]> git.ipfire.org Git - people/teissler/ipfire-2.x.git/blob - src/patches/60031_xen3-patch-2.6.22.patch1
Imported xen patches.
[people/teissler/ipfire-2.x.git] / src / patches / 60031_xen3-patch-2.6.22.patch1
1 From: www.kernel.org
2 Subject: Update to 2.6.22
3 Patch-mainline: 2.6.22
4
5 Automatically created from "patches.kernel.org/patch-2.6.22" by xen-port-patches.py
6
7 Acked-by: jbeulich@novell.com
8
9 ---
10 arch/x86/Kconfig | 4
11 arch/x86/ia32/ia32entry-xen.S | 18 -
12 arch/x86/kernel/Makefile | 2
13 arch/x86/kernel/acpi/sleep_64-xen.c | 26 -
14 arch/x86/kernel/apic_32-xen.c | 1
15 arch/x86/kernel/apic_64-xen.c | 1
16 arch/x86/kernel/asm-offsets_32.c | 5
17 arch/x86/kernel/cpu/common-xen.c | 224 ++++---------
18 arch/x86/kernel/cpu/mtrr/main-xen.c | 2
19 arch/x86/kernel/e820_32-xen.c | 46 +-
20 arch/x86/kernel/e820_64-xen.c | 28 -
21 arch/x86/kernel/early_printk-xen.c | 27 -
22 arch/x86/kernel/entry_32-xen.S | 30 -
23 arch/x86/kernel/entry_64-xen.S | 7
24 arch/x86/kernel/genapic_64-xen.c | 108 +-----
25 arch/x86/kernel/genapic_xen_64.c | 3
26 arch/x86/kernel/head64-xen.c | 32 +
27 arch/x86/kernel/head_32-xen.S | 101 ------
28 arch/x86/kernel/head_64-xen.S | 52 ---
29 arch/x86/kernel/io_apic_32-xen.c | 43 --
30 arch/x86/kernel/io_apic_64-xen.c | 39 --
31 arch/x86/kernel/ioport_32-xen.c | 2
32 arch/x86/kernel/ioport_64-xen.c | 2
33 arch/x86/kernel/irq_32-xen.c | 3
34 arch/x86/kernel/irq_64-xen.c | 34 +-
35 arch/x86/kernel/ldt_32-xen.c | 1
36 arch/x86/kernel/ldt_64-xen.c | 1
37 arch/x86/kernel/microcode-xen.c | 2
38 arch/x86/kernel/mpparse_32-xen.c | 3
39 arch/x86/kernel/mpparse_64-xen.c | 3
40 arch/x86/kernel/pci-dma-xen.c | 29 +
41 arch/x86/kernel/process_32-xen.c | 27 +
42 arch/x86/kernel/process_64-xen.c | 16
43 arch/x86/kernel/quirks-xen.c | 63 ---
44 arch/x86/kernel/setup64-xen.c | 17 -
45 arch/x86/kernel/setup_64-xen.c | 30 -
46 arch/x86/kernel/smp_32-xen.c | 191 ++++-------
47 arch/x86/kernel/smp_64-xen.c | 29 -
48 arch/x86/kernel/time_32-xen.c | 165 ++++++----
49 arch/x86/kernel/traps_32-xen.c | 46 +-
50 arch/x86/kernel/traps_64-xen.c | 55 +--
51 arch/x86/kernel/vsyscall_64-xen.c | 73 +++-
52 arch/x86/mm/fault_32-xen.c | 42 +-
53 arch/x86/mm/fault_64-xen.c | 15
54 arch/x86/mm/highmem_32-xen.c | 14
55 arch/x86/mm/init_32-xen.c | 157 ++++++---
56 arch/x86/mm/init_64-xen.c | 132 ++++----
57 arch/x86/mm/ioremap_32-xen.c | 1
58 arch/x86/mm/pageattr_64-xen.c | 27 +
59 arch/x86/mm/pgtable_32-xen.c | 206 +++++++-----
60 drivers/char/tpm/tpm_xen.c | 2
61 drivers/pci/msi-xen.c | 127 +++++--
62 drivers/xen/blkfront/blkfront.c | 2
63 drivers/xen/char/mem.c | 1
64 drivers/xen/core/hypervisor_sysfs.c | 2
65 drivers/xen/core/smpboot.c | 45 +-
66 drivers/xen/core/xen_sysfs.c | 24 -
67 drivers/xen/netback/netback.c | 14
68 drivers/xen/netfront/netfront.c | 2
69 drivers/xen/pciback/xenbus.c | 2
70 drivers/xen/pcifront/xenbus.c | 4
71 drivers/xen/scsifront/xenbus.c | 2
72 drivers/xen/sfc_netback/accel_fwd.c | 7
73 drivers/xen/sfc_netback/accel_solarflare.c | 2
74 drivers/xen/sfc_netfront/accel_tso.c | 28 -
75 drivers/xen/sfc_netfront/accel_vi.c | 4
76 drivers/xen/sfc_netfront/accel_xenbus.c | 4
77 fs/aio.c | 7
78 include/asm-x86/mach-xen/asm/desc_32.h | 119 ++++---
79 include/asm-x86/mach-xen/asm/desc_64.h | 30 -
80 include/asm-x86/mach-xen/asm/dma-mapping_64.h | 2
81 include/asm-x86/mach-xen/asm/fixmap_32.h | 9
82 include/asm-x86/mach-xen/asm/fixmap_64.h | 1
83 include/asm-x86/mach-xen/asm/highmem.h | 6
84 include/asm-x86/mach-xen/asm/io_32.h | 13
85 include/asm-x86/mach-xen/asm/irqflags_32.h | 75 ++--
86 include/asm-x86/mach-xen/asm/irqflags_64.h | 19 -
87 include/asm-x86/mach-xen/asm/mmu_context_32.h | 29 +
88 include/asm-x86/mach-xen/asm/mmu_context_64.h | 3
89 include/asm-x86/mach-xen/asm/page_64.h | 61 +--
90 include/asm-x86/mach-xen/asm/pgalloc_32.h | 3
91 include/asm-x86/mach-xen/asm/pgalloc_64.h | 15
92 include/asm-x86/mach-xen/asm/pgtable-3level-defs.h | 2
93 include/asm-x86/mach-xen/asm/pgtable-3level.h | 61 ++-
94 include/asm-x86/mach-xen/asm/pgtable_32.h | 80 ++--
95 include/asm-x86/mach-xen/asm/pgtable_64.h | 83 ++---
96 include/asm-x86/mach-xen/asm/processor_32.h | 141 +++-----
97 include/asm-x86/mach-xen/asm/processor_64.h | 55 ---
98 include/asm-x86/mach-xen/asm/segment_32.h | 10
99 include/asm-x86/mach-xen/asm/smp_32.h | 117 +++++--
100 include/asm-x86/mach-xen/asm/smp_64.h | 20 -
101 include/asm-x86/mach-xen/asm/system_32.h | 342 ++++-----------------
102 include/asm-x86/mach-xen/asm/system_64.h | 106 ------
103 include/asm-x86/mach-xen/asm/tlbflush_32.h | 11
104 include/asm-x86/mach-xen/asm/tlbflush_64.h | 2
105 include/linux/pci.h | 2
106 lib/swiotlb-xen.c | 1
107 net/core/dev.c | 15
108 scripts/Makefile.xen.awk | 2
109 99 files changed, 1771 insertions(+), 2128 deletions(-)
110
111 --- a/arch/x86/ia32/ia32entry-xen.S
112 +++ b/arch/x86/ia32/ia32entry-xen.S
113 @@ -431,11 +431,7 @@ ia32_sys_call_table:
114 .quad sys_symlink
115 .quad sys_lstat
116 .quad sys_readlink /* 85 */
117 -#ifdef CONFIG_IA32_AOUT
118 .quad sys_uselib
119 -#else
120 - .quad quiet_ni_syscall
121 -#endif
122 .quad sys_swapon
123 .quad sys_reboot
124 .quad compat_sys_old_readdir
125 @@ -574,7 +570,7 @@ ia32_sys_call_table:
126 .quad quiet_ni_syscall /* tux */
127 .quad quiet_ni_syscall /* security */
128 .quad sys_gettid
129 - .quad sys_readahead /* 225 */
130 + .quad sys32_readahead /* 225 */
131 .quad sys_setxattr
132 .quad sys_lsetxattr
133 .quad sys_fsetxattr
134 @@ -599,7 +595,7 @@ ia32_sys_call_table:
135 .quad compat_sys_io_getevents
136 .quad compat_sys_io_submit
137 .quad sys_io_cancel
138 - .quad sys_fadvise64 /* 250 */
139 + .quad sys32_fadvise64 /* 250 */
140 .quad quiet_ni_syscall /* free_huge_pages */
141 .quad sys_exit_group
142 .quad sys32_lookup_dcookie
143 @@ -663,10 +659,14 @@ ia32_sys_call_table:
144 .quad compat_sys_set_robust_list
145 .quad compat_sys_get_robust_list
146 .quad sys_splice
147 - .quad sys_sync_file_range
148 - .quad sys_tee
149 + .quad sys32_sync_file_range
150 + .quad sys_tee /* 315 */
151 .quad compat_sys_vmsplice
152 .quad compat_sys_move_pages
153 .quad sys_getcpu
154 .quad sys_epoll_pwait
155 -ia32_syscall_end:
156 + .quad compat_sys_utimensat /* 320 */
157 + .quad compat_sys_signalfd
158 + .quad compat_sys_timerfd
159 + .quad sys_eventfd
160 +ia32_syscall_end:
161 --- a/arch/x86/Kconfig
162 +++ b/arch/x86/Kconfig
163 @@ -1429,7 +1429,7 @@ config PHYSICAL_START
164
165 config RELOCATABLE
166 bool "Build a relocatable kernel (EXPERIMENTAL)"
167 - depends on EXPERIMENTAL && !X86_XEN
168 + depends on EXPERIMENTAL && !X86_XEN && !X86_64_XEN
169 help
170 This builds a kernel image that retains relocation information
171 so it can be loaded someplace besides the default 1MB.
172 @@ -1483,7 +1483,6 @@ config COMPAT_VDSO
173 def_bool y
174 prompt "Compat VDSO support"
175 depends on X86_32 || IA32_EMULATION
176 - depends on !X86_XEN
177 help
178 Map the 32-bit VDSO to the predictable old-style address too.
179 ---help---
180 @@ -1662,6 +1661,7 @@ config PCI
181 bool "PCI support"
182 default y
183 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
184 + select ARCH_SUPPORTS_MSI if (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND)
185 help
186 Find out whether you have a PCI motherboard. PCI is the name of a
187 bus system, i.e. the way the CPU talks to the other stuff inside
188 --- a/arch/x86/kernel/acpi/sleep_64-xen.c
189 +++ b/arch/x86/kernel/acpi/sleep_64-xen.c
190 @@ -60,19 +60,6 @@ unsigned long acpi_video_flags;
191 extern char wakeup_start, wakeup_end;
192
193 extern unsigned long acpi_copy_wakeup_routine(unsigned long);
194 -
195 -static pgd_t low_ptr;
196 -
197 -static void init_low_mapping(void)
198 -{
199 - pgd_t *slot0 = pgd_offset(current->mm, 0UL);
200 - low_ptr = *slot0;
201 - /* FIXME: We're playing with the current task's page tables here, which
202 - * is potentially dangerous on SMP systems.
203 - */
204 - set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
205 - local_flush_tlb();
206 -}
207 #endif
208
209 /**
210 @@ -84,8 +71,6 @@ static void init_low_mapping(void)
211 int acpi_save_state_mem(void)
212 {
213 #ifndef CONFIG_ACPI_PV_SLEEP
214 - init_low_mapping();
215 -
216 memcpy((void *)acpi_wakeup_address, &wakeup_start,
217 &wakeup_end - &wakeup_start);
218 acpi_copy_wakeup_routine(acpi_wakeup_address);
219 @@ -98,10 +83,6 @@ int acpi_save_state_mem(void)
220 */
221 void acpi_restore_state_mem(void)
222 {
223 -#ifndef CONFIG_ACPI_PV_SLEEP
224 - set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
225 - local_flush_tlb();
226 -#endif
227 }
228
229 /**
230 @@ -115,10 +96,11 @@ void acpi_restore_state_mem(void)
231 void __init acpi_reserve_bootmem(void)
232 {
233 #ifndef CONFIG_ACPI_PV_SLEEP
234 - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
235 - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
236 + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
237 + if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
238 printk(KERN_CRIT
239 - "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
240 + "ACPI: Wakeup code way too big, will crash on attempt"
241 + " to suspend\n");
242 #endif
243 }
244
245 --- a/arch/x86/kernel/apic_32-xen.c
246 +++ b/arch/x86/kernel/apic_32-xen.c
247 @@ -19,7 +19,6 @@
248 #include <linux/mm.h>
249 #include <linux/delay.h>
250 #include <linux/bootmem.h>
251 -#include <linux/smp_lock.h>
252 #include <linux/interrupt.h>
253 #include <linux/mc146818rtc.h>
254 #include <linux/kernel_stat.h>
255 --- a/arch/x86/kernel/apic_64-xen.c
256 +++ b/arch/x86/kernel/apic_64-xen.c
257 @@ -19,7 +19,6 @@
258 #include <linux/mm.h>
259 #include <linux/delay.h>
260 #include <linux/bootmem.h>
261 -#include <linux/smp_lock.h>
262 #include <linux/interrupt.h>
263 #include <linux/mc146818rtc.h>
264 #include <linux/kernel_stat.h>
265 --- a/arch/x86/kernel/asm-offsets_32.c
266 +++ b/arch/x86/kernel/asm-offsets_32.c
267 @@ -109,11 +109,6 @@ void foo(void)
268
269 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
270
271 -#ifdef CONFIG_XEN
272 - BLANK();
273 - OFFSET(XEN_START_mfn_list, start_info, mfn_list);
274 -#endif
275 -
276 #ifdef CONFIG_PARAVIRT
277 BLANK();
278 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
279 --- a/arch/x86/kernel/cpu/common-xen.c
280 +++ b/arch/x86/kernel/cpu/common-xen.c
281 @@ -22,16 +22,40 @@
282 #define phys_pkg_id(a,b) a
283 #endif
284 #endif
285 -#include <asm/pda.h>
286 #include <asm/hypervisor.h>
287
288 #include "cpu.h"
289
290 -DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
291 -EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
292 +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
293 + [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
294 + [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
295 + [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
296 + [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
297 +#ifndef CONFIG_XEN
298 + /*
299 + * Segments used for calling PnP BIOS have byte granularity.
300 + * They code segments and data segments have fixed 64k limits,
301 + * the transfer segment sizes are set at run time.
302 + */
303 + [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
304 + [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
305 + [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
306 + [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
307 + [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
308 + /*
309 + * The APM segments have byte granularity and their bases
310 + * are set at run time. All have 64k limits.
311 + */
312 + [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
313 + /* 16-bit code */
314 + [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
315 + [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
316
317 -struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
318 -EXPORT_SYMBOL(_cpu_pda);
319 + [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
320 +#endif
321 + [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
322 +} };
323 +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
324
325 static int cachesize_override __cpuinitdata = -1;
326 static int disable_x86_fxsr __cpuinitdata;
327 @@ -373,7 +397,7 @@ __setup("serialnumber", x86_serial_nr_se
328 /*
329 * This does the hard work of actually picking apart the CPU stuff...
330 */
331 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
332 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
333 {
334 int i;
335
336 @@ -484,15 +508,22 @@ void __cpuinit identify_cpu(struct cpuin
337
338 /* Init Machine Check Exception if available. */
339 mcheck_init(c);
340 +}
341
342 - if (c == &boot_cpu_data)
343 - sysenter_setup();
344 +void __init identify_boot_cpu(void)
345 +{
346 + identify_cpu(&boot_cpu_data);
347 + sysenter_setup();
348 enable_sep_cpu();
349 + mtrr_bp_init();
350 +}
351
352 - if (c == &boot_cpu_data)
353 - mtrr_bp_init();
354 - else
355 - mtrr_ap_init();
356 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
357 +{
358 + BUG_ON(c == &boot_cpu_data);
359 + identify_cpu(c);
360 + enable_sep_cpu();
361 + mtrr_ap_init();
362 }
363
364 #ifdef CONFIG_X86_HT
365 @@ -606,136 +637,47 @@ void __init early_cpu_init(void)
366 #endif
367 }
368
369 -/* Make sure %gs is initialized properly in idle threads */
370 +/* Make sure %fs is initialized properly in idle threads */
371 struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
372 {
373 memset(regs, 0, sizeof(struct pt_regs));
374 - regs->xfs = __KERNEL_PDA;
375 + regs->xfs = __KERNEL_PERCPU;
376 return regs;
377 }
378
379 -static __cpuinit int alloc_gdt(int cpu)
380 +/* Current gdt points %fs at the "master" per-cpu area: after this,
381 + * it's on the real one. */
382 +void switch_to_new_gdt(void)
383 {
384 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
385 - struct desc_struct *gdt;
386 - struct i386_pda *pda;
387 -
388 - gdt = (struct desc_struct *)cpu_gdt_descr->address;
389 - pda = cpu_pda(cpu);
390 -
391 - /*
392 - * This is a horrible hack to allocate the GDT. The problem
393 - * is that cpu_init() is called really early for the boot CPU
394 - * (and hence needs bootmem) but much later for the secondary
395 - * CPUs, when bootmem will have gone away
396 - */
397 - if (NODE_DATA(0)->bdata->node_bootmem_map) {
398 - BUG_ON(gdt != NULL || pda != NULL);
399 -
400 - gdt = alloc_bootmem_pages(PAGE_SIZE);
401 - pda = alloc_bootmem(sizeof(*pda));
402 - /* alloc_bootmem(_pages) panics on failure, so no check */
403 -
404 - memset(gdt, 0, PAGE_SIZE);
405 - memset(pda, 0, sizeof(*pda));
406 - } else {
407 - /* GDT and PDA might already have been allocated if
408 - this is a CPU hotplug re-insertion. */
409 - if (gdt == NULL)
410 - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
411 -
412 - if (pda == NULL)
413 - pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
414 -
415 - if (unlikely(!gdt || !pda)) {
416 - free_pages((unsigned long)gdt, 0);
417 - kfree(pda);
418 - return 0;
419 - }
420 - }
421 -
422 - cpu_gdt_descr->address = (unsigned long)gdt;
423 - cpu_pda(cpu) = pda;
424 -
425 - return 1;
426 -}
427 -
428 -/* Initial PDA used by boot CPU */
429 -struct i386_pda boot_pda = {
430 - ._pda = &boot_pda,
431 - .cpu_number = 0,
432 - .pcurrent = &init_task,
433 -};
434 -
435 -static inline void set_kernel_fs(void)
436 -{
437 - /* Set %fs for this CPU's PDA. Memory clobber is to create a
438 - barrier with respect to any PDA operations, so the compiler
439 - doesn't move any before here. */
440 - asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
441 -}
442 -
443 -/* Initialize the CPU's GDT and PDA. The boot CPU does this for
444 - itself, but secondaries find this done for them. */
445 -__cpuinit int init_gdt(int cpu, struct task_struct *idle)
446 -{
447 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
448 - struct desc_struct *gdt;
449 - struct i386_pda *pda;
450 -
451 - /* For non-boot CPUs, the GDT and PDA should already have been
452 - allocated. */
453 - if (!alloc_gdt(cpu)) {
454 - printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
455 - return 0;
456 - }
457 -
458 - gdt = (struct desc_struct *)cpu_gdt_descr->address;
459 - pda = cpu_pda(cpu);
460 -
461 - BUG_ON(gdt == NULL || pda == NULL);
462 -
463 - /*
464 - * Initialize the per-CPU GDT with the boot GDT,
465 - * and set up the GDT descriptor:
466 - */
467 - memcpy(gdt, cpu_gdt_table, GDT_SIZE);
468 - cpu_gdt_descr->size = GDT_SIZE - 1;
469 -
470 - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
471 - (u32 *)&gdt[GDT_ENTRY_PDA].b,
472 - (unsigned long)pda, sizeof(*pda) - 1,
473 - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
474 -
475 - memset(pda, 0, sizeof(*pda));
476 - pda->_pda = pda;
477 - pda->cpu_number = cpu;
478 - pda->pcurrent = idle;
479 -
480 - return 1;
481 -}
482 -
483 -void __cpuinit cpu_set_gdt(int cpu)
484 -{
485 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
486 + struct Xgt_desc_struct gdt_descr;
487 unsigned long va, frames[16];
488 int f;
489
490 - for (va = cpu_gdt_descr->address, f = 0;
491 - va < cpu_gdt_descr->address + cpu_gdt_descr->size;
492 + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
493 + gdt_descr.size = GDT_SIZE - 1;
494 +
495 + for (va = gdt_descr.address, f = 0;
496 + va < gdt_descr.address + gdt_descr.size;
497 va += PAGE_SIZE, f++) {
498 frames[f] = virt_to_mfn(va);
499 make_lowmem_page_readonly(
500 (void *)va, XENFEAT_writable_descriptor_tables);
501 }
502 - BUG_ON(HYPERVISOR_set_gdt(frames, (cpu_gdt_descr->size + 1) / 8));
503 -
504 - set_kernel_fs();
505 + if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
506 + BUG();
507 + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
508 }
509
510 -/* Common CPU init for both boot and secondary CPUs */
511 -static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
512 +/*
513 + * cpu_init() initializes state that is per-CPU. Some data is already
514 + * initialized (naturally) in the bootstrap process, such as the GDT
515 + * and IDT. We reload them nevertheless, this function acts as a
516 + * 'CPU state barrier', nothing should get across.
517 + */
518 +void __cpuinit cpu_init(void)
519 {
520 + int cpu = smp_processor_id();
521 + struct task_struct *curr = current;
522 #ifndef CONFIG_X86_NO_TSS
523 struct tss_struct * t = &per_cpu(init_tss, cpu);
524 #endif
525 @@ -757,6 +699,8 @@ static void __cpuinit _cpu_init(int cpu,
526 set_in_cr4(X86_CR4_TSD);
527 }
528
529 + switch_to_new_gdt();
530 +
531 /*
532 * Set up and load the per-CPU TSS and LDT
533 */
534 @@ -794,38 +738,6 @@ static void __cpuinit _cpu_init(int cpu,
535 mxcsr_feature_mask_init();
536 }
537
538 -/* Entrypoint to initialize secondary CPU */
539 -void __cpuinit secondary_cpu_init(void)
540 -{
541 - int cpu = smp_processor_id();
542 - struct task_struct *curr = current;
543 -
544 - _cpu_init(cpu, curr);
545 -}
546 -
547 -/*
548 - * cpu_init() initializes state that is per-CPU. Some data is already
549 - * initialized (naturally) in the bootstrap process, such as the GDT
550 - * and IDT. We reload them nevertheless, this function acts as a
551 - * 'CPU state barrier', nothing should get across.
552 - */
553 -void __cpuinit cpu_init(void)
554 -{
555 - int cpu = smp_processor_id();
556 - struct task_struct *curr = current;
557 -
558 - /* Set up the real GDT and PDA, so we can transition from the
559 - boot versions. */
560 - if (!init_gdt(cpu, curr)) {
561 - /* failed to allocate something; not much we can do... */
562 - for (;;)
563 - local_irq_enable();
564 - }
565 -
566 - cpu_set_gdt(cpu);
567 - _cpu_init(cpu, curr);
568 -}
569 -
570 #ifdef CONFIG_HOTPLUG_CPU
571 void __cpuinit cpu_uninit(void)
572 {
573 --- a/arch/x86/kernel/cpu/mtrr/main-xen.c
574 +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
575 @@ -167,7 +167,7 @@ mtrr_del(int reg, unsigned long base, un
576 EXPORT_SYMBOL(mtrr_add);
577 EXPORT_SYMBOL(mtrr_del);
578
579 -void __init mtrr_bp_init(void)
580 +__init void mtrr_bp_init(void)
581 {
582 }
583
584 --- a/arch/x86/kernel/e820_32-xen.c
585 +++ b/arch/x86/kernel/e820_32-xen.c
586 @@ -162,26 +162,27 @@ static struct resource standard_io_resou
587
588 static int __init romsignature(const unsigned char *rom)
589 {
590 + const unsigned short * const ptr = (const unsigned short *)rom;
591 unsigned short sig;
592
593 - return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
594 - sig == ROMSIGNATURE;
595 + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
596 }
597
598 -static int __init romchecksum(unsigned char *rom, unsigned long length)
599 +static int __init romchecksum(const unsigned char *rom, unsigned long length)
600 {
601 - unsigned char sum;
602 + unsigned char sum, c;
603
604 - for (sum = 0; length; length--)
605 - sum += *rom++;
606 - return sum == 0;
607 + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
608 + sum += c;
609 + return !length && !sum;
610 }
611
612 static void __init probe_roms(void)
613 {
614 + const unsigned char *rom;
615 unsigned long start, length, upper;
616 - unsigned char *rom;
617 - int i;
618 + unsigned char c;
619 + int i;
620
621 #ifdef CONFIG_XEN
622 /* Nothing to do if not running in dom0. */
623 @@ -198,8 +199,11 @@ static void __init probe_roms(void)
624
625 video_rom_resource.start = start;
626
627 + if (probe_kernel_address(rom + 2, c) != 0)
628 + continue;
629 +
630 /* 0 < length <= 0x7f * 512, historically */
631 - length = rom[2] * 512;
632 + length = c * 512;
633
634 /* if checksum okay, trust length byte */
635 if (length && romchecksum(rom, length))
636 @@ -233,8 +237,11 @@ static void __init probe_roms(void)
637 if (!romsignature(rom))
638 continue;
639
640 + if (probe_kernel_address(rom + 2, c) != 0)
641 + continue;
642 +
643 /* 0 < length <= 0x7f * 512, historically */
644 - length = rom[2] * 512;
645 + length = c * 512;
646
647 /* but accept any length that fits if checksum okay */
648 if (!length || start + length > upper || !romchecksum(rom, length))
649 @@ -249,7 +256,7 @@ static void __init probe_roms(void)
650 }
651
652 #ifdef CONFIG_XEN
653 -static struct e820map machine_e820 __initdata;
654 +static struct e820map machine_e820;
655 #define e820 machine_e820
656 #endif
657
658 @@ -409,10 +416,8 @@ int __init sanitize_e820_map(struct e820
659 ____________________33__
660 ______________________4_
661 */
662 - printk("sanitize start\n");
663 /* if there's only one memory region, don't bother */
664 if (*pnr_map < 2) {
665 - printk("sanitize bail 0\n");
666 return -1;
667 }
668
669 @@ -421,7 +426,6 @@ int __init sanitize_e820_map(struct e820
670 /* bail out if we find any unreasonable addresses in bios map */
671 for (i=0; i<old_nr; i++)
672 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
673 - printk("sanitize bail 1\n");
674 return -1;
675 }
676
677 @@ -517,7 +521,6 @@ int __init sanitize_e820_map(struct e820
678 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
679 *pnr_map = new_nr;
680
681 - printk("sanitize end\n");
682 return 0;
683 }
684
685 @@ -552,7 +555,6 @@ int __init copy_e820_map(struct e820entr
686 unsigned long long size = biosmap->size;
687 unsigned long long end = start + size;
688 unsigned long type = biosmap->type;
689 - printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
690
691 /* Overflow in 64 bits? Ignore the memory map. */
692 if (start > end)
693 @@ -564,17 +566,11 @@ int __init copy_e820_map(struct e820entr
694 * Not right. Fix it up.
695 */
696 if (type == E820_RAM) {
697 - printk("copy_e820_map() type is E820_RAM\n");
698 if (start < 0x100000ULL && end > 0xA0000ULL) {
699 - printk("copy_e820_map() lies in range...\n");
700 - if (start < 0xA0000ULL) {
701 - printk("copy_e820_map() start < 0xA0000ULL\n");
702 + if (start < 0xA0000ULL)
703 add_memory_region(start, 0xA0000ULL-start, type);
704 - }
705 - if (end <= 0x100000ULL) {
706 - printk("copy_e820_map() end <= 0x100000ULL\n");
707 + if (end <= 0x100000ULL)
708 continue;
709 - }
710 start = 0x100000ULL;
711 size = end - start;
712 }
713 --- a/arch/x86/kernel/e820_64-xen.c
714 +++ b/arch/x86/kernel/e820_64-xen.c
715 @@ -17,6 +17,8 @@
716 #include <linux/kexec.h>
717 #include <linux/module.h>
718 #include <linux/mm.h>
719 +#include <linux/suspend.h>
720 +#include <linux/pfn.h>
721
722 #include <asm/pgtable.h>
723 #include <asm/page.h>
724 @@ -28,7 +30,7 @@
725
726 struct e820map e820 __initdata;
727 #ifdef CONFIG_XEN
728 -struct e820map machine_e820 __initdata;
729 +struct e820map machine_e820;
730 #endif
731
732 /*
733 @@ -291,22 +293,6 @@ void __init e820_reserve_resources(struc
734 }
735
736 #ifndef CONFIG_XEN
737 -/* Mark pages corresponding to given address range as nosave */
738 -static void __init
739 -e820_mark_nosave_range(unsigned long start, unsigned long end)
740 -{
741 - unsigned long pfn, max_pfn;
742 -
743 - if (start >= end)
744 - return;
745 -
746 - printk("Nosave address range: %016lx - %016lx\n", start, end);
747 - max_pfn = end >> PAGE_SHIFT;
748 - for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
749 - if (pfn_valid(pfn))
750 - SetPageNosave(pfn_to_page(pfn));
751 -}
752 -
753 /*
754 * Find the ranges of physical addresses that do not correspond to
755 * e820 RAM areas and mark the corresponding pages as nosave for software
756 @@ -325,13 +311,13 @@ void __init e820_mark_nosave_regions(voi
757 struct e820entry *ei = &e820.map[i];
758
759 if (paddr < ei->addr)
760 - e820_mark_nosave_range(paddr,
761 - round_up(ei->addr, PAGE_SIZE));
762 + register_nosave_region(PFN_DOWN(paddr),
763 + PFN_UP(ei->addr));
764
765 paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
766 if (ei->type != E820_RAM)
767 - e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
768 - paddr);
769 + register_nosave_region(PFN_UP(ei->addr),
770 + PFN_DOWN(paddr));
771
772 if (paddr >= (end_pfn << PAGE_SHIFT))
773 break;
774 --- a/arch/x86/kernel/early_printk-xen.c
775 +++ b/arch/x86/kernel/early_printk-xen.c
776 @@ -11,11 +11,10 @@
777
778 #ifdef __i386__
779 #include <asm/setup.h>
780 -#define VGABASE (__ISA_IO_base + 0xb8000)
781 #else
782 #include <asm/bootsetup.h>
783 -#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
784 #endif
785 +#define VGABASE (__ISA_IO_base + 0xb8000)
786
787 #ifndef CONFIG_XEN
788 static int max_ypos = 25, max_xpos = 80;
789 @@ -93,9 +92,9 @@ static int early_serial_putc(unsigned ch
790 static void early_serial_write(struct console *con, const char *s, unsigned n)
791 {
792 while (*s && n-- > 0) {
793 - early_serial_putc(*s);
794 if (*s == '\n')
795 early_serial_putc('\r');
796 + early_serial_putc(*s);
797 s++;
798 }
799 }
800 @@ -205,7 +204,7 @@ static noinline long simnow(long cmd, lo
801 return ret;
802 }
803
804 -void __init simnow_init(char *str)
805 +static void __init simnow_init(char *str)
806 {
807 char *fn = "klog";
808 if (*str == '=')
809 @@ -277,22 +276,12 @@ static int __init setup_early_printk(cha
810 early_console = &simnow_console;
811 keep_early = 1;
812 }
813 +
814 + if (keep_early)
815 + early_console->flags &= ~CON_BOOT;
816 + else
817 + early_console->flags |= CON_BOOT;
818 register_console(early_console);
819 return 0;
820 }
821 -
822 early_param("earlyprintk", setup_early_printk);
823 -
824 -void __init disable_early_printk(void)
825 -{
826 - if (!early_console_initialized || !early_console)
827 - return;
828 - if (!keep_early) {
829 - printk("disabling early console\n");
830 - unregister_console(early_console);
831 - early_console_initialized = 0;
832 - } else {
833 - printk("keeping early console\n");
834 - }
835 -}
836 -
837 --- a/arch/x86/kernel/entry_32-xen.S
838 +++ b/arch/x86/kernel/entry_32-xen.S
839 @@ -15,7 +15,7 @@
840 * I changed all the .align's to 4 (16 byte alignment), as that's faster
841 * on a 486.
842 *
843 - * Stack layout in 'ret_from_system_call':
844 + * Stack layout in 'syscall_exit':
845 * ptrace needs to have all regs on the stack.
846 * if the order here is changed, it needs to be
847 * updated in fork.c:copy_process, signal.c:do_signal,
848 @@ -135,7 +135,7 @@ NMI_MASK = 0x80000000
849 movl $(__USER_DS), %edx; \
850 movl %edx, %ds; \
851 movl %edx, %es; \
852 - movl $(__KERNEL_PDA), %edx; \
853 + movl $(__KERNEL_PERCPU), %edx; \
854 movl %edx, %fs
855
856 #define RESTORE_INT_REGS \
857 @@ -308,16 +308,12 @@ sysenter_past_esp:
858 pushl $(__USER_CS)
859 CFI_ADJUST_CFA_OFFSET 4
860 /*CFI_REL_OFFSET cs, 0*/
861 -#ifndef CONFIG_COMPAT_VDSO
862 /*
863 * Push current_thread_info()->sysenter_return to the stack.
864 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
865 * pushed above; +8 corresponds to copy_thread's esp0 setting.
866 */
867 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
868 -#else
869 - pushl $SYSENTER_RETURN
870 -#endif
871 CFI_ADJUST_CFA_OFFSET 4
872 CFI_REL_OFFSET eip, 0
873
874 @@ -345,7 +341,7 @@ sysenter_past_esp:
875 jae syscall_badsys
876 call *sys_call_table(,%eax,4)
877 movl %eax,PT_EAX(%esp)
878 - DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
879 + DISABLE_INTERRUPTS(CLBR_ANY)
880 TRACE_IRQS_OFF
881 movl TI_flags(%ebp), %ecx
882 testw $_TIF_ALLWORK_MASK, %cx
883 @@ -400,10 +396,6 @@ ENTRY(system_call)
884 CFI_ADJUST_CFA_OFFSET 4
885 SAVE_ALL
886 GET_THREAD_INFO(%ebp)
887 - testl $TF_MASK,PT_EFLAGS(%esp)
888 - jz no_singlestep
889 - orl $_TIF_SINGLESTEP,TI_flags(%ebp)
890 -no_singlestep:
891 # system call tracing in operation / emulation
892 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
893 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
894 @@ -418,6 +410,10 @@ syscall_exit:
895 # setting need_resched or sigpending
896 # between sampling and the iret
897 TRACE_IRQS_OFF
898 + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
899 + jz no_singlestep
900 + orl $_TIF_SINGLESTEP,TI_flags(%ebp)
901 +no_singlestep:
902 movl TI_flags(%ebp), %ecx
903 testw $_TIF_ALLWORK_MASK, %cx # current->work
904 jne syscall_exit_work
905 @@ -635,9 +631,7 @@ END(syscall_badsys)
906 #ifndef CONFIG_XEN
907 #define FIXUP_ESPFIX_STACK \
908 /* since we are on a wrong stack, we cant make it a C code :( */ \
909 - movl %fs:PDA_cpu, %ebx; \
910 - PER_CPU(cpu_gdt_descr, %ebx); \
911 - movl GDS_address(%ebx), %ebx; \
912 + PER_CPU(gdt_page, %ebx); \
913 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
914 addl %esp, %eax; \
915 pushl $__KERNEL_DS; \
916 @@ -710,7 +704,7 @@ ENTRY(name) \
917 SAVE_ALL; \
918 TRACE_IRQS_OFF \
919 movl %esp,%eax; \
920 - call smp_/**/name; \
921 + call smp_##name; \
922 jmp ret_from_intr; \
923 CFI_ENDPROC; \
924 ENDPROC(name)
925 @@ -718,10 +712,6 @@ ENDPROC(name)
926 /* The include is where all of the SMP etc. interrupts come from */
927 #include "entry_arch.h"
928
929 -/* This alternate entry is needed because we hijack the apic LVTT */
930 -#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
931 -BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
932 -#endif
933 #else
934 #define UNWIND_ESPFIX_STACK
935 #endif
936 @@ -764,7 +754,7 @@ error_code:
937 pushl %fs
938 CFI_ADJUST_CFA_OFFSET 4
939 /*CFI_REL_OFFSET fs, 0*/
940 - movl $(__KERNEL_PDA), %ecx
941 + movl $(__KERNEL_PERCPU), %ecx
942 movl %ecx, %fs
943 UNWIND_ESPFIX_STACK
944 popl %ecx
945 --- a/arch/x86/kernel/entry_64-xen.S
946 +++ b/arch/x86/kernel/entry_64-xen.S
947 @@ -1254,3 +1254,10 @@ ENTRY(call_softirq)
948 ret
949 CFI_ENDPROC
950 ENDPROC(call_softirq)
951 +
952 +KPROBE_ENTRY(ignore_sysret)
953 + CFI_STARTPROC
954 + mov $-ENOSYS,%eax
955 + HYPERVISOR_IRET 0
956 + CFI_ENDPROC
957 +ENDPROC(ignore_sysret)
958 --- a/arch/x86/kernel/genapic_64-xen.c
959 +++ b/arch/x86/kernel/genapic_64-xen.c
960 @@ -11,123 +11,57 @@
961 #include <linux/threads.h>
962 #include <linux/cpumask.h>
963 #include <linux/string.h>
964 +#include <linux/module.h>
965 #include <linux/kernel.h>
966 #include <linux/ctype.h>
967 #include <linux/init.h>
968 -#include <linux/module.h>
969
970 #include <asm/smp.h>
971 #include <asm/ipi.h>
972 +#include <asm/genapic.h>
973
974 -#if defined(CONFIG_ACPI)
975 +#ifdef CONFIG_ACPI
976 #include <acpi/acpi_bus.h>
977 #endif
978
979 /* which logical CPU number maps to which CPU (physical APIC ID) */
980 -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
981 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
982 + = { [0 ... NR_CPUS-1] = BAD_APICID };
983 EXPORT_SYMBOL(x86_cpu_to_apicid);
984 -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
985
986 -extern struct genapic apic_cluster;
987 -extern struct genapic apic_flat;
988 -extern struct genapic apic_physflat;
989 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
990
991 #ifndef CONFIG_XEN
992 -struct genapic *genapic = &apic_flat;
993 -struct genapic *genapic_force;
994 +struct genapic __read_mostly *genapic = &apic_flat;
995 #else
996 extern struct genapic apic_xen;
997 -struct genapic *genapic = &apic_xen;
998 +struct genapic __read_mostly *genapic = &apic_xen;
999 #endif
1000
1001
1002 /*
1003 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
1004 */
1005 -void __init clustered_apic_check(void)
1006 +void __init setup_apic_routing(void)
1007 {
1008 #ifndef CONFIG_XEN
1009 - long i;
1010 - u8 clusters, max_cluster;
1011 - u8 id;
1012 - u8 cluster_cnt[NUM_APIC_CLUSTERS];
1013 - int max_apic = 0;
1014 -
1015 - /* genapic selection can be forced because of certain quirks.
1016 - */
1017 - if (genapic_force) {
1018 - genapic = genapic_force;
1019 - goto print;
1020 - }
1021 -
1022 -#if defined(CONFIG_ACPI)
1023 +#ifdef CONFIG_ACPI
1024 /*
1025 - * Some x86_64 machines use physical APIC mode regardless of how many
1026 - * procs/clusters are present (x86_64 ES7000 is an example).
1027 + * Quirk: some x86_64 machines can only use physical APIC mode
1028 + * regardless of how many processors are present (x86_64 ES7000
1029 + * is an example).
1030 */
1031 - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID)
1032 - if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) {
1033 - genapic = &apic_cluster;
1034 - goto print;
1035 - }
1036 -#endif
1037 -
1038 - memset(cluster_cnt, 0, sizeof(cluster_cnt));
1039 - for (i = 0; i < NR_CPUS; i++) {
1040 - id = bios_cpu_apicid[i];
1041 - if (id == BAD_APICID)
1042 - continue;
1043 - if (id > max_apic)
1044 - max_apic = id;
1045 - cluster_cnt[APIC_CLUSTERID(id)]++;
1046 - }
1047 -
1048 - /* Don't use clustered mode on AMD platforms. */
1049 - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
1050 + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
1051 + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
1052 genapic = &apic_physflat;
1053 -#ifndef CONFIG_HOTPLUG_CPU
1054 - /* In the CPU hotplug case we cannot use broadcast mode
1055 - because that opens a race when a CPU is removed.
1056 - Stay at physflat mode in this case.
1057 - It is bad to do this unconditionally though. Once
1058 - we have ACPI platform support for CPU hotplug
1059 - we should detect hotplug capablity from ACPI tables and
1060 - only do this when really needed. -AK */
1061 - if (max_apic <= 8)
1062 - genapic = &apic_flat;
1063 -#endif
1064 - goto print;
1065 - }
1066 -
1067 - clusters = 0;
1068 - max_cluster = 0;
1069 -
1070 - for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
1071 - if (cluster_cnt[i] > 0) {
1072 - ++clusters;
1073 - if (cluster_cnt[i] > max_cluster)
1074 - max_cluster = cluster_cnt[i];
1075 - }
1076 - }
1077 + else
1078 +#endif
1079
1080 - /*
1081 - * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
1082 - * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
1083 - * else physical mode.
1084 - * (We don't use lowest priority delivery + HW APIC IRQ steering, so
1085 - * can ignore the clustered logical case and go straight to physical.)
1086 - */
1087 - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
1088 -#ifdef CONFIG_HOTPLUG_CPU
1089 - /* Don't use APIC shortcuts in CPU hotplug to avoid races */
1090 - genapic = &apic_physflat;
1091 -#else
1092 + if (cpus_weight(cpu_possible_map) <= 8)
1093 genapic = &apic_flat;
1094 -#endif
1095 - } else
1096 - genapic = &apic_cluster;
1097 + else
1098 + genapic = &apic_physflat;
1099
1100 -print:
1101 #else
1102 /* hardcode to xen apic functions */
1103 genapic = &apic_xen;
1104 @@ -135,7 +69,7 @@ print:
1105 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
1106 }
1107
1108 -/* Same for both flat and clustered. */
1109 +/* Same for both flat and physical. */
1110
1111 #ifdef CONFIG_XEN
1112 extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
1113 --- a/arch/x86/kernel/genapic_xen_64.c
1114 +++ b/arch/x86/kernel/genapic_xen_64.c
1115 @@ -21,9 +21,8 @@
1116 #include <asm/ipi.h>
1117 #else
1118 #include <asm/apic.h>
1119 -#include <asm/apicdef.h>
1120 -#include <asm/genapic.h>
1121 #endif
1122 +#include <asm/genapic.h>
1123 #include <xen/evtchn.h>
1124
1125 DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
1126 --- a/arch/x86/kernel/head_32-xen.S
1127 +++ b/arch/x86/kernel/head_32-xen.S
1128 @@ -37,7 +37,8 @@ ENTRY(startup_32)
1129 /* Set up the stack pointer */
1130 movl $(init_thread_union+THREAD_SIZE),%esp
1131
1132 - call setup_pda
1133 + movl %ss,%eax
1134 + movl %eax,%fs # gets reset once there's real percpu
1135
1136 /* get vendor info */
1137 xorl %eax,%eax # call CPUID with 0 -> return vendor ID
1138 @@ -64,55 +65,11 @@ ENTRY(startup_32)
1139 xorl %eax,%eax # Clear GS
1140 movl %eax,%gs
1141
1142 - movl $(__KERNEL_PDA),%eax
1143 - mov %eax,%fs
1144 -
1145 cld # gcc2 wants the direction flag cleared at all times
1146
1147 pushl $0 # fake return address for unwinder
1148 jmp start_kernel
1149
1150 -/*
1151 - * Point the GDT at this CPU's PDA. This will be
1152 - * cpu_gdt_table and boot_pda.
1153 - */
1154 -ENTRY(setup_pda)
1155 - /* get the PDA pointer */
1156 - movl $boot_pda, %eax
1157 -
1158 - /* slot the PDA address into the GDT */
1159 - mov $cpu_gdt_table, %ecx
1160 - mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
1161 - shr $16, %eax
1162 - mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
1163 - mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
1164 -
1165 - # %esi still points to start_info, and no registers
1166 - # need to be preserved.
1167 -
1168 - movl XEN_START_mfn_list(%esi), %ebx
1169 - movl $(cpu_gdt_table - __PAGE_OFFSET), %eax
1170 - shrl $PAGE_SHIFT, %eax
1171 - movl (%ebx,%eax,4), %ecx
1172 - pushl %ecx # frame number for set_gdt below
1173 -
1174 - xorl %esi, %esi
1175 - xorl %edx, %edx
1176 - shldl $PAGE_SHIFT, %ecx, %edx
1177 - shll $PAGE_SHIFT, %ecx
1178 - orl $0x61, %ecx
1179 - movl $cpu_gdt_table, %ebx
1180 - movl $__HYPERVISOR_update_va_mapping, %eax
1181 - int $0x82
1182 -
1183 - movl $(PAGE_SIZE_asm / 8), %ecx
1184 - movl %esp, %ebx
1185 - movl $__HYPERVISOR_set_gdt, %eax
1186 - int $0x82
1187 -
1188 - popl %ecx
1189 - ret
1190 -
1191 #define HYPERCALL_PAGE_OFFSET 0x1000
1192 .org HYPERCALL_PAGE_OFFSET
1193 ENTRY(hypercall_page)
1194 @@ -138,60 +95,6 @@ ENTRY(empty_zero_page)
1195 */
1196 .data
1197
1198 -/*
1199 - * The Global Descriptor Table contains 28 quadwords, per-CPU.
1200 - */
1201 - .section .data.page_aligned, "aw"
1202 - .align PAGE_SIZE_asm
1203 -ENTRY(cpu_gdt_table)
1204 - .quad 0x0000000000000000 /* NULL descriptor */
1205 - .quad 0x0000000000000000 /* 0x0b reserved */
1206 - .quad 0x0000000000000000 /* 0x13 reserved */
1207 - .quad 0x0000000000000000 /* 0x1b reserved */
1208 - .quad 0x0000000000000000 /* 0x20 unused */
1209 - .quad 0x0000000000000000 /* 0x28 unused */
1210 - .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
1211 - .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
1212 - .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
1213 - .quad 0x0000000000000000 /* 0x4b reserved */
1214 - .quad 0x0000000000000000 /* 0x53 reserved */
1215 - .quad 0x0000000000000000 /* 0x5b reserved */
1216 -
1217 - .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
1218 - .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
1219 - .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
1220 - .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
1221 -
1222 - .quad 0x0000000000000000 /* 0x80 TSS descriptor */
1223 - .quad 0x0000000000000000 /* 0x88 LDT descriptor */
1224 -
1225 - /*
1226 - * Segments used for calling PnP BIOS have byte granularity.
1227 - * They code segments and data segments have fixed 64k limits,
1228 - * the transfer segment sizes are set at run time.
1229 - */
1230 - .quad 0x0000000000000000 /* 0x90 32-bit code */
1231 - .quad 0x0000000000000000 /* 0x98 16-bit code */
1232 - .quad 0x0000000000000000 /* 0xa0 16-bit data */
1233 - .quad 0x0000000000000000 /* 0xa8 16-bit data */
1234 - .quad 0x0000000000000000 /* 0xb0 16-bit data */
1235 -
1236 - /*
1237 - * The APM segments have byte granularity and their bases
1238 - * are set at run time. All have 64k limits.
1239 - */
1240 - .quad 0x0000000000000000 /* 0xb8 APM CS code */
1241 - .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
1242 - .quad 0x0000000000000000 /* 0xc8 APM DS data */
1243 -
1244 - .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */
1245 - .quad 0x00cf92000000ffff /* 0xd8 - PDA */
1246 - .quad 0x0000000000000000 /* 0xe0 - unused */
1247 - .quad 0x0000000000000000 /* 0xe8 - unused */
1248 - .quad 0x0000000000000000 /* 0xf0 - unused */
1249 - .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
1250 - .align PAGE_SIZE_asm
1251 -
1252 #if CONFIG_XEN_COMPAT <= 0x030002
1253 /*
1254 * __xen_guest information
1255 --- a/arch/x86/kernel/head64-xen.c
1256 +++ b/arch/x86/kernel/head64-xen.c
1257 @@ -25,13 +25,21 @@
1258 #include <asm/setup.h>
1259 #include <asm/desc.h>
1260 #include <asm/pgtable.h>
1261 +#include <asm/tlbflush.h>
1262 #include <asm/sections.h>
1263
1264 unsigned long start_pfn;
1265
1266 +#ifndef CONFIG_XEN
1267 +static void __init zap_identity_mappings(void)
1268 +{
1269 + pgd_t *pgd = pgd_offset_k(0UL);
1270 + pgd_clear(pgd);
1271 + __flush_tlb();
1272 +}
1273 +
1274 /* Don't add a printk in there. printk relies on the PDA which is not initialized
1275 yet. */
1276 -#if 0
1277 static void __init clear_bss(void)
1278 {
1279 memset(__bss_start, 0,
1280 @@ -40,26 +48,25 @@ static void __init clear_bss(void)
1281 #endif
1282
1283 #define NEW_CL_POINTER 0x228 /* Relative to real mode data */
1284 -#define OLD_CL_MAGIC_ADDR 0x90020
1285 +#define OLD_CL_MAGIC_ADDR 0x20
1286 #define OLD_CL_MAGIC 0xA33F
1287 -#define OLD_CL_BASE_ADDR 0x90000
1288 -#define OLD_CL_OFFSET 0x90022
1289 +#define OLD_CL_OFFSET 0x22
1290
1291 static void __init copy_bootdata(char *real_mode_data)
1292 {
1293 #ifndef CONFIG_XEN
1294 - int new_data;
1295 + unsigned long new_data;
1296 char * command_line;
1297
1298 memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
1299 - new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
1300 + new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
1301 if (!new_data) {
1302 - if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
1303 + if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
1304 return;
1305 }
1306 - new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
1307 + new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
1308 }
1309 - command_line = (char *) ((u64)(new_data));
1310 + command_line = __va(new_data);
1311 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
1312 #else
1313 int max_cmdline;
1314 @@ -101,10 +108,13 @@ void __init x86_64_start_kernel(char * r
1315 while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
1316 machine_to_phys_order++;
1317
1318 -#if 0
1319 +#ifndef CONFIG_XEN
1320 /* clear bss before set_intr_gate with early_idt_handler */
1321 clear_bss();
1322
1323 + /* Make NULL pointers segfault */
1324 + zap_identity_mappings();
1325 +
1326 for (i = 0; i < IDT_ENTRIES; i++)
1327 set_intr_gate(i, early_idt_handler);
1328 asm volatile("lidt %0" :: "m" (idt_descr));
1329 @@ -116,7 +126,7 @@ void __init x86_64_start_kernel(char * r
1330 cpu_pda(i) = &boot_cpu_pda[i];
1331
1332 pda_init(0);
1333 - copy_bootdata(real_mode_data);
1334 + copy_bootdata(__va(real_mode_data));
1335 #ifdef CONFIG_SMP
1336 cpu_set(0, cpu_online_map);
1337 #endif
1338 --- a/arch/x86/kernel/head_64-xen.S
1339 +++ b/arch/x86/kernel/head_64-xen.S
1340 @@ -5,6 +5,7 @@
1341 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
1342 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
1343 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
1344 + * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
1345 * Jun Nakajima <jun.nakajima@intel.com>
1346 * Modified for Xen
1347 */
1348 @@ -34,27 +35,15 @@ startup_64:
1349 pushq $0 # fake return address
1350 jmp x86_64_start_kernel
1351
1352 -#ifdef CONFIG_ACPI_SLEEP
1353 -.org 0xf00
1354 - .globl pGDT32
1355 -pGDT32:
1356 - .word gdt_end-cpu_gdt_table-1
1357 - .long cpu_gdt_table-__START_KERNEL_map
1358 -#endif
1359 -ENTRY(stext)
1360 -ENTRY(_stext)
1361 +.balign PAGE_SIZE
1362
1363 - $page = 0
1364 #define NEXT_PAGE(name) \
1365 - $page = $page + 1; \
1366 - .org $page * 0x1000; \
1367 - phys_##name = $page * 0x1000 + __PHYSICAL_START; \
1368 + .balign PAGE_SIZE; \
1369 + phys_##name = . - .bootstrap.text; \
1370 ENTRY(name)
1371
1372 NEXT_PAGE(init_level4_pgt)
1373 - /* This gets initialized in x86_64_start_kernel */
1374 .fill 512,8,0
1375 -NEXT_PAGE(init_level4_user_pgt)
1376 /*
1377 * We update two pgd entries to make kernel and user pgd consistent
1378 * at pgd_populate(). It can be used for kernel modules. So we place
1379 @@ -101,14 +90,6 @@ NEXT_PAGE(hypercall_page)
1380 #undef NEXT_PAGE
1381
1382 .data
1383 -/* Just dummy symbol to allow compilation. Not used in sleep path */
1384 -#ifdef CONFIG_ACPI_SLEEP
1385 - .align PAGE_SIZE
1386 -ENTRY(wakeup_level4_pgt)
1387 - .fill 512,8,0
1388 -#endif
1389 -
1390 - .data
1391
1392 .align 16
1393 .globl cpu_gdt_descr
1394 @@ -136,13 +117,13 @@ gdt:
1395
1396 ENTRY(cpu_gdt_table)
1397 .quad 0x0000000000000000 /* NULL descriptor */
1398 + .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
1399 + .quad 0x00af9b000000ffff /* __KERNEL_CS */
1400 + .quad 0x00cf93000000ffff /* __KERNEL_DS */
1401 + .quad 0x00cffb000000ffff /* __USER32_CS */
1402 + .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
1403 + .quad 0x00affb000000ffff /* __USER_CS */
1404 .quad 0x0 /* unused */
1405 - .quad 0x00af9a000000ffff /* __KERNEL_CS */
1406 - .quad 0x00cf92000000ffff /* __KERNEL_DS */
1407 - .quad 0x00cffa000000ffff /* __USER32_CS */
1408 - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
1409 - .quad 0x00affa000000ffff /* __USER_CS */
1410 - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
1411 .quad 0,0 /* TSS */
1412 .quad 0,0 /* LDT */
1413 .quad 0,0,0 /* three TLS descriptors */
1414 @@ -165,14 +146,11 @@ ENTRY(empty_zero_page)
1415 * __xen_guest information
1416 */
1417 .macro utoh value
1418 - .if (\value) < 0 || (\value) >= 0x10
1419 - utoh (((\value)>>4)&0x0fffffffffffffff)
1420 - .endif
1421 - .if ((\value) & 0xf) < 10
1422 - .byte '0' + ((\value) & 0xf)
1423 - .else
1424 - .byte 'A' + ((\value) & 0xf) - 10
1425 - .endif
1426 + i = 64
1427 + .rept 16
1428 + i = i - 4
1429 + .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf)
1430 + .endr
1431 .endm
1432
1433 .section __xen_guest
1434 --- a/arch/x86/kernel/io_apic_32-xen.c
1435 +++ b/arch/x86/kernel/io_apic_32-xen.c
1436 @@ -25,7 +25,6 @@
1437 #include <linux/init.h>
1438 #include <linux/delay.h>
1439 #include <linux/sched.h>
1440 -#include <linux/smp_lock.h>
1441 #include <linux/mc146818rtc.h>
1442 #include <linux/compiler.h>
1443 #include <linux/acpi.h>
1444 @@ -35,6 +34,7 @@
1445 #include <linux/msi.h>
1446 #include <linux/htirq.h>
1447 #include <linux/freezer.h>
1448 +#include <linux/kthread.h>
1449
1450 #include <asm/io.h>
1451 #include <asm/smp.h>
1452 @@ -710,8 +710,6 @@ static int balanced_irq(void *unused)
1453 unsigned long prev_balance_time = jiffies;
1454 long time_remaining = balanced_irq_interval;
1455
1456 - daemonize("kirqd");
1457 -
1458 /* push everything to CPU 0 to give us a starting point. */
1459 for (i = 0 ; i < NR_IRQS ; i++) {
1460 irq_desc[i].pending_mask = cpumask_of_cpu(0);
1461 @@ -771,10 +769,9 @@ static int __init balanced_irq_init(void
1462 }
1463
1464 printk(KERN_INFO "Starting balanced_irq\n");
1465 - if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
1466 + if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
1467 return 0;
1468 - else
1469 - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
1470 + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
1471 failed:
1472 for_each_possible_cpu(i) {
1473 kfree(irq_cpu_data[i].irq_delta);
1474 @@ -1455,10 +1452,6 @@ static void __init setup_ExtINT_IRQ0_pin
1475 enable_8259A_irq(0);
1476 }
1477
1478 -static inline void UNEXPECTED_IO_APIC(void)
1479 -{
1480 -}
1481 -
1482 void __init print_IO_APIC(void)
1483 {
1484 int apic, i;
1485 @@ -1498,34 +1491,12 @@ void __init print_IO_APIC(void)
1486 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1487 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1488 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1489 - if (reg_00.bits.ID >= get_physical_broadcast())
1490 - UNEXPECTED_IO_APIC();
1491 - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1492 - UNEXPECTED_IO_APIC();
1493
1494 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1495 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1496 - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1497 - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1498 - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1499 - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1500 - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1501 - (reg_01.bits.entries != 0x2E) &&
1502 - (reg_01.bits.entries != 0x3F)
1503 - )
1504 - UNEXPECTED_IO_APIC();
1505
1506 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1507 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1508 - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1509 - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1510 - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1511 - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1512 - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1513 - )
1514 - UNEXPECTED_IO_APIC();
1515 - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1516 - UNEXPECTED_IO_APIC();
1517
1518 /*
1519 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1520 @@ -1535,8 +1506,6 @@ void __init print_IO_APIC(void)
1521 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1522 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1523 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1524 - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1525 - UNEXPECTED_IO_APIC();
1526 }
1527
1528 /*
1529 @@ -1548,8 +1517,6 @@ void __init print_IO_APIC(void)
1530 reg_03.raw != reg_01.raw) {
1531 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1532 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1533 - if (reg_03.bits.__reserved_1)
1534 - UNEXPECTED_IO_APIC();
1535 }
1536
1537 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1538 @@ -2686,19 +2653,19 @@ int arch_setup_msi_irq(struct pci_dev *d
1539 if (irq < 0)
1540 return irq;
1541
1542 - set_irq_msi(irq, desc);
1543 ret = msi_compose_msg(dev, irq, &msg);
1544 if (ret < 0) {
1545 destroy_irq(irq);
1546 return ret;
1547 }
1548
1549 + set_irq_msi(irq, desc);
1550 write_msi_msg(irq, &msg);
1551
1552 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
1553 "edge");
1554
1555 - return irq;
1556 + return 0;
1557 }
1558
1559 void arch_teardown_msi_irq(unsigned int irq)
1560 --- a/arch/x86/kernel/io_apic_64-xen.c
1561 +++ b/arch/x86/kernel/io_apic_64-xen.c
1562 @@ -25,7 +25,6 @@
1563 #include <linux/init.h>
1564 #include <linux/delay.h>
1565 #include <linux/sched.h>
1566 -#include <linux/smp_lock.h>
1567 #include <linux/pci.h>
1568 #include <linux/mc146818rtc.h>
1569 #include <linux/acpi.h>
1570 @@ -904,10 +903,6 @@ static void __init setup_ExtINT_IRQ0_pin
1571 enable_8259A_irq(0);
1572 }
1573
1574 -void __init UNEXPECTED_IO_APIC(void)
1575 -{
1576 -}
1577 -
1578 void __apicdebuginit print_IO_APIC(void)
1579 {
1580 int apic, i;
1581 @@ -943,40 +938,16 @@ void __apicdebuginit print_IO_APIC(void)
1582 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
1583 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1584 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1585 - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1586 - UNEXPECTED_IO_APIC();
1587
1588 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
1589 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1590 - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1591 - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1592 - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1593 - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1594 - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1595 - (reg_01.bits.entries != 0x2E) &&
1596 - (reg_01.bits.entries != 0x3F) &&
1597 - (reg_01.bits.entries != 0x03)
1598 - )
1599 - UNEXPECTED_IO_APIC();
1600
1601 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1602 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1603 - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1604 - (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
1605 - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1606 - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1607 - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1608 - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1609 - )
1610 - UNEXPECTED_IO_APIC();
1611 - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1612 - UNEXPECTED_IO_APIC();
1613
1614 if (reg_01.bits.version >= 0x10) {
1615 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1616 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1617 - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1618 - UNEXPECTED_IO_APIC();
1619 }
1620
1621 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1622 @@ -1408,8 +1379,7 @@ static void irq_complete_move(unsigned i
1623
1624 vector = ~get_irq_regs()->orig_rax;
1625 me = smp_processor_id();
1626 - if ((vector == cfg->vector) &&
1627 - cpu_isset(smp_processor_id(), cfg->domain)) {
1628 + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
1629 cpumask_t cleanup_mask;
1630
1631 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
1632 @@ -1444,7 +1414,7 @@ static void ack_apic_level(unsigned int
1633
1634 /*
1635 * We must acknowledge the irq before we move it or the acknowledge will
1636 - * not propogate properly.
1637 + * not propagate properly.
1638 */
1639 ack_APIC_irq();
1640
1641 @@ -1527,6 +1497,7 @@ static void ack_lapic_irq (unsigned int
1642 static void end_lapic_irq (unsigned int i) { /* nothing */ }
1643
1644 static struct hw_interrupt_type lapic_irq_type __read_mostly = {
1645 + .name = "local-APIC",
1646 .typename = "local-APIC-edge",
1647 .startup = NULL, /* startup_irq() not used for IRQ0 */
1648 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1649 @@ -1998,18 +1969,18 @@ int arch_setup_msi_irq(struct pci_dev *d
1650 if (irq < 0)
1651 return irq;
1652
1653 - set_irq_msi(irq, desc);
1654 ret = msi_compose_msg(dev, irq, &msg);
1655 if (ret < 0) {
1656 destroy_irq(irq);
1657 return ret;
1658 }
1659
1660 + set_irq_msi(irq, desc);
1661 write_msi_msg(irq, &msg);
1662
1663 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
1664
1665 - return irq;
1666 + return 0;
1667 }
1668
1669 void arch_teardown_msi_irq(unsigned int irq)
1670 --- a/arch/x86/kernel/ioport_32-xen.c
1671 +++ b/arch/x86/kernel/ioport_32-xen.c
1672 @@ -12,10 +12,10 @@
1673 #include <linux/types.h>
1674 #include <linux/ioport.h>
1675 #include <linux/smp.h>
1676 -#include <linux/smp_lock.h>
1677 #include <linux/stddef.h>
1678 #include <linux/slab.h>
1679 #include <linux/thread_info.h>
1680 +#include <linux/syscalls.h>
1681 #include <xen/interface/physdev.h>
1682
1683 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
1684 --- a/arch/x86/kernel/ioport_64-xen.c
1685 +++ b/arch/x86/kernel/ioport_64-xen.c
1686 @@ -13,10 +13,10 @@
1687 #include <linux/ioport.h>
1688 #include <linux/mm.h>
1689 #include <linux/smp.h>
1690 -#include <linux/smp_lock.h>
1691 #include <linux/stddef.h>
1692 #include <linux/slab.h>
1693 #include <linux/thread_info.h>
1694 +#include <linux/syscalls.h>
1695 #include <xen/interface/physdev.h>
1696
1697 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
1698 --- a/arch/x86/kernel/irq_32-xen.c
1699 +++ b/arch/x86/kernel/irq_32-xen.c
1700 @@ -24,6 +24,9 @@
1701 DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
1702 EXPORT_PER_CPU_SYMBOL(irq_stat);
1703
1704 +DEFINE_PER_CPU(struct pt_regs *, irq_regs);
1705 +EXPORT_PER_CPU_SYMBOL(irq_regs);
1706 +
1707 /*
1708 * 'what should we do if we get a hw irq event on an illegal vector'.
1709 * each architecture has to answer this themselves.
1710 --- a/arch/x86/kernel/irq_64-xen.c
1711 +++ b/arch/x86/kernel/irq_64-xen.c
1712 @@ -32,7 +32,7 @@ atomic_t irq_err_count;
1713 */
1714 static inline void stack_overflow_check(struct pt_regs *regs)
1715 {
1716 - u64 curbase = (u64) current->thread_info;
1717 + u64 curbase = (u64)task_stack_page(current);
1718 static unsigned long warned = -60*HZ;
1719
1720 if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
1721 @@ -145,17 +145,43 @@ void fixup_irqs(cpumask_t map)
1722
1723 for (irq = 0; irq < NR_IRQS; irq++) {
1724 cpumask_t mask;
1725 + int break_affinity = 0;
1726 + int set_affinity = 1;
1727 +
1728 if (irq == 2)
1729 continue;
1730
1731 + /* interrupt's are disabled at this point */
1732 + spin_lock(&irq_desc[irq].lock);
1733 +
1734 + if (!irq_has_action(irq) ||
1735 + cpus_equal(irq_desc[irq].affinity, map)) {
1736 + spin_unlock(&irq_desc[irq].lock);
1737 + continue;
1738 + }
1739 +
1740 cpus_and(mask, irq_desc[irq].affinity, map);
1741 - if (any_online_cpu(mask) == NR_CPUS) {
1742 - /*printk("Breaking affinity for irq %i\n", irq);*/
1743 + if (cpus_empty(mask)) {
1744 + break_affinity = 1;
1745 mask = map;
1746 }
1747 +
1748 + if (irq_desc[irq].chip->mask)
1749 + irq_desc[irq].chip->mask(irq);
1750 +
1751 if (irq_desc[irq].chip->set_affinity)
1752 irq_desc[irq].chip->set_affinity(irq, mask);
1753 - else if (irq_desc[irq].action && !(warned++))
1754 + else if (!(warned++))
1755 + set_affinity = 0;
1756 +
1757 + if (irq_desc[irq].chip->unmask)
1758 + irq_desc[irq].chip->unmask(irq);
1759 +
1760 + spin_unlock(&irq_desc[irq].lock);
1761 +
1762 + if (break_affinity && set_affinity)
1763 + /*printk("Broke affinity for irq %i\n", irq)*/;
1764 + else if (!set_affinity)
1765 printk("Cannot set affinity for irq %i\n", irq);
1766 }
1767
1768 --- a/arch/x86/kernel/ldt_32-xen.c
1769 +++ b/arch/x86/kernel/ldt_32-xen.c
1770 @@ -10,7 +10,6 @@
1771 #include <linux/string.h>
1772 #include <linux/mm.h>
1773 #include <linux/smp.h>
1774 -#include <linux/smp_lock.h>
1775 #include <linux/vmalloc.h>
1776 #include <linux/slab.h>
1777
1778 --- a/arch/x86/kernel/ldt_64-xen.c
1779 +++ b/arch/x86/kernel/ldt_64-xen.c
1780 @@ -13,7 +13,6 @@
1781 #include <linux/string.h>
1782 #include <linux/mm.h>
1783 #include <linux/smp.h>
1784 -#include <linux/smp_lock.h>
1785 #include <linux/vmalloc.h>
1786 #include <linux/slab.h>
1787
1788 --- a/arch/x86/kernel/Makefile
1789 +++ b/arch/x86/kernel/Makefile
1790 @@ -127,4 +127,4 @@ endif
1791 disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
1792 smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
1793 disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o
1794 -%/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
1795 +%/head_64.o %/head_64.s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
1796 --- a/arch/x86/kernel/microcode-xen.c
1797 +++ b/arch/x86/kernel/microcode-xen.c
1798 @@ -135,7 +135,7 @@ static int __init microcode_dev_init (vo
1799 return 0;
1800 }
1801
1802 -static void __exit microcode_dev_exit (void)
1803 +static void microcode_dev_exit (void)
1804 {
1805 misc_deregister(&microcode_dev);
1806 }
1807 --- a/arch/x86/kernel/mpparse_32-xen.c
1808 +++ b/arch/x86/kernel/mpparse_32-xen.c
1809 @@ -18,7 +18,6 @@
1810 #include <linux/acpi.h>
1811 #include <linux/delay.h>
1812 #include <linux/bootmem.h>
1813 -#include <linux/smp_lock.h>
1814 #include <linux/kernel_stat.h>
1815 #include <linux/mc146818rtc.h>
1816 #include <linux/bitops.h>
1817 @@ -484,7 +483,7 @@ static int __init smp_read_mpc(struct mp
1818 }
1819 ++mpc_record;
1820 }
1821 - clustered_apic_check();
1822 + setup_apic_routing();
1823 if (!num_processors)
1824 printk(KERN_ERR "SMP mptable: no processors registered!\n");
1825 return num_processors;
1826 --- a/arch/x86/kernel/mpparse_64-xen.c
1827 +++ b/arch/x86/kernel/mpparse_64-xen.c
1828 @@ -17,7 +17,6 @@
1829 #include <linux/init.h>
1830 #include <linux/delay.h>
1831 #include <linux/bootmem.h>
1832 -#include <linux/smp_lock.h>
1833 #include <linux/kernel_stat.h>
1834 #include <linux/mc146818rtc.h>
1835 #include <linux/acpi.h>
1836 @@ -307,7 +306,7 @@ static int __init smp_read_mpc(struct mp
1837 }
1838 }
1839 }
1840 - clustered_apic_check();
1841 + setup_apic_routing();
1842 if (!num_processors)
1843 printk(KERN_ERR "MPTABLE: no processors registered!\n");
1844 return num_processors;
1845 --- a/arch/x86/kernel/pci-dma-xen.c
1846 +++ b/arch/x86/kernel/pci-dma-xen.c
1847 @@ -13,6 +13,7 @@
1848 #include <linux/pci.h>
1849 #include <linux/module.h>
1850 #include <linux/version.h>
1851 +#include <linux/pci.h>
1852 #include <asm/io.h>
1853 #include <xen/balloon.h>
1854 #include <xen/gnttab.h>
1855 @@ -278,7 +279,7 @@ int dma_declare_coherent_memory(struct d
1856 {
1857 void __iomem *mem_base = NULL;
1858 int pages = size >> PAGE_SHIFT;
1859 - int bitmap_size = (pages + 31)/32;
1860 + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
1861
1862 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
1863 goto out;
1864 @@ -351,6 +352,32 @@ void *dma_mark_declared_memory_occupied(
1865 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
1866 #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
1867
1868 +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
1869 +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
1870 +
1871 +int forbid_dac;
1872 +EXPORT_SYMBOL(forbid_dac);
1873 +
1874 +static __devinit void via_no_dac(struct pci_dev *dev)
1875 +{
1876 + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
1877 + printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
1878 + forbid_dac = 1;
1879 + }
1880 +}
1881 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
1882 +
1883 +static int check_iommu(char *s)
1884 +{
1885 + if (!strcmp(s, "usedac")) {
1886 + forbid_dac = -1;
1887 + return 1;
1888 + }
1889 + return 0;
1890 +}
1891 +__setup("iommu=", check_iommu);
1892 +#endif
1893 +
1894 dma_addr_t
1895 dma_map_single(struct device *dev, void *ptr, size_t size,
1896 enum dma_data_direction direction)
1897 --- a/arch/x86/kernel/process_32-xen.c
1898 +++ b/arch/x86/kernel/process_32-xen.c
1899 @@ -21,7 +21,6 @@
1900 #include <linux/mm.h>
1901 #include <linux/elfcore.h>
1902 #include <linux/smp.h>
1903 -#include <linux/smp_lock.h>
1904 #include <linux/stddef.h>
1905 #include <linux/slab.h>
1906 #include <linux/vmalloc.h>
1907 @@ -39,6 +38,7 @@
1908 #include <linux/random.h>
1909 #include <linux/personality.h>
1910 #include <linux/tick.h>
1911 +#include <linux/percpu.h>
1912
1913 #include <asm/uaccess.h>
1914 #include <asm/pgtable.h>
1915 @@ -61,7 +61,6 @@
1916
1917 #include <asm/tlbflush.h>
1918 #include <asm/cpu.h>
1919 -#include <asm/pda.h>
1920
1921 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
1922
1923 @@ -70,6 +69,12 @@ static int hlt_counter;
1924 unsigned long boot_option_idle_override = 0;
1925 EXPORT_SYMBOL(boot_option_idle_override);
1926
1927 +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1928 +EXPORT_PER_CPU_SYMBOL(current_task);
1929 +
1930 +DEFINE_PER_CPU(int, cpu_number);
1931 +EXPORT_PER_CPU_SYMBOL(cpu_number);
1932 +
1933 /*
1934 * Return saved PC of a blocked thread.
1935 */
1936 @@ -168,6 +173,7 @@ void cpu_idle(void)
1937 if (__get_cpu_var(cpu_idle_state))
1938 __get_cpu_var(cpu_idle_state) = 0;
1939
1940 + check_pgt_cache();
1941 rmb();
1942 idle = xen_idle; /* no alternatives */
1943
1944 @@ -218,18 +224,19 @@ void __devinit select_idle_routine(const
1945 {
1946 }
1947
1948 -static int __init idle_setup (char *str)
1949 +static int __init idle_setup(char *str)
1950 {
1951 - if (!strncmp(str, "poll", 4)) {
1952 + if (!strcmp(str, "poll")) {
1953 printk("using polling idle threads.\n");
1954 pm_idle = poll_idle;
1955 }
1956 + else
1957 + return -1;
1958
1959 boot_option_idle_override = 1;
1960 - return 1;
1961 + return 0;
1962 }
1963 -
1964 -__setup("idle=", idle_setup);
1965 +early_param("idle", idle_setup);
1966
1967 void show_regs(struct pt_regs * regs)
1968 {
1969 @@ -282,7 +289,7 @@ int kernel_thread(int (*fn)(void *), voi
1970
1971 regs.xds = __USER_DS;
1972 regs.xes = __USER_DS;
1973 - regs.xfs = __KERNEL_PDA;
1974 + regs.xfs = __KERNEL_PERCPU;
1975 regs.orig_eax = -1;
1976 regs.eip = (unsigned long) kernel_thread_helper;
1977 regs.xcs = __KERNEL_CS | get_kernel_rpl();
1978 @@ -562,7 +569,7 @@ struct task_struct fastcall * __switch_t
1979 * multicall to indicate FPU task switch, rather than
1980 * synchronously trapping to Xen.
1981 */
1982 - if (prev_p->thread_info->status & TS_USEDFPU) {
1983 + if (task_thread_info(prev_p)->status & TS_USEDFPU) {
1984 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
1985 mcl->op = __HYPERVISOR_fpu_taskswitch;
1986 mcl->args[0] = 1;
1987 @@ -669,7 +676,7 @@ struct task_struct fastcall * __switch_t
1988 if (prev->gs | next->gs)
1989 loadsegment(gs, next->gs);
1990
1991 - write_pda(pcurrent, next_p);
1992 + x86_write_percpu(current_task, next_p);
1993
1994 return prev_p;
1995 }
1996 --- a/arch/x86/kernel/process_64-xen.c
1997 +++ b/arch/x86/kernel/process_64-xen.c
1998 @@ -39,6 +39,7 @@
1999 #include <linux/random.h>
2000 #include <linux/notifier.h>
2001 #include <linux/kprobes.h>
2002 +#include <linux/kdebug.h>
2003
2004 #include <asm/uaccess.h>
2005 #include <asm/pgtable.h>
2006 @@ -49,7 +50,6 @@
2007 #include <asm/mmu_context.h>
2008 #include <asm/pda.h>
2009 #include <asm/prctl.h>
2010 -#include <asm/kdebug.h>
2011 #include <xen/interface/platform.h>
2012 #include <xen/interface/physdev.h>
2013 #include <xen/interface/vcpu.h>
2014 @@ -232,16 +232,18 @@ void __cpuinit select_idle_routine(const
2015
2016 static int __init idle_setup (char *str)
2017 {
2018 - if (!strncmp(str, "poll", 4)) {
2019 + if (!strcmp(str, "poll")) {
2020 printk("using polling idle threads.\n");
2021 pm_idle = poll_idle;
2022 - }
2023 + } else if (!strcmp(str, "mwait"))
2024 + force_mwait = 1;
2025 + else
2026 + return -1;
2027
2028 boot_option_idle_override = 1;
2029 - return 1;
2030 + return 0;
2031 }
2032 -
2033 -__setup("idle=", idle_setup);
2034 +early_param("idle", idle_setup);
2035
2036 /* Prints also some state that isn't saved in the pt_regs */
2037 void __show_regs(struct pt_regs * regs)
2038 @@ -546,7 +548,7 @@ __switch_to(struct task_struct *prev_p,
2039 * The AMD workaround requires it to be after DS reload, or
2040 * after DS has been cleared, which we do in __prepare_arch_switch.
2041 */
2042 - if (prev_p->thread_info->status & TS_USEDFPU) {
2043 + if (task_thread_info(prev_p)->status & TS_USEDFPU) {
2044 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
2045 mcl->op = __HYPERVISOR_fpu_taskswitch;
2046 mcl->args[0] = 1;
2047 --- a/arch/x86/kernel/quirks-xen.c
2048 +++ b/arch/x86/kernel/quirks-xen.c
2049 @@ -3,12 +3,10 @@
2050 */
2051 #include <linux/pci.h>
2052 #include <linux/irq.h>
2053 -#include <asm/pci-direct.h>
2054 -#include <asm/genapic.h>
2055 -#include <asm/cpu.h>
2056
2057 #if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
2058 -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
2059 +
2060 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
2061 {
2062 u8 config, rev;
2063 u32 word;
2064 @@ -16,7 +14,7 @@ static void __devinit verify_quirk_intel
2065 /* BIOS may enable hardware IRQ balancing for
2066 * E7520/E7320/E7525(revision ID 0x9 and below)
2067 * based platforms.
2068 - * For those platforms, make sure that the genapic is set to 'flat'
2069 + * Disable SW irqbalance/affinity on those platforms.
2070 */
2071 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
2072 if (rev > 0x9)
2073 @@ -30,59 +28,20 @@ static void __devinit verify_quirk_intel
2074 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
2075
2076 if (!(word & (1 << 13))) {
2077 -#ifndef CONFIG_XEN
2078 -#ifdef CONFIG_X86_64
2079 - if (genapic != &apic_flat)
2080 - panic("APIC mode must be flat on this system\n");
2081 -#elif defined(CONFIG_X86_GENERICARCH)
2082 - if (genapic != &apic_default)
2083 - panic("APIC mode must be default(flat) on this system. Use apic=default\n");
2084 -#endif
2085 -#endif
2086 - }
2087 -
2088 - /* put back the original value for config space*/
2089 - if (!(config & 0x2))
2090 - pci_write_config_byte(dev, 0xf4, config);
2091 -}
2092 -
2093 -void __init quirk_intel_irqbalance(void)
2094 -{
2095 - u8 config, rev;
2096 - u32 word;
2097 -
2098 - /* BIOS may enable hardware IRQ balancing for
2099 - * E7520/E7320/E7525(revision ID 0x9 and below)
2100 - * based platforms.
2101 - * Disable SW irqbalance/affinity on those platforms.
2102 - */
2103 - rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
2104 - if (rev > 0x9)
2105 - return;
2106 -
2107 - printk(KERN_INFO "Intel E7520/7320/7525 detected.");
2108 -
2109 - /* enable access to config space */
2110 - config = read_pci_config_byte(0, 0, 0, 0xf4);
2111 - write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
2112 -
2113 - /* read xTPR register */
2114 - word = read_pci_config_16(0, 0, 0x40, 0x4c);
2115 -
2116 - if (!(word & (1 << 13))) {
2117 struct xen_platform_op op;
2118 - printk(KERN_INFO "Disabling irq balancing and affinity\n");
2119 +
2120 + printk(KERN_INFO "Intel E7520/7320/7525 detected. "
2121 + "Disabling irq balancing and affinity\n");
2122 op.cmd = XENPF_platform_quirk;
2123 op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
2124 WARN_ON(HYPERVISOR_platform_op(&op));
2125 }
2126
2127 - /* put back the original value for config space */
2128 + /* put back the original value for config space*/
2129 if (!(config & 0x2))
2130 - write_pci_config_byte(0, 0, 0, 0xf4, config);
2131 + pci_write_config_byte(dev, 0xf4, config);
2132 }
2133 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
2134 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
2135 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
2136 -
2137 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
2138 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
2139 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
2140 #endif
2141 --- a/arch/x86/kernel/setup_64-xen.c
2142 +++ b/arch/x86/kernel/setup_64-xen.c
2143 @@ -120,6 +120,8 @@ int bootloader_type;
2144
2145 unsigned long saved_video_mode;
2146
2147 +int force_mwait __cpuinitdata;
2148 +
2149 /*
2150 * Early DMI memory
2151 */
2152 @@ -253,10 +255,10 @@ static void discover_ebda(void)
2153 * there is a real-mode segmented pointer pointing to the
2154 * 4K EBDA area at 0x40E
2155 */
2156 - ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
2157 + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
2158 ebda_addr <<= 4;
2159
2160 - ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
2161 + ebda_size = *(unsigned short *)__va(ebda_addr);
2162
2163 /* Round EBDA up to pages */
2164 if (ebda_size == 0)
2165 @@ -410,15 +412,8 @@ void __init setup_arch(char **cmdline_p)
2166 #endif
2167
2168 #ifdef CONFIG_SMP
2169 - /*
2170 - * But first pinch a few for the stack/trampoline stuff
2171 - * FIXME: Don't need the extra page at 4K, but need to fix
2172 - * trampoline before removing it. (see the GDT stuff)
2173 - */
2174 - reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
2175 -
2176 /* Reserve SMP trampoline */
2177 - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
2178 + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
2179 #endif
2180 #endif
2181
2182 @@ -570,8 +565,6 @@ void __init setup_arch(char **cmdline_p)
2183 early_quirks();
2184 #endif
2185
2186 - zap_low_mappings(0);
2187 -
2188 /*
2189 * set this early, so we dont allocate cpu0
2190 * if MADT list doesnt list BSP first
2191 @@ -864,6 +857,10 @@ static void __cpuinit init_amd(struct cp
2192
2193 /* RDTSC can be speculated around */
2194 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
2195 +
2196 + /* Family 10 doesn't support C states in MWAIT so don't use it */
2197 + if (c->x86 == 0x10 && !force_mwait)
2198 + clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
2199 }
2200
2201 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
2202 @@ -1146,9 +1143,7 @@ void __cpuinit identify_cpu(struct cpuin
2203 #ifdef CONFIG_X86_MCE
2204 mcheck_init(c);
2205 #endif
2206 - if (c == &boot_cpu_data)
2207 - mtrr_bp_init();
2208 - else
2209 + if (c != &boot_cpu_data)
2210 mtrr_ap_init();
2211 #ifdef CONFIG_NUMA
2212 numa_add_cpu(smp_processor_id());
2213 @@ -1239,9 +1234,8 @@ static int show_cpuinfo(struct seq_file
2214 "stc",
2215 "100mhzsteps",
2216 "hwpstate",
2217 - NULL, /* tsc invariant mapped to constant_tsc */
2218 - NULL,
2219 - /* nothing */ /* constant_tsc - moved to flags */
2220 + "", /* tsc invariant mapped to constant_tsc */
2221 + /* nothing */
2222 };
2223
2224
2225 --- a/arch/x86/kernel/setup64-xen.c
2226 +++ b/arch/x86/kernel/setup64-xen.c
2227 @@ -113,9 +113,9 @@ void __init setup_per_cpu_areas(void)
2228 if (!NODE_DATA(cpu_to_node(i))) {
2229 printk("cpu with no node %d, num_online_nodes %d\n",
2230 i, num_online_nodes());
2231 - ptr = alloc_bootmem(size);
2232 + ptr = alloc_bootmem_pages(size);
2233 } else {
2234 - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
2235 + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
2236 }
2237 if (!ptr)
2238 panic("Cannot allocate cpu data for CPU %d\n", i);
2239 @@ -208,6 +208,8 @@ char boot_exception_stacks[(N_EXCEPTION_
2240 __attribute__((section(".bss.page_aligned")));
2241 #endif
2242
2243 +extern asmlinkage void ignore_sysret(void);
2244 +
2245 /* May not be marked __init: used by software suspend */
2246 void syscall_init(void)
2247 {
2248 @@ -219,12 +221,22 @@ void syscall_init(void)
2249 */
2250 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
2251 wrmsrl(MSR_LSTAR, system_call);
2252 + wrmsrl(MSR_CSTAR, ignore_sysret);
2253
2254 /* Flags to clear on syscall */
2255 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
2256 #endif
2257 #ifdef CONFIG_IA32_EMULATION
2258 syscall32_cpu_init ();
2259 +#else
2260 + {
2261 + static const struct callback_register cstar = {
2262 + .type = CALLBACKTYPE_syscall32,
2263 + .address = (unsigned long)ignore_sysret
2264 + };
2265 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
2266 + printk(KERN_WARN "Unable to register CSTAR callback\n");
2267 + }
2268 #endif
2269 }
2270
2271 @@ -262,7 +274,6 @@ void __cpuinit cpu_init (void)
2272 /* CPU 0 is initialised in head64.c */
2273 if (cpu != 0) {
2274 pda_init(cpu);
2275 - zap_low_mappings(cpu);
2276 }
2277 #ifndef CONFIG_X86_NO_TSS
2278 else
2279 --- a/arch/x86/kernel/smp_32-xen.c
2280 +++ b/arch/x86/kernel/smp_32-xen.c
2281 @@ -13,7 +13,6 @@
2282 #include <linux/mm.h>
2283 #include <linux/delay.h>
2284 #include <linux/spinlock.h>
2285 -#include <linux/smp_lock.h>
2286 #include <linux/kernel_stat.h>
2287 #include <linux/mc146818rtc.h>
2288 #include <linux/cache.h>
2289 @@ -216,7 +215,6 @@ static cpumask_t flush_cpumask;
2290 static struct mm_struct * flush_mm;
2291 static unsigned long flush_va;
2292 static DEFINE_SPINLOCK(tlbstate_lock);
2293 -#define FLUSH_ALL 0xffffffff
2294
2295 /*
2296 * We cannot call mmdrop() because we are in interrupt context,
2297 @@ -298,7 +296,7 @@ irqreturn_t smp_invalidate_interrupt(int
2298
2299 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
2300 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
2301 - if (flush_va == FLUSH_ALL)
2302 + if (flush_va == TLB_FLUSH_ALL)
2303 local_flush_tlb();
2304 else
2305 __flush_tlb_one(flush_va);
2306 @@ -314,9 +312,11 @@ out:
2307 return IRQ_HANDLED;
2308 }
2309
2310 -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
2311 - unsigned long va)
2312 +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
2313 + unsigned long va)
2314 {
2315 + cpumask_t cpumask = *cpumaskp;
2316 +
2317 /*
2318 * A couple of (to be removed) sanity checks:
2319 *
2320 @@ -327,10 +327,12 @@ static void flush_tlb_others(cpumask_t c
2321 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
2322 BUG_ON(!mm);
2323
2324 +#ifdef CONFIG_HOTPLUG_CPU
2325 /* If a CPU which we ran on has gone down, OK. */
2326 cpus_and(cpumask, cpumask, cpu_online_map);
2327 - if (cpus_empty(cpumask))
2328 + if (unlikely(cpus_empty(cpumask)))
2329 return;
2330 +#endif
2331
2332 /*
2333 * i'm not happy about this global shared spinlock in the
2334 @@ -341,17 +343,7 @@ static void flush_tlb_others(cpumask_t c
2335
2336 flush_mm = mm;
2337 flush_va = va;
2338 -#if NR_CPUS <= BITS_PER_LONG
2339 - atomic_set_mask(cpumask, &flush_cpumask);
2340 -#else
2341 - {
2342 - int k;
2343 - unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
2344 - unsigned long *cpu_mask = (unsigned long *)&cpumask;
2345 - for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
2346 - atomic_set_mask(cpu_mask[k], &flush_mask[k]);
2347 - }
2348 -#endif
2349 + cpus_or(flush_cpumask, cpumask, flush_cpumask);
2350 /*
2351 * We have to send the IPI only to
2352 * CPUs affected.
2353 @@ -378,7 +370,7 @@ void flush_tlb_current_task(void)
2354
2355 local_flush_tlb();
2356 if (!cpus_empty(cpu_mask))
2357 - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
2358 + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
2359 preempt_enable();
2360 }
2361
2362 @@ -397,7 +389,7 @@ void flush_tlb_mm (struct mm_struct * mm
2363 leave_mm(smp_processor_id());
2364 }
2365 if (!cpus_empty(cpu_mask))
2366 - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
2367 + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
2368
2369 preempt_enable();
2370 }
2371 @@ -446,7 +438,7 @@ void flush_tlb_all(void)
2372 * it goes straight through and wastes no time serializing
2373 * anything. Worst case is that we lose a reschedule ...
2374 */
2375 -void smp_send_reschedule(int cpu)
2376 +void xen_smp_send_reschedule(int cpu)
2377 {
2378 WARN_ON(cpu_is_offline(cpu));
2379 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
2380 @@ -478,36 +470,79 @@ void unlock_ipi_call_lock(void)
2381
2382 static struct call_data_struct *call_data;
2383
2384 +static void __smp_call_function(void (*func) (void *info), void *info,
2385 + int nonatomic, int wait)
2386 +{
2387 + struct call_data_struct data;
2388 + int cpus = num_online_cpus() - 1;
2389 +
2390 + if (!cpus)
2391 + return;
2392 +
2393 + data.func = func;
2394 + data.info = info;
2395 + atomic_set(&data.started, 0);
2396 + data.wait = wait;
2397 + if (wait)
2398 + atomic_set(&data.finished, 0);
2399 +
2400 + call_data = &data;
2401 + mb();
2402 +
2403 + /* Send a message to all other CPUs and wait for them to respond */
2404 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2405 +
2406 + /* Wait for response */
2407 + while (atomic_read(&data.started) != cpus)
2408 + cpu_relax();
2409 +
2410 + if (wait)
2411 + while (atomic_read(&data.finished) != cpus)
2412 + cpu_relax();
2413 +}
2414 +
2415 +
2416 /**
2417 - * smp_call_function(): Run a function on all other CPUs.
2418 + * smp_call_function_mask(): Run a function on a set of other CPUs.
2419 + * @mask: The set of cpus to run on. Must not include the current cpu.
2420 * @func: The function to run. This must be fast and non-blocking.
2421 * @info: An arbitrary pointer to pass to the function.
2422 - * @nonatomic: currently unused.
2423 * @wait: If true, wait (atomically) until function has completed on other CPUs.
2424 *
2425 - * Returns 0 on success, else a negative status code. Does not return until
2426 - * remote CPUs are nearly ready to execute <<func>> or are or have executed.
2427 + * Returns 0 on success, else a negative status code.
2428 + *
2429 + * If @wait is true, then returns once @func has returned; otherwise
2430 + * it returns just before the target cpu calls @func.
2431 *
2432 * You must not call this function with disabled interrupts or from a
2433 * hardware interrupt handler or from a bottom half handler.
2434 */
2435 -int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
2436 - int wait)
2437 +int
2438 +xen_smp_call_function_mask(cpumask_t mask,
2439 + void (*func)(void *), void *info,
2440 + int wait)
2441 {
2442 struct call_data_struct data;
2443 + cpumask_t allbutself;
2444 int cpus;
2445
2446 + /* Can deadlock when called with interrupts disabled */
2447 + WARN_ON(irqs_disabled());
2448 +
2449 /* Holding any lock stops cpus from going down. */
2450 spin_lock(&call_lock);
2451 - cpus = num_online_cpus() - 1;
2452 +
2453 + allbutself = cpu_online_map;
2454 + cpu_clear(smp_processor_id(), allbutself);
2455 +
2456 + cpus_and(mask, mask, allbutself);
2457 + cpus = cpus_weight(mask);
2458 +
2459 if (!cpus) {
2460 spin_unlock(&call_lock);
2461 return 0;
2462 }
2463
2464 - /* Can deadlock when called with interrupts disabled */
2465 - WARN_ON(irqs_disabled());
2466 -
2467 data.func = func;
2468 data.info = info;
2469 atomic_set(&data.started, 0);
2470 @@ -517,9 +552,12 @@ int smp_call_function (void (*func) (voi
2471
2472 call_data = &data;
2473 mb();
2474 -
2475 - /* Send a message to all other CPUs and wait for them to respond */
2476 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2477 +
2478 + /* Send a message to other CPUs */
2479 + if (cpus_equal(mask, allbutself))
2480 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2481 + else
2482 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
2483
2484 /* Wait for response */
2485 while (atomic_read(&data.started) != cpus)
2486 @@ -532,15 +570,14 @@ int smp_call_function (void (*func) (voi
2487
2488 return 0;
2489 }
2490 -EXPORT_SYMBOL(smp_call_function);
2491
2492 static void stop_this_cpu (void * dummy)
2493 {
2494 + local_irq_disable();
2495 /*
2496 * Remove this CPU:
2497 */
2498 cpu_clear(smp_processor_id(), cpu_online_map);
2499 - local_irq_disable();
2500 disable_all_local_evtchn();
2501 if (cpu_data[smp_processor_id()].hlt_works_ok)
2502 for(;;) halt();
2503 @@ -551,13 +588,18 @@ static void stop_this_cpu (void * dummy)
2504 * this function calls the 'stop' function on all other CPUs in the system.
2505 */
2506
2507 -void smp_send_stop(void)
2508 +void xen_smp_send_stop(void)
2509 {
2510 - smp_call_function(stop_this_cpu, NULL, 1, 0);
2511 + /* Don't deadlock on the call lock in panic */
2512 + int nolock = !spin_trylock(&call_lock);
2513 + unsigned long flags;
2514
2515 - local_irq_disable();
2516 + local_irq_save(flags);
2517 + __smp_call_function(stop_this_cpu, NULL, 0, 0);
2518 + if (!nolock)
2519 + spin_unlock(&call_lock);
2520 disable_all_local_evtchn();
2521 - local_irq_enable();
2522 + local_irq_restore(flags);
2523 }
2524
2525 /*
2526 @@ -598,74 +640,3 @@ irqreturn_t smp_call_function_interrupt(
2527
2528 return IRQ_HANDLED;
2529 }
2530 -
2531 -/*
2532 - * this function sends a 'generic call function' IPI to one other CPU
2533 - * in the system.
2534 - *
2535 - * cpu is a standard Linux logical CPU number.
2536 - */
2537 -static void
2538 -__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
2539 - int nonatomic, int wait)
2540 -{
2541 - struct call_data_struct data;
2542 - int cpus = 1;
2543 -
2544 - data.func = func;
2545 - data.info = info;
2546 - atomic_set(&data.started, 0);
2547 - data.wait = wait;
2548 - if (wait)
2549 - atomic_set(&data.finished, 0);
2550 -
2551 - call_data = &data;
2552 - wmb();
2553 - /* Send a message to all other CPUs and wait for them to respond */
2554 - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
2555 -
2556 - /* Wait for response */
2557 - while (atomic_read(&data.started) != cpus)
2558 - cpu_relax();
2559 -
2560 - if (!wait)
2561 - return;
2562 -
2563 - while (atomic_read(&data.finished) != cpus)
2564 - cpu_relax();
2565 -}
2566 -
2567 -/*
2568 - * smp_call_function_single - Run a function on another CPU
2569 - * @func: The function to run. This must be fast and non-blocking.
2570 - * @info: An arbitrary pointer to pass to the function.
2571 - * @nonatomic: Currently unused.
2572 - * @wait: If true, wait until function has completed on other CPUs.
2573 - *
2574 - * Retrurns 0 on success, else a negative status code.
2575 - *
2576 - * Does not return until the remote CPU is nearly ready to execute <func>
2577 - * or is or has executed.
2578 - */
2579 -
2580 -int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
2581 - int nonatomic, int wait)
2582 -{
2583 - /* prevent preemption and reschedule on another processor */
2584 - int me = get_cpu();
2585 - if (cpu == me) {
2586 - WARN_ON(1);
2587 - put_cpu();
2588 - return -EBUSY;
2589 - }
2590 -
2591 - /* Can deadlock when called with interrupts disabled */
2592 - WARN_ON(irqs_disabled());
2593 -
2594 - spin_lock_bh(&call_lock);
2595 - __smp_call_function_single(cpu, func, info, nonatomic, wait);
2596 - spin_unlock_bh(&call_lock);
2597 - put_cpu();
2598 - return 0;
2599 -}
2600 -EXPORT_SYMBOL(smp_call_function_single);
2601 --- a/arch/x86/kernel/smp_64-xen.c
2602 +++ b/arch/x86/kernel/smp_64-xen.c
2603 @@ -14,7 +14,6 @@
2604 #include <linux/mm.h>
2605 #include <linux/delay.h>
2606 #include <linux/spinlock.h>
2607 -#include <linux/smp_lock.h>
2608 #include <linux/smp.h>
2609 #include <linux/kernel_stat.h>
2610 #include <linux/mc146818rtc.h>
2611 @@ -457,44 +456,36 @@ int smp_call_function (void (*func) (voi
2612 }
2613 EXPORT_SYMBOL(smp_call_function);
2614
2615 -void smp_stop_cpu(void)
2616 +static void stop_this_cpu(void *dummy)
2617 {
2618 - unsigned long flags;
2619 + local_irq_disable();
2620 /*
2621 * Remove this CPU:
2622 */
2623 cpu_clear(smp_processor_id(), cpu_online_map);
2624 - local_irq_save(flags);
2625 disable_all_local_evtchn();
2626 - local_irq_restore(flags);
2627 -}
2628 -
2629 -static void smp_really_stop_cpu(void *dummy)
2630 -{
2631 - smp_stop_cpu();
2632 for (;;)
2633 halt();
2634 }
2635
2636 void smp_send_stop(void)
2637 {
2638 - int nolock = 0;
2639 + int nolock;
2640 + unsigned long flags;
2641 +
2642 #ifndef CONFIG_XEN
2643 if (reboot_force)
2644 return;
2645 #endif
2646 +
2647 /* Don't deadlock on the call lock in panic */
2648 - if (!spin_trylock(&call_lock)) {
2649 - /* ignore locking because we have panicked anyways */
2650 - nolock = 1;
2651 - }
2652 - __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
2653 + nolock = !spin_trylock(&call_lock);
2654 + local_irq_save(flags);
2655 + __smp_call_function(stop_this_cpu, NULL, 0, 0);
2656 if (!nolock)
2657 spin_unlock(&call_lock);
2658 -
2659 - local_irq_disable();
2660 disable_all_local_evtchn();
2661 - local_irq_enable();
2662 + local_irq_restore(flags);
2663 }
2664
2665 /*
2666 --- a/arch/x86/kernel/time_32-xen.c
2667 +++ b/arch/x86/kernel/time_32-xen.c
2668 @@ -80,7 +80,6 @@
2669 #include <asm/i8253.h>
2670 DEFINE_SPINLOCK(i8253_lock);
2671 EXPORT_SYMBOL(i8253_lock);
2672 -int pit_latch_buggy; /* extern */
2673 #else
2674 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
2675 #endif
2676 @@ -218,6 +217,26 @@ static inline u64 scale_delta(u64 delta,
2677 return product;
2678 }
2679
2680 +static inline u64 get64(volatile u64 *ptr)
2681 +{
2682 +#ifndef CONFIG_64BIT
2683 + return cmpxchg64(ptr, 0, 0);
2684 +#else
2685 + return *ptr;
2686 +#define cmpxchg64 cmpxchg
2687 +#endif
2688 +}
2689 +
2690 +static inline u64 get64_local(volatile u64 *ptr)
2691 +{
2692 +#ifndef CONFIG_64BIT
2693 + return cmpxchg64_local(ptr, 0, 0);
2694 +#else
2695 + return *ptr;
2696 +#define cmpxchg64_local cmpxchg_local
2697 +#endif
2698 +}
2699 +
2700 static void init_cpu_khz(void)
2701 {
2702 u64 __cpu_khz = 1000000ULL << 32;
2703 @@ -399,7 +418,7 @@ static int set_rtc_mmss(unsigned long no
2704 return retval;
2705 }
2706
2707 -unsigned long long sched_clock(void)
2708 +static unsigned long long local_clock(void)
2709 {
2710 unsigned int cpu = get_cpu();
2711 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
2712 @@ -420,6 +439,61 @@ unsigned long long sched_clock(void)
2713 return time;
2714 }
2715
2716 +/*
2717 + * Runstate accounting
2718 + */
2719 +static void get_runstate_snapshot(struct vcpu_runstate_info *res)
2720 +{
2721 + u64 state_time;
2722 + struct vcpu_runstate_info *state;
2723 +
2724 + BUG_ON(preemptible());
2725 +
2726 + state = &__get_cpu_var(runstate);
2727 +
2728 + do {
2729 + state_time = get64_local(&state->state_entry_time);
2730 + *res = *state;
2731 + } while (get64_local(&state->state_entry_time) != state_time);
2732 +
2733 + WARN_ON_ONCE(res->state != RUNSTATE_running);
2734 +}
2735 +
2736 +/*
2737 + * Xen sched_clock implementation. Returns the number of unstolen
2738 + * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
2739 + * states.
2740 + */
2741 +unsigned long long sched_clock(void)
2742 +{
2743 + struct vcpu_runstate_info runstate;
2744 + cycle_t now;
2745 + u64 ret;
2746 + s64 offset;
2747 +
2748 + /*
2749 + * Ideally sched_clock should be called on a per-cpu basis
2750 + * anyway, so preempt should already be disabled, but that's
2751 + * not current practice at the moment.
2752 + */
2753 + preempt_disable();
2754 +
2755 + now = local_clock();
2756 +
2757 + get_runstate_snapshot(&runstate);
2758 +
2759 + offset = now - runstate.state_entry_time;
2760 + if (offset < 0)
2761 + offset = 0;
2762 +
2763 + ret = offset + runstate.time[RUNSTATE_running]
2764 + + runstate.time[RUNSTATE_blocked];
2765 +
2766 + preempt_enable();
2767 +
2768 + return ret;
2769 +}
2770 +
2771 unsigned long profile_pc(struct pt_regs *regs)
2772 {
2773 unsigned long pc = instruction_pointer(regs);
2774 @@ -467,10 +541,9 @@ EXPORT_SYMBOL(profile_pc);
2775 irqreturn_t timer_interrupt(int irq, void *dev_id)
2776 {
2777 s64 delta, delta_cpu, stolen, blocked;
2778 - u64 sched_time;
2779 unsigned int i, cpu = smp_processor_id();
2780 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
2781 - struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
2782 + struct vcpu_runstate_info runstate;
2783
2784 /*
2785 * Here we are in the timer irq handler. We just have irqs locally
2786 @@ -490,20 +563,7 @@ irqreturn_t timer_interrupt(int irq, voi
2787 delta -= processed_system_time;
2788 delta_cpu -= per_cpu(processed_system_time, cpu);
2789
2790 - /*
2791 - * Obtain a consistent snapshot of stolen/blocked cycles. We
2792 - * can use state_entry_time to detect if we get preempted here.
2793 - */
2794 - do {
2795 - sched_time = runstate->state_entry_time;
2796 - barrier();
2797 - stolen = runstate->time[RUNSTATE_runnable] +
2798 - runstate->time[RUNSTATE_offline] -
2799 - per_cpu(processed_stolen_time, cpu);
2800 - blocked = runstate->time[RUNSTATE_blocked] -
2801 - per_cpu(processed_blocked_time, cpu);
2802 - barrier();
2803 - } while (sched_time != runstate->state_entry_time);
2804 + get_runstate_snapshot(&runstate);
2805 } while (!time_values_up_to_date(cpu));
2806
2807 if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
2808 @@ -545,6 +605,9 @@ irqreturn_t timer_interrupt(int irq, voi
2809 * HACK: Passing NULL to account_steal_time()
2810 * ensures that the ticks are accounted as stolen.
2811 */
2812 + stolen = runstate.time[RUNSTATE_runnable]
2813 + + runstate.time[RUNSTATE_offline]
2814 + - per_cpu(processed_stolen_time, cpu);
2815 if ((stolen > 0) && (delta_cpu > 0)) {
2816 delta_cpu -= stolen;
2817 if (unlikely(delta_cpu < 0))
2818 @@ -560,6 +623,8 @@ irqreturn_t timer_interrupt(int irq, voi
2819 * HACK: Passing idle_task to account_steal_time()
2820 * ensures that the ticks are accounted as idle/wait.
2821 */
2822 + blocked = runstate.time[RUNSTATE_blocked]
2823 + - per_cpu(processed_blocked_time, cpu);
2824 if ((blocked > 0) && (delta_cpu > 0)) {
2825 delta_cpu -= blocked;
2826 if (unlikely(delta_cpu < 0))
2827 @@ -596,7 +661,7 @@ irqreturn_t timer_interrupt(int irq, voi
2828 return IRQ_HANDLED;
2829 }
2830
2831 -void mark_tsc_unstable(void)
2832 +void mark_tsc_unstable(char *reason)
2833 {
2834 #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
2835 tsc_unstable = 1;
2836 @@ -604,17 +669,13 @@ void mark_tsc_unstable(void)
2837 }
2838 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
2839
2840 +static cycle_t cs_last;
2841 +
2842 static cycle_t xen_clocksource_read(void)
2843 {
2844 #ifdef CONFIG_SMP
2845 - static cycle_t last_ret;
2846 -#ifndef CONFIG_64BIT
2847 - cycle_t last = cmpxchg64(&last_ret, 0, 0);
2848 -#else
2849 - cycle_t last = last_ret;
2850 -#define cmpxchg64 cmpxchg
2851 -#endif
2852 - cycle_t ret = sched_clock();
2853 + cycle_t last = get64(&cs_last);
2854 + cycle_t ret = local_clock();
2855
2856 if (unlikely((s64)(ret - last) < 0)) {
2857 if (last - ret > permitted_clock_jitter
2858 @@ -633,17 +694,25 @@ static cycle_t xen_clocksource_read(void
2859 }
2860
2861 for (;;) {
2862 - cycle_t cur = cmpxchg64(&last_ret, last, ret);
2863 + cycle_t cur = cmpxchg64(&cs_last, last, ret);
2864
2865 if (cur == last || (s64)(ret - cur) < 0)
2866 return ret;
2867 last = cur;
2868 }
2869 #else
2870 - return sched_clock();
2871 + return local_clock();
2872 #endif
2873 }
2874
2875 +static void xen_clocksource_resume(void)
2876 +{
2877 + extern void time_resume(void);
2878 +
2879 + time_resume();
2880 + cs_last = local_clock();
2881 +}
2882 +
2883 static struct clocksource clocksource_xen = {
2884 .name = "xen",
2885 .rating = 400,
2886 @@ -652,6 +721,7 @@ static struct clocksource clocksource_xe
2887 .mult = 1 << XEN_SHIFT, /* time directly in nanoseconds */
2888 .shift = XEN_SHIFT,
2889 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
2890 + .resume = xen_clocksource_resume,
2891 };
2892
2893 static void init_missing_ticks_accounting(unsigned int cpu)
2894 @@ -740,35 +810,6 @@ void notify_arch_cmos_timer(void)
2895 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
2896 }
2897
2898 -static int timer_resume(struct sys_device *dev)
2899 -{
2900 - extern void time_resume(void);
2901 - time_resume();
2902 - return 0;
2903 -}
2904 -
2905 -static struct sysdev_class timer_sysclass = {
2906 - .resume = timer_resume,
2907 - set_kset_name("timer"),
2908 -};
2909 -
2910 -
2911 -/* XXX this driverfs stuff should probably go elsewhere later -john */
2912 -static struct sys_device device_timer = {
2913 - .id = 0,
2914 - .cls = &timer_sysclass,
2915 -};
2916 -
2917 -static int time_init_device(void)
2918 -{
2919 - int error = sysdev_class_register(&timer_sysclass);
2920 - if (!error)
2921 - error = sysdev_register(&device_timer);
2922 - return error;
2923 -}
2924 -
2925 -device_initcall(time_init_device);
2926 -
2927 extern void (*late_time_init)(void);
2928
2929 /* Dynamically-mapped IRQ. */
2930 @@ -899,21 +940,21 @@ static void start_hz_timer(void)
2931 cpu_clear(smp_processor_id(), nohz_cpu_mask);
2932 }
2933
2934 -void raw_safe_halt(void)
2935 +void xen_safe_halt(void)
2936 {
2937 stop_hz_timer();
2938 /* Blocking includes an implicit local_irq_enable(). */
2939 HYPERVISOR_block();
2940 start_hz_timer();
2941 }
2942 -EXPORT_SYMBOL(raw_safe_halt);
2943 +EXPORT_SYMBOL(xen_safe_halt);
2944
2945 -void halt(void)
2946 +void xen_halt(void)
2947 {
2948 if (irqs_disabled())
2949 VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
2950 }
2951 -EXPORT_SYMBOL(halt);
2952 +EXPORT_SYMBOL(xen_halt);
2953
2954 /* No locking required. Interrupts are disabled on all CPUs. */
2955 void time_resume(void)
2956 --- a/arch/x86/kernel/traps_32-xen.c
2957 +++ b/arch/x86/kernel/traps_32-xen.c
2958 @@ -52,7 +52,7 @@
2959 #include <asm/unwind.h>
2960 #include <asm/smp.h>
2961 #include <asm/arch_hooks.h>
2962 -#include <asm/kdebug.h>
2963 +#include <linux/kdebug.h>
2964 #include <asm/stacktrace.h>
2965
2966 #include <linux/module.h>
2967 @@ -101,20 +101,6 @@ asmlinkage void machine_check(void);
2968
2969 int kstack_depth_to_print = 24;
2970 static unsigned int code_bytes = 64;
2971 -ATOMIC_NOTIFIER_HEAD(i386die_chain);
2972 -
2973 -int register_die_notifier(struct notifier_block *nb)
2974 -{
2975 - vmalloc_sync_all();
2976 - return atomic_notifier_chain_register(&i386die_chain, nb);
2977 -}
2978 -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
2979 -
2980 -int unregister_die_notifier(struct notifier_block *nb)
2981 -{
2982 - return atomic_notifier_chain_unregister(&i386die_chain, nb);
2983 -}
2984 -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
2985
2986 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
2987 {
2988 @@ -325,7 +311,7 @@ void show_registers(struct pt_regs *regs
2989 regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
2990 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
2991 TASK_COMM_LEN, current->comm, current->pid,
2992 - current_thread_info(), current, current->thread_info);
2993 + current_thread_info(), current, task_thread_info(current));
2994 /*
2995 * When in-kernel, we also print out the stack and code at the
2996 * time of the fault..
2997 @@ -482,8 +468,6 @@ static void __kprobes do_trap(int trapnr
2998 siginfo_t *info)
2999 {
3000 struct task_struct *tsk = current;
3001 - tsk->thread.error_code = error_code;
3002 - tsk->thread.trap_no = trapnr;
3003
3004 if (regs->eflags & VM_MASK) {
3005 if (vm86)
3006 @@ -495,6 +479,18 @@ static void __kprobes do_trap(int trapnr
3007 goto kernel_trap;
3008
3009 trap_signal: {
3010 + /*
3011 + * We want error_code and trap_no set for userspace faults and
3012 + * kernelspace faults which result in die(), but not
3013 + * kernelspace faults which are fixed up. die() gives the
3014 + * process no chance to handle the signal and notice the
3015 + * kernel fault information, so that won't result in polluting
3016 + * the information about previously queued, but not yet
3017 + * delivered, faults. See also do_general_protection below.
3018 + */
3019 + tsk->thread.error_code = error_code;
3020 + tsk->thread.trap_no = trapnr;
3021 +
3022 if (info)
3023 force_sig_info(signr, info, tsk);
3024 else
3025 @@ -503,8 +499,11 @@ static void __kprobes do_trap(int trapnr
3026 }
3027
3028 kernel_trap: {
3029 - if (!fixup_exception(regs))
3030 + if (!fixup_exception(regs)) {
3031 + tsk->thread.error_code = error_code;
3032 + tsk->thread.trap_no = trapnr;
3033 die(str, regs, error_code);
3034 + }
3035 return;
3036 }
3037
3038 @@ -578,9 +577,6 @@ DO_ERROR_INFO(32, SIGSEGV, "iret excepti
3039 fastcall void __kprobes do_general_protection(struct pt_regs * regs,
3040 long error_code)
3041 {
3042 - current->thread.error_code = error_code;
3043 - current->thread.trap_no = 13;
3044 -
3045 if (regs->eflags & VM_MASK)
3046 goto gp_in_vm86;
3047
3048 @@ -599,6 +595,8 @@ gp_in_vm86:
3049
3050 gp_in_kernel:
3051 if (!fixup_exception(regs)) {
3052 + current->thread.error_code = error_code;
3053 + current->thread.trap_no = 13;
3054 if (notify_die(DIE_GPF, "general protection fault", regs,
3055 error_code, 13, SIGSEGV) == NOTIFY_STOP)
3056 return;
3057 @@ -987,9 +985,7 @@ fastcall void do_spurious_interrupt_bug(
3058 fastcall unsigned long patch_espfix_desc(unsigned long uesp,
3059 unsigned long kesp)
3060 {
3061 - int cpu = smp_processor_id();
3062 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
3063 - struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
3064 + struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
3065 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
3066 unsigned long new_kesp = kesp - base;
3067 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
3068 --- a/arch/x86/kernel/traps_64-xen.c
3069 +++ b/arch/x86/kernel/traps_64-xen.c
3070 @@ -32,6 +32,7 @@
3071 #include <linux/unwind.h>
3072 #include <linux/uaccess.h>
3073 #include <linux/bug.h>
3074 +#include <linux/kdebug.h>
3075
3076 #include <asm/system.h>
3077 #include <asm/io.h>
3078 @@ -39,7 +40,6 @@
3079 #include <asm/debugreg.h>
3080 #include <asm/desc.h>
3081 #include <asm/i387.h>
3082 -#include <asm/kdebug.h>
3083 #include <asm/processor.h>
3084 #include <asm/unwind.h>
3085 #include <asm/smp.h>
3086 @@ -71,22 +71,6 @@ asmlinkage void alignment_check(void);
3087 asmlinkage void machine_check(void);
3088 asmlinkage void spurious_interrupt_bug(void);
3089
3090 -ATOMIC_NOTIFIER_HEAD(die_chain);
3091 -EXPORT_SYMBOL(die_chain);
3092 -
3093 -int register_die_notifier(struct notifier_block *nb)
3094 -{
3095 - vmalloc_sync_all();
3096 - return atomic_notifier_chain_register(&die_chain, nb);
3097 -}
3098 -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
3099 -
3100 -int unregister_die_notifier(struct notifier_block *nb)
3101 -{
3102 - return atomic_notifier_chain_unregister(&die_chain, nb);
3103 -}
3104 -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
3105 -
3106 static inline void conditional_sti(struct pt_regs *regs)
3107 {
3108 if (regs->eflags & X86_EFLAGS_IF)
3109 @@ -428,8 +412,7 @@ void show_registers(struct pt_regs *regs
3110 const int cpu = smp_processor_id();
3111 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
3112
3113 - rsp = regs->rsp;
3114 -
3115 + rsp = regs->rsp;
3116 printk("CPU %d ", cpu);
3117 __show_regs(regs);
3118 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
3119 @@ -440,7 +423,6 @@ void show_registers(struct pt_regs *regs
3120 * time of the fault..
3121 */
3122 if (in_kernel) {
3123 -
3124 printk("Stack: ");
3125 _show_stack(NULL, regs, (unsigned long*)rsp);
3126
3127 @@ -485,13 +467,14 @@ static unsigned int die_nest_count;
3128
3129 unsigned __kprobes long oops_begin(void)
3130 {
3131 - int cpu = smp_processor_id();
3132 + int cpu;
3133 unsigned long flags;
3134
3135 oops_enter();
3136
3137 /* racy, but better than risking deadlock. */
3138 local_irq_save(flags);
3139 + cpu = smp_processor_id();
3140 if (!spin_trylock(&die_lock)) {
3141 if (cpu == die_owner)
3142 /* nested oops. should stop eventually */;
3143 @@ -585,10 +568,20 @@ static void __kprobes do_trap(int trapnr
3144 {
3145 struct task_struct *tsk = current;
3146
3147 - tsk->thread.error_code = error_code;
3148 - tsk->thread.trap_no = trapnr;
3149 -
3150 if (user_mode(regs)) {
3151 + /*
3152 + * We want error_code and trap_no set for userspace
3153 + * faults and kernelspace faults which result in
3154 + * die(), but not kernelspace faults which are fixed
3155 + * up. die() gives the process no chance to handle
3156 + * the signal and notice the kernel fault information,
3157 + * so that won't result in polluting the information
3158 + * about previously queued, but not yet delivered,
3159 + * faults. See also do_general_protection below.
3160 + */
3161 + tsk->thread.error_code = error_code;
3162 + tsk->thread.trap_no = trapnr;
3163 +
3164 if (exception_trace && unhandled_signal(tsk, signr))
3165 printk(KERN_INFO
3166 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
3167 @@ -609,8 +602,11 @@ static void __kprobes do_trap(int trapnr
3168 fixup = search_exception_tables(regs->rip);
3169 if (fixup)
3170 regs->rip = fixup->fixup;
3171 - else
3172 + else {
3173 + tsk->thread.error_code = error_code;
3174 + tsk->thread.trap_no = trapnr;
3175 die(str, regs, error_code);
3176 + }
3177 return;
3178 }
3179 }
3180 @@ -686,10 +682,10 @@ asmlinkage void __kprobes do_general_pro
3181
3182 conditional_sti(regs);
3183
3184 - tsk->thread.error_code = error_code;
3185 - tsk->thread.trap_no = 13;
3186 -
3187 if (user_mode(regs)) {
3188 + tsk->thread.error_code = error_code;
3189 + tsk->thread.trap_no = 13;
3190 +
3191 if (exception_trace && unhandled_signal(tsk, SIGSEGV))
3192 printk(KERN_INFO
3193 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
3194 @@ -708,6 +704,9 @@ asmlinkage void __kprobes do_general_pro
3195 regs->rip = fixup->fixup;
3196 return;
3197 }
3198 +
3199 + tsk->thread.error_code = error_code;
3200 + tsk->thread.trap_no = 13;
3201 if (notify_die(DIE_GPF, "general protection fault", regs,
3202 error_code, 13, SIGSEGV) == NOTIFY_STOP)
3203 return;
3204 --- a/arch/x86/kernel/vsyscall_64-xen.c
3205 +++ b/arch/x86/kernel/vsyscall_64-xen.c
3206 @@ -45,14 +45,34 @@
3207
3208 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
3209 #define __syscall_clobber "r11","rcx","memory"
3210 +#define __pa_vsymbol(x) \
3211 + ({unsigned long v; \
3212 + extern char __vsyscall_0; \
3213 + asm("" : "=r" (v) : "0" (x)); \
3214 + ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
3215
3216 +/*
3217 + * vsyscall_gtod_data contains data that is :
3218 + * - readonly from vsyscalls
3219 + * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
3220 + * Try to keep this structure as small as possible to avoid cache line ping pongs
3221 + */
3222 struct vsyscall_gtod_data_t {
3223 - seqlock_t lock;
3224 - int sysctl_enabled;
3225 - struct timeval wall_time_tv;
3226 + seqlock_t lock;
3227 +
3228 + /* open coded 'struct timespec' */
3229 + time_t wall_time_sec;
3230 + u32 wall_time_nsec;
3231 +
3232 + int sysctl_enabled;
3233 struct timezone sys_tz;
3234 - cycle_t offset_base;
3235 - struct clocksource clock;
3236 + struct { /* extract of a clocksource struct */
3237 + cycle_t (*vread)(void);
3238 + cycle_t cycle_last;
3239 + cycle_t mask;
3240 + u32 mult;
3241 + u32 shift;
3242 + } clock;
3243 };
3244 int __vgetcpu_mode __section_vgetcpu_mode;
3245
3246 @@ -68,9 +88,13 @@ void update_vsyscall(struct timespec *wa
3247
3248 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
3249 /* copy vsyscall data */
3250 - vsyscall_gtod_data.clock = *clock;
3251 - vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec;
3252 - vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000;
3253 + vsyscall_gtod_data.clock.vread = clock->vread;
3254 + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
3255 + vsyscall_gtod_data.clock.mask = clock->mask;
3256 + vsyscall_gtod_data.clock.mult = clock->mult;
3257 + vsyscall_gtod_data.clock.shift = clock->shift;
3258 + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
3259 + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
3260 vsyscall_gtod_data.sys_tz = sys_tz;
3261 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
3262 }
3263 @@ -105,7 +129,8 @@ static __always_inline long time_syscall
3264 static __always_inline void do_vgettimeofday(struct timeval * tv)
3265 {
3266 cycle_t now, base, mask, cycle_delta;
3267 - unsigned long seq, mult, shift, nsec_delta;
3268 + unsigned seq;
3269 + unsigned long mult, shift, nsec;
3270 cycle_t (*vread)(void);
3271 do {
3272 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
3273 @@ -121,21 +146,20 @@ static __always_inline void do_vgettimeo
3274 mult = __vsyscall_gtod_data.clock.mult;
3275 shift = __vsyscall_gtod_data.clock.shift;
3276
3277 - *tv = __vsyscall_gtod_data.wall_time_tv;
3278 -
3279 + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
3280 + nsec = __vsyscall_gtod_data.wall_time_nsec;
3281 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
3282
3283 /* calculate interval: */
3284 cycle_delta = (now - base) & mask;
3285 /* convert to nsecs: */
3286 - nsec_delta = (cycle_delta * mult) >> shift;
3287 + nsec += (cycle_delta * mult) >> shift;
3288
3289 - /* convert to usecs and add to timespec: */
3290 - tv->tv_usec += nsec_delta / NSEC_PER_USEC;
3291 - while (tv->tv_usec > USEC_PER_SEC) {
3292 + while (nsec >= NSEC_PER_SEC) {
3293 tv->tv_sec += 1;
3294 - tv->tv_usec -= USEC_PER_SEC;
3295 + nsec -= NSEC_PER_SEC;
3296 }
3297 + tv->tv_usec = nsec / NSEC_PER_USEC;
3298 }
3299
3300 int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
3301 @@ -151,11 +175,16 @@ int __vsyscall(0) vgettimeofday(struct t
3302 * unlikely */
3303 time_t __vsyscall(1) vtime(time_t *t)
3304 {
3305 + struct timeval tv;
3306 + time_t result;
3307 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
3308 return time_syscall(t);
3309 - else if (t)
3310 - *t = __vsyscall_gtod_data.wall_time_tv.tv_sec;
3311 - return __vsyscall_gtod_data.wall_time_tv.tv_sec;
3312 +
3313 + vgettimeofday(&tv, 0);
3314 + result = tv.tv_sec;
3315 + if (t)
3316 + *t = result;
3317 + return result;
3318 }
3319
3320 /* Fast way to get current CPU and node.
3321 @@ -224,10 +253,10 @@ static int vsyscall_sysctl_change(ctl_ta
3322 return ret;
3323 /* gcc has some trouble with __va(__pa()), so just do it this
3324 way. */
3325 - map1 = ioremap(__pa_symbol(&vsysc1), 2);
3326 + map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
3327 if (!map1)
3328 return -ENOMEM;
3329 - map2 = ioremap(__pa_symbol(&vsysc2), 2);
3330 + map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
3331 if (!map2) {
3332 ret = -ENOMEM;
3333 goto out;
3334 @@ -304,7 +333,7 @@ static int __cpuinit
3335 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
3336 {
3337 long cpu = (long)arg;
3338 - if (action == CPU_ONLINE)
3339 + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
3340 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
3341 return NOTIFY_DONE;
3342 }
3343 --- a/arch/x86/mm/fault_32-xen.c
3344 +++ b/arch/x86/mm/fault_32-xen.c
3345 @@ -14,19 +14,20 @@
3346 #include <linux/mman.h>
3347 #include <linux/mm.h>
3348 #include <linux/smp.h>
3349 -#include <linux/smp_lock.h>
3350 #include <linux/interrupt.h>
3351 #include <linux/init.h>
3352 #include <linux/tty.h>
3353 #include <linux/vt_kern.h> /* For unblank_screen() */
3354 #include <linux/highmem.h>
3355 +#include <linux/bootmem.h> /* for max_low_pfn */
3356 +#include <linux/vmalloc.h>
3357 #include <linux/module.h>
3358 #include <linux/kprobes.h>
3359 #include <linux/uaccess.h>
3360 +#include <linux/kdebug.h>
3361
3362 #include <asm/system.h>
3363 #include <asm/desc.h>
3364 -#include <asm/kdebug.h>
3365 #include <asm/segment.h>
3366
3367 extern void die(const char *,struct pt_regs *,long);
3368 @@ -259,25 +260,20 @@ static void dump_fault_path(unsigned lon
3369 unsigned long page;
3370
3371 page = read_cr3();
3372 - page = ((unsigned long *) __va(page))[address >> 22];
3373 - if (oops_may_print())
3374 - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
3375 - machine_to_phys(page));
3376 + page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
3377 + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
3378 + machine_to_phys(page));
3379 /*
3380 * We must not directly access the pte in the highpte
3381 * case if the page table is located in highmem.
3382 * And lets rather not kmap-atomic the pte, just in case
3383 * it's allocated already.
3384 */
3385 -#ifdef CONFIG_HIGHPTE
3386 - if ((page >> PAGE_SHIFT) >= highstart_pfn)
3387 - return;
3388 -#endif
3389 - if ((page & 1) && oops_may_print()) {
3390 - page &= PAGE_MASK;
3391 - address &= 0x003ff000;
3392 - page = machine_to_phys(page);
3393 - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
3394 + if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
3395 + && (page & _PAGE_PRESENT)) {
3396 + page = machine_to_phys(page & PAGE_MASK);
3397 + page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
3398 + & (PTRS_PER_PTE - 1)];
3399 printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
3400 machine_to_phys(page));
3401 }
3402 @@ -581,6 +577,11 @@ bad_area:
3403 bad_area_nosemaphore:
3404 /* User mode accesses just cause a SIGSEGV */
3405 if (error_code & 4) {
3406 + /*
3407 + * It's possible to have interrupts off here.
3408 + */
3409 + local_irq_enable();
3410 +
3411 /*
3412 * Valid to do another page fault here because this one came
3413 * from user space.
3414 @@ -633,7 +634,7 @@ no_context:
3415 bust_spinlocks(1);
3416
3417 if (oops_may_print()) {
3418 - #ifdef CONFIG_X86_PAE
3419 +#ifdef CONFIG_X86_PAE
3420 if (error_code & 16) {
3421 pte_t *pte = lookup_address(address);
3422
3423 @@ -642,7 +643,7 @@ no_context:
3424 "NX-protected page - exploit attempt? "
3425 "(uid: %d)\n", current->uid);
3426 }
3427 - #endif
3428 +#endif
3429 if (address < PAGE_SIZE)
3430 printk(KERN_ALERT "BUG: unable to handle kernel NULL "
3431 "pointer dereference");
3432 @@ -652,8 +653,8 @@ no_context:
3433 printk(" at virtual address %08lx\n",address);
3434 printk(KERN_ALERT " printing eip:\n");
3435 printk("%08lx\n", regs->eip);
3436 + dump_fault_path(address);
3437 }
3438 - dump_fault_path(address);
3439 tsk->thread.cr2 = address;
3440 tsk->thread.trap_no = 14;
3441 tsk->thread.error_code = error_code;
3442 @@ -694,7 +695,6 @@ do_sigbus:
3443 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
3444 }
3445
3446 -#if !HAVE_SHARED_KERNEL_PMD
3447 void vmalloc_sync_all(void)
3448 {
3449 /*
3450 @@ -710,6 +710,9 @@ void vmalloc_sync_all(void)
3451 static unsigned long start = TASK_SIZE;
3452 unsigned long address;
3453
3454 + if (SHARED_KERNEL_PMD)
3455 + return;
3456 +
3457 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
3458 for (address = start;
3459 address >= TASK_SIZE && address < hypervisor_virt_start;
3460 @@ -739,4 +742,3 @@ void vmalloc_sync_all(void)
3461 start = address + (1UL << PMD_SHIFT);
3462 }
3463 }
3464 -#endif
3465 --- a/arch/x86/mm/fault_64-xen.c
3466 +++ b/arch/x86/mm/fault_64-xen.c
3467 @@ -15,22 +15,22 @@
3468 #include <linux/mman.h>
3469 #include <linux/mm.h>
3470 #include <linux/smp.h>
3471 -#include <linux/smp_lock.h>
3472 #include <linux/interrupt.h>
3473 #include <linux/init.h>
3474 #include <linux/tty.h>
3475 #include <linux/vt_kern.h> /* For unblank_screen() */
3476 #include <linux/compiler.h>
3477 +#include <linux/vmalloc.h>
3478 #include <linux/module.h>
3479 #include <linux/kprobes.h>
3480 #include <linux/uaccess.h>
3481 +#include <linux/kdebug.h>
3482
3483 #include <asm/system.h>
3484 #include <asm/pgalloc.h>
3485 #include <asm/smp.h>
3486 #include <asm/tlbflush.h>
3487 #include <asm/proto.h>
3488 -#include <asm/kdebug.h>
3489 #include <asm-generic/sections.h>
3490
3491 /* Page fault error code bits */
3492 @@ -537,6 +537,12 @@ bad_area:
3493 bad_area_nosemaphore:
3494 /* User mode accesses just cause a SIGSEGV */
3495 if (error_code & PF_USER) {
3496 +
3497 + /*
3498 + * It's possible to have interrupts off here.
3499 + */
3500 + local_irq_enable();
3501 +
3502 if (is_prefetch(regs, address, error_code))
3503 return;
3504
3505 @@ -646,7 +652,7 @@ do_sigbus:
3506 }
3507
3508 DEFINE_SPINLOCK(pgd_lock);
3509 -struct page *pgd_list;
3510 +LIST_HEAD(pgd_list);
3511
3512 void vmalloc_sync_all(void)
3513 {
3514 @@ -666,8 +672,7 @@ void vmalloc_sync_all(void)
3515 if (pgd_none(*pgd_ref))
3516 continue;
3517 spin_lock(&pgd_lock);
3518 - for (page = pgd_list; page;
3519 - page = (struct page *)page->index) {
3520 + list_for_each_entry(page, &pgd_list, lru) {
3521 pgd_t *pgd;
3522 pgd = (pgd_t *)page_address(page) + pgd_index(address);
3523 if (pgd_none(*pgd))
3524 --- a/arch/x86/mm/highmem_32-xen.c
3525 +++ b/arch/x86/mm/highmem_32-xen.c
3526 @@ -26,7 +26,7 @@ void kunmap(struct page *page)
3527 * However when holding an atomic kmap is is not legal to sleep, so atomic
3528 * kmaps are appropriate for short, tight code paths only.
3529 */
3530 -static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
3531 +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
3532 {
3533 enum fixed_addresses idx;
3534 unsigned long vaddr;
3535 @@ -49,15 +49,7 @@ static void *__kmap_atomic(struct page *
3536
3537 void *kmap_atomic(struct page *page, enum km_type type)
3538 {
3539 - return __kmap_atomic(page, type, kmap_prot);
3540 -}
3541 -
3542 -/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
3543 -void *kmap_atomic_pte(struct page *page, enum km_type type)
3544 -{
3545 - return __kmap_atomic(page, type,
3546 - test_bit(PG_pinned, &page->flags)
3547 - ? PAGE_KERNEL_RO : kmap_prot);
3548 + return kmap_atomic_prot(page, type, kmap_prot);
3549 }
3550
3551 void kunmap_atomic(void *kvaddr, enum km_type type)
3552 @@ -80,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km
3553 #endif
3554 }
3555
3556 + /*arch_flush_lazy_mmu_mode();*/
3557 pagefault_enable();
3558 }
3559
3560 @@ -162,7 +155,6 @@ void copy_highpage(struct page *to, stru
3561 EXPORT_SYMBOL(kmap);
3562 EXPORT_SYMBOL(kunmap);
3563 EXPORT_SYMBOL(kmap_atomic);
3564 -EXPORT_SYMBOL(kmap_atomic_pte);
3565 EXPORT_SYMBOL(kunmap_atomic);
3566 EXPORT_SYMBOL(kmap_atomic_to_page);
3567 EXPORT_SYMBOL(clear_highpage);
3568 --- a/arch/x86/mm/init_32-xen.c
3569 +++ b/arch/x86/mm/init_32-xen.c
3570 @@ -22,6 +22,7 @@
3571 #include <linux/init.h>
3572 #include <linux/highmem.h>
3573 #include <linux/pagemap.h>
3574 +#include <linux/pfn.h>
3575 #include <linux/poison.h>
3576 #include <linux/bootmem.h>
3577 #include <linux/slab.h>
3578 @@ -65,17 +66,19 @@ static pmd_t * __init one_md_table_init(
3579 pmd_t *pmd_table;
3580
3581 #ifdef CONFIG_X86_PAE
3582 - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3583 - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
3584 - make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
3585 - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
3586 - pud = pud_offset(pgd, 0);
3587 - if (pmd_table != pmd_offset(pud, 0))
3588 - BUG();
3589 -#else
3590 + if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
3591 + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3592 +
3593 + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
3594 + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
3595 + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
3596 + pud = pud_offset(pgd, 0);
3597 + if (pmd_table != pmd_offset(pud, 0))
3598 + BUG();
3599 + }
3600 +#endif
3601 pud = pud_offset(pgd, 0);
3602 pmd_table = pmd_offset(pud, 0);
3603 -#endif
3604
3605 return pmd_table;
3606 }
3607 @@ -86,16 +89,18 @@ static pmd_t * __init one_md_table_init(
3608 */
3609 static pte_t * __init one_page_table_init(pmd_t *pmd)
3610 {
3611 +#if CONFIG_XEN_COMPAT <= 0x030002
3612 if (pmd_none(*pmd)) {
3613 +#else
3614 + if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) {
3615 +#endif
3616 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3617 +
3618 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
3619 make_lowmem_page_readonly(page_table,
3620 XENFEAT_writable_page_tables);
3621 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
3622 - if (page_table != pte_offset_kernel(pmd, 0))
3623 - BUG();
3624 -
3625 - return page_table;
3626 + BUG_ON(page_table != pte_offset_kernel(pmd, 0));
3627 }
3628
3629 return pte_offset_kernel(pmd, 0);
3630 @@ -115,7 +120,6 @@ static pte_t * __init one_page_table_ini
3631 static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
3632 {
3633 pgd_t *pgd;
3634 - pud_t *pud;
3635 pmd_t *pmd;
3636 int pgd_idx, pmd_idx;
3637 unsigned long vaddr;
3638 @@ -126,12 +130,10 @@ static void __init page_table_range_init
3639 pgd = pgd_base + pgd_idx;
3640
3641 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
3642 - if (pgd_none(*pgd))
3643 - one_md_table_init(pgd);
3644 - pud = pud_offset(pgd, vaddr);
3645 - pmd = pmd_offset(pud, vaddr);
3646 + pmd = one_md_table_init(pgd);
3647 + pmd = pmd + pmd_index(vaddr);
3648 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
3649 - if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
3650 + if (vaddr < hypervisor_virt_start)
3651 one_page_table_init(pmd);
3652
3653 vaddr += PMD_SIZE;
3654 @@ -194,24 +196,25 @@ static void __init kernel_physical_mappi
3655 /* Map with big pages if possible, otherwise create normal page tables. */
3656 if (cpu_has_pse) {
3657 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
3658 -
3659 if (is_kernel_text(address) || is_kernel_text(address2))
3660 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
3661 else
3662 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
3663 +
3664 pfn += PTRS_PER_PTE;
3665 } else {
3666 pte = one_page_table_init(pmd);
3667
3668 - pte += pte_ofs;
3669 - for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
3670 - /* XEN: Only map initial RAM allocation. */
3671 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
3672 - continue;
3673 - if (is_kernel_text(address))
3674 - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
3675 - else
3676 - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
3677 + for (pte += pte_ofs;
3678 + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
3679 + pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
3680 + /* XEN: Only map initial RAM allocation. */
3681 + if ((pfn >= max_ram_pfn) || pte_present(*pte))
3682 + continue;
3683 + if (is_kernel_text(address))
3684 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
3685 + else
3686 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
3687 }
3688 pte_ofs = 0;
3689 }
3690 @@ -381,15 +384,44 @@ extern void __init remap_numa_kva(void);
3691
3692 pgd_t *swapper_pg_dir;
3693
3694 +static void __init xen_pagetable_setup_start(pgd_t *base)
3695 +{
3696 +}
3697 +
3698 +static void __init xen_pagetable_setup_done(pgd_t *base)
3699 +{
3700 +}
3701 +
3702 +/*
3703 + * Build a proper pagetable for the kernel mappings. Up until this
3704 + * point, we've been running on some set of pagetables constructed by
3705 + * the boot process.
3706 + *
3707 + * If we're booting on native hardware, this will be a pagetable
3708 + * constructed in arch/i386/kernel/head.S, and not running in PAE mode
3709 + * (even if we'll end up running in PAE). The root of the pagetable
3710 + * will be swapper_pg_dir.
3711 + *
3712 + * If we're booting paravirtualized under a hypervisor, then there are
3713 + * more options: we may already be running PAE, and the pagetable may
3714 + * or may not be based in swapper_pg_dir. In any case,
3715 + * paravirt_pagetable_setup_start() will set up swapper_pg_dir
3716 + * appropriately for the rest of the initialization to work.
3717 + *
3718 + * In general, pagetable_init() assumes that the pagetable may already
3719 + * be partially populated, and so it avoids stomping on any existing
3720 + * mappings.
3721 + */
3722 static void __init pagetable_init (void)
3723 {
3724 - unsigned long vaddr;
3725 + unsigned long vaddr, end;
3726 pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
3727
3728 + xen_pagetable_setup_start(pgd_base);
3729 +
3730 /* Enable PSE if available */
3731 - if (cpu_has_pse) {
3732 + if (cpu_has_pse)
3733 set_in_cr4(X86_CR4_PSE);
3734 - }
3735
3736 /* Enable PGE if available */
3737 if (cpu_has_pge) {
3738 @@ -406,9 +438,12 @@ static void __init pagetable_init (void)
3739 * created - mappings will be set by set_fixmap():
3740 */
3741 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
3742 - page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
3743 + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
3744 + page_table_range_init(vaddr, end, pgd_base);
3745
3746 permanent_kmaps_init(pgd_base);
3747 +
3748 + xen_pagetable_setup_done(pgd_base);
3749 }
3750
3751 #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
3752 @@ -750,34 +785,29 @@ int remove_memory(u64 start, u64 size)
3753 EXPORT_SYMBOL_GPL(remove_memory);
3754 #endif
3755
3756 -struct kmem_cache *pgd_cache;
3757 struct kmem_cache *pmd_cache;
3758
3759 void __init pgtable_cache_init(void)
3760 {
3761 + size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
3762 +
3763 if (PTRS_PER_PMD > 1) {
3764 pmd_cache = kmem_cache_create("pmd",
3765 PTRS_PER_PMD*sizeof(pmd_t),
3766 PTRS_PER_PMD*sizeof(pmd_t),
3767 - 0,
3768 + SLAB_PANIC,
3769 pmd_ctor,
3770 NULL);
3771 - if (!pmd_cache)
3772 - panic("pgtable_cache_init(): cannot create pmd cache");
3773 + if (!SHARED_KERNEL_PMD) {
3774 + /* If we're in PAE mode and have a non-shared
3775 + kernel pmd, then the pgd size must be a
3776 + page size. This is because the pgd_list
3777 + links through the page structure, so there
3778 + can only be one pgd per page for this to
3779 + work. */
3780 + pgd_size = PAGE_SIZE;
3781 + }
3782 }
3783 - pgd_cache = kmem_cache_create("pgd",
3784 -#ifndef CONFIG_XEN
3785 - PTRS_PER_PGD*sizeof(pgd_t),
3786 - PTRS_PER_PGD*sizeof(pgd_t),
3787 -#else
3788 - PAGE_SIZE,
3789 - PAGE_SIZE,
3790 -#endif
3791 - 0,
3792 - pgd_ctor,
3793 - PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
3794 - if (!pgd_cache)
3795 - panic("pgtable_cache_init(): Cannot create pgd cache");
3796 }
3797
3798 /*
3799 @@ -811,13 +841,26 @@ static int noinline do_test_wp_bit(void)
3800
3801 void mark_rodata_ro(void)
3802 {
3803 - unsigned long addr = (unsigned long)__start_rodata;
3804 -
3805 - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
3806 - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
3807 + unsigned long start = PFN_ALIGN(_text);
3808 + unsigned long size = PFN_ALIGN(_etext) - start;
3809
3810 - printk("Write protecting the kernel read-only data: %uk\n",
3811 - (__end_rodata - __start_rodata) >> 10);
3812 +#ifndef CONFIG_KPROBES
3813 +#ifdef CONFIG_HOTPLUG_CPU
3814 + /* It must still be possible to apply SMP alternatives. */
3815 + if (num_possible_cpus() <= 1)
3816 +#endif
3817 + {
3818 + change_page_attr(virt_to_page(start),
3819 + size >> PAGE_SHIFT, PAGE_KERNEL_RX);
3820 + printk("Write protecting the kernel text: %luk\n", size >> 10);
3821 + }
3822 +#endif
3823 + start += size;
3824 + size = (unsigned long)__end_rodata - start;
3825 + change_page_attr(virt_to_page(start),
3826 + size >> PAGE_SHIFT, PAGE_KERNEL_RO);
3827 + printk("Write protecting the kernel read-only data: %luk\n",
3828 + size >> 10);
3829
3830 /*
3831 * change_page_attr() requires a global_flush_tlb() call after it.
3832 @@ -840,7 +883,7 @@ void free_init_pages(char *what, unsigne
3833 free_page(addr);
3834 totalram_pages++;
3835 }
3836 - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
3837 + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
3838 }
3839
3840 void free_initmem(void)
3841 --- a/arch/x86/mm/init_64-xen.c
3842 +++ b/arch/x86/mm/init_64-xen.c
3843 @@ -25,10 +25,12 @@
3844 #include <linux/bootmem.h>
3845 #include <linux/proc_fs.h>
3846 #include <linux/pci.h>
3847 +#include <linux/pfn.h>
3848 #include <linux/poison.h>
3849 #include <linux/dma-mapping.h>
3850 #include <linux/module.h>
3851 #include <linux/memory_hotplug.h>
3852 +#include <linux/nmi.h>
3853
3854 #include <asm/processor.h>
3855 #include <asm/system.h>
3856 @@ -51,7 +53,7 @@
3857 #define Dprintk(x...)
3858 #endif
3859
3860 -struct dma_mapping_ops* dma_ops;
3861 +const struct dma_mapping_ops* dma_ops;
3862 EXPORT_SYMBOL(dma_ops);
3863
3864 #if CONFIG_XEN_COMPAT <= 0x030002
3865 @@ -191,6 +193,13 @@ void show_mem(void)
3866
3867 for_each_online_pgdat(pgdat) {
3868 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
3869 + /* this loop can take a while with 256 GB and 4k pages
3870 + so update the NMI watchdog */
3871 + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
3872 + touch_nmi_watchdog();
3873 + }
3874 + if (!pfn_valid(pgdat->node_start_pfn + i))
3875 + continue;
3876 page = pfn_to_page(pgdat->node_start_pfn + i);
3877 total++;
3878 if (PageReserved(page))
3879 @@ -359,7 +368,7 @@ __set_fixmap (enum fixed_addresses idx,
3880 }
3881 }
3882
3883 -unsigned long __initdata table_start, table_end;
3884 +unsigned long __meminitdata table_start, table_end;
3885
3886 static __meminit void *alloc_static_page(unsigned long *phys)
3887 {
3888 @@ -376,7 +385,7 @@ static __meminit void *alloc_static_page
3889 start_pfn++;
3890 memset((void *)va, 0, PAGE_SIZE);
3891 return (void *)va;
3892 -}
3893 +}
3894
3895 #define PTE_SIZE PAGE_SIZE
3896
3897 @@ -412,28 +421,46 @@ static inline int make_readonly(unsigned
3898
3899 #ifndef CONFIG_XEN
3900 /* Must run before zap_low_mappings */
3901 -__init void *early_ioremap(unsigned long addr, unsigned long size)
3902 +__meminit void *early_ioremap(unsigned long addr, unsigned long size)
3903 {
3904 - unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
3905 -
3906 - /* actually usually some more */
3907 - if (size >= LARGE_PAGE_SIZE) {
3908 - return NULL;
3909 + unsigned long vaddr;
3910 + pmd_t *pmd, *last_pmd;
3911 + int i, pmds;
3912 +
3913 + pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
3914 + vaddr = __START_KERNEL_map;
3915 + pmd = level2_kernel_pgt;
3916 + last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
3917 + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
3918 + for (i = 0; i < pmds; i++) {
3919 + if (pmd_present(pmd[i]))
3920 + goto next;
3921 + }
3922 + vaddr += addr & ~PMD_MASK;
3923 + addr &= PMD_MASK;
3924 + for (i = 0; i < pmds; i++, addr += PMD_SIZE)
3925 + set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
3926 + __flush_tlb();
3927 + return (void *)vaddr;
3928 + next:
3929 + ;
3930 }
3931 - set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
3932 - map += LARGE_PAGE_SIZE;
3933 - set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
3934 - __flush_tlb();
3935 - return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
3936 + printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
3937 + return NULL;
3938 }
3939
3940 /* To avoid virtual aliases later */
3941 -__init void early_iounmap(void *addr, unsigned long size)
3942 +__meminit void early_iounmap(void *addr, unsigned long size)
3943 {
3944 - if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
3945 - printk("early_iounmap: bad address %p\n", addr);
3946 - set_pmd(temp_mappings[0].pmd, __pmd(0));
3947 - set_pmd(temp_mappings[1].pmd, __pmd(0));
3948 + unsigned long vaddr;
3949 + pmd_t *pmd;
3950 + int i, pmds;
3951 +
3952 + vaddr = (unsigned long)addr;
3953 + pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
3954 + pmd = level2_kernel_pgt + pmd_index(vaddr);
3955 + for (i = 0; i < pmds; i++)
3956 + pmd_clear(pmd + i);
3957 __flush_tlb();
3958 }
3959 #endif
3960 @@ -767,14 +794,6 @@ void __meminit init_memory_mapping(unsig
3961 __flush_tlb_all();
3962 }
3963
3964 -void __cpuinit zap_low_mappings(int cpu)
3965 -{
3966 - /* this is not required for Xen */
3967 -#if 0
3968 - swap_low_mappings();
3969 -#endif
3970 -}
3971 -
3972 #ifndef CONFIG_NUMA
3973 void __init paging_init(void)
3974 {
3975 @@ -960,17 +979,6 @@ void __init mem_init(void)
3976 reservedpages << (PAGE_SHIFT-10),
3977 datasize >> 10,
3978 initsize >> 10);
3979 -
3980 -#ifndef CONFIG_XEN
3981 -#ifdef CONFIG_SMP
3982 - /*
3983 - * Sync boot_level4_pgt mappings with the init_level4_pgt
3984 - * except for the low identity mappings which are already zapped
3985 - * in init_level4_pgt. This sync-up is essential for AP's bringup
3986 - */
3987 - memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
3988 -#endif
3989 -#endif
3990 }
3991
3992 void free_init_pages(char *what, unsigned long begin, unsigned long end)
3993 @@ -980,7 +988,7 @@ void free_init_pages(char *what, unsigne
3994 if (begin >= end)
3995 return;
3996
3997 - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
3998 + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
3999 for (addr = begin; addr < end; addr += PAGE_SIZE) {
4000 ClearPageReserved(virt_to_page(addr));
4001 init_page_count(virt_to_page(addr));
4002 @@ -989,24 +997,17 @@ void free_init_pages(char *what, unsigne
4003 if (addr >= __START_KERNEL_map) {
4004 /* make_readonly() reports all kernel addresses. */
4005 __make_page_writable(__va(__pa(addr)));
4006 - if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
4007 - pgd_t *pgd = pgd_offset_k(addr);
4008 - pud_t *pud = pud_offset(pgd, addr);
4009 - pmd_t *pmd = pmd_offset(pud, addr);
4010 - pte_t *pte = pte_offset_kernel(pmd, addr);
4011 -
4012 - xen_l1_entry_update(pte, __pte(0)); /* fallback */
4013 - }
4014 + change_page_attr_addr(addr, 1, __pgprot(0));
4015 }
4016 free_page(addr);
4017 totalram_pages++;
4018 }
4019 + if (addr > __START_KERNEL_map)
4020 + global_flush_tlb();
4021 }
4022
4023 void free_initmem(void)
4024 {
4025 - memset(__initdata_begin, POISON_FREE_INITDATA,
4026 - __initdata_end - __initdata_begin);
4027 free_init_pages("unused kernel memory",
4028 (unsigned long)(&__init_begin),
4029 (unsigned long)(&__init_end));
4030 @@ -1016,13 +1017,28 @@ void free_initmem(void)
4031
4032 void mark_rodata_ro(void)
4033 {
4034 - unsigned long addr = (unsigned long)__start_rodata;
4035 + unsigned long start = (unsigned long)_stext, end;
4036 +
4037 +#ifdef CONFIG_HOTPLUG_CPU
4038 + /* It must still be possible to apply SMP alternatives. */
4039 + if (num_possible_cpus() > 1)
4040 + start = (unsigned long)_etext;
4041 +#endif
4042 +
4043 +#ifdef CONFIG_KPROBES
4044 + start = (unsigned long)__start_rodata;
4045 +#endif
4046 +
4047 + end = (unsigned long)__end_rodata;
4048 + start = (start + PAGE_SIZE - 1) & PAGE_MASK;
4049 + end &= PAGE_MASK;
4050 + if (end <= start)
4051 + return;
4052
4053 - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
4054 - change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
4055 + change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
4056
4057 - printk ("Write protecting the kernel read-only data: %luk\n",
4058 - (__end_rodata - __start_rodata) >> 10);
4059 + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
4060 + (end - start) >> 10);
4061
4062 /*
4063 * change_page_attr_addr() requires a global_flush_tlb() call after it.
4064 @@ -1175,3 +1191,11 @@ int in_gate_area_no_task(unsigned long a
4065 {
4066 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
4067 }
4068 +
4069 +#ifndef CONFIG_XEN
4070 +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
4071 +{
4072 + return __alloc_bootmem_core(pgdat->bdata, size,
4073 + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
4074 +}
4075 +#endif
4076 --- a/arch/x86/mm/ioremap_32-xen.c
4077 +++ b/arch/x86/mm/ioremap_32-xen.c
4078 @@ -13,6 +13,7 @@
4079 #include <linux/slab.h>
4080 #include <linux/module.h>
4081 #include <linux/io.h>
4082 +#include <linux/sched.h>
4083 #include <asm/fixmap.h>
4084 #include <asm/cacheflush.h>
4085 #include <asm/tlbflush.h>
4086 --- a/arch/x86/mm/pageattr_64-xen.c
4087 +++ b/arch/x86/mm/pageattr_64-xen.c
4088 @@ -215,13 +215,13 @@ void mm_pin_all(void)
4089 preempt_enable();
4090 }
4091
4092 -void _arch_dup_mmap(struct mm_struct *mm)
4093 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
4094 {
4095 if (!mm->context.pinned)
4096 mm_pin(mm);
4097 }
4098
4099 -void _arch_exit_mmap(struct mm_struct *mm)
4100 +void arch_exit_mmap(struct mm_struct *mm)
4101 {
4102 struct task_struct *tsk = current;
4103
4104 @@ -343,10 +343,11 @@ static void flush_kernel_map(void *arg)
4105 struct page *pg;
4106
4107 /* When clflush is available always use it because it is
4108 - much cheaper than WBINVD */
4109 - if (!cpu_has_clflush)
4110 + much cheaper than WBINVD. Disable clflush for now because
4111 + the high level code is not ready yet */
4112 + if (1 || !cpu_has_clflush)
4113 asm volatile("wbinvd" ::: "memory");
4114 - list_for_each_entry(pg, l, lru) {
4115 + else list_for_each_entry(pg, l, lru) {
4116 void *adr = page_address(pg);
4117 if (cpu_has_clflush)
4118 cache_flush_page(adr);
4119 @@ -460,16 +461,24 @@ __change_page_attr(unsigned long address
4120 */
4121 int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
4122 {
4123 - int err = 0;
4124 + int err = 0, kernel_map = 0;
4125 int i;
4126
4127 + if (address >= __START_KERNEL_map
4128 + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
4129 + address = (unsigned long)__va(__pa(address));
4130 + kernel_map = 1;
4131 + }
4132 +
4133 down_write(&init_mm.mmap_sem);
4134 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
4135 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
4136
4137 - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
4138 - if (err)
4139 - break;
4140 + if (!kernel_map || pte_present(pfn_pte(0, prot))) {
4141 + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
4142 + if (err)
4143 + break;
4144 + }
4145 /* Handle kernel mapping too which aliases part of the
4146 * lowmem */
4147 if (__pa(address) < KERNEL_TEXT_SIZE) {
4148 --- a/arch/x86/mm/pgtable_32-xen.c
4149 +++ b/arch/x86/mm/pgtable_32-xen.c
4150 @@ -13,6 +13,7 @@
4151 #include <linux/pagemap.h>
4152 #include <linux/spinlock.h>
4153 #include <linux/module.h>
4154 +#include <linux/quicklist.h>
4155
4156 #include <asm/system.h>
4157 #include <asm/pgtable.h>
4158 @@ -218,8 +219,6 @@ void pmd_ctor(void *pmd, struct kmem_cac
4159 * against pageattr.c; it is the unique case in which a valid change
4160 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
4161 * vmalloc faults work because attached pagetables are never freed.
4162 - * The locking scheme was chosen on the basis of manfred's
4163 - * recommendations and having no core impact whatsoever.
4164 * -- wli
4165 */
4166 DEFINE_SPINLOCK(pgd_lock);
4167 @@ -245,37 +244,54 @@ static inline void pgd_list_del(pgd_t *p
4168 set_page_private(next, (unsigned long)pprev);
4169 }
4170
4171 -void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
4172 +
4173 +
4174 +#if (PTRS_PER_PMD == 1)
4175 +/* Non-PAE pgd constructor */
4176 +void pgd_ctor(void *pgd)
4177 {
4178 unsigned long flags;
4179
4180 - if (PTRS_PER_PMD > 1) {
4181 - if (HAVE_SHARED_KERNEL_PMD)
4182 - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4183 - swapper_pg_dir + USER_PTRS_PER_PGD,
4184 - KERNEL_PGD_PTRS);
4185 - } else {
4186 - spin_lock_irqsave(&pgd_lock, flags);
4187 + /* !PAE, no pagetable sharing */
4188 + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4189 +
4190 + spin_lock_irqsave(&pgd_lock, flags);
4191 +
4192 + /* must happen under lock */
4193 + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4194 + swapper_pg_dir + USER_PTRS_PER_PGD,
4195 + KERNEL_PGD_PTRS);
4196 +
4197 + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
4198 + __pa(swapper_pg_dir) >> PAGE_SHIFT,
4199 + USER_PTRS_PER_PGD,
4200 + KERNEL_PGD_PTRS);
4201 + pgd_list_add(pgd);
4202 + spin_unlock_irqrestore(&pgd_lock, flags);
4203 +}
4204 +#else /* PTRS_PER_PMD > 1 */
4205 +/* PAE pgd constructor */
4206 +void pgd_ctor(void *pgd)
4207 +{
4208 + /* PAE, kernel PMD may be shared */
4209 +
4210 + if (SHARED_KERNEL_PMD) {
4211 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4212 swapper_pg_dir + USER_PTRS_PER_PGD,
4213 KERNEL_PGD_PTRS);
4214 + } else {
4215 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4216 -
4217 - /* must happen under lock */
4218 - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
4219 - __pa(swapper_pg_dir) >> PAGE_SHIFT,
4220 - USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);
4221 -
4222 - pgd_list_add(pgd);
4223 - spin_unlock_irqrestore(&pgd_lock, flags);
4224 }
4225 }
4226 +#endif /* PTRS_PER_PMD */
4227
4228 -/* never called when PTRS_PER_PMD > 1 */
4229 -void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
4230 +void pgd_dtor(void *pgd)
4231 {
4232 unsigned long flags; /* can be called from interrupt context */
4233
4234 + if (SHARED_KERNEL_PMD)
4235 + return;
4236 +
4237 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
4238 spin_lock_irqsave(&pgd_lock, flags);
4239 pgd_list_del(pgd);
4240 @@ -284,11 +300,46 @@ void pgd_dtor(void *pgd, struct kmem_cac
4241 pgd_test_and_unpin(pgd);
4242 }
4243
4244 +#define UNSHARED_PTRS_PER_PGD \
4245 + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
4246 +
4247 +/* If we allocate a pmd for part of the kernel address space, then
4248 + make sure its initialized with the appropriate kernel mappings.
4249 + Otherwise use a cached zeroed pmd. */
4250 +static pmd_t *pmd_cache_alloc(int idx)
4251 +{
4252 + pmd_t *pmd;
4253 +
4254 + if (idx >= USER_PTRS_PER_PGD) {
4255 + pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
4256 +
4257 +#ifndef CONFIG_XEN
4258 + if (pmd)
4259 + memcpy(pmd,
4260 + (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
4261 + sizeof(pmd_t) * PTRS_PER_PMD);
4262 +#endif
4263 + } else
4264 + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4265 +
4266 + return pmd;
4267 +}
4268 +
4269 +static void pmd_cache_free(pmd_t *pmd, int idx)
4270 +{
4271 + if (idx >= USER_PTRS_PER_PGD) {
4272 + make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
4273 + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
4274 + free_page((unsigned long)pmd);
4275 + } else
4276 + kmem_cache_free(pmd_cache, pmd);
4277 +}
4278 +
4279 pgd_t *pgd_alloc(struct mm_struct *mm)
4280 {
4281 int i;
4282 - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
4283 - pmd_t **pmd;
4284 + pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
4285 + pmd_t **pmds = NULL;
4286 unsigned long flags;
4287
4288 pgd_test_and_unpin(pgd);
4289 @@ -296,37 +347,40 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
4290 if (PTRS_PER_PMD == 1 || !pgd)
4291 return pgd;
4292
4293 - if (HAVE_SHARED_KERNEL_PMD) {
4294 - for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
4295 - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4296 - if (!pmd)
4297 - goto out_oom;
4298 - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
4299 - set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
4300 +#ifdef CONFIG_XEN
4301 + if (!SHARED_KERNEL_PMD) {
4302 + /*
4303 + * We can race save/restore (if we sleep during a GFP_KERNEL memory
4304 + * allocation). We therefore store virtual addresses of pmds as they
4305 + * do not change across save/restore, and poke the machine addresses
4306 + * into the pgdir under the pgd_lock.
4307 + */
4308 + pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
4309 + if (!pmds) {
4310 + quicklist_free(0, pgd_dtor, pgd);
4311 + return NULL;
4312 }
4313 - return pgd;
4314 - }
4315 -
4316 - /*
4317 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
4318 - * allocation). We therefore store virtual addresses of pmds as they
4319 - * do not change across save/restore, and poke the machine addresses
4320 - * into the pgdir under the pgd_lock.
4321 - */
4322 - pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
4323 - if (!pmd) {
4324 - kmem_cache_free(pgd_cache, pgd);
4325 - return NULL;
4326 }
4327 +#endif
4328
4329 /* Allocate pmds, remember virtual addresses. */
4330 - for (i = 0; i < PTRS_PER_PGD; ++i) {
4331 - pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4332 - if (!pmd[i])
4333 + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
4334 + pmd_t *pmd = pmd_cache_alloc(i);
4335 +
4336 + if (!pmd)
4337 goto out_oom;
4338 +
4339 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
4340 + if (pmds)
4341 + pmds[i] = pmd;
4342 + else
4343 + set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
4344 }
4345
4346 +#ifdef CONFIG_XEN
4347 + if (SHARED_KERNEL_PMD)
4348 + return pgd;
4349 +
4350 spin_lock_irqsave(&pgd_lock, flags);
4351
4352 /* Protect against save/restore: move below 4GB under pgd_lock. */
4353 @@ -341,44 +395,43 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
4354
4355 /* Copy kernel pmd contents and write-protect the new pmds. */
4356 for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
4357 - unsigned long v = (unsigned long)i << PGDIR_SHIFT;
4358 - pgd_t *kpgd = pgd_offset_k(v);
4359 - pud_t *kpud = pud_offset(kpgd, v);
4360 - pmd_t *kpmd = pmd_offset(kpud, v);
4361 - memcpy(pmd[i], kpmd, PAGE_SIZE);
4362 + memcpy(pmds[i],
4363 + (void *)pgd_page_vaddr(swapper_pg_dir[i]),
4364 + sizeof(pmd_t) * PTRS_PER_PMD);
4365 make_lowmem_page_readonly(
4366 - pmd[i], XENFEAT_writable_page_tables);
4367 + pmds[i], XENFEAT_writable_page_tables);
4368 }
4369
4370 /* It is safe to poke machine addresses of pmds under the pmd_lock. */
4371 for (i = 0; i < PTRS_PER_PGD; i++)
4372 - set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
4373 + set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
4374
4375 /* Ensure this pgd gets picked up and pinned on save/restore. */
4376 pgd_list_add(pgd);
4377
4378 spin_unlock_irqrestore(&pgd_lock, flags);
4379
4380 - kfree(pmd);
4381 + kfree(pmds);
4382 +#endif
4383
4384 return pgd;
4385
4386 out_oom:
4387 - if (HAVE_SHARED_KERNEL_PMD) {
4388 + if (!pmds) {
4389 for (i--; i >= 0; i--) {
4390 pgd_t pgdent = pgd[i];
4391 void* pmd = (void *)__va(pgd_val(pgdent)-1);
4392 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
4393 - kmem_cache_free(pmd_cache, pmd);
4394 + pmd_cache_free(pmd, i);
4395 }
4396 } else {
4397 for (i--; i >= 0; i--) {
4398 - paravirt_release_pd(__pa(pmd[i]) >> PAGE_SHIFT);
4399 - kmem_cache_free(pmd_cache, pmd[i]);
4400 + paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
4401 + pmd_cache_free(pmds[i], i);
4402 }
4403 - kfree(pmd);
4404 + kfree(pmds);
4405 }
4406 - kmem_cache_free(pgd_cache, pgd);
4407 + quicklist_free(0, pgd_dtor, pgd);
4408 return NULL;
4409 }
4410
4411 @@ -398,35 +451,24 @@ void pgd_free(pgd_t *pgd)
4412
4413 /* in the PAE case user pgd entries are overwritten before usage */
4414 if (PTRS_PER_PMD > 1) {
4415 - for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
4416 + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
4417 pgd_t pgdent = pgd[i];
4418 void* pmd = (void *)__va(pgd_val(pgdent)-1);
4419 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
4420 - kmem_cache_free(pmd_cache, pmd);
4421 + pmd_cache_free(pmd, i);
4422 }
4423
4424 - if (!HAVE_SHARED_KERNEL_PMD) {
4425 - unsigned long flags;
4426 - spin_lock_irqsave(&pgd_lock, flags);
4427 - pgd_list_del(pgd);
4428 - spin_unlock_irqrestore(&pgd_lock, flags);
4429 -
4430 - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
4431 - pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
4432 - make_lowmem_page_writable(
4433 - pmd, XENFEAT_writable_page_tables);
4434 - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
4435 - kmem_cache_free(pmd_cache, pmd);
4436 - }
4437 -
4438 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
4439 - xen_destroy_contiguous_region(
4440 - (unsigned long)pgd, 0);
4441 - }
4442 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
4443 + xen_destroy_contiguous_region((unsigned long)pgd, 0);
4444 }
4445
4446 /* in the non-PAE case, free_pgtables() clears user pgd entries */
4447 - kmem_cache_free(pgd_cache, pgd);
4448 + quicklist_free(0, pgd_dtor, pgd);
4449 +}
4450 +
4451 +void check_pgt_cache(void)
4452 +{
4453 + quicklist_trim(0, pgd_dtor, 25, 16);
4454 }
4455
4456 void make_lowmem_page_readonly(void *va, unsigned int feature)
4457 @@ -723,13 +765,13 @@ void mm_pin_all(void)
4458 spin_unlock_irqrestore(&pgd_lock, flags);
4459 }
4460
4461 -void _arch_dup_mmap(struct mm_struct *mm)
4462 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
4463 {
4464 if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
4465 mm_pin(mm);
4466 }
4467
4468 -void _arch_exit_mmap(struct mm_struct *mm)
4469 +void arch_exit_mmap(struct mm_struct *mm)
4470 {
4471 struct task_struct *tsk = current;
4472
4473 --- a/drivers/char/tpm/tpm_xen.c
4474 +++ b/drivers/char/tpm/tpm_xen.c
4475 @@ -463,7 +463,7 @@ static int tpmif_connect(struct xenbus_d
4476 tp->backend_id = domid;
4477
4478 err = bind_listening_port_to_irqhandler(
4479 - domid, tpmif_int, SA_SAMPLE_RANDOM, "tpmif", tp);
4480 + domid, tpmif_int, IRQF_SAMPLE_RANDOM, "tpmif", tp);
4481 if (err <= 0) {
4482 WPRINTK("bind_listening_port_to_irqhandler failed "
4483 "(err=%d)\n", err);
4484 --- a/drivers/pci/msi-xen.c
4485 +++ b/drivers/pci/msi-xen.c
4486 @@ -12,16 +12,15 @@
4487 #include <linux/interrupt.h>
4488 #include <linux/init.h>
4489 #include <linux/ioport.h>
4490 -#include <linux/smp_lock.h>
4491 #include <linux/pci.h>
4492 #include <linux/proc_fs.h>
4493 #include <linux/msi.h>
4494 +#include <linux/smp.h>
4495
4496 #include <xen/evtchn.h>
4497
4498 #include <asm/errno.h>
4499 #include <asm/io.h>
4500 -#include <asm/smp.h>
4501
4502 #include "pci.h"
4503 #include "msi.h"
4504 @@ -154,6 +153,7 @@ int register_msi_get_owner(int (*func)(s
4505 get_owner = func;
4506 return 0;
4507 }
4508 +EXPORT_SYMBOL(register_msi_get_owner);
4509
4510 int unregister_msi_get_owner(int (*func)(struct pci_dev *dev))
4511 {
4512 @@ -162,6 +162,7 @@ int unregister_msi_get_owner(int (*func)
4513 get_owner = NULL;
4514 return 0;
4515 }
4516 +EXPORT_SYMBOL(unregister_msi_get_owner);
4517
4518 static int msi_get_dev_owner(struct pci_dev *dev)
4519 {
4520 @@ -263,11 +264,6 @@ static int msi_map_vector(struct pci_dev
4521 return msi_map_pirq_to_vector(dev, -1, entry_nr, table_base);
4522 }
4523
4524 -static int msi_init(void)
4525 -{
4526 - return 0;
4527 -}
4528 -
4529 #ifdef CONFIG_PM
4530 static void __pci_restore_msi_state(struct pci_dev *dev)
4531 {
4532 @@ -434,21 +430,32 @@ static int msix_capability_init(struct p
4533 }
4534
4535 /**
4536 - * pci_msi_supported - check whether MSI may be enabled on device
4537 + * pci_msi_check_device - check whether MSI may be enabled on a device
4538 * @dev: pointer to the pci_dev data structure of MSI device function
4539 + * @nvec: how many MSIs have been requested ?
4540 + * @type: are we checking for MSI or MSI-X ?
4541 *
4542 * Look at global flags, the device itself, and its parent busses
4543 - * to return 0 if MSI are supported for the device.
4544 + * to determine if MSI/-X are supported for the device. If MSI/-X is
4545 + * supported return 0, else return an error code.
4546 **/
4547 -static
4548 -int pci_msi_supported(struct pci_dev * dev)
4549 +static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type)
4550 {
4551 struct pci_bus *bus;
4552 + int ret;
4553
4554 /* MSI must be globally enabled and supported by the device */
4555 if (!pci_msi_enable || !dev || dev->no_msi)
4556 return -EINVAL;
4557
4558 + /*
4559 + * You can't ask to have 0 or less MSIs configured.
4560 + * a) it's stupid ..
4561 + * b) the list manipulation code assumes nvec >= 1.
4562 + */
4563 + if (nvec < 1)
4564 + return -ERANGE;
4565 +
4566 /* Any bridge which does NOT route MSI transactions from it's
4567 * secondary bus to it's primary bus must set NO_MSI flag on
4568 * the secondary pci_bus.
4569 @@ -459,6 +466,13 @@ int pci_msi_supported(struct pci_dev * d
4570 if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
4571 return -EINVAL;
4572
4573 + ret = arch_msi_check_device(dev, nvec, type);
4574 + if (ret)
4575 + return ret;
4576 +
4577 + if (!pci_find_capability(dev, type))
4578 + return -EINVAL;
4579 +
4580 return 0;
4581 }
4582
4583 @@ -476,18 +490,15 @@ extern int pci_frontend_enable_msi(struc
4584 int pci_enable_msi(struct pci_dev* dev)
4585 {
4586 struct pci_bus *bus;
4587 - int pos, temp, status;
4588 -
4589 - if (pci_msi_supported(dev) < 0)
4590 - return -EINVAL;
4591 + int temp, status;
4592
4593 for (bus = dev->bus; bus; bus = bus->parent)
4594 if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
4595 return -EINVAL;
4596
4597 - status = msi_init();
4598 - if (status < 0)
4599 - return status;
4600 + status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
4601 + if (status)
4602 + return status;
4603
4604 #ifdef CONFIG_XEN_PCIDEV_FRONTEND
4605 if (!is_initial_xendomain())
4606 @@ -508,10 +519,6 @@ int pci_enable_msi(struct pci_dev* dev)
4607
4608 temp = dev->irq;
4609
4610 - pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
4611 - if (!pos)
4612 - return -EINVAL;
4613 -
4614 /* Check whether driver already requested for MSI-X irqs */
4615 if (dev->msix_enabled) {
4616 printk(KERN_INFO "PCI: %s: Can't enable MSI. "
4617 @@ -526,15 +533,14 @@ int pci_enable_msi(struct pci_dev* dev)
4618
4619 return status;
4620 }
4621 +EXPORT_SYMBOL(pci_enable_msi);
4622
4623 extern void pci_frontend_disable_msi(struct pci_dev* dev);
4624 void pci_disable_msi(struct pci_dev* dev)
4625 {
4626 int pirq;
4627
4628 - if (!pci_msi_enable)
4629 - return;
4630 - if (!dev)
4631 + if (!pci_msi_enable || !dev)
4632 return;
4633
4634 #ifdef CONFIG_XEN_PCIDEV_FRONTEND
4635 @@ -559,6 +565,7 @@ void pci_disable_msi(struct pci_dev* dev
4636 pci_intx(dev, 1); /* enable intx */
4637 dev->msi_enabled = 0;
4638 }
4639 +EXPORT_SYMBOL(pci_disable_msi);
4640
4641 /**
4642 * pci_enable_msix - configure device's MSI-X capability structure
4643 @@ -583,7 +590,7 @@ int pci_enable_msix(struct pci_dev* dev,
4644 int i, j, temp;
4645 u16 control;
4646
4647 - if (!entries || pci_msi_supported(dev) < 0)
4648 + if (!entries)
4649 return -EINVAL;
4650
4651 #ifdef CONFIG_XEN_PCIDEV_FRONTEND
4652 @@ -621,14 +628,11 @@ int pci_enable_msix(struct pci_dev* dev,
4653 }
4654 #endif
4655
4656 - status = msi_init();
4657 - if (status < 0)
4658 + status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSIX);
4659 + if (status)
4660 return status;
4661
4662 pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
4663 - if (!pos)
4664 - return -EINVAL;
4665 -
4666 pci_read_config_word(dev, msi_control_reg(pos), &control);
4667 nr_entries = multi_msix_capable(control);
4668 if (nvec > nr_entries)
4669 @@ -660,6 +664,7 @@ int pci_enable_msix(struct pci_dev* dev,
4670
4671 return status;
4672 }
4673 +EXPORT_SYMBOL(pci_enable_msix);
4674
4675 extern void pci_frontend_disable_msix(struct pci_dev* dev);
4676 void pci_disable_msix(struct pci_dev* dev)
4677 @@ -699,6 +704,7 @@ void pci_disable_msix(struct pci_dev* de
4678 pci_intx(dev, 1); /* enable intx */
4679 dev->msix_enabled = 0;
4680 }
4681 +EXPORT_SYMBOL(pci_disable_msix);
4682
4683 /**
4684 * msi_remove_pci_irq_vectors - reclaim MSI(X) irqs to unused state
4685 @@ -742,12 +748,57 @@ void pci_no_msi(void)
4686 pci_msi_enable = 0;
4687 }
4688
4689 -EXPORT_SYMBOL(pci_enable_msi);
4690 -EXPORT_SYMBOL(pci_disable_msi);
4691 -EXPORT_SYMBOL(pci_enable_msix);
4692 -EXPORT_SYMBOL(pci_disable_msix);
4693 -#ifdef CONFIG_XEN
4694 -EXPORT_SYMBOL(register_msi_get_owner);
4695 -EXPORT_SYMBOL(unregister_msi_get_owner);
4696 +void pci_msi_init_pci_dev(struct pci_dev *dev)
4697 +{
4698 +#ifndef CONFIG_XEN
4699 + INIT_LIST_HEAD(&dev->msi_list);
4700 #endif
4701 +}
4702 +
4703 +
4704 +/* Arch hooks */
4705 +
4706 +int __attribute__ ((weak))
4707 +arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
4708 +{
4709 + return 0;
4710 +}
4711 +
4712 +#ifndef CONFIG_XEN
4713 +int __attribute__ ((weak))
4714 +arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
4715 +{
4716 + return 0;
4717 +}
4718 +
4719 +int __attribute__ ((weak))
4720 +arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
4721 +{
4722 + struct msi_desc *entry;
4723 + int ret;
4724
4725 + list_for_each_entry(entry, &dev->msi_list, list) {
4726 + ret = arch_setup_msi_irq(dev, entry);
4727 + if (ret)
4728 + return ret;
4729 + }
4730 +
4731 + return 0;
4732 +}
4733 +
4734 +void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
4735 +{
4736 + return;
4737 +}
4738 +
4739 +void __attribute__ ((weak))
4740 +arch_teardown_msi_irqs(struct pci_dev *dev)
4741 +{
4742 + struct msi_desc *entry;
4743 +
4744 + list_for_each_entry(entry, &dev->msi_list, list) {
4745 + if (entry->irq != 0)
4746 + arch_teardown_msi_irq(entry->irq);
4747 + }
4748 +}
4749 +#endif
4750 --- a/drivers/xen/blkfront/blkfront.c
4751 +++ b/drivers/xen/blkfront/blkfront.c
4752 @@ -241,7 +241,7 @@ static int setup_blkring(struct xenbus_d
4753 info->ring_ref = err;
4754
4755 err = bind_listening_port_to_irqhandler(
4756 - dev->otherend_id, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
4757 + dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info);
4758 if (err <= 0) {
4759 xenbus_dev_fatal(dev, err,
4760 "bind_listening_port_to_irqhandler");
4761 --- a/drivers/xen/char/mem.c
4762 +++ b/drivers/xen/char/mem.c
4763 @@ -18,7 +18,6 @@
4764 #include <linux/raw.h>
4765 #include <linux/tty.h>
4766 #include <linux/capability.h>
4767 -#include <linux/smp_lock.h>
4768 #include <linux/ptrace.h>
4769 #include <linux/device.h>
4770 #include <asm/pgalloc.h>
4771 --- a/drivers/xen/core/hypervisor_sysfs.c
4772 +++ b/drivers/xen/core/hypervisor_sysfs.c
4773 @@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init
4774 if (!is_running_on_xen())
4775 return -ENODEV;
4776
4777 - hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type;
4778 + hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
4779 return 0;
4780 }
4781
4782 --- a/drivers/xen/core/smpboot.c
4783 +++ b/drivers/xen/core/smpboot.c
4784 @@ -165,13 +165,12 @@ static void xen_smp_intr_exit(unsigned i
4785
4786 void __cpuinit cpu_bringup(void)
4787 {
4788 + cpu_init();
4789 #ifdef __i386__
4790 - cpu_set_gdt(current_thread_info()->cpu);
4791 - secondary_cpu_init();
4792 + identify_secondary_cpu(cpu_data + smp_processor_id());
4793 #else
4794 - cpu_init();
4795 -#endif
4796 identify_cpu(cpu_data + smp_processor_id());
4797 +#endif
4798 touch_softlockup_watchdog();
4799 preempt_disable();
4800 local_irq_enable();
4801 @@ -191,11 +190,6 @@ static void __cpuinit cpu_initialize_con
4802 static DEFINE_SPINLOCK(ctxt_lock);
4803
4804 struct task_struct *idle = idle_task(cpu);
4805 -#ifdef __x86_64__
4806 - struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
4807 -#else
4808 - struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4809 -#endif
4810
4811 if (cpu_test_and_set(cpu, cpu_initialized_map))
4812 return;
4813 @@ -218,11 +212,11 @@ static void __cpuinit cpu_initialize_con
4814 smp_trap_init(ctxt.trap_ctxt);
4815
4816 ctxt.ldt_ents = 0;
4817 -
4818 - ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
4819 - ctxt.gdt_ents = gdt_descr->size / 8;
4820 + ctxt.gdt_ents = GDT_SIZE / 8;
4821
4822 #ifdef __i386__
4823 + ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
4824 +
4825 ctxt.user_regs.cs = __KERNEL_CS;
4826 ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
4827
4828 @@ -235,7 +229,11 @@ static void __cpuinit cpu_initialize_con
4829 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
4830
4831 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
4832 +
4833 + ctxt.user_regs.fs = __KERNEL_PERCPU;
4834 #else /* __x86_64__ */
4835 + ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
4836 +
4837 ctxt.user_regs.cs = __KERNEL_CS;
4838 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
4839
4840 @@ -265,9 +263,8 @@ void __init smp_prepare_cpus(unsigned in
4841 struct vcpu_get_physid cpu_id;
4842 #ifdef __x86_64__
4843 struct desc_ptr *gdt_descr;
4844 -#else
4845 - struct Xgt_desc_struct *gdt_descr;
4846 #endif
4847 + void *gdt_addr;
4848
4849 apicid = 0;
4850 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0)
4851 @@ -317,14 +314,12 @@ void __init smp_prepare_cpus(unsigned in
4852 }
4853 gdt_descr->size = GDT_SIZE;
4854 memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
4855 + gdt_addr = (void *)gdt_descr->address;
4856 #else
4857 - if (unlikely(!init_gdt(cpu, idle)))
4858 - continue;
4859 - gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4860 + init_gdt(cpu);
4861 + gdt_addr = get_cpu_gdt_table(cpu);
4862 #endif
4863 - make_page_readonly(
4864 - (void *)gdt_descr->address,
4865 - XENFEAT_writable_descriptor_tables);
4866 + make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
4867
4868 apicid = cpu;
4869 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0)
4870 @@ -338,7 +333,9 @@ void __init smp_prepare_cpus(unsigned in
4871 #ifdef __x86_64__
4872 cpu_pda(cpu)->pcurrent = idle;
4873 cpu_pda(cpu)->cpunumber = cpu;
4874 - clear_ti_thread_flag(idle->thread_info, TIF_FORK);
4875 + clear_ti_thread_flag(task_thread_info(idle), TIF_FORK);
4876 +#else
4877 + per_cpu(current_task, cpu) = idle;
4878 #endif
4879
4880 irq_ctx_init(cpu);
4881 @@ -363,8 +360,12 @@ void __init smp_prepare_cpus(unsigned in
4882 #endif
4883 }
4884
4885 -void __devinit smp_prepare_boot_cpu(void)
4886 +void __init smp_prepare_boot_cpu(void)
4887 {
4888 +#ifdef __i386__
4889 + init_gdt(smp_processor_id());
4890 + switch_to_new_gdt();
4891 +#endif
4892 prefill_possible_map();
4893 }
4894
4895 --- a/drivers/xen/core/xen_sysfs.c
4896 +++ b/drivers/xen/core/xen_sysfs.c
4897 @@ -29,12 +29,12 @@ HYPERVISOR_ATTR_RO(type);
4898
4899 static int __init xen_sysfs_type_init(void)
4900 {
4901 - return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
4902 + return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
4903 }
4904
4905 static void xen_sysfs_type_destroy(void)
4906 {
4907 - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
4908 + sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
4909 }
4910
4911 /* xen version attributes */
4912 @@ -90,13 +90,13 @@ static struct attribute_group version_gr
4913
4914 static int __init xen_sysfs_version_init(void)
4915 {
4916 - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4917 + return sysfs_create_group(&hypervisor_subsys.kobj,
4918 &version_group);
4919 }
4920
4921 static void xen_sysfs_version_destroy(void)
4922 {
4923 - sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group);
4924 + sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
4925 }
4926
4927 /* UUID */
4928 @@ -126,12 +126,12 @@ HYPERVISOR_ATTR_RO(uuid);
4929
4930 static int __init xen_sysfs_uuid_init(void)
4931 {
4932 - return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
4933 + return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
4934 }
4935
4936 static void xen_sysfs_uuid_destroy(void)
4937 {
4938 - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
4939 + sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
4940 }
4941
4942 /* xen compilation attributes */
4943 @@ -204,13 +204,13 @@ static struct attribute_group xen_compil
4944
4945 int __init static xen_compilation_init(void)
4946 {
4947 - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4948 + return sysfs_create_group(&hypervisor_subsys.kobj,
4949 &xen_compilation_group);
4950 }
4951
4952 static void xen_compilation_destroy(void)
4953 {
4954 - sysfs_remove_group(&hypervisor_subsys.kset.kobj,
4955 + sysfs_remove_group(&hypervisor_subsys.kobj,
4956 &xen_compilation_group);
4957 }
4958
4959 @@ -325,13 +325,13 @@ static struct attribute_group xen_proper
4960
4961 static int __init xen_properties_init(void)
4962 {
4963 - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4964 + return sysfs_create_group(&hypervisor_subsys.kobj,
4965 &xen_properties_group);
4966 }
4967
4968 static void xen_properties_destroy(void)
4969 {
4970 - sysfs_remove_group(&hypervisor_subsys.kset.kobj,
4971 + sysfs_remove_group(&hypervisor_subsys.kobj,
4972 &xen_properties_group);
4973 }
4974
4975 @@ -350,13 +350,13 @@ HYPERVISOR_ATTR_RO(vmcoreinfo);
4976
4977 static int __init xen_sysfs_vmcoreinfo_init(void)
4978 {
4979 - return sysfs_create_file(&hypervisor_subsys.kset.kobj,
4980 + return sysfs_create_file(&hypervisor_subsys.kobj,
4981 &vmcoreinfo_attr.attr);
4982 }
4983
4984 static void xen_sysfs_vmcoreinfo_destroy(void)
4985 {
4986 - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &vmcoreinfo_attr.attr);
4987 + sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr);
4988 }
4989
4990 #endif
4991 --- a/drivers/xen/netback/netback.c
4992 +++ b/drivers/xen/netback/netback.c
4993 @@ -179,7 +179,7 @@ static struct sk_buff *netbk_copy_skb(st
4994 goto err;
4995
4996 skb_reserve(nskb, 16 + NET_IP_ALIGN);
4997 - headlen = nskb->end - nskb->data;
4998 + headlen = skb_end_pointer(nskb) - nskb->data;
4999 if (headlen > skb_headlen(skb))
5000 headlen = skb_headlen(skb);
5001 ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
5002 @@ -225,11 +225,15 @@ static struct sk_buff *netbk_copy_skb(st
5003 len -= copy;
5004 }
5005
5006 +#ifdef NET_SKBUFF_DATA_USES_OFFSET
5007 + offset = 0;
5008 +#else
5009 offset = nskb->data - skb->data;
5010 +#endif
5011
5012 - nskb->h.raw = skb->h.raw + offset;
5013 - nskb->nh.raw = skb->nh.raw + offset;
5014 - nskb->mac.raw = skb->mac.raw + offset;
5015 + nskb->transport_header = skb->transport_header + offset;
5016 + nskb->network_header = skb->network_header + offset;
5017 + nskb->mac_header = skb->mac_header + offset;
5018
5019 return nskb;
5020
5021 @@ -1601,7 +1605,7 @@ static int __init netback_init(void)
5022 (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
5023 0,
5024 netif_be_dbg,
5025 - SA_SHIRQ,
5026 + IRQF_SHARED,
5027 "net-be-dbg",
5028 &netif_be_dbg);
5029 #endif
5030 --- a/drivers/xen/netfront/netfront.c
5031 +++ b/drivers/xen/netfront/netfront.c
5032 @@ -513,7 +513,7 @@ static int setup_device(struct xenbus_de
5033 memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
5034
5035 err = bind_listening_port_to_irqhandler(
5036 - dev->otherend_id, netif_int, SA_SAMPLE_RANDOM, netdev->name,
5037 + dev->otherend_id, netif_int, IRQF_SAMPLE_RANDOM, netdev->name,
5038 netdev);
5039 if (err < 0)
5040 goto fail;
5041 --- a/drivers/xen/pciback/xenbus.c
5042 +++ b/drivers/xen/pciback/xenbus.c
5043 @@ -99,7 +99,7 @@ static int pciback_do_attach(struct pcib
5044
5045 err = bind_interdomain_evtchn_to_irqhandler(
5046 pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
5047 - SA_SAMPLE_RANDOM, "pciback", pdev);
5048 + IRQF_SAMPLE_RANDOM, "pciback", pdev);
5049 if (err < 0) {
5050 xenbus_dev_fatal(pdev->xdev, err,
5051 "Error binding event channel to IRQ");
5052 --- a/drivers/xen/pcifront/xenbus.c
5053 +++ b/drivers/xen/pcifront/xenbus.c
5054 @@ -10,10 +10,6 @@
5055 #include <xen/gnttab.h>
5056 #include "pcifront.h"
5057
5058 -#ifndef __init_refok
5059 -#define __init_refok
5060 -#endif
5061 -
5062 #define INVALID_GRANT_REF (0)
5063 #define INVALID_EVTCHN (-1)
5064
5065 --- a/drivers/xen/scsifront/xenbus.c
5066 +++ b/drivers/xen/scsifront/xenbus.c
5067 @@ -96,7 +96,7 @@ static int scsifront_alloc_ring(struct v
5068
5069 err = bind_listening_port_to_irqhandler(
5070 dev->otherend_id, scsifront_intr,
5071 - SA_SAMPLE_RANDOM, "scsifront", info);
5072 + IRQF_SAMPLE_RANDOM, "scsifront", info);
5073
5074 if (err <= 0) {
5075 xenbus_dev_fatal(dev, err, "bind_listening_port_to_irqhandler");
5076 --- a/drivers/xen/sfc_netback/accel_fwd.c
5077 +++ b/drivers/xen/sfc_netback/accel_fwd.c
5078 @@ -308,7 +308,7 @@ static struct netback_accel *for_a_vnic(
5079 static inline int packet_is_arp_reply(struct sk_buff *skb)
5080 {
5081 return skb->protocol == ntohs(ETH_P_ARP)
5082 - && skb->nh.arph->ar_op == ntohs(ARPOP_REPLY);
5083 + && arp_hdr(skb)->ar_op == ntohs(ARPOP_REPLY);
5084 }
5085
5086
5087 @@ -392,12 +392,13 @@ void netback_accel_tx_packet(struct sk_b
5088
5089 BUG_ON(fwd_priv == NULL);
5090
5091 - if (is_broadcast_ether_addr(skb->mac.raw) && packet_is_arp_reply(skb)) {
5092 + if (is_broadcast_ether_addr(skb_mac_header(skb))
5093 + && packet_is_arp_reply(skb)) {
5094 /*
5095 * update our fast path forwarding to reflect this
5096 * gratuitous ARP
5097 */
5098 - mac = skb->mac.raw+ETH_ALEN;
5099 + mac = skb_mac_header(skb)+ETH_ALEN;
5100
5101 DPRINTK("%s: found gratuitous ARP for " MAC_FMT "\n",
5102 __FUNCTION__, MAC_ARG(mac));
5103 --- a/drivers/xen/sfc_netback/accel_solarflare.c
5104 +++ b/drivers/xen/sfc_netback/accel_solarflare.c
5105 @@ -114,7 +114,7 @@ bend_dl_tx_packet(struct efx_dl_device *
5106 BUG_ON(port == NULL);
5107
5108 NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_packets++);
5109 - if (skb->mac.raw != NULL)
5110 + if (skb_mac_header_was_set(skb))
5111 netback_accel_tx_packet(skb, port->fwd_priv);
5112 else {
5113 DPRINTK("Ignoring packet with missing mac address\n");
5114 --- a/drivers/xen/sfc_netfront/accel_tso.c
5115 +++ b/drivers/xen/sfc_netfront/accel_tso.c
5116 @@ -33,10 +33,9 @@
5117
5118 #include "accel_tso.h"
5119
5120 -#define PTR_DIFF(p1, p2) ((u8*)(p1) - (u8*)(p2))
5121 -#define ETH_HDR_LEN(skb) ((skb)->nh.raw - (skb)->data)
5122 -#define SKB_TCP_OFF(skb) PTR_DIFF ((skb)->h.th, (skb)->data)
5123 -#define SKB_IP_OFF(skb) PTR_DIFF ((skb)->nh.iph, (skb)->data)
5124 +#define ETH_HDR_LEN(skb) skb_network_offset(skb)
5125 +#define SKB_TCP_OFF(skb) skb_transport_offset(skb)
5126 +#define SKB_IP_OFF(skb) skb_network_offset(skb)
5127
5128 /*
5129 * Set a maximum number of buffers in each output packet to make life
5130 @@ -114,9 +113,8 @@ struct netfront_accel_tso_state {
5131 static inline void tso_check_safe(struct sk_buff *skb) {
5132 EPRINTK_ON(skb->protocol != htons (ETH_P_IP));
5133 EPRINTK_ON(((struct ethhdr*) skb->data)->h_proto != htons (ETH_P_IP));
5134 - EPRINTK_ON(skb->nh.iph->protocol != IPPROTO_TCP);
5135 - EPRINTK_ON((SKB_TCP_OFF(skb)
5136 - + (skb->h.th->doff << 2u)) > skb_headlen(skb));
5137 + EPRINTK_ON(ip_hdr(skb)->protocol != IPPROTO_TCP);
5138 + EPRINTK_ON((SKB_TCP_OFF(skb) + tcp_hdrlen(skb)) > skb_headlen(skb));
5139 }
5140
5141
5142 @@ -129,17 +127,17 @@ static inline void tso_start(struct netf
5143 * All ethernet/IP/TCP headers combined size is TCP header size
5144 * plus offset of TCP header relative to start of packet.
5145 */
5146 - st->p.header_length = (skb->h.th->doff << 2u) + SKB_TCP_OFF(skb);
5147 + st->p.header_length = tcp_hdrlen(skb) + SKB_TCP_OFF(skb);
5148 st->p.full_packet_size = (st->p.header_length
5149 + skb_shinfo(skb)->gso_size);
5150 st->p.gso_size = skb_shinfo(skb)->gso_size;
5151
5152 - st->p.ip_id = htons(skb->nh.iph->id);
5153 - st->seqnum = ntohl(skb->h.th->seq);
5154 + st->p.ip_id = htons(ip_hdr(skb)->id);
5155 + st->seqnum = ntohl(tcp_hdr(skb)->seq);
5156
5157 - EPRINTK_ON(skb->h.th->urg);
5158 - EPRINTK_ON(skb->h.th->syn);
5159 - EPRINTK_ON(skb->h.th->rst);
5160 + EPRINTK_ON(tcp_hdr(skb)->urg);
5161 + EPRINTK_ON(tcp_hdr(skb)->syn);
5162 + EPRINTK_ON(tcp_hdr(skb)->rst);
5163
5164 st->remaining_len = skb->len - st->p.header_length;
5165
5166 @@ -258,8 +256,8 @@ int tso_start_new_packet(netfront_accel_
5167 /* This packet will be the last in the TSO burst. */
5168 ip_length = (st->p.header_length - ETH_HDR_LEN(skb)
5169 + st->remaining_len);
5170 - tsoh_th->fin = skb->h.th->fin;
5171 - tsoh_th->psh = skb->h.th->psh;
5172 + tsoh_th->fin = tcp_hdr(skb)->fin;
5173 + tsoh_th->psh = tcp_hdr(skb)->psh;
5174 }
5175
5176 tsoh_iph->tot_len = htons(ip_length);
5177 --- a/drivers/xen/sfc_netfront/accel_vi.c
5178 +++ b/drivers/xen/sfc_netfront/accel_vi.c
5179 @@ -463,7 +463,7 @@ netfront_accel_enqueue_skb_multi(netfron
5180
5181 if (skb->ip_summed == CHECKSUM_PARTIAL) {
5182 /* Set to zero to encourage falcon to work it out for us */
5183 - *(u16*)(skb->h.raw + skb->csum_offset) = 0;
5184 + *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
5185 }
5186
5187 if (multi_post_start_new_buffer(vnic, &state)) {
5188 @@ -582,7 +582,7 @@ netfront_accel_enqueue_skb_single(netfro
5189
5190 if (skb->ip_summed == CHECKSUM_PARTIAL) {
5191 /* Set to zero to encourage falcon to work it out for us */
5192 - *(u16*)(skb->h.raw + skb->csum_offset) = 0;
5193 + *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
5194 }
5195 NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
5196 (skb, idx, frag_data, frag_len, {
5197 --- a/drivers/xen/sfc_netfront/accel_xenbus.c
5198 +++ b/drivers/xen/sfc_netfront/accel_xenbus.c
5199 @@ -356,7 +356,7 @@ static int vnic_setup_domU_shared_state(
5200 /* Create xenbus msg event channel */
5201 err = bind_listening_port_to_irqhandler
5202 (dev->otherend_id, netfront_accel_msg_channel_irq_from_bend,
5203 - SA_SAMPLE_RANDOM, "vnicctrl", vnic);
5204 + IRQF_SAMPLE_RANDOM, "vnicctrl", vnic);
5205 if (err < 0) {
5206 EPRINTK("Couldn't bind msg event channel\n");
5207 goto fail_msg_irq;
5208 @@ -367,7 +367,7 @@ static int vnic_setup_domU_shared_state(
5209 /* Create xenbus net event channel */
5210 err = bind_listening_port_to_irqhandler
5211 (dev->otherend_id, netfront_accel_net_channel_irq_from_bend,
5212 - SA_SAMPLE_RANDOM, "vnicfront", vnic);
5213 + IRQF_SAMPLE_RANDOM, "vnicfront", vnic);
5214 if (err < 0) {
5215 EPRINTK("Couldn't bind net event channel\n");
5216 goto fail_net_irq;
5217 --- a/fs/aio.c
5218 +++ b/fs/aio.c
5219 @@ -38,7 +38,7 @@
5220
5221 #ifdef CONFIG_EPOLL
5222 #include <linux/poll.h>
5223 -#include <linux/eventpoll.h>
5224 +#include <linux/anon_inodes.h>
5225 #endif
5226
5227 #if DEBUG > 1
5228 @@ -1309,7 +1309,7 @@ static const struct file_operations aioq
5229
5230 /* make_aio_fd:
5231 * Create a file descriptor that can be used to poll the event queue.
5232 - * Based and piggybacked on the excellent epoll code.
5233 + * Based on the excellent epoll code.
5234 */
5235
5236 static int make_aio_fd(struct kioctx *ioctx)
5237 @@ -1318,7 +1318,8 @@ static int make_aio_fd(struct kioctx *io
5238 struct inode *inode;
5239 struct file *file;
5240
5241 - error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops);
5242 + error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
5243 + &aioq_fops, ioctx);
5244 if (error)
5245 return error;
5246
5247 --- a/include/asm-x86/mach-xen/asm/desc_32.h
5248 +++ b/include/asm-x86/mach-xen/asm/desc_32.h
5249 @@ -11,23 +11,24 @@
5250
5251 #include <asm/mmu.h>
5252
5253 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
5254 -
5255 struct Xgt_desc_struct {
5256 unsigned short size;
5257 unsigned long address __attribute__((packed));
5258 unsigned short pad;
5259 } __attribute__ ((packed));
5260
5261 -extern struct Xgt_desc_struct idt_descr;
5262 -DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
5263 -extern struct Xgt_desc_struct early_gdt_descr;
5264 +struct gdt_page
5265 +{
5266 + struct desc_struct gdt[GDT_ENTRIES];
5267 +} __attribute__((aligned(PAGE_SIZE)));
5268 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
5269
5270 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
5271 {
5272 - return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
5273 + return per_cpu(gdt_page, cpu).gdt;
5274 }
5275
5276 +extern struct Xgt_desc_struct idt_descr;
5277 extern struct desc_struct idt_table[];
5278 extern void set_intr_gate(unsigned int irq, void * addr);
5279
5280 @@ -55,53 +56,32 @@ static inline void pack_gate(__u32 *a, _
5281 #define DESCTYPE_S 0x10 /* !system */
5282
5283 #ifndef CONFIG_XEN
5284 -#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
5285 -
5286 -#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
5287 -#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
5288 +#define load_TR_desc() native_load_tr_desc()
5289 +#define load_gdt(dtr) native_load_gdt(dtr)
5290 +#define load_idt(dtr) native_load_idt(dtr)
5291 #define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
5292 #define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
5293
5294 -#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
5295 -#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
5296 -#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
5297 +#define store_gdt(dtr) native_store_gdt(dtr)
5298 +#define store_idt(dtr) native_store_idt(dtr)
5299 +#define store_tr(tr) (tr = native_store_tr())
5300 #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
5301 -#endif
5302
5303 -#if TLS_SIZE != 24
5304 -# error update this code.
5305 -#endif
5306 -
5307 -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
5308 -{
5309 -#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
5310 - *(u64 *)&t->tls_array[i]) \
5311 - BUG()
5312 - C(0); C(1); C(2);
5313 -#undef C
5314 -}
5315 +#define load_TLS(t, cpu) native_load_tls(t, cpu)
5316 +#define set_ldt native_set_ldt
5317
5318 -#ifndef CONFIG_XEN
5319 #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
5320 #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
5321 #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
5322
5323 -static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
5324 +static inline void write_dt_entry(struct desc_struct *dt,
5325 + int entry, u32 entry_low, u32 entry_high)
5326 {
5327 - __u32 *lp = (__u32 *)((char *)dt + entry*8);
5328 - *lp = entry_a;
5329 - *(lp+1) = entry_b;
5330 + dt[entry].a = entry_low;
5331 + dt[entry].b = entry_high;
5332 }
5333 -#define set_ldt native_set_ldt
5334 -#else
5335 -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
5336 -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
5337 -#define set_ldt xen_set_ldt
5338 -#endif
5339
5340 -#ifndef CONFIG_XEN
5341 -static inline fastcall void native_set_ldt(const void *addr,
5342 - unsigned int entries)
5343 +static inline void native_set_ldt(const void *addr, unsigned int entries)
5344 {
5345 if (likely(entries == 0))
5346 __asm__ __volatile__("lldt %w0"::"q" (0));
5347 @@ -116,6 +96,65 @@ static inline fastcall void native_set_l
5348 __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
5349 }
5350 }
5351 +
5352 +
5353 +static inline void native_load_tr_desc(void)
5354 +{
5355 + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
5356 +}
5357 +
5358 +static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
5359 +{
5360 + asm volatile("lgdt %0"::"m" (*dtr));
5361 +}
5362 +
5363 +static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
5364 +{
5365 + asm volatile("lidt %0"::"m" (*dtr));
5366 +}
5367 +
5368 +static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
5369 +{
5370 + asm ("sgdt %0":"=m" (*dtr));
5371 +}
5372 +
5373 +static inline void native_store_idt(struct Xgt_desc_struct *dtr)
5374 +{
5375 + asm ("sidt %0":"=m" (*dtr));
5376 +}
5377 +
5378 +static inline unsigned long native_store_tr(void)
5379 +{
5380 + unsigned long tr;
5381 + asm ("str %0":"=r" (tr));
5382 + return tr;
5383 +}
5384 +
5385 +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
5386 +{
5387 + unsigned int i;
5388 + struct desc_struct *gdt = get_cpu_gdt_table(cpu);
5389 +
5390 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
5391 + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
5392 +}
5393 +#else
5394 +#define load_TLS(t, cpu) xen_load_tls(t, cpu)
5395 +#define set_ldt xen_set_ldt
5396 +
5397 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
5398 +extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
5399 +
5400 +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
5401 +{
5402 + unsigned int i;
5403 + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
5404 +
5405 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
5406 + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
5407 + *(u64 *)&t->tls_array[i]))
5408 + BUG();
5409 +}
5410 #endif
5411
5412 #ifndef CONFIG_X86_NO_IDT
5413 --- a/include/asm-x86/mach-xen/asm/desc_64.h
5414 +++ b/include/asm-x86/mach-xen/asm/desc_64.h
5415 @@ -127,16 +127,6 @@ static inline void set_ldt_desc(unsigned
5416 DESC_LDT, size * 8 - 1);
5417 }
5418
5419 -static inline void set_seg_base(unsigned cpu, int entry, void *base)
5420 -{
5421 - struct desc_struct *d = &cpu_gdt(cpu)[entry];
5422 - u32 addr = (u32)(u64)base;
5423 - BUG_ON((u64)base >> 32);
5424 - d->base0 = addr & 0xffff;
5425 - d->base1 = (addr >> 16) & 0xff;
5426 - d->base2 = (addr >> 24) & 0xff;
5427 -}
5428 -
5429 #define LDT_entry_a(info) \
5430 ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
5431 /* Don't allow setting of the lm bit. It is useless anyways because
5432 @@ -165,25 +155,15 @@ static inline void set_seg_base(unsigned
5433 (info)->useable == 0 && \
5434 (info)->lm == 0)
5435
5436 -#if TLS_SIZE != 24
5437 -# error update this code.
5438 -#endif
5439 -
5440 static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
5441 {
5442 -#if 0
5443 + unsigned int i;
5444 u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
5445 - gdt[0] = t->tls_array[0];
5446 - gdt[1] = t->tls_array[1];
5447 - gdt[2] = t->tls_array[2];
5448 -#endif
5449 -#define C(i) \
5450 - if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \
5451 - t->tls_array[i])) \
5452 - BUG();
5453
5454 - C(0); C(1); C(2);
5455 -#undef C
5456 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
5457 + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
5458 + t->tls_array[i]))
5459 + BUG();
5460 }
5461
5462 /*
5463 --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
5464 +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h
5465 @@ -51,7 +51,7 @@ struct dma_mapping_ops {
5466 };
5467
5468 extern dma_addr_t bad_dma_address;
5469 -extern struct dma_mapping_ops* dma_ops;
5470 +extern const struct dma_mapping_ops* dma_ops;
5471 extern int iommu_merge;
5472
5473 #if 0
5474 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
5475 +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
5476 @@ -19,10 +19,8 @@
5477 * the start of the fixmap.
5478 */
5479 extern unsigned long __FIXADDR_TOP;
5480 -#ifdef CONFIG_COMPAT_VDSO
5481 -#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
5482 -#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
5483 -#endif
5484 +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
5485 +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
5486
5487 #ifndef __ASSEMBLY__
5488 #include <linux/kernel.h>
5489 @@ -85,6 +83,9 @@ enum fixed_addresses {
5490 #ifdef CONFIG_PCI_MMCONFIG
5491 FIX_PCIE_MCFG,
5492 #endif
5493 +#ifdef CONFIG_PARAVIRT
5494 + FIX_PARAVIRT_BOOTMAP,
5495 +#endif
5496 FIX_SHARED_INFO,
5497 #define NR_FIX_ISAMAPS 256
5498 FIX_ISAMAP_END,
5499 --- a/include/asm-x86/mach-xen/asm/fixmap_64.h
5500 +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
5501 @@ -15,7 +15,6 @@
5502 #include <asm/apicdef.h>
5503 #include <asm/page.h>
5504 #include <asm/vsyscall.h>
5505 -#include <asm/vsyscall32.h>
5506 #include <asm/acpi.h>
5507
5508 /*
5509 --- a/include/asm-x86/mach-xen/asm/highmem.h
5510 +++ b/include/asm-x86/mach-xen/asm/highmem.h
5511 @@ -67,12 +67,18 @@ extern void FASTCALL(kunmap_high(struct
5512
5513 void *kmap(struct page *page);
5514 void kunmap(struct page *page);
5515 +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
5516 void *kmap_atomic(struct page *page, enum km_type type);
5517 void *kmap_atomic_pte(struct page *page, enum km_type type);
5518 void kunmap_atomic(void *kvaddr, enum km_type type);
5519 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
5520 struct page *kmap_atomic_to_page(void *ptr);
5521
5522 +#define kmap_atomic_pte(page, type) \
5523 + kmap_atomic_prot(page, type, \
5524 + test_bit(PG_pinned, &(page)->flags) \
5525 + ? PAGE_KERNEL_RO : kmap_prot)
5526 +
5527 #define flush_cache_kmaps() do { } while (0)
5528
5529 void clear_highpage(struct page *);
5530 --- a/include/asm-x86/mach-xen/asm/io_32.h
5531 +++ b/include/asm-x86/mach-xen/asm/io_32.h
5532 @@ -263,15 +263,18 @@ static inline void flush_write_buffers(v
5533
5534 #endif /* __KERNEL__ */
5535
5536 -#define __SLOW_DOWN_IO "outb %%al,$0x80;"
5537 +static inline void xen_io_delay(void)
5538 +{
5539 + asm volatile("outb %%al,$0x80" : : : "memory");
5540 +}
5541
5542 static inline void slow_down_io(void) {
5543 - __asm__ __volatile__(
5544 - __SLOW_DOWN_IO
5545 + xen_io_delay();
5546 #ifdef REALLY_SLOW_IO
5547 - __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
5548 + xen_io_delay();
5549 + xen_io_delay();
5550 + xen_io_delay();
5551 #endif
5552 - : : );
5553 }
5554
5555 #ifdef CONFIG_X86_NUMAQ
5556 --- a/include/asm-x86/mach-xen/asm/irqflags_32.h
5557 +++ b/include/asm-x86/mach-xen/asm/irqflags_32.h
5558 @@ -11,6 +11,40 @@
5559 #define _ASM_IRQFLAGS_H
5560
5561 #ifndef __ASSEMBLY__
5562 +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
5563 +
5564 +#define xen_restore_fl(f) \
5565 +do { \
5566 + vcpu_info_t *_vcpu; \
5567 + barrier(); \
5568 + _vcpu = current_vcpu_info(); \
5569 + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
5570 + barrier(); /* unmask then check (avoid races) */\
5571 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
5572 + force_evtchn_callback(); \
5573 + } \
5574 +} while (0)
5575 +
5576 +#define xen_irq_disable() \
5577 +do { \
5578 + current_vcpu_info()->evtchn_upcall_mask = 1; \
5579 + barrier(); \
5580 +} while (0)
5581 +
5582 +#define xen_irq_enable() \
5583 +do { \
5584 + vcpu_info_t *_vcpu; \
5585 + barrier(); \
5586 + _vcpu = current_vcpu_info(); \
5587 + _vcpu->evtchn_upcall_mask = 0; \
5588 + barrier(); /* unmask then check (avoid races) */ \
5589 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
5590 + force_evtchn_callback(); \
5591 +} while (0)
5592 +
5593 +void xen_safe_halt(void);
5594 +
5595 +void xen_halt(void);
5596
5597 /*
5598 * The use of 'barrier' in the following reflects their use as local-lock
5599 @@ -20,48 +54,31 @@
5600 * includes these barriers, for example.
5601 */
5602
5603 -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
5604 +#define __raw_local_save_flags() xen_save_fl()
5605
5606 -#define raw_local_irq_restore(x) \
5607 -do { \
5608 - vcpu_info_t *_vcpu; \
5609 - barrier(); \
5610 - _vcpu = current_vcpu_info(); \
5611 - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
5612 - barrier(); /* unmask then check (avoid races) */ \
5613 - if (unlikely(_vcpu->evtchn_upcall_pending)) \
5614 - force_evtchn_callback(); \
5615 - } \
5616 -} while (0)
5617 +#define raw_local_irq_restore(flags) xen_restore_fl(flags)
5618
5619 -#define raw_local_irq_disable() \
5620 -do { \
5621 - current_vcpu_info()->evtchn_upcall_mask = 1; \
5622 - barrier(); \
5623 -} while (0)
5624 +#define raw_local_irq_disable() xen_irq_disable()
5625
5626 -#define raw_local_irq_enable() \
5627 -do { \
5628 - vcpu_info_t *_vcpu; \
5629 - barrier(); \
5630 - _vcpu = current_vcpu_info(); \
5631 - _vcpu->evtchn_upcall_mask = 0; \
5632 - barrier(); /* unmask then check (avoid races) */ \
5633 - if (unlikely(_vcpu->evtchn_upcall_pending)) \
5634 - force_evtchn_callback(); \
5635 -} while (0)
5636 +#define raw_local_irq_enable() xen_irq_enable()
5637
5638 /*
5639 * Used in the idle loop; sti takes one instruction cycle
5640 * to complete:
5641 */
5642 -void raw_safe_halt(void);
5643 +static inline void raw_safe_halt(void)
5644 +{
5645 + xen_safe_halt();
5646 +}
5647
5648 /*
5649 * Used when interrupts are already enabled or to
5650 * shutdown the processor:
5651 */
5652 -void halt(void);
5653 +static inline void halt(void)
5654 +{
5655 + xen_halt();
5656 +}
5657
5658 /*
5659 * For spinlocks, etc:
5660 --- a/include/asm-x86/mach-xen/asm/irqflags_64.h
5661 +++ b/include/asm-x86/mach-xen/asm/irqflags_64.h
5662 @@ -9,6 +9,7 @@
5663 */
5664 #ifndef _ASM_IRQFLAGS_H
5665 #define _ASM_IRQFLAGS_H
5666 +#include <asm/processor-flags.h>
5667
5668 #ifndef __ASSEMBLY__
5669 /*
5670 @@ -50,19 +51,19 @@ static inline void raw_local_irq_disable
5671 {
5672 unsigned long flags = __raw_local_save_flags();
5673
5674 - raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18));
5675 + raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
5676 }
5677
5678 static inline void raw_local_irq_enable(void)
5679 {
5680 unsigned long flags = __raw_local_save_flags();
5681
5682 - raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18));
5683 + raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
5684 }
5685
5686 static inline int raw_irqs_disabled_flags(unsigned long flags)
5687 {
5688 - return !(flags & (1<<9)) || (flags & (1 << 18));
5689 + return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
5690 }
5691
5692 #else /* CONFIG_X86_VSMP */
5693 @@ -118,13 +119,21 @@ static inline int raw_irqs_disabled_flag
5694 * Used in the idle loop; sti takes one instruction cycle
5695 * to complete:
5696 */
5697 -void raw_safe_halt(void);
5698 +void xen_safe_halt(void);
5699 +static inline void raw_safe_halt(void)
5700 +{
5701 + xen_safe_halt();
5702 +}
5703
5704 /*
5705 * Used when interrupts are already enabled or to
5706 * shutdown the processor:
5707 */
5708 -void halt(void);
5709 +void xen_halt(void);
5710 +static inline void halt(void)
5711 +{
5712 + xen_halt();
5713 +}
5714
5715 #else /* __ASSEMBLY__: */
5716 # ifdef CONFIG_TRACE_IRQFLAGS
5717 --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
5718 +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
5719 @@ -6,6 +6,20 @@
5720 #include <asm/pgalloc.h>
5721 #include <asm/tlbflush.h>
5722
5723 +void arch_exit_mmap(struct mm_struct *mm);
5724 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
5725 +
5726 +void mm_pin(struct mm_struct *mm);
5727 +void mm_unpin(struct mm_struct *mm);
5728 +void mm_pin_all(void);
5729 +
5730 +static inline void xen_activate_mm(struct mm_struct *prev,
5731 + struct mm_struct *next)
5732 +{
5733 + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
5734 + mm_pin(next);
5735 +}
5736 +
5737 /*
5738 * Used for LDT copy/destruction.
5739 */
5740 @@ -37,10 +51,6 @@ static inline void __prepare_arch_switch
5741 : : "r" (0) );
5742 }
5743
5744 -extern void mm_pin(struct mm_struct *mm);
5745 -extern void mm_unpin(struct mm_struct *mm);
5746 -void mm_pin_all(void);
5747 -
5748 static inline void switch_mm(struct mm_struct *prev,
5749 struct mm_struct *next,
5750 struct task_struct *tsk)
5751 @@ -97,11 +107,10 @@ static inline void switch_mm(struct mm_s
5752 #define deactivate_mm(tsk, mm) \
5753 asm("movl %0,%%gs": :"r" (0));
5754
5755 -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
5756 -{
5757 - if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
5758 - mm_pin(next);
5759 - switch_mm(prev, next, NULL);
5760 -}
5761 +#define activate_mm(prev, next) \
5762 + do { \
5763 + xen_activate_mm(prev, next); \
5764 + switch_mm((prev),(next),NULL); \
5765 + } while(0)
5766
5767 #endif
5768 --- a/include/asm-x86/mach-xen/asm/mmu_context_64.h
5769 +++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h
5770 @@ -9,6 +9,9 @@
5771 #include <asm/pgtable.h>
5772 #include <asm/tlbflush.h>
5773
5774 +void arch_exit_mmap(struct mm_struct *mm);
5775 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
5776 +
5777 /*
5778 * possibly do the LDT unload here?
5779 */
5780 --- a/include/asm-x86/mach-xen/asm/page_64.h
5781 +++ b/include/asm-x86/mach-xen/asm/page_64.h
5782 @@ -7,6 +7,7 @@
5783 #include <linux/types.h>
5784 #include <asm/bug.h>
5785 #endif
5786 +#include <linux/const.h>
5787 #include <xen/interface/xen.h>
5788
5789 /*
5790 @@ -19,18 +20,14 @@
5791
5792 /* PAGE_SHIFT determines the page size */
5793 #define PAGE_SHIFT 12
5794 -#ifdef __ASSEMBLY__
5795 -#define PAGE_SIZE (0x1 << PAGE_SHIFT)
5796 -#else
5797 -#define PAGE_SIZE (1UL << PAGE_SHIFT)
5798 -#endif
5799 +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
5800 #define PAGE_MASK (~(PAGE_SIZE-1))
5801
5802 /* See Documentation/x86_64/mm.txt for a description of the memory map. */
5803 #define __PHYSICAL_MASK_SHIFT 46
5804 -#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
5805 +#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
5806 #define __VIRTUAL_MASK_SHIFT 48
5807 -#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
5808 +#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
5809
5810 #define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
5811
5812 @@ -55,10 +52,10 @@
5813 #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
5814
5815 #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
5816 -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
5817 +#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
5818
5819 #define HPAGE_SHIFT PMD_SHIFT
5820 -#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
5821 +#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
5822 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
5823 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
5824
5825 @@ -152,17 +149,23 @@ static inline pgd_t __pgd(unsigned long
5826
5827 #define __pgprot(x) ((pgprot_t) { (x) } )
5828
5829 -#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
5830 -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
5831 -#define __START_KERNEL_map 0xffffffff80000000UL
5832 -#define __PAGE_OFFSET 0xffff880000000000UL
5833 +#endif /* !__ASSEMBLY__ */
5834
5835 -#else
5836 #define __PHYSICAL_START CONFIG_PHYSICAL_START
5837 +#define __KERNEL_ALIGN 0x200000
5838 +
5839 +/*
5840 + * Make sure kernel is aligned to 2MB address. Catching it at compile
5841 + * time is better. Change your config file and compile the kernel
5842 + * for a 2MB aligned address (CONFIG_PHYSICAL_START)
5843 + */
5844 +#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
5845 +#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
5846 +#endif
5847 +
5848 #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
5849 -#define __START_KERNEL_map 0xffffffff80000000
5850 -#define __PAGE_OFFSET 0xffff880000000000
5851 -#endif /* !__ASSEMBLY__ */
5852 +#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
5853 +#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
5854
5855 #if CONFIG_XEN_COMPAT <= 0x030002
5856 #undef LOAD_OFFSET
5857 @@ -172,20 +175,20 @@ static inline pgd_t __pgd(unsigned long
5858 /* to align the pointer to the (next) page boundary */
5859 #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
5860
5861 -#define KERNEL_TEXT_SIZE (40UL*1024*1024)
5862 -#define KERNEL_TEXT_START 0xffffffff80000000UL
5863 +#define KERNEL_TEXT_SIZE (40*1024*1024)
5864 +#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
5865 +
5866 +#define PAGE_OFFSET __PAGE_OFFSET
5867
5868 -#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
5869 +#ifndef __ASSEMBLY__
5870 +static inline unsigned long __phys_addr(unsigned long x)
5871 +{
5872 + return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
5873 +}
5874 +#endif
5875
5876 -/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
5877 - Otherwise you risk miscompilation. */
5878 -#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
5879 -/* __pa_symbol should be used for C visible symbols.
5880 - This seems to be the official gcc blessed way to do such arithmetic. */
5881 -#define __pa_symbol(x) \
5882 - ({unsigned long v; \
5883 - asm("" : "=r" (v) : "0" (x)); \
5884 - __pa(v); })
5885 +#define __pa(x) __phys_addr((unsigned long)(x))
5886 +#define __pa_symbol(x) __phys_addr((unsigned long)(x))
5887
5888 #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
5889 #define __boot_va(x) __va(x)
5890 --- a/include/asm-x86/mach-xen/asm/pgalloc_32.h
5891 +++ b/include/asm-x86/mach-xen/asm/pgalloc_32.h
5892 @@ -1,7 +1,6 @@
5893 #ifndef _I386_PGALLOC_H
5894 #define _I386_PGALLOC_H
5895
5896 -#include <asm/fixmap.h>
5897 #include <linux/threads.h>
5898 #include <linux/mm.h> /* for struct page */
5899 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
5900 @@ -69,6 +68,4 @@ do { \
5901 #define pud_populate(mm, pmd, pte) BUG()
5902 #endif
5903
5904 -#define check_pgt_cache() do { } while (0)
5905 -
5906 #endif /* _I386_PGALLOC_H */
5907 --- a/include/asm-x86/mach-xen/asm/pgalloc_64.h
5908 +++ b/include/asm-x86/mach-xen/asm/pgalloc_64.h
5909 @@ -1,7 +1,6 @@
5910 #ifndef _X86_64_PGALLOC_H
5911 #define _X86_64_PGALLOC_H
5912
5913 -#include <asm/fixmap.h>
5914 #include <asm/pda.h>
5915 #include <linux/threads.h>
5916 #include <linux/mm.h>
5917 @@ -100,24 +99,16 @@ static inline void pgd_list_add(pgd_t *p
5918 struct page *page = virt_to_page(pgd);
5919
5920 spin_lock(&pgd_lock);
5921 - page->index = (pgoff_t)pgd_list;
5922 - if (pgd_list)
5923 - pgd_list->private = (unsigned long)&page->index;
5924 - pgd_list = page;
5925 - page->private = (unsigned long)&pgd_list;
5926 + list_add(&page->lru, &pgd_list);
5927 spin_unlock(&pgd_lock);
5928 }
5929
5930 static inline void pgd_list_del(pgd_t *pgd)
5931 {
5932 - struct page *next, **pprev, *page = virt_to_page(pgd);
5933 + struct page *page = virt_to_page(pgd);
5934
5935 spin_lock(&pgd_lock);
5936 - next = (struct page *)page->index;
5937 - pprev = (struct page **)page->private;
5938 - *pprev = next;
5939 - if (next)
5940 - next->private = (unsigned long)pprev;
5941 + list_del(&page->lru);
5942 spin_unlock(&pgd_lock);
5943 }
5944
5945 --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
5946 +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
5947 @@ -24,11 +24,11 @@
5948 #include <linux/slab.h>
5949 #include <linux/list.h>
5950 #include <linux/spinlock.h>
5951 +#include <linux/sched.h>
5952
5953 /* Is this pagetable pinned? */
5954 #define PG_pinned PG_arch_1
5955
5956 -struct mm_struct;
5957 struct vm_area_struct;
5958
5959 /*
5960 @@ -38,17 +38,16 @@ struct vm_area_struct;
5961 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
5962 extern unsigned long empty_zero_page[1024];
5963 extern pgd_t *swapper_pg_dir;
5964 -extern struct kmem_cache *pgd_cache;
5965 extern struct kmem_cache *pmd_cache;
5966 extern spinlock_t pgd_lock;
5967 extern struct page *pgd_list;
5968 +void check_pgt_cache(void);
5969
5970 void pmd_ctor(void *, struct kmem_cache *, unsigned long);
5971 -void pgd_ctor(void *, struct kmem_cache *, unsigned long);
5972 -void pgd_dtor(void *, struct kmem_cache *, unsigned long);
5973 void pgtable_cache_init(void);
5974 void paging_init(void);
5975
5976 +
5977 /*
5978 * The Linux x86 paging architecture is 'compile-time dual-mode', it
5979 * implements both the traditional 2-level x86 page tables and the
5980 @@ -165,6 +164,7 @@ void paging_init(void);
5981
5982 extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
5983 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
5984 +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
5985 #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
5986 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
5987 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
5988 @@ -172,6 +172,7 @@ extern unsigned long long __PAGE_KERNEL,
5989 #define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
5990 #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
5991 #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
5992 +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
5993 #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
5994 #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
5995 #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
5996 @@ -275,7 +276,13 @@ static inline pte_t pte_mkhuge(pte_t pte
5997 */
5998 #define pte_update(mm, addr, ptep) do { } while (0)
5999 #define pte_update_defer(mm, addr, ptep) do { } while (0)
6000 -#define paravirt_map_pt_hook(slot, va, pfn) do { } while (0)
6001 +
6002 +/* local pte updates need not use xchg for locking */
6003 +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
6004 +{
6005 + xen_set_pte(ptep, __pte(0));
6006 + return res;
6007 +}
6008
6009 /*
6010 * We only update the dirty/accessed state if we set
6011 @@ -286,17 +293,34 @@ static inline pte_t pte_mkhuge(pte_t pte
6012 */
6013 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
6014 #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
6015 -do { \
6016 - if (dirty) \
6017 +({ \
6018 + int __changed = !pte_same(*(ptep), entry); \
6019 + if (__changed && (dirty)) \
6020 ptep_establish(vma, address, ptep, entry); \
6021 -} while (0)
6022 + __changed; \
6023 +})
6024
6025 -/*
6026 - * We don't actually have these, but we want to advertise them so that
6027 - * we can encompass the flush here.
6028 - */
6029 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
6030 +#define ptep_test_and_clear_dirty(vma, addr, ptep) ({ \
6031 + int __ret = 0; \
6032 + if (pte_dirty(*(ptep))) \
6033 + __ret = test_and_clear_bit(_PAGE_BIT_DIRTY, \
6034 + &(ptep)->pte_low); \
6035 + if (__ret) \
6036 + pte_update((vma)->vm_mm, addr, ptep); \
6037 + __ret; \
6038 +})
6039 +
6040 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
6041 +#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
6042 + int __ret = 0; \
6043 + if (pte_young(*(ptep))) \
6044 + __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
6045 + &(ptep)->pte_low); \
6046 + if (__ret) \
6047 + pte_update((vma)->vm_mm, addr, ptep); \
6048 + __ret; \
6049 +})
6050
6051 /*
6052 * Rules for using ptep_establish: the pte MUST be a user pte, and
6053 @@ -323,7 +347,7 @@ do { \
6054 int __dirty = pte_dirty(__pte); \
6055 __pte = pte_mkclean(__pte); \
6056 if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
6057 - ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
6058 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
6059 else if (__dirty) \
6060 (ptep)->pte_low = __pte.pte_low; \
6061 __dirty; \
6062 @@ -336,7 +360,7 @@ do { \
6063 int __young = pte_young(__pte); \
6064 __pte = pte_mkold(__pte); \
6065 if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
6066 - ptep_set_access_flags(vma, address, ptep, __pte, __young); \
6067 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
6068 else if (__young) \
6069 (ptep)->pte_low = __pte.pte_low; \
6070 __young; \
6071 @@ -349,7 +373,7 @@ static inline pte_t ptep_get_and_clear(s
6072 if (!pte_none(pte)
6073 && (mm != &init_mm
6074 || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
6075 - pte = raw_ptep_get_and_clear(ptep, pte);
6076 + pte = xen_ptep_get_and_clear(ptep, pte);
6077 pte_update(mm, addr, ptep);
6078 }
6079 return pte;
6080 @@ -491,24 +515,10 @@ extern pte_t *lookup_address(unsigned lo
6081 #endif
6082
6083 #if defined(CONFIG_HIGHPTE)
6084 -#define pte_offset_map(dir, address) \
6085 -({ \
6086 - pte_t *__ptep; \
6087 - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \
6088 - __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE0); \
6089 - paravirt_map_pt_hook(KM_PTE0,__ptep, pfn); \
6090 - __ptep = __ptep + pte_index(address); \
6091 - __ptep; \
6092 -})
6093 -#define pte_offset_map_nested(dir, address) \
6094 -({ \
6095 - pte_t *__ptep; \
6096 - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \
6097 - __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE1); \
6098 - paravirt_map_pt_hook(KM_PTE1,__ptep, pfn); \
6099 - __ptep = __ptep + pte_index(address); \
6100 - __ptep; \
6101 -})
6102 +#define pte_offset_map(dir, address) \
6103 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
6104 +#define pte_offset_map_nested(dir, address) \
6105 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
6106 #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
6107 #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
6108 #else
6109 @@ -584,10 +594,6 @@ int xen_change_pte_range(struct mm_struc
6110 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
6111 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
6112
6113 -#define MK_IOSPACE_PFN(space, pfn) (pfn)
6114 -#define GET_IOSPACE(pfn) 0
6115 -#define GET_PFN(pfn) (pfn)
6116 -
6117 #include <asm-generic/pgtable.h>
6118
6119 #endif /* _I386_PGTABLE_H */
6120 --- a/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
6121 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
6122 @@ -1,7 +1,7 @@
6123 #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
6124 #define _I386_PGTABLE_3LEVEL_DEFS_H
6125
6126 -#define HAVE_SHARED_KERNEL_PMD 0
6127 +#define SHARED_KERNEL_PMD 0
6128
6129 /*
6130 * PGDIR_SHIFT determines what a top-level page table entry can map
6131 --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
6132 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
6133 @@ -52,32 +52,40 @@ static inline int pte_exec_kernel(pte_t
6134 * value and then use set_pte to update it. -ben
6135 */
6136
6137 -static inline void set_pte(pte_t *ptep, pte_t pte)
6138 +static inline void xen_set_pte(pte_t *ptep, pte_t pte)
6139 {
6140 ptep->pte_high = pte.pte_high;
6141 smp_wmb();
6142 ptep->pte_low = pte.pte_low;
6143 }
6144 -#define set_pte_atomic(pteptr,pteval) \
6145 - set_64bit((unsigned long long *)(pteptr),__pte_val(pteval))
6146
6147 -#define set_pte_at(_mm,addr,ptep,pteval) do { \
6148 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
6149 - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
6150 - set_pte((ptep), (pteval)); \
6151 -} while (0)
6152 -
6153 -#define set_pmd(pmdptr,pmdval) \
6154 - xen_l2_entry_update((pmdptr), (pmdval))
6155 -#define set_pud(pudptr,pudval) \
6156 - xen_l3_entry_update((pudptr), (pudval))
6157 +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
6158 + pte_t *ptep , pte_t pte)
6159 +{
6160 + if ((mm != current->mm && mm != &init_mm) ||
6161 + HYPERVISOR_update_va_mapping(addr, pte, 0))
6162 + xen_set_pte(ptep, pte);
6163 +}
6164 +
6165 +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
6166 +{
6167 + set_64bit((unsigned long long *)(ptep),__pte_val(pte));
6168 +}
6169 +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
6170 +{
6171 + xen_l2_entry_update(pmdp, pmd);
6172 +}
6173 +static inline void xen_set_pud(pud_t *pudp, pud_t pud)
6174 +{
6175 + xen_l3_entry_update(pudp, pud);
6176 +}
6177
6178 /*
6179 * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
6180 * entry, so clear the bottom half first and enforce ordering with a compiler
6181 * barrier.
6182 */
6183 -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6184 +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6185 {
6186 if ((mm != current->mm && mm != &init_mm)
6187 || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
6188 @@ -87,7 +95,18 @@ static inline void pte_clear(struct mm_s
6189 }
6190 }
6191
6192 -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6193 +static inline void xen_pmd_clear(pmd_t *pmd)
6194 +{
6195 + xen_l2_entry_update(pmd, __pmd(0));
6196 +}
6197 +
6198 +#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
6199 +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
6200 +#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte)
6201 +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
6202 +#define set_pud(pudp, pud) xen_set_pud(pudp, pud)
6203 +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
6204 +#define pmd_clear(pmd) xen_pmd_clear(pmd)
6205
6206 /*
6207 * Pentium-II erratum A13: in PAE mode we explicitly have to flush
6208 @@ -108,7 +127,8 @@ static inline void pud_clear (pud_t * pu
6209 #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
6210 pmd_index(address))
6211
6212 -static inline pte_t raw_ptep_get_and_clear(pte_t *ptep, pte_t res)
6213 +#ifdef CONFIG_SMP
6214 +static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
6215 {
6216 uint64_t val = __pte_val(res);
6217 if (__cmpxchg64(ptep, val, 0) != val) {
6218 @@ -119,6 +139,9 @@ static inline pte_t raw_ptep_get_and_cle
6219 }
6220 return res;
6221 }
6222 +#else
6223 +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
6224 +#endif
6225
6226 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
6227 #define ptep_clear_flush(vma, addr, ptep) \
6228 @@ -165,13 +188,13 @@ extern unsigned long long __supported_pt
6229 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
6230 {
6231 return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
6232 - pgprot_val(pgprot)) & __supported_pte_mask);
6233 + pgprot_val(pgprot)) & __supported_pte_mask);
6234 }
6235
6236 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
6237 {
6238 return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
6239 - pgprot_val(pgprot)) & __supported_pte_mask);
6240 + pgprot_val(pgprot)) & __supported_pte_mask);
6241 }
6242
6243 /*
6244 @@ -191,6 +214,4 @@ static inline pmd_t pfn_pmd(unsigned lon
6245
6246 #define __pmd_free_tlb(tlb, x) do { } while (0)
6247
6248 -void vmalloc_sync_all(void);
6249 -
6250 #endif /* _I386_PGTABLE_3LEVEL_H */
6251 --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
6252 +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
6253 @@ -1,12 +1,14 @@
6254 #ifndef _X86_64_PGTABLE_H
6255 #define _X86_64_PGTABLE_H
6256
6257 +#include <linux/const.h>
6258 +#ifndef __ASSEMBLY__
6259 +
6260 /*
6261 * This file contains the functions and defines necessary to modify and use
6262 * the x86-64 page table tree.
6263 */
6264 #include <asm/processor.h>
6265 -#include <asm/fixmap.h>
6266 #include <asm/bitops.h>
6267 #include <linux/threads.h>
6268 #include <linux/sched.h>
6269 @@ -33,11 +35,9 @@ extern pte_t *lookup_address(unsigned lo
6270 #endif
6271
6272 extern pud_t level3_kernel_pgt[512];
6273 -extern pud_t level3_physmem_pgt[512];
6274 extern pud_t level3_ident_pgt[512];
6275 extern pmd_t level2_kernel_pgt[512];
6276 extern pgd_t init_level4_pgt[];
6277 -extern pgd_t boot_level4_pgt[];
6278 extern unsigned long __supported_pte_mask;
6279
6280 #define swapper_pg_dir init_level4_pgt
6281 @@ -52,6 +52,8 @@ extern void clear_kernel_mapping(unsigne
6282 extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
6283 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
6284
6285 +#endif /* !__ASSEMBLY__ */
6286 +
6287 /*
6288 * PGDIR_SHIFT determines what a top-level page table entry can map
6289 */
6290 @@ -76,6 +78,8 @@ extern unsigned long empty_zero_page[PAG
6291 */
6292 #define PTRS_PER_PTE 512
6293
6294 +#ifndef __ASSEMBLY__
6295 +
6296 #define pte_ERROR(e) \
6297 printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
6298 &(e), __pte_val(e), pte_pfn(e))
6299 @@ -118,22 +122,23 @@ static inline void pgd_clear (pgd_t * pg
6300
6301 #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
6302
6303 -#define PMD_SIZE (1UL << PMD_SHIFT)
6304 +#endif /* !__ASSEMBLY__ */
6305 +
6306 +#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
6307 #define PMD_MASK (~(PMD_SIZE-1))
6308 -#define PUD_SIZE (1UL << PUD_SHIFT)
6309 +#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT)
6310 #define PUD_MASK (~(PUD_SIZE-1))
6311 -#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
6312 +#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
6313 #define PGDIR_MASK (~(PGDIR_SIZE-1))
6314
6315 #define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
6316 #define FIRST_USER_ADDRESS 0
6317
6318 -#ifndef __ASSEMBLY__
6319 -#define MAXMEM 0x3fffffffffffUL
6320 -#define VMALLOC_START 0xffffc20000000000UL
6321 -#define VMALLOC_END 0xffffe1ffffffffffUL
6322 -#define MODULES_VADDR 0xffffffff88000000UL
6323 -#define MODULES_END 0xfffffffffff00000UL
6324 +#define MAXMEM _AC(0x3fffffffffff, UL)
6325 +#define VMALLOC_START _AC(0xffffc20000000000, UL)
6326 +#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
6327 +#define MODULES_VADDR _AC(0xffffffff88000000, UL)
6328 +#define MODULES_END _AC(0xfffffffffff00000, UL)
6329 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
6330
6331 #define _PAGE_BIT_PRESENT 0
6332 @@ -159,16 +164,18 @@ static inline void pgd_clear (pgd_t * pg
6333 #define _PAGE_GLOBAL 0x100 /* Global TLB entry */
6334
6335 #define _PAGE_PROTNONE 0x080 /* If not present */
6336 -#define _PAGE_NX (1UL<<_PAGE_BIT_NX)
6337 +#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX)
6338
6339 /* Mapped page is I/O or foreign and has no associated page struct. */
6340 #define _PAGE_IO 0x200
6341
6342 +#ifndef __ASSEMBLY__
6343 #if CONFIG_XEN_COMPAT <= 0x030002
6344 extern unsigned int __kernel_page_user;
6345 #else
6346 #define __kernel_page_user 0
6347 #endif
6348 +#endif
6349
6350 #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
6351 #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
6352 @@ -233,6 +240,8 @@ extern unsigned int __kernel_page_user;
6353 #define __S110 PAGE_SHARED_EXEC
6354 #define __S111 PAGE_SHARED_EXEC
6355
6356 +#ifndef __ASSEMBLY__
6357 +
6358 static inline unsigned long pgd_bad(pgd_t pgd)
6359 {
6360 return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
6361 @@ -344,6 +353,20 @@ static inline pte_t pte_mkwrite(pte_t pt
6362 static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
6363 static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
6364
6365 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
6366 +{
6367 + if (!pte_dirty(*ptep))
6368 + return 0;
6369 + return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte);
6370 +}
6371 +
6372 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
6373 +{
6374 + if (!pte_young(*ptep))
6375 + return 0;
6376 + return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
6377 +}
6378 +
6379 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6380 {
6381 pte_t pte = *ptep;
6382 @@ -468,18 +491,12 @@ static inline pte_t pte_modify(pte_t pte
6383 * bit at the same time. */
6384 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
6385 #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
6386 - do { \
6387 - if (dirty) \
6388 - ptep_establish(vma, address, ptep, entry); \
6389 - } while (0)
6390 -
6391 -
6392 -/*
6393 - * i386 says: We don't actually have these, but we want to advertise
6394 - * them so that we can encompass the flush here.
6395 - */
6396 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
6397 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
6398 +({ \
6399 + int __changed = !pte_same(*(ptep), entry); \
6400 + if (__changed && (dirty)) \
6401 + ptep_establish(vma, address, ptep, entry); \
6402 + __changed; \
6403 +})
6404
6405 #define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
6406 #define ptep_clear_flush_dirty(vma, address, ptep) \
6407 @@ -488,7 +505,7 @@ static inline pte_t pte_modify(pte_t pte
6408 int __dirty = pte_dirty(__pte); \
6409 __pte = pte_mkclean(__pte); \
6410 if ((vma)->vm_mm->context.pinned) \
6411 - ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
6412 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
6413 else if (__dirty) \
6414 set_pte(ptep, __pte); \
6415 __dirty; \
6416 @@ -501,7 +518,7 @@ static inline pte_t pte_modify(pte_t pte
6417 int __young = pte_young(__pte); \
6418 __pte = pte_mkold(__pte); \
6419 if ((vma)->vm_mm->context.pinned) \
6420 - ptep_set_access_flags(vma, address, ptep, __pte, __young); \
6421 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
6422 else if (__young) \
6423 set_pte(ptep, __pte); \
6424 __young; \
6425 @@ -515,10 +532,7 @@ static inline pte_t pte_modify(pte_t pte
6426 #define __swp_entry_to_pte(x) ((pte_t) { (x).val })
6427
6428 extern spinlock_t pgd_lock;
6429 -extern struct page *pgd_list;
6430 -void vmalloc_sync_all(void);
6431 -
6432 -#endif /* !__ASSEMBLY__ */
6433 +extern struct list_head pgd_list;
6434
6435 extern int kern_addr_valid(unsigned long addr);
6436
6437 @@ -557,10 +571,6 @@ int xen_change_pte_range(struct mm_struc
6438 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
6439 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
6440
6441 -#define MK_IOSPACE_PFN(space, pfn) (pfn)
6442 -#define GET_IOSPACE(pfn) 0
6443 -#define GET_PFN(pfn) (pfn)
6444 -
6445 #define HAVE_ARCH_UNMAPPED_AREA
6446
6447 #define pgtable_cache_init() do { } while (0)
6448 @@ -574,11 +584,14 @@ int xen_change_pte_range(struct mm_struc
6449 #define kc_offset_to_vaddr(o) \
6450 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
6451
6452 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
6453 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
6454 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6455 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
6456 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
6457 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
6458 #define __HAVE_ARCH_PTE_SAME
6459 #include <asm-generic/pgtable.h>
6460 +#endif /* !__ASSEMBLY__ */
6461
6462 #endif /* _X86_64_PGTABLE_H */
6463 --- a/include/asm-x86/mach-xen/asm/processor_32.h
6464 +++ b/include/asm-x86/mach-xen/asm/processor_32.h
6465 @@ -21,6 +21,7 @@
6466 #include <asm/percpu.h>
6467 #include <linux/cpumask.h>
6468 #include <linux/init.h>
6469 +#include <asm/processor-flags.h>
6470 #include <xen/interface/physdev.h>
6471
6472 /* flag for disabling the tsc */
6473 @@ -118,7 +119,8 @@ extern char ignore_fpu_irq;
6474
6475 void __init cpu_detect(struct cpuinfo_x86 *c);
6476
6477 -extern void identify_cpu(struct cpuinfo_x86 *);
6478 +extern void identify_boot_cpu(void);
6479 +extern void identify_secondary_cpu(struct cpuinfo_x86 *);
6480 extern void print_cpu_info(struct cpuinfo_x86 *);
6481 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
6482 extern unsigned short num_cache_leaves;
6483 @@ -129,29 +131,8 @@ extern void detect_ht(struct cpuinfo_x86
6484 static inline void detect_ht(struct cpuinfo_x86 *c) {}
6485 #endif
6486
6487 -/*
6488 - * EFLAGS bits
6489 - */
6490 -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
6491 -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
6492 -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
6493 -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
6494 -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
6495 -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
6496 -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
6497 -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
6498 -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
6499 -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
6500 -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
6501 -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
6502 -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
6503 -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
6504 -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
6505 -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6506 -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6507 -
6508 -static inline fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6509 - unsigned int *ecx, unsigned int *edx)
6510 +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6511 + unsigned int *ecx, unsigned int *edx)
6512 {
6513 /* ecx is often an input as well as an output. */
6514 __asm__(XEN_CPUID
6515 @@ -165,21 +146,6 @@ static inline fastcall void xen_cpuid(un
6516 #define load_cr3(pgdir) write_cr3(__pa(pgdir))
6517
6518 /*
6519 - * Intel CPU features in CR4
6520 - */
6521 -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
6522 -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
6523 -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
6524 -#define X86_CR4_DE 0x0008 /* enable debugging extensions */
6525 -#define X86_CR4_PSE 0x0010 /* enable page size extensions */
6526 -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
6527 -#define X86_CR4_MCE 0x0040 /* Machine check enable */
6528 -#define X86_CR4_PGE 0x0080 /* enable global pages */
6529 -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
6530 -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
6531 -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
6532 -
6533 -/*
6534 * Save the cr4 feature set we're using (ie
6535 * Pentium 4MB enable and PPro Global page
6536 * enable), so that any CPU's that boot up
6537 @@ -206,26 +172,6 @@ static inline void clear_in_cr4 (unsigne
6538 }
6539
6540 /*
6541 - * NSC/Cyrix CPU configuration register indexes
6542 - */
6543 -
6544 -#define CX86_PCR0 0x20
6545 -#define CX86_GCR 0xb8
6546 -#define CX86_CCR0 0xc0
6547 -#define CX86_CCR1 0xc1
6548 -#define CX86_CCR2 0xc2
6549 -#define CX86_CCR3 0xc3
6550 -#define CX86_CCR4 0xe8
6551 -#define CX86_CCR5 0xe9
6552 -#define CX86_CCR6 0xea
6553 -#define CX86_CCR7 0xeb
6554 -#define CX86_PCR1 0xf0
6555 -#define CX86_DIR0 0xfe
6556 -#define CX86_DIR1 0xff
6557 -#define CX86_ARR_BASE 0xc4
6558 -#define CX86_RCR_BASE 0xdc
6559 -
6560 -/*
6561 * NSC/Cyrix CPU indexed register access macros
6562 */
6563
6564 @@ -351,7 +297,8 @@ typedef struct {
6565 struct thread_struct;
6566
6567 #ifndef CONFIG_X86_NO_TSS
6568 -struct tss_struct {
6569 +/* This is the TSS defined by the hardware. */
6570 +struct i386_hw_tss {
6571 unsigned short back_link,__blh;
6572 unsigned long esp0;
6573 unsigned short ss0,__ss0h;
6574 @@ -375,6 +322,11 @@ struct tss_struct {
6575 unsigned short gs, __gsh;
6576 unsigned short ldt, __ldth;
6577 unsigned short trace, io_bitmap_base;
6578 +} __attribute__((packed));
6579 +
6580 +struct tss_struct {
6581 + struct i386_hw_tss x86_tss;
6582 +
6583 /*
6584 * The extra 1 is there because the CPU will access an
6585 * additional byte beyond the end of the IO permission
6586 @@ -428,10 +380,11 @@ struct thread_struct {
6587 };
6588
6589 #define INIT_THREAD { \
6590 + .esp0 = sizeof(init_stack) + (long)&init_stack, \
6591 .vm86_info = NULL, \
6592 .sysenter_cs = __KERNEL_CS, \
6593 .io_bitmap_ptr = NULL, \
6594 - .fs = __KERNEL_PDA, \
6595 + .fs = __KERNEL_PERCPU, \
6596 }
6597
6598 /*
6599 @@ -441,10 +394,12 @@ struct thread_struct {
6600 * be within the limit.
6601 */
6602 #define INIT_TSS { \
6603 - .esp0 = sizeof(init_stack) + (long)&init_stack, \
6604 - .ss0 = __KERNEL_DS, \
6605 - .ss1 = __KERNEL_CS, \
6606 - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
6607 + .x86_tss = { \
6608 + .esp0 = sizeof(init_stack) + (long)&init_stack, \
6609 + .ss0 = __KERNEL_DS, \
6610 + .ss1 = __KERNEL_CS, \
6611 + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
6612 + }, \
6613 .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
6614 }
6615
6616 @@ -551,38 +506,33 @@ static inline void rep_nop(void)
6617
6618 #define cpu_relax() rep_nop()
6619
6620 -#define paravirt_enabled() 0
6621 -#define __cpuid xen_cpuid
6622 -
6623 #ifndef CONFIG_X86_NO_TSS
6624 -static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6625 +static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6626 {
6627 - tss->esp0 = thread->esp0;
6628 + tss->x86_tss.esp0 = thread->esp0;
6629 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
6630 - if (unlikely(tss->ss1 != thread->sysenter_cs)) {
6631 - tss->ss1 = thread->sysenter_cs;
6632 + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
6633 + tss->x86_tss.ss1 = thread->sysenter_cs;
6634 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
6635 }
6636 }
6637 -#define load_esp0(tss, thread) \
6638 - __load_esp0(tss, thread)
6639 #else
6640 -#define load_esp0(tss, thread) do { \
6641 +#define xen_load_esp0(tss, thread) do { \
6642 if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
6643 BUG(); \
6644 } while (0)
6645 #endif
6646
6647
6648 -/*
6649 - * These special macros can be used to get or set a debugging register
6650 - */
6651 -#define get_debugreg(var, register) \
6652 - (var) = HYPERVISOR_get_debugreg(register)
6653 -#define set_debugreg(value, register) \
6654 - WARN_ON(HYPERVISOR_set_debugreg(register, value))
6655 +static inline unsigned long xen_get_debugreg(int regno)
6656 +{
6657 + return HYPERVISOR_get_debugreg(regno);
6658 +}
6659
6660 -#define set_iopl_mask xen_set_iopl_mask
6661 +static inline void xen_set_debugreg(int regno, unsigned long value)
6662 +{
6663 + WARN_ON(HYPERVISOR_set_debugreg(regno, value));
6664 +}
6665
6666 /*
6667 * Set IOPL bits in EFLAGS from given mask
6668 @@ -597,6 +547,21 @@ static inline void xen_set_iopl_mask(uns
6669 }
6670
6671
6672 +#define paravirt_enabled() 0
6673 +#define __cpuid xen_cpuid
6674 +
6675 +#define load_esp0 xen_load_esp0
6676 +
6677 +/*
6678 + * These special macros can be used to get or set a debugging register
6679 + */
6680 +#define get_debugreg(var, register) \
6681 + (var) = xen_get_debugreg(register)
6682 +#define set_debugreg(value, register) \
6683 + xen_set_debugreg(register, value)
6684 +
6685 +#define set_iopl_mask xen_set_iopl_mask
6686 +
6687 /*
6688 * Generic CPUID function
6689 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
6690 @@ -749,8 +714,14 @@ extern unsigned long boot_option_idle_ov
6691 extern void enable_sep_cpu(void);
6692 extern int sysenter_setup(void);
6693
6694 -extern int init_gdt(int cpu, struct task_struct *idle);
6695 +/* Defined in head.S */
6696 +extern struct Xgt_desc_struct early_gdt_descr;
6697 +
6698 extern void cpu_set_gdt(int);
6699 -extern void secondary_cpu_init(void);
6700 +extern void switch_to_new_gdt(void);
6701 +extern void cpu_init(void);
6702 +extern void init_gdt(int cpu);
6703 +
6704 +extern int force_mwait;
6705
6706 #endif /* __ASM_I386_PROCESSOR_H */
6707 --- a/include/asm-x86/mach-xen/asm/processor_64.h
6708 +++ b/include/asm-x86/mach-xen/asm/processor_64.h
6709 @@ -20,6 +20,7 @@
6710 #include <asm/percpu.h>
6711 #include <linux/personality.h>
6712 #include <linux/cpumask.h>
6713 +#include <asm/processor-flags.h>
6714
6715 #define TF_MASK 0x00000100
6716 #define IF_MASK 0x00000200
6717 @@ -103,42 +104,6 @@ extern unsigned int init_intel_cacheinfo
6718 extern unsigned short num_cache_leaves;
6719
6720 /*
6721 - * EFLAGS bits
6722 - */
6723 -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
6724 -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
6725 -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
6726 -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
6727 -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
6728 -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
6729 -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
6730 -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
6731 -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
6732 -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
6733 -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
6734 -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
6735 -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
6736 -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
6737 -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
6738 -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6739 -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6740 -
6741 -/*
6742 - * Intel CPU features in CR4
6743 - */
6744 -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
6745 -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
6746 -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
6747 -#define X86_CR4_DE 0x0008 /* enable debugging extensions */
6748 -#define X86_CR4_PSE 0x0010 /* enable page size extensions */
6749 -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
6750 -#define X86_CR4_MCE 0x0040 /* Machine check enable */
6751 -#define X86_CR4_PGE 0x0080 /* enable global pages */
6752 -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
6753 -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
6754 -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
6755 -
6756 -/*
6757 * Save the cr4 feature set we're using (ie
6758 * Pentium 4MB enable and PPro Global page
6759 * enable), so that any CPU's that boot up
6760 @@ -203,7 +168,7 @@ struct i387_fxsave_struct {
6761 u32 mxcsr;
6762 u32 mxcsr_mask;
6763 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
6764 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */
6765 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
6766 u32 padding[24];
6767 } __attribute__ ((aligned (16)));
6768
6769 @@ -436,22 +401,6 @@ static inline void prefetchw(void *x)
6770 #define cpu_relax() rep_nop()
6771
6772 /*
6773 - * NSC/Cyrix CPU configuration register indexes
6774 - */
6775 -#define CX86_CCR0 0xc0
6776 -#define CX86_CCR1 0xc1
6777 -#define CX86_CCR2 0xc2
6778 -#define CX86_CCR3 0xc3
6779 -#define CX86_CCR4 0xe8
6780 -#define CX86_CCR5 0xe9
6781 -#define CX86_CCR6 0xea
6782 -#define CX86_CCR7 0xeb
6783 -#define CX86_DIR0 0xfe
6784 -#define CX86_DIR1 0xff
6785 -#define CX86_ARR_BASE 0xc4
6786 -#define CX86_RCR_BASE 0xdc
6787 -
6788 -/*
6789 * NSC/Cyrix CPU indexed register access macros
6790 */
6791
6792 --- a/include/asm-x86/mach-xen/asm/segment_32.h
6793 +++ b/include/asm-x86/mach-xen/asm/segment_32.h
6794 @@ -39,7 +39,7 @@
6795 * 25 - APM BIOS support
6796 *
6797 * 26 - ESPFIX small SS
6798 - * 27 - PDA [ per-cpu private data area ]
6799 + * 27 - per-cpu [ offset to per-cpu data area ]
6800 * 28 - unused
6801 * 29 - unused
6802 * 30 - unused
6803 @@ -74,8 +74,12 @@
6804 #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
6805 #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
6806
6807 -#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15)
6808 -#define __KERNEL_PDA (GDT_ENTRY_PDA * 8)
6809 +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
6810 +#ifdef CONFIG_SMP
6811 +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
6812 +#else
6813 +#define __KERNEL_PERCPU 0
6814 +#endif
6815
6816 #define GDT_ENTRY_DOUBLEFAULT_TSS 31
6817
6818 --- a/include/asm-x86/mach-xen/asm/smp_32.h
6819 +++ b/include/asm-x86/mach-xen/asm/smp_32.h
6820 @@ -8,19 +8,15 @@
6821 #include <linux/kernel.h>
6822 #include <linux/threads.h>
6823 #include <linux/cpumask.h>
6824 -#include <asm/pda.h>
6825 #endif
6826
6827 -#ifdef CONFIG_X86_LOCAL_APIC
6828 -#ifndef __ASSEMBLY__
6829 -#include <asm/fixmap.h>
6830 +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
6831 #include <asm/bitops.h>
6832 #include <asm/mpspec.h>
6833 +#include <asm/apic.h>
6834 #ifdef CONFIG_X86_IO_APIC
6835 #include <asm/io_apic.h>
6836 #endif
6837 -#include <asm/apic.h>
6838 -#endif
6839 #endif
6840
6841 #define BAD_APICID 0xFFu
6842 @@ -52,9 +48,76 @@ extern void cpu_exit_clear(void);
6843 extern void cpu_uninit(void);
6844 #endif
6845
6846 -#ifndef CONFIG_PARAVIRT
6847 +#ifndef CONFIG_XEN
6848 +struct smp_ops
6849 +{
6850 + void (*smp_prepare_boot_cpu)(void);
6851 + void (*smp_prepare_cpus)(unsigned max_cpus);
6852 + int (*cpu_up)(unsigned cpu);
6853 + void (*smp_cpus_done)(unsigned max_cpus);
6854 +
6855 + void (*smp_send_stop)(void);
6856 + void (*smp_send_reschedule)(int cpu);
6857 + int (*smp_call_function_mask)(cpumask_t mask,
6858 + void (*func)(void *info), void *info,
6859 + int wait);
6860 +};
6861 +
6862 +extern struct smp_ops smp_ops;
6863 +
6864 +static inline void smp_prepare_boot_cpu(void)
6865 +{
6866 + smp_ops.smp_prepare_boot_cpu();
6867 +}
6868 +static inline void smp_prepare_cpus(unsigned int max_cpus)
6869 +{
6870 + smp_ops.smp_prepare_cpus(max_cpus);
6871 +}
6872 +static inline int __cpu_up(unsigned int cpu)
6873 +{
6874 + return smp_ops.cpu_up(cpu);
6875 +}
6876 +static inline void smp_cpus_done(unsigned int max_cpus)
6877 +{
6878 + smp_ops.smp_cpus_done(max_cpus);
6879 +}
6880 +
6881 +static inline void smp_send_stop(void)
6882 +{
6883 + smp_ops.smp_send_stop();
6884 +}
6885 +static inline void smp_send_reschedule(int cpu)
6886 +{
6887 + smp_ops.smp_send_reschedule(cpu);
6888 +}
6889 +static inline int smp_call_function_mask(cpumask_t mask,
6890 + void (*func) (void *info), void *info,
6891 + int wait)
6892 +{
6893 + return smp_ops.smp_call_function_mask(mask, func, info, wait);
6894 +}
6895 +
6896 +void native_smp_prepare_boot_cpu(void);
6897 +void native_smp_prepare_cpus(unsigned int max_cpus);
6898 +int native_cpu_up(unsigned int cpunum);
6899 +void native_smp_cpus_done(unsigned int max_cpus);
6900 +
6901 #define startup_ipi_hook(phys_apicid, start_eip, start_esp) \
6902 do { } while (0)
6903 +
6904 +#else
6905 +
6906 +
6907 +void xen_smp_send_stop(void);
6908 +void xen_smp_send_reschedule(int cpu);
6909 +int xen_smp_call_function_mask(cpumask_t mask,
6910 + void (*func) (void *info), void *info,
6911 + int wait);
6912 +
6913 +#define smp_send_stop xen_smp_send_stop
6914 +#define smp_send_reschedule xen_smp_send_reschedule
6915 +#define smp_call_function_mask xen_smp_call_function_mask
6916 +
6917 #endif
6918
6919 /*
6920 @@ -62,7 +125,8 @@ do { } while (0)
6921 * from the initial startup. We map APIC_BASE very early in page_setup(),
6922 * so this is correct in the x86 case.
6923 */
6924 -#define raw_smp_processor_id() (read_pda(cpu_number))
6925 +DECLARE_PER_CPU(int, cpu_number);
6926 +#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
6927
6928 extern cpumask_t cpu_possible_map;
6929 #define cpu_callin_map cpu_possible_map
6930 @@ -73,20 +137,6 @@ static inline int num_booting_cpus(void)
6931 return cpus_weight(cpu_possible_map);
6932 }
6933
6934 -#ifdef CONFIG_X86_LOCAL_APIC
6935 -
6936 -#ifdef APIC_DEFINITION
6937 -extern int hard_smp_processor_id(void);
6938 -#else
6939 -#include <mach_apicdef.h>
6940 -static inline int hard_smp_processor_id(void)
6941 -{
6942 - /* we don't want to mark this access volatile - bad code generation */
6943 - return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
6944 -}
6945 -#endif
6946 -#endif
6947 -
6948 #define safe_smp_processor_id() smp_processor_id()
6949 extern int __cpu_disable(void);
6950 extern void __cpu_die(unsigned int cpu);
6951 @@ -102,10 +152,31 @@ extern unsigned int num_processors;
6952
6953 #define NO_PROC_ID 0xFF /* No processor magic marker */
6954
6955 -#endif
6956 +#endif /* CONFIG_SMP */
6957
6958 #ifndef __ASSEMBLY__
6959
6960 +#ifdef CONFIG_X86_LOCAL_APIC
6961 +
6962 +#ifdef APIC_DEFINITION
6963 +extern int hard_smp_processor_id(void);
6964 +#else
6965 +#include <mach_apicdef.h>
6966 +static inline int hard_smp_processor_id(void)
6967 +{
6968 + /* we don't want to mark this access volatile - bad code generation */
6969 + return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
6970 +}
6971 +#endif /* APIC_DEFINITION */
6972 +
6973 +#else /* CONFIG_X86_LOCAL_APIC */
6974 +
6975 +#ifndef CONFIG_SMP
6976 +#define hard_smp_processor_id() 0
6977 +#endif
6978 +
6979 +#endif /* CONFIG_X86_LOCAL_APIC */
6980 +
6981 extern u8 apicid_2_node[];
6982
6983 #ifdef CONFIG_X86_LOCAL_APIC
6984 --- a/include/asm-x86/mach-xen/asm/smp_64.h
6985 +++ b/include/asm-x86/mach-xen/asm/smp_64.h
6986 @@ -11,12 +11,11 @@
6987 extern int disable_apic;
6988
6989 #ifdef CONFIG_X86_LOCAL_APIC
6990 -#include <asm/fixmap.h>
6991 #include <asm/mpspec.h>
6992 +#include <asm/apic.h>
6993 #ifdef CONFIG_X86_IO_APIC
6994 #include <asm/io_apic.h>
6995 #endif
6996 -#include <asm/apic.h>
6997 #include <asm/thread_info.h>
6998 #endif
6999
7000 @@ -41,7 +40,6 @@ extern void lock_ipi_call_lock(void);
7001 extern void unlock_ipi_call_lock(void);
7002 extern int smp_num_siblings;
7003 extern void smp_send_reschedule(int cpu);
7004 -void smp_stop_cpu(void);
7005
7006 extern cpumask_t cpu_sibling_map[NR_CPUS];
7007 extern cpumask_t cpu_core_map[NR_CPUS];
7008 @@ -62,14 +60,6 @@ static inline int num_booting_cpus(void)
7009
7010 #define raw_smp_processor_id() read_pda(cpunumber)
7011
7012 -#ifdef CONFIG_X86_LOCAL_APIC
7013 -static inline int hard_smp_processor_id(void)
7014 -{
7015 - /* we don't want to mark this access volatile - bad code generation */
7016 - return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
7017 -}
7018 -#endif
7019 -
7020 extern int __cpu_disable(void);
7021 extern void __cpu_die(unsigned int cpu);
7022 extern void prefill_possible_map(void);
7023 @@ -78,6 +68,14 @@ extern unsigned __cpuinitdata disabled_c
7024
7025 #define NO_PROC_ID 0xFF /* No processor magic marker */
7026
7027 +#endif /* CONFIG_SMP */
7028 +
7029 +#ifdef CONFIG_X86_LOCAL_APIC
7030 +static inline int hard_smp_processor_id(void)
7031 +{
7032 + /* we don't want to mark this access volatile - bad code generation */
7033 + return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
7034 +}
7035 #endif
7036
7037 /*
7038 --- a/include/asm-x86/mach-xen/asm/system_32.h
7039 +++ b/include/asm-x86/mach-xen/asm/system_32.h
7040 @@ -4,7 +4,7 @@
7041 #include <linux/kernel.h>
7042 #include <asm/segment.h>
7043 #include <asm/cpufeature.h>
7044 -#include <linux/bitops.h> /* for LOCK_PREFIX */
7045 +#include <asm/cmpxchg.h>
7046 #include <asm/synch_bitops.h>
7047 #include <asm/hypervisor.h>
7048
7049 @@ -90,308 +90,102 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
7050 #define savesegment(seg, value) \
7051 asm volatile("mov %%" #seg ",%0":"=rm" (value))
7052
7053 -#define read_cr0() ({ \
7054 - unsigned int __dummy; \
7055 - __asm__ __volatile__( \
7056 - "movl %%cr0,%0\n\t" \
7057 - :"=r" (__dummy)); \
7058 - __dummy; \
7059 -})
7060 -#define write_cr0(x) \
7061 - __asm__ __volatile__("movl %0,%%cr0": :"r" (x))
7062 -
7063 -#define read_cr2() (current_vcpu_info()->arch.cr2)
7064 -#define write_cr2(x) \
7065 - __asm__ __volatile__("movl %0,%%cr2": :"r" (x))
7066 -
7067 -#define read_cr3() ({ \
7068 - unsigned int __dummy; \
7069 - __asm__ ( \
7070 - "movl %%cr3,%0\n\t" \
7071 - :"=r" (__dummy)); \
7072 - __dummy = xen_cr3_to_pfn(__dummy); \
7073 - mfn_to_pfn(__dummy) << PAGE_SHIFT; \
7074 -})
7075 -#define write_cr3(x) ({ \
7076 - unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \
7077 - __dummy = xen_pfn_to_cr3(__dummy); \
7078 - __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \
7079 -})
7080 -#define read_cr4() ({ \
7081 - unsigned int __dummy; \
7082 - __asm__( \
7083 - "movl %%cr4,%0\n\t" \
7084 - :"=r" (__dummy)); \
7085 - __dummy; \
7086 -})
7087 -#define read_cr4_safe() ({ \
7088 - unsigned int __dummy; \
7089 - /* This could fault if %cr4 does not exist */ \
7090 - __asm__("1: movl %%cr4, %0 \n" \
7091 - "2: \n" \
7092 - ".section __ex_table,\"a\" \n" \
7093 - ".long 1b,2b \n" \
7094 - ".previous \n" \
7095 - : "=r" (__dummy): "0" (0)); \
7096 - __dummy; \
7097 -})
7098 -
7099 -#define write_cr4(x) \
7100 - __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
7101 -
7102 -#define wbinvd() \
7103 - __asm__ __volatile__ ("wbinvd": : :"memory")
7104 -
7105 -/* Clear the 'TS' bit */
7106 -#define clts() (HYPERVISOR_fpu_taskswitch(0))
7107 -
7108 -/* Set the 'TS' bit */
7109 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
7110 -
7111 -#endif /* __KERNEL__ */
7112 -
7113 -static inline unsigned long get_limit(unsigned long segment)
7114 +static inline void xen_clts(void)
7115 {
7116 - unsigned long __limit;
7117 - __asm__("lsll %1,%0"
7118 - :"=r" (__limit):"r" (segment));
7119 - return __limit+1;
7120 + HYPERVISOR_fpu_taskswitch(0);
7121 }
7122
7123 -#define nop() __asm__ __volatile__ ("nop")
7124 -
7125 -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
7126 -
7127 -#define tas(ptr) (xchg((ptr),1))
7128 -
7129 -struct __xchg_dummy { unsigned long a[100]; };
7130 -#define __xg(x) ((struct __xchg_dummy *)(x))
7131 +static inline unsigned long xen_read_cr0(void)
7132 +{
7133 + unsigned long val;
7134 + asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
7135 + return val;
7136 +}
7137
7138 +static inline void xen_write_cr0(unsigned long val)
7139 +{
7140 + asm volatile("movl %0,%%cr0": :"r" (val));
7141 +}
7142
7143 -#ifdef CONFIG_X86_CMPXCHG64
7144 +#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
7145
7146 -/*
7147 - * The semantics of XCHGCMP8B are a bit strange, this is why
7148 - * there is a loop and the loading of %%eax and %%edx has to
7149 - * be inside. This inlines well in most cases, the cached
7150 - * cost is around ~38 cycles. (in the future we might want
7151 - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
7152 - * might have an implicit FPU-save as a cost, so it's not
7153 - * clear which path to go.)
7154 - *
7155 - * cmpxchg8b must be used with the lock prefix here to allow
7156 - * the instruction to be executed atomically, see page 3-102
7157 - * of the instruction set reference 24319102.pdf. We need
7158 - * the reader side to see the coherent 64bit value.
7159 - */
7160 -static inline void __set_64bit (unsigned long long * ptr,
7161 - unsigned int low, unsigned int high)
7162 +static inline void xen_write_cr2(unsigned long val)
7163 {
7164 - __asm__ __volatile__ (
7165 - "\n1:\t"
7166 - "movl (%0), %%eax\n\t"
7167 - "movl 4(%0), %%edx\n\t"
7168 - "lock cmpxchg8b (%0)\n\t"
7169 - "jnz 1b"
7170 - : /* no outputs */
7171 - : "D"(ptr),
7172 - "b"(low),
7173 - "c"(high)
7174 - : "ax","dx","memory");
7175 + asm volatile("movl %0,%%cr2": :"r" (val));
7176 }
7177
7178 -static inline void __set_64bit_constant (unsigned long long *ptr,
7179 - unsigned long long value)
7180 +static inline unsigned long xen_read_cr3(void)
7181 {
7182 - __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
7183 + unsigned long val;
7184 + asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
7185 + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
7186 }
7187 -#define ll_low(x) *(((unsigned int*)&(x))+0)
7188 -#define ll_high(x) *(((unsigned int*)&(x))+1)
7189
7190 -static inline void __set_64bit_var (unsigned long long *ptr,
7191 - unsigned long long value)
7192 +static inline void xen_write_cr3(unsigned long val)
7193 {
7194 - __set_64bit(ptr,ll_low(value), ll_high(value));
7195 + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
7196 + asm volatile("movl %0,%%cr3": :"r" (val));
7197 }
7198
7199 -#define set_64bit(ptr,value) \
7200 -(__builtin_constant_p(value) ? \
7201 - __set_64bit_constant(ptr, value) : \
7202 - __set_64bit_var(ptr, value) )
7203 -
7204 -#define _set_64bit(ptr,value) \
7205 -(__builtin_constant_p(value) ? \
7206 - __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
7207 - __set_64bit(ptr, ll_low(value), ll_high(value)) )
7208 -
7209 -#endif
7210 -
7211 -/*
7212 - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
7213 - * Note 2: xchg has side effect, so that attribute volatile is necessary,
7214 - * but generally the primitive is invalid, *ptr is output argument. --ANK
7215 - */
7216 -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
7217 +static inline unsigned long xen_read_cr4(void)
7218 {
7219 - switch (size) {
7220 - case 1:
7221 - __asm__ __volatile__("xchgb %b0,%1"
7222 - :"=q" (x)
7223 - :"m" (*__xg(ptr)), "0" (x)
7224 - :"memory");
7225 - break;
7226 - case 2:
7227 - __asm__ __volatile__("xchgw %w0,%1"
7228 - :"=r" (x)
7229 - :"m" (*__xg(ptr)), "0" (x)
7230 - :"memory");
7231 - break;
7232 - case 4:
7233 - __asm__ __volatile__("xchgl %0,%1"
7234 - :"=r" (x)
7235 - :"m" (*__xg(ptr)), "0" (x)
7236 - :"memory");
7237 - break;
7238 - }
7239 - return x;
7240 + unsigned long val;
7241 + asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
7242 + return val;
7243 }
7244
7245 -/*
7246 - * Atomic compare and exchange. Compare OLD with MEM, if identical,
7247 - * store NEW in MEM. Return the initial value in MEM. Success is
7248 - * indicated by comparing RETURN with OLD.
7249 - */
7250 -
7251 -#ifdef CONFIG_X86_CMPXCHG
7252 -#define __HAVE_ARCH_CMPXCHG 1
7253 -#define cmpxchg(ptr,o,n)\
7254 - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
7255 - (unsigned long)(n),sizeof(*(ptr))))
7256 -#define sync_cmpxchg(ptr,o,n)\
7257 - ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
7258 - (unsigned long)(n),sizeof(*(ptr))))
7259 -#endif
7260 +static inline unsigned long xen_read_cr4_safe(void)
7261 +{
7262 + unsigned long val;
7263 + /* This could fault if %cr4 does not exist */
7264 + asm("1: movl %%cr4, %0 \n"
7265 + "2: \n"
7266 + ".section __ex_table,\"a\" \n"
7267 + ".long 1b,2b \n"
7268 + ".previous \n"
7269 + : "=r" (val): "0" (0));
7270 + return val;
7271 +}
7272
7273 -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
7274 - unsigned long new, int size)
7275 +static inline void xen_write_cr4(unsigned long val)
7276 {
7277 - unsigned long prev;
7278 - switch (size) {
7279 - case 1:
7280 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
7281 - : "=a"(prev)
7282 - : "q"(new), "m"(*__xg(ptr)), "0"(old)
7283 - : "memory");
7284 - return prev;
7285 - case 2:
7286 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
7287 - : "=a"(prev)
7288 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7289 - : "memory");
7290 - return prev;
7291 - case 4:
7292 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
7293 - : "=a"(prev)
7294 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7295 - : "memory");
7296 - return prev;
7297 - }
7298 - return old;
7299 + asm volatile("movl %0,%%cr4": :"r" (val));
7300 }
7301
7302 -/*
7303 - * Always use locked operations when touching memory shared with a
7304 - * hypervisor, since the system may be SMP even if the guest kernel
7305 - * isn't.
7306 - */
7307 -static inline unsigned long __sync_cmpxchg(volatile void *ptr,
7308 - unsigned long old,
7309 - unsigned long new, int size)
7310 -{
7311 - unsigned long prev;
7312 - switch (size) {
7313 - case 1:
7314 - __asm__ __volatile__("lock; cmpxchgb %b1,%2"
7315 - : "=a"(prev)
7316 - : "q"(new), "m"(*__xg(ptr)), "0"(old)
7317 - : "memory");
7318 - return prev;
7319 - case 2:
7320 - __asm__ __volatile__("lock; cmpxchgw %w1,%2"
7321 - : "=a"(prev)
7322 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7323 - : "memory");
7324 - return prev;
7325 - case 4:
7326 - __asm__ __volatile__("lock; cmpxchgl %1,%2"
7327 - : "=a"(prev)
7328 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7329 - : "memory");
7330 - return prev;
7331 - }
7332 - return old;
7333 +static inline void xen_wbinvd(void)
7334 +{
7335 + asm volatile("wbinvd": : :"memory");
7336 }
7337
7338 -#ifndef CONFIG_X86_CMPXCHG
7339 -/*
7340 - * Building a kernel capable running on 80386. It may be necessary to
7341 - * simulate the cmpxchg on the 80386 CPU. For that purpose we define
7342 - * a function for each of the sizes we support.
7343 - */
7344 +#define read_cr0() (xen_read_cr0())
7345 +#define write_cr0(x) (xen_write_cr0(x))
7346 +#define read_cr2() (xen_read_cr2())
7347 +#define write_cr2(x) (xen_write_cr2(x))
7348 +#define read_cr3() (xen_read_cr3())
7349 +#define write_cr3(x) (xen_write_cr3(x))
7350 +#define read_cr4() (xen_read_cr4())
7351 +#define read_cr4_safe() (xen_read_cr4_safe())
7352 +#define write_cr4(x) (xen_write_cr4(x))
7353 +#define wbinvd() (xen_wbinvd())
7354
7355 -extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
7356 -extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
7357 -extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
7358 -
7359 -static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
7360 - unsigned long new, int size)
7361 -{
7362 - switch (size) {
7363 - case 1:
7364 - return cmpxchg_386_u8(ptr, old, new);
7365 - case 2:
7366 - return cmpxchg_386_u16(ptr, old, new);
7367 - case 4:
7368 - return cmpxchg_386_u32(ptr, old, new);
7369 - }
7370 - return old;
7371 -}
7372 -
7373 -#define cmpxchg(ptr,o,n) \
7374 -({ \
7375 - __typeof__(*(ptr)) __ret; \
7376 - if (likely(boot_cpu_data.x86 > 3)) \
7377 - __ret = __cmpxchg((ptr), (unsigned long)(o), \
7378 - (unsigned long)(n), sizeof(*(ptr))); \
7379 - else \
7380 - __ret = cmpxchg_386((ptr), (unsigned long)(o), \
7381 - (unsigned long)(n), sizeof(*(ptr))); \
7382 - __ret; \
7383 -})
7384 -#endif
7385 +/* Clear the 'TS' bit */
7386 +#define clts() (xen_clts())
7387
7388 -#ifdef CONFIG_X86_CMPXCHG64
7389 +/* Set the 'TS' bit */
7390 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
7391
7392 -static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
7393 - unsigned long long new)
7394 +#endif /* __KERNEL__ */
7395 +
7396 +static inline unsigned long get_limit(unsigned long segment)
7397 {
7398 - unsigned long long prev;
7399 - __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
7400 - : "=A"(prev)
7401 - : "b"((unsigned long)new),
7402 - "c"((unsigned long)(new >> 32)),
7403 - "m"(*__xg(ptr)),
7404 - "0"(old)
7405 - : "memory");
7406 - return prev;
7407 -}
7408 -
7409 -#define cmpxchg64(ptr,o,n)\
7410 - ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
7411 - (unsigned long long)(n)))
7412 + unsigned long __limit;
7413 + __asm__("lsll %1,%0"
7414 + :"=r" (__limit):"r" (segment));
7415 + return __limit+1;
7416 +}
7417 +
7418 +#define nop() __asm__ __volatile__ ("nop")
7419
7420 -#endif
7421 -
7422 /*
7423 * Force strict CPU ordering.
7424 * And yes, this is required on UP too when we're talking
7425 --- a/include/asm-x86/mach-xen/asm/system_64.h
7426 +++ b/include/asm-x86/mach-xen/asm/system_64.h
7427 @@ -3,7 +3,7 @@
7428
7429 #include <linux/kernel.h>
7430 #include <asm/segment.h>
7431 -#include <asm/alternative.h>
7432 +#include <asm/cmpxchg.h>
7433
7434 #include <asm/synch_bitops.h>
7435 #include <asm/hypervisor.h>
7436 @@ -43,7 +43,7 @@
7437 [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
7438 [ti_flags] "i" (offsetof(struct thread_info, flags)),\
7439 [tif_fork] "i" (TIF_FORK), \
7440 - [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
7441 + [thread_info] "i" (offsetof(struct task_struct, stack)), \
7442 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
7443 : "memory", "cc" __EXTRA_CLOBBER)
7444
7445 @@ -92,6 +92,12 @@ static inline void write_cr0(unsigned lo
7446 machine_to_phys(__dummy); \
7447 })
7448
7449 +static inline void write_cr3(unsigned long val)
7450 +{
7451 + val = phys_to_machine(val);
7452 + asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
7453 +}
7454 +
7455 static inline unsigned long read_cr4(void)
7456 {
7457 unsigned long cr4;
7458 @@ -101,7 +107,7 @@ static inline unsigned long read_cr4(voi
7459
7460 static inline void write_cr4(unsigned long val)
7461 {
7462 - asm volatile("movq %0,%%cr4" :: "r" (val));
7463 + asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
7464 }
7465
7466 #define stts() (HYPERVISOR_fpu_taskswitch(1))
7467 @@ -122,100 +128,6 @@ static inline void sched_cacheflush(void
7468
7469 #define nop() __asm__ __volatile__ ("nop")
7470
7471 -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
7472 -
7473 -#define tas(ptr) (xchg((ptr),1))
7474 -
7475 -#define __xg(x) ((volatile long *)(x))
7476 -
7477 -static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
7478 -{
7479 - *ptr = val;
7480 -}
7481 -
7482 -#define _set_64bit set_64bit
7483 -
7484 -/*
7485 - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
7486 - * Note 2: xchg has side effect, so that attribute volatile is necessary,
7487 - * but generally the primitive is invalid, *ptr is output argument. --ANK
7488 - */
7489 -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
7490 -{
7491 - switch (size) {
7492 - case 1:
7493 - __asm__ __volatile__("xchgb %b0,%1"
7494 - :"=q" (x)
7495 - :"m" (*__xg(ptr)), "0" (x)
7496 - :"memory");
7497 - break;
7498 - case 2:
7499 - __asm__ __volatile__("xchgw %w0,%1"
7500 - :"=r" (x)
7501 - :"m" (*__xg(ptr)), "0" (x)
7502 - :"memory");
7503 - break;
7504 - case 4:
7505 - __asm__ __volatile__("xchgl %k0,%1"
7506 - :"=r" (x)
7507 - :"m" (*__xg(ptr)), "0" (x)
7508 - :"memory");
7509 - break;
7510 - case 8:
7511 - __asm__ __volatile__("xchgq %0,%1"
7512 - :"=r" (x)
7513 - :"m" (*__xg(ptr)), "0" (x)
7514 - :"memory");
7515 - break;
7516 - }
7517 - return x;
7518 -}
7519 -
7520 -/*
7521 - * Atomic compare and exchange. Compare OLD with MEM, if identical,
7522 - * store NEW in MEM. Return the initial value in MEM. Success is
7523 - * indicated by comparing RETURN with OLD.
7524 - */
7525 -
7526 -#define __HAVE_ARCH_CMPXCHG 1
7527 -
7528 -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
7529 - unsigned long new, int size)
7530 -{
7531 - unsigned long prev;
7532 - switch (size) {
7533 - case 1:
7534 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
7535 - : "=a"(prev)
7536 - : "q"(new), "m"(*__xg(ptr)), "0"(old)
7537 - : "memory");
7538 - return prev;
7539 - case 2:
7540 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
7541 - : "=a"(prev)
7542 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7543 - : "memory");
7544 - return prev;
7545 - case 4:
7546 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
7547 - : "=a"(prev)
7548 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7549 - : "memory");
7550 - return prev;
7551 - case 8:
7552 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
7553 - : "=a"(prev)
7554 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7555 - : "memory");
7556 - return prev;
7557 - }
7558 - return old;
7559 -}
7560 -
7561 -#define cmpxchg(ptr,o,n)\
7562 - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
7563 - (unsigned long)(n),sizeof(*(ptr))))
7564 -
7565 #ifdef CONFIG_SMP
7566 #define smp_mb() mb()
7567 #define smp_rmb() rmb()
7568 --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h
7569 +++ b/include/asm-x86/mach-xen/asm/tlbflush_32.h
7570 @@ -29,8 +29,13 @@
7571 * and page-granular flushes are available only on i486 and up.
7572 */
7573
7574 +#define TLB_FLUSH_ALL 0xffffffff
7575 +
7576 +
7577 #ifndef CONFIG_SMP
7578
7579 +#include <linux/sched.h>
7580 +
7581 #define flush_tlb() __flush_tlb()
7582 #define flush_tlb_all() __flush_tlb_all()
7583 #define local_flush_tlb() __flush_tlb()
7584 @@ -55,7 +60,7 @@ static inline void flush_tlb_range(struc
7585 __flush_tlb();
7586 }
7587
7588 -#else
7589 +#else /* SMP */
7590
7591 #include <asm/smp.h>
7592
7593 @@ -84,9 +89,7 @@ struct tlb_state
7594 char __cacheline_padding[L1_CACHE_BYTES-8];
7595 };
7596 DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
7597 -
7598 -
7599 -#endif
7600 +#endif /* SMP */
7601
7602 #define flush_tlb_kernel_range(start, end) flush_tlb_all()
7603
7604 --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h
7605 +++ b/include/asm-x86/mach-xen/asm/tlbflush_64.h
7606 @@ -2,7 +2,9 @@
7607 #define _X8664_TLBFLUSH_H
7608
7609 #include <linux/mm.h>
7610 +#include <linux/sched.h>
7611 #include <asm/processor.h>
7612 +#include <asm/system.h>
7613
7614 #define __flush_tlb() xen_tlb_flush()
7615
7616 --- a/include/linux/pci.h
7617 +++ b/include/linux/pci.h
7618 @@ -239,7 +239,7 @@ struct pci_dev {
7619 int rom_attr_enabled; /* has display of the rom attribute been enabled? */
7620 struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
7621 struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
7622 -#ifdef CONFIG_PCI_MSI
7623 +#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
7624 struct list_head msi_list;
7625 #endif
7626 struct pci_vpd *vpd;
7627 --- a/lib/swiotlb-xen.c
7628 +++ b/lib/swiotlb-xen.c
7629 @@ -723,7 +723,6 @@ swiotlb_dma_supported (struct device *hw
7630 return (mask >= ((1UL << dma_bits) - 1));
7631 }
7632
7633 -EXPORT_SYMBOL(swiotlb_init);
7634 EXPORT_SYMBOL(swiotlb_map_single);
7635 EXPORT_SYMBOL(swiotlb_unmap_single);
7636 EXPORT_SYMBOL(swiotlb_map_sg);
7637 --- a/net/core/dev.c
7638 +++ b/net/core/dev.c
7639 @@ -1744,12 +1744,17 @@ static struct netdev_queue *dev_pick_tx(
7640 inline int skb_checksum_setup(struct sk_buff *skb)
7641 {
7642 if (skb->proto_csum_blank) {
7643 + struct iphdr *iph;
7644 + unsigned char *th;
7645 +
7646 if (skb->protocol != htons(ETH_P_IP))
7647 goto out;
7648 - skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
7649 - if (skb->h.raw >= skb->tail)
7650 + iph = ip_hdr(skb);
7651 + th = skb_network_header(skb) + 4 * iph->ihl;
7652 + if (th >= skb_tail_pointer(skb))
7653 goto out;
7654 - switch (skb->nh.iph->protocol) {
7655 + skb->csum_start = th - skb->head;
7656 + switch (iph->protocol) {
7657 case IPPROTO_TCP:
7658 skb->csum_offset = offsetof(struct tcphdr, check);
7659 break;
7660 @@ -1760,10 +1765,10 @@ inline int skb_checksum_setup(struct sk_
7661 if (net_ratelimit())
7662 printk(KERN_ERR "Attempting to checksum a non-"
7663 "TCP/UDP packet, dropping a protocol"
7664 - " %d packet", skb->nh.iph->protocol);
7665 + " %d packet", iph->protocol);
7666 goto out;
7667 }
7668 - if ((skb->h.raw + skb->csum_offset + 2) > skb->tail)
7669 + if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
7670 goto out;
7671 skb->ip_summed = CHECKSUM_PARTIAL;
7672 skb->proto_csum_blank = 0;
7673 --- a/scripts/Makefile.xen.awk
7674 +++ b/scripts/Makefile.xen.awk
7675 @@ -13,7 +13,7 @@ BEGIN {
7676 next
7677 }
7678
7679 -/:[[:space:]]*%\.[cS][[:space:]]/ {
7680 +/:[[:space:]]*\$\(src\)\/%\.[cS][[:space:]]/ {
7681 line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
7682 line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
7683 print line