]>
Commit | Line | Data |
---|---|---|
cc90b958 BS |
1 | Subject: xen3 xen-arch |
2 | From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 728:832aac894efd) | |
3 | Patch-mainline: obsolete | |
4 | Acked-by: jbeulich@novell.com | |
5 | ||
6 | List of files having Xen derivates (perhaps created during the merging | |
7 | of newer kernel versions), for xen-port-patches.py to pick up (i.e. this | |
8 | must be retained here until the XenSource tree has these in the right | |
9 | places): | |
10 | +++ linux/arch/x86/kernel/acpi/sleep-xen.c | |
11 | +++ linux/arch/x86/kernel/cpu/common_64-xen.c | |
12 | +++ linux/arch/x86/kernel/e820-xen.c | |
13 | +++ linux/arch/x86/kernel/head-xen.c | |
14 | +++ linux/arch/x86/kernel/head32-xen.c | |
15 | +++ linux/arch/x86/kernel/ioport-xen.c | |
16 | +++ linux/arch/x86/kernel/ipi-xen.c | |
17 | +++ linux/arch/x86/kernel/ldt-xen.c | |
18 | +++ linux/arch/x86/kernel/mpparse-xen.c | |
19 | +++ linux/arch/x86/kernel/pci-nommu-xen.c | |
20 | +++ linux/arch/x86/kernel/process-xen.c | |
21 | +++ linux/arch/x86/kernel/setup-xen.c | |
22 | +++ linux/arch/x86/kernel/setup_percpu-xen.c | |
23 | +++ linux/arch/x86/kernel/smp-xen.c | |
24 | +++ linux/arch/x86/mm/fault-xen.c | |
25 | +++ linux/arch/x86/mm/ioremap-xen.c | |
26 | +++ linux/arch/x86/mm/pageattr-xen.c | |
27 | +++ linux/arch/x86/mm/pat-xen.c | |
28 | +++ linux/arch/x86/mm/pgtable-xen.c | |
29 | +++ linux/arch/x86/vdso/vdso32-setup-xen.c | |
30 | +++ linux/drivers/char/mem-xen.c | |
31 | +++ linux/include/asm-x86/mach-xen/asm/desc.h | |
32 | +++ linux/include/asm-x86/mach-xen/asm/dma-mapping.h | |
33 | +++ linux/include/asm-x86/mach-xen/asm/fixmap.h | |
34 | +++ linux/include/asm-x86/mach-xen/asm/io.h | |
35 | +++ linux/include/asm-x86/mach-xen/asm/irq_vectors.h | |
36 | +++ linux/include/asm-x86/mach-xen/asm/irqflags.h | |
37 | +++ linux/include/asm-x86/mach-xen/asm/mmu_context.h | |
38 | +++ linux/include/asm-x86/mach-xen/asm/page.h | |
39 | +++ linux/include/asm-x86/mach-xen/asm/pci.h | |
40 | +++ linux/include/asm-x86/mach-xen/asm/pgalloc.h | |
41 | +++ linux/include/asm-x86/mach-xen/asm/pgtable.h | |
42 | +++ linux/include/asm-x86/mach-xen/asm/processor.h | |
43 | +++ linux/include/asm-x86/mach-xen/asm/segment.h | |
44 | +++ linux/include/asm-x86/mach-xen/asm/smp.h | |
45 | +++ linux/include/asm-x86/mach-xen/asm/spinlock.h | |
46 | +++ linux/include/asm-x86/mach-xen/asm/swiotlb.h | |
47 | +++ linux/include/asm-x86/mach-xen/asm/system.h | |
48 | +++ linux/include/asm-x86/mach-xen/asm/tlbflush.h | |
49 | +++ linux/include/asm-x86/mach-xen/asm/xor.h | |
50 | ||
51 | List of files folded into their native counterparts (and hence removed | |
52 | from this patch for xen-port-patches.py to not needlessly pick them up; | |
53 | for reference, prefixed with the version the removal occured): | |
54 | 2.6.18/include/asm-x86/mach-xen/asm/pgtable-2level.h | |
55 | 2.6.18/include/asm-x86/mach-xen/asm/pgtable-2level-defs.h | |
56 | 2.6.19/include/asm-x86/mach-xen/asm/ptrace.h | |
57 | 2.6.23/arch/x86/kernel/vsyscall-note_32-xen.S | |
58 | 2.6.23/include/asm-x86/mach-xen/asm/ptrace_64.h | |
59 | 2.6.24/arch/x86/kernel/early_printk_32-xen.c | |
60 | 2.6.24/include/asm-x86/mach-xen/asm/arch_hooks_64.h | |
61 | 2.6.24/include/asm-x86/mach-xen/asm/bootsetup_64.h | |
62 | 2.6.24/include/asm-x86/mach-xen/asm/mmu_32.h | |
63 | 2.6.24/include/asm-x86/mach-xen/asm/mmu_64.h | |
64 | 2.6.24/include/asm-x86/mach-xen/asm/nmi_64.h | |
65 | 2.6.24/include/asm-x86/mach-xen/asm/setup.h | |
66 | 2.6.24/include/asm-x86/mach-xen/asm/time_64.h (added in 2.6.20) | |
67 | 2.6.25/arch/x86/ia32/syscall32-xen.c | |
68 | 2.6.25/arch/x86/ia32/syscall32_syscall-xen.S | |
69 | 2.6.25/arch/x86/ia32/vsyscall-int80.S | |
70 | 2.6.25/arch/x86/kernel/acpi/boot-xen.c | |
71 | 2.6.25/include/asm-x86/mach-xen/asm/msr.h | |
72 | 2.6.25/include/asm-x86/mach-xen/asm/page_32.h | |
73 | 2.6.25/include/asm-x86/mach-xen/asm/spinlock_32.h | |
74 | 2.6.25/include/asm-x86/mach-xen/asm/timer.h (added in 2.6.24) | |
75 | 2.6.25/include/asm-x86/mach-xen/asm/timer_64.h | |
76 | 2.6.26/arch/x86/kernel/pci-dma_32-xen.c | |
77 | 2.6.26/arch/x86/kernel/pci-swiotlb_64-xen.c | |
78 | 2.6.26/include/asm-x86/mach-xen/asm/dma-mapping_32.h | |
79 | 2.6.26/include/asm-x86/mach-xen/asm/dma-mapping_64.h | |
80 | 2.6.26/include/asm-x86/mach-xen/asm/nmi.h (added in 2.6.24) | |
81 | 2.6.26/include/asm-x86/mach-xen/asm/scatterlist.h (added in 2.6.24) | |
82 | 2.6.26/include/asm-x86/mach-xen/asm/scatterlist_32.h | |
83 | 2.6.26/include/xen/xencomm.h | |
84 | 2.6.27/arch/x86/kernel/e820_32-xen.c | |
85 | 2.6.27/include/asm-x86/mach-xen/asm/e820.h (added in 2.6.24) | |
86 | 2.6.27/include/asm-x86/mach-xen/asm/e820_64.h | |
87 | 2.6.27/include/asm-x86/mach-xen/asm/hw_irq.h (added in 2.6.24) | |
88 | 2.6.27/include/asm-x86/mach-xen/asm/hw_irq_32.h | |
89 | 2.6.27/include/asm-x86/mach-xen/asm/hw_irq_64.h | |
90 | 2.6.27/include/asm-x86/mach-xen/asm/irq.h (added in 2.6.24) | |
91 | 2.6.27/include/asm-x86/mach-xen/asm/irq_64.h | |
92 | ||
93 | Index: head-2008-11-25/arch/x86/kernel/acpi/processor_extcntl_xen.c | |
94 | =================================================================== | |
95 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
96 | +++ head-2008-11-25/arch/x86/kernel/acpi/processor_extcntl_xen.c 2008-10-01 15:43:24.000000000 +0200 | |
97 | @@ -0,0 +1,209 @@ | |
98 | +/* | |
99 | + * processor_extcntl_xen.c - interface to notify Xen | |
100 | + * | |
101 | + * Copyright (C) 2008, Intel corporation | |
102 | + * | |
103 | + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
104 | + * | |
105 | + * This program is free software; you can redistribute it and/or modify | |
106 | + * it under the terms of the GNU General Public License as published by | |
107 | + * the Free Software Foundation; either version 2 of the License, or (at | |
108 | + * your option) any later version. | |
109 | + * | |
110 | + * This program is distributed in the hope that it will be useful, but | |
111 | + * WITHOUT ANY WARRANTY; without even the implied warranty of | |
112 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
113 | + * General Public License for more details. | |
114 | + * | |
115 | + * You should have received a copy of the GNU General Public License along | |
116 | + * with this program; if not, write to the Free Software Foundation, Inc., | |
117 | + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. | |
118 | + * | |
119 | + */ | |
120 | + | |
121 | +#include <linux/kernel.h> | |
122 | +#include <linux/init.h> | |
123 | +#include <linux/types.h> | |
124 | +#include <linux/acpi.h> | |
125 | +#include <linux/pm.h> | |
126 | +#include <linux/cpu.h> | |
127 | + | |
128 | +#include <linux/cpufreq.h> | |
129 | +#include <acpi/processor.h> | |
130 | +#include <asm/hypercall.h> | |
131 | + | |
132 | +static int xen_cx_notifier(struct acpi_processor *pr, int action) | |
133 | +{ | |
134 | + int ret, count = 0, i; | |
135 | + xen_platform_op_t op = { | |
136 | + .cmd = XENPF_set_processor_pminfo, | |
137 | + .interface_version = XENPF_INTERFACE_VERSION, | |
138 | + .u.set_pminfo.id = pr->acpi_id, | |
139 | + .u.set_pminfo.type = XEN_PM_CX, | |
140 | + }; | |
141 | + struct xen_processor_cx *data, *buf; | |
142 | + struct acpi_processor_cx *cx; | |
143 | + | |
144 | + if (action == PROCESSOR_PM_CHANGE) | |
145 | + return -EINVAL; | |
146 | + | |
147 | + /* Convert to Xen defined structure and hypercall */ | |
148 | + buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx), | |
149 | + GFP_KERNEL); | |
150 | + if (!buf) | |
151 | + return -ENOMEM; | |
152 | + | |
153 | + data = buf; | |
154 | + for (i = 1; i <= pr->power.count; i++) { | |
155 | + cx = &pr->power.states[i]; | |
156 | + /* Skip invalid cstate entry */ | |
157 | + if (!cx->valid) | |
158 | + continue; | |
159 | + | |
160 | + data->type = cx->type; | |
161 | + data->latency = cx->latency; | |
162 | + data->power = cx->power; | |
163 | + data->reg.space_id = cx->reg.space_id; | |
164 | + data->reg.bit_width = cx->reg.bit_width; | |
165 | + data->reg.bit_offset = cx->reg.bit_offset; | |
166 | + data->reg.access_size = cx->reg.reserved; | |
167 | + data->reg.address = cx->reg.address; | |
168 | + | |
169 | + /* Get dependency relationships */ | |
170 | + if (cx->csd_count) { | |
171 | + printk("Wow! _CSD is found. Not support for now!\n"); | |
172 | + kfree(buf); | |
173 | + return -EINVAL; | |
174 | + } else { | |
175 | + data->dpcnt = 0; | |
176 | + set_xen_guest_handle(data->dp, NULL); | |
177 | + } | |
178 | + | |
179 | + data++; | |
180 | + count++; | |
181 | + } | |
182 | + | |
183 | + if (!count) { | |
184 | + printk("No available Cx info for cpu %d\n", pr->acpi_id); | |
185 | + kfree(buf); | |
186 | + return -EINVAL; | |
187 | + } | |
188 | + | |
189 | + op.u.set_pminfo.power.count = count; | |
190 | + op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control; | |
191 | + op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check; | |
192 | + op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst; | |
193 | + op.u.set_pminfo.power.flags.power_setup_done = pr->flags.power_setup_done; | |
194 | + | |
195 | + set_xen_guest_handle(op.u.set_pminfo.power.states, buf); | |
196 | + ret = HYPERVISOR_platform_op(&op); | |
197 | + kfree(buf); | |
198 | + return ret; | |
199 | +} | |
200 | + | |
201 | +static int xen_px_notifier(struct acpi_processor *pr, int action) | |
202 | +{ | |
203 | + int ret = -EINVAL; | |
204 | + xen_platform_op_t op = { | |
205 | + .cmd = XENPF_set_processor_pminfo, | |
206 | + .interface_version = XENPF_INTERFACE_VERSION, | |
207 | + .u.set_pminfo.id = pr->acpi_id, | |
208 | + .u.set_pminfo.type = XEN_PM_PX, | |
209 | + }; | |
210 | + struct xen_processor_performance *perf; | |
211 | + struct xen_processor_px *states = NULL; | |
212 | + struct acpi_processor_performance *px; | |
213 | + struct acpi_psd_package *pdomain; | |
214 | + | |
215 | + if (!pr) | |
216 | + return -EINVAL; | |
217 | + | |
218 | + perf = &op.u.set_pminfo.perf; | |
219 | + px = pr->performance; | |
220 | + | |
221 | + switch(action) { | |
222 | + case PROCESSOR_PM_CHANGE: | |
223 | + /* ppc dynamic handle */ | |
224 | + perf->flags = XEN_PX_PPC; | |
225 | + perf->platform_limit = pr->performance_platform_limit; | |
226 | + | |
227 | + ret = HYPERVISOR_platform_op(&op); | |
228 | + break; | |
229 | + | |
230 | + case PROCESSOR_PM_INIT: | |
231 | + /* px normal init */ | |
232 | + perf->flags = XEN_PX_PPC | | |
233 | + XEN_PX_PCT | | |
234 | + XEN_PX_PSS | | |
235 | + XEN_PX_PSD; | |
236 | + | |
237 | + /* ppc */ | |
238 | + perf->platform_limit = pr->performance_platform_limit; | |
239 | + | |
240 | + /* pct */ | |
241 | + xen_convert_pct_reg(&perf->control_register, &px->control_register); | |
242 | + xen_convert_pct_reg(&perf->status_register, &px->status_register); | |
243 | + | |
244 | + /* pss */ | |
245 | + perf->state_count = px->state_count; | |
246 | + states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL); | |
247 | + if (!states) | |
248 | + return -ENOMEM; | |
249 | + xen_convert_pss_states(states, px->states, px->state_count); | |
250 | + set_xen_guest_handle(perf->states, states); | |
251 | + | |
252 | + /* psd */ | |
253 | + pdomain = &px->domain_info; | |
254 | + xen_convert_psd_pack(&perf->domain_info, pdomain); | |
255 | + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) | |
256 | + perf->shared_type = CPUFREQ_SHARED_TYPE_ALL; | |
257 | + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) | |
258 | + perf->shared_type = CPUFREQ_SHARED_TYPE_ANY; | |
259 | + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) | |
260 | + perf->shared_type = CPUFREQ_SHARED_TYPE_HW; | |
261 | + else { | |
262 | + ret = -ENODEV; | |
263 | + kfree(states); | |
264 | + break; | |
265 | + } | |
266 | + | |
267 | + ret = HYPERVISOR_platform_op(&op); | |
268 | + kfree(states); | |
269 | + break; | |
270 | + | |
271 | + default: | |
272 | + break; | |
273 | + } | |
274 | + | |
275 | + return ret; | |
276 | +} | |
277 | + | |
278 | +static int xen_tx_notifier(struct acpi_processor *pr, int action) | |
279 | +{ | |
280 | + return -EINVAL; | |
281 | +} | |
282 | +static int xen_hotplug_notifier(struct acpi_processor *pr, int event) | |
283 | +{ | |
284 | + return -EINVAL; | |
285 | +} | |
286 | + | |
287 | +static struct processor_extcntl_ops xen_extcntl_ops = { | |
288 | + .hotplug = xen_hotplug_notifier, | |
289 | +}; | |
290 | + | |
291 | +void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **ops) | |
292 | +{ | |
293 | + unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8; | |
294 | + | |
295 | + if (!pmbits) | |
296 | + return; | |
297 | + if (pmbits & XEN_PROCESSOR_PM_CX) | |
298 | + xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier; | |
299 | + if (pmbits & XEN_PROCESSOR_PM_PX) | |
300 | + xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier; | |
301 | + if (pmbits & XEN_PROCESSOR_PM_TX) | |
302 | + xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier; | |
303 | + | |
304 | + *ops = &xen_extcntl_ops; | |
305 | +} | |
306 | +EXPORT_SYMBOL(arch_acpi_processor_init_extcntl); | |
307 | Index: head-2008-11-25/arch/x86/kernel/acpi/sleep_32-xen.c | |
308 | =================================================================== | |
309 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
310 | +++ head-2008-11-25/arch/x86/kernel/acpi/sleep_32-xen.c 2008-04-15 09:29:41.000000000 +0200 | |
311 | @@ -0,0 +1,113 @@ | |
312 | +/* | |
313 | + * sleep.c - x86-specific ACPI sleep support. | |
314 | + * | |
315 | + * Copyright (C) 2001-2003 Patrick Mochel | |
316 | + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> | |
317 | + */ | |
318 | + | |
319 | +#include <linux/acpi.h> | |
320 | +#include <linux/bootmem.h> | |
321 | +#include <linux/dmi.h> | |
322 | +#include <linux/cpumask.h> | |
323 | + | |
324 | +#include <asm/smp.h> | |
325 | + | |
326 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
327 | +/* address in low memory of the wakeup routine. */ | |
328 | +unsigned long acpi_wakeup_address = 0; | |
329 | +unsigned long acpi_video_flags; | |
330 | +extern char wakeup_start, wakeup_end; | |
331 | + | |
332 | +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); | |
333 | +#endif | |
334 | + | |
335 | +/** | |
336 | + * acpi_save_state_mem - save kernel state | |
337 | + * | |
338 | + * Create an identity mapped page table and copy the wakeup routine to | |
339 | + * low memory. | |
340 | + */ | |
341 | +int acpi_save_state_mem(void) | |
342 | +{ | |
343 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
344 | + if (!acpi_wakeup_address) | |
345 | + return 1; | |
346 | + memcpy((void *)acpi_wakeup_address, &wakeup_start, | |
347 | + &wakeup_end - &wakeup_start); | |
348 | + acpi_copy_wakeup_routine(acpi_wakeup_address); | |
349 | +#endif | |
350 | + return 0; | |
351 | +} | |
352 | + | |
353 | +/* | |
354 | + * acpi_restore_state - undo effects of acpi_save_state_mem | |
355 | + */ | |
356 | +void acpi_restore_state_mem(void) | |
357 | +{ | |
358 | +} | |
359 | + | |
360 | +/** | |
361 | + * acpi_reserve_bootmem - do _very_ early ACPI initialisation | |
362 | + * | |
363 | + * We allocate a page from the first 1MB of memory for the wakeup | |
364 | + * routine for when we come back from a sleep state. The | |
365 | + * runtime allocator allows specification of <16MB pages, but not | |
366 | + * <1MB pages. | |
367 | + */ | |
368 | +void __init acpi_reserve_bootmem(void) | |
369 | +{ | |
370 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
371 | + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) { | |
372 | + printk(KERN_ERR | |
373 | + "ACPI: Wakeup code way too big, S3 disabled.\n"); | |
374 | + return; | |
375 | + } | |
376 | + | |
377 | + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); | |
378 | + if (!acpi_wakeup_address) | |
379 | + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); | |
380 | +#endif | |
381 | +} | |
382 | + | |
383 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
384 | +static int __init acpi_sleep_setup(char *str) | |
385 | +{ | |
386 | + while ((str != NULL) && (*str != '\0')) { | |
387 | + if (strncmp(str, "s3_bios", 7) == 0) | |
388 | + acpi_video_flags = 1; | |
389 | + if (strncmp(str, "s3_mode", 7) == 0) | |
390 | + acpi_video_flags |= 2; | |
391 | + str = strchr(str, ','); | |
392 | + if (str != NULL) | |
393 | + str += strspn(str, ", \t"); | |
394 | + } | |
395 | + return 1; | |
396 | +} | |
397 | + | |
398 | +__setup("acpi_sleep=", acpi_sleep_setup); | |
399 | + | |
400 | +static __init int reset_videomode_after_s3(struct dmi_system_id *d) | |
401 | +{ | |
402 | + acpi_video_flags |= 2; | |
403 | + return 0; | |
404 | +} | |
405 | + | |
406 | +static __initdata struct dmi_system_id acpisleep_dmi_table[] = { | |
407 | + { /* Reset video mode after returning from ACPI S3 sleep */ | |
408 | + .callback = reset_videomode_after_s3, | |
409 | + .ident = "Toshiba Satellite 4030cdt", | |
410 | + .matches = { | |
411 | + DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), | |
412 | + }, | |
413 | + }, | |
414 | + {} | |
415 | +}; | |
416 | + | |
417 | +static int __init acpisleep_dmi_init(void) | |
418 | +{ | |
419 | + dmi_check_system(acpisleep_dmi_table); | |
420 | + return 0; | |
421 | +} | |
422 | + | |
423 | +core_initcall(acpisleep_dmi_init); | |
424 | +#endif /* CONFIG_ACPI_PV_SLEEP */ | |
425 | Index: head-2008-11-25/arch/x86/kernel/apic_32-xen.c | |
426 | =================================================================== | |
427 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
428 | +++ head-2008-11-25/arch/x86/kernel/apic_32-xen.c 2007-06-12 13:12:48.000000000 +0200 | |
429 | @@ -0,0 +1,155 @@ | |
430 | +/* | |
431 | + * Local APIC handling, local APIC timers | |
432 | + * | |
433 | + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> | |
434 | + * | |
435 | + * Fixes | |
436 | + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | |
437 | + * thanks to Eric Gilmore | |
438 | + * and Rolf G. Tews | |
439 | + * for testing these extensively. | |
440 | + * Maciej W. Rozycki : Various updates and fixes. | |
441 | + * Mikael Pettersson : Power Management for UP-APIC. | |
442 | + * Pavel Machek and | |
443 | + * Mikael Pettersson : PM converted to driver model. | |
444 | + */ | |
445 | + | |
446 | +#include <linux/init.h> | |
447 | + | |
448 | +#include <linux/mm.h> | |
449 | +#include <linux/delay.h> | |
450 | +#include <linux/bootmem.h> | |
451 | +#include <linux/smp_lock.h> | |
452 | +#include <linux/interrupt.h> | |
453 | +#include <linux/mc146818rtc.h> | |
454 | +#include <linux/kernel_stat.h> | |
455 | +#include <linux/sysdev.h> | |
456 | +#include <linux/cpu.h> | |
457 | +#include <linux/module.h> | |
458 | + | |
459 | +#include <asm/atomic.h> | |
460 | +#include <asm/smp.h> | |
461 | +#include <asm/mtrr.h> | |
462 | +#include <asm/mpspec.h> | |
463 | +#include <asm/desc.h> | |
464 | +#include <asm/arch_hooks.h> | |
465 | +#include <asm/hpet.h> | |
466 | +#include <asm/i8253.h> | |
467 | +#include <asm/nmi.h> | |
468 | + | |
469 | +#include <mach_apic.h> | |
470 | +#include <mach_apicdef.h> | |
471 | +#include <mach_ipi.h> | |
472 | + | |
473 | +#include "io_ports.h" | |
474 | + | |
475 | +#ifndef CONFIG_XEN | |
476 | +/* | |
477 | + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as | |
478 | + * IPIs in place of local APIC timers | |
479 | + */ | |
480 | +static cpumask_t timer_bcast_ipi; | |
481 | +#endif | |
482 | + | |
483 | +/* | |
484 | + * Knob to control our willingness to enable the local APIC. | |
485 | + */ | |
486 | +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ | |
487 | + | |
488 | +/* | |
489 | + * Debug level | |
490 | + */ | |
491 | +int apic_verbosity; | |
492 | + | |
493 | +#ifndef CONFIG_XEN | |
494 | +static int modern_apic(void) | |
495 | +{ | |
496 | + unsigned int lvr, version; | |
497 | + /* AMD systems use old APIC versions, so check the CPU */ | |
498 | + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | |
499 | + boot_cpu_data.x86 >= 0xf) | |
500 | + return 1; | |
501 | + lvr = apic_read(APIC_LVR); | |
502 | + version = GET_APIC_VERSION(lvr); | |
503 | + return version >= 0x14; | |
504 | +} | |
505 | +#endif /* !CONFIG_XEN */ | |
506 | + | |
507 | +/* | |
508 | + * 'what should we do if we get a hw irq event on an illegal vector'. | |
509 | + * each architecture has to answer this themselves. | |
510 | + */ | |
511 | +void ack_bad_irq(unsigned int irq) | |
512 | +{ | |
513 | + printk("unexpected IRQ trap at vector %02x\n", irq); | |
514 | + /* | |
515 | + * Currently unexpected vectors happen only on SMP and APIC. | |
516 | + * We _must_ ack these because every local APIC has only N | |
517 | + * irq slots per priority level, and a 'hanging, unacked' IRQ | |
518 | + * holds up an irq slot - in excessive cases (when multiple | |
519 | + * unexpected vectors occur) that might lock up the APIC | |
520 | + * completely. | |
521 | + * But only ack when the APIC is enabled -AK | |
522 | + */ | |
523 | + if (cpu_has_apic) | |
524 | + ack_APIC_irq(); | |
525 | +} | |
526 | + | |
527 | +int get_physical_broadcast(void) | |
528 | +{ | |
529 | + return 0xff; | |
530 | +} | |
531 | + | |
532 | +#ifndef CONFIG_XEN | |
533 | +#ifndef CONFIG_SMP | |
534 | +static void up_apic_timer_interrupt_call(struct pt_regs *regs) | |
535 | +{ | |
536 | + int cpu = smp_processor_id(); | |
537 | + | |
538 | + /* | |
539 | + * the NMI deadlock-detector uses this. | |
540 | + */ | |
541 | + per_cpu(irq_stat, cpu).apic_timer_irqs++; | |
542 | + | |
543 | + smp_local_timer_interrupt(regs); | |
544 | +} | |
545 | +#endif | |
546 | + | |
547 | +void smp_send_timer_broadcast_ipi(struct pt_regs *regs) | |
548 | +{ | |
549 | + cpumask_t mask; | |
550 | + | |
551 | + cpus_and(mask, cpu_online_map, timer_bcast_ipi); | |
552 | + if (!cpus_empty(mask)) { | |
553 | +#ifdef CONFIG_SMP | |
554 | + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); | |
555 | +#else | |
556 | + /* | |
557 | + * We can directly call the apic timer interrupt handler | |
558 | + * in UP case. Minus all irq related functions | |
559 | + */ | |
560 | + up_apic_timer_interrupt_call(regs); | |
561 | +#endif | |
562 | + } | |
563 | +} | |
564 | +#endif | |
565 | + | |
566 | +int setup_profiling_timer(unsigned int multiplier) | |
567 | +{ | |
568 | + return -EINVAL; | |
569 | +} | |
570 | + | |
571 | +/* | |
572 | + * This initializes the IO-APIC and APIC hardware if this is | |
573 | + * a UP kernel. | |
574 | + */ | |
575 | +int __init APIC_init_uniprocessor (void) | |
576 | +{ | |
577 | +#ifdef CONFIG_X86_IO_APIC | |
578 | + if (smp_found_config) | |
579 | + if (!skip_ioapic_setup && nr_ioapics) | |
580 | + setup_IO_APIC(); | |
581 | +#endif | |
582 | + | |
583 | + return 0; | |
584 | +} | |
585 | Index: head-2008-11-25/arch/x86/kernel/cpu/common-xen.c | |
586 | =================================================================== | |
587 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
588 | +++ head-2008-11-25/arch/x86/kernel/cpu/common-xen.c 2007-12-10 08:47:31.000000000 +0100 | |
589 | @@ -0,0 +1,743 @@ | |
590 | +#include <linux/init.h> | |
591 | +#include <linux/string.h> | |
592 | +#include <linux/delay.h> | |
593 | +#include <linux/smp.h> | |
594 | +#include <linux/module.h> | |
595 | +#include <linux/percpu.h> | |
596 | +#include <linux/bootmem.h> | |
597 | +#include <asm/semaphore.h> | |
598 | +#include <asm/processor.h> | |
599 | +#include <asm/i387.h> | |
600 | +#include <asm/msr.h> | |
601 | +#include <asm/io.h> | |
602 | +#include <asm/mmu_context.h> | |
603 | +#include <asm/mtrr.h> | |
604 | +#include <asm/mce.h> | |
605 | +#ifdef CONFIG_X86_LOCAL_APIC | |
606 | +#include <asm/mpspec.h> | |
607 | +#include <asm/apic.h> | |
608 | +#include <mach_apic.h> | |
609 | +#else | |
610 | +#ifdef CONFIG_XEN | |
611 | +#define phys_pkg_id(a,b) a | |
612 | +#endif | |
613 | +#endif | |
614 | +#include <asm/hypervisor.h> | |
615 | + | |
616 | +#include "cpu.h" | |
617 | + | |
618 | +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); | |
619 | +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); | |
620 | + | |
621 | +#ifndef CONFIG_XEN | |
622 | +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); | |
623 | +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); | |
624 | +#endif | |
625 | + | |
626 | +static int cachesize_override __cpuinitdata = -1; | |
627 | +static int disable_x86_fxsr __cpuinitdata; | |
628 | +static int disable_x86_serial_nr __cpuinitdata = 1; | |
629 | +static int disable_x86_sep __cpuinitdata; | |
630 | + | |
631 | +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; | |
632 | + | |
633 | +extern int disable_pse; | |
634 | + | |
635 | +static void default_init(struct cpuinfo_x86 * c) | |
636 | +{ | |
637 | + /* Not much we can do here... */ | |
638 | + /* Check if at least it has cpuid */ | |
639 | + if (c->cpuid_level == -1) { | |
640 | + /* No cpuid. It must be an ancient CPU */ | |
641 | + if (c->x86 == 4) | |
642 | + strcpy(c->x86_model_id, "486"); | |
643 | + else if (c->x86 == 3) | |
644 | + strcpy(c->x86_model_id, "386"); | |
645 | + } | |
646 | +} | |
647 | + | |
648 | +static struct cpu_dev default_cpu = { | |
649 | + .c_init = default_init, | |
650 | + .c_vendor = "Unknown", | |
651 | +}; | |
652 | +static struct cpu_dev * this_cpu = &default_cpu; | |
653 | + | |
654 | +static int __init cachesize_setup(char *str) | |
655 | +{ | |
656 | + get_option (&str, &cachesize_override); | |
657 | + return 1; | |
658 | +} | |
659 | +__setup("cachesize=", cachesize_setup); | |
660 | + | |
661 | +int __cpuinit get_model_name(struct cpuinfo_x86 *c) | |
662 | +{ | |
663 | + unsigned int *v; | |
664 | + char *p, *q; | |
665 | + | |
666 | + if (cpuid_eax(0x80000000) < 0x80000004) | |
667 | + return 0; | |
668 | + | |
669 | + v = (unsigned int *) c->x86_model_id; | |
670 | + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); | |
671 | + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); | |
672 | + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); | |
673 | + c->x86_model_id[48] = 0; | |
674 | + | |
675 | + /* Intel chips right-justify this string for some dumb reason; | |
676 | + undo that brain damage */ | |
677 | + p = q = &c->x86_model_id[0]; | |
678 | + while ( *p == ' ' ) | |
679 | + p++; | |
680 | + if ( p != q ) { | |
681 | + while ( *p ) | |
682 | + *q++ = *p++; | |
683 | + while ( q <= &c->x86_model_id[48] ) | |
684 | + *q++ = '\0'; /* Zero-pad the rest */ | |
685 | + } | |
686 | + | |
687 | + return 1; | |
688 | +} | |
689 | + | |
690 | + | |
691 | +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) | |
692 | +{ | |
693 | + unsigned int n, dummy, ecx, edx, l2size; | |
694 | + | |
695 | + n = cpuid_eax(0x80000000); | |
696 | + | |
697 | + if (n >= 0x80000005) { | |
698 | + cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); | |
699 | + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", | |
700 | + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | |
701 | + c->x86_cache_size=(ecx>>24)+(edx>>24); | |
702 | + } | |
703 | + | |
704 | + if (n < 0x80000006) /* Some chips just has a large L1. */ | |
705 | + return; | |
706 | + | |
707 | + ecx = cpuid_ecx(0x80000006); | |
708 | + l2size = ecx >> 16; | |
709 | + | |
710 | + /* do processor-specific cache resizing */ | |
711 | + if (this_cpu->c_size_cache) | |
712 | + l2size = this_cpu->c_size_cache(c,l2size); | |
713 | + | |
714 | + /* Allow user to override all this if necessary. */ | |
715 | + if (cachesize_override != -1) | |
716 | + l2size = cachesize_override; | |
717 | + | |
718 | + if ( l2size == 0 ) | |
719 | + return; /* Again, no L2 cache is possible */ | |
720 | + | |
721 | + c->x86_cache_size = l2size; | |
722 | + | |
723 | + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | |
724 | + l2size, ecx & 0xFF); | |
725 | +} | |
726 | + | |
727 | +/* Naming convention should be: <Name> [(<Codename>)] */ | |
728 | +/* This table only is used unless init_<vendor>() below doesn't set it; */ | |
729 | +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ | |
730 | + | |
731 | +/* Look up CPU names by table lookup. */ | |
732 | +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) | |
733 | +{ | |
734 | + struct cpu_model_info *info; | |
735 | + | |
736 | + if ( c->x86_model >= 16 ) | |
737 | + return NULL; /* Range check */ | |
738 | + | |
739 | + if (!this_cpu) | |
740 | + return NULL; | |
741 | + | |
742 | + info = this_cpu->c_models; | |
743 | + | |
744 | + while (info && info->family) { | |
745 | + if (info->family == c->x86) | |
746 | + return info->model_names[c->x86_model]; | |
747 | + info++; | |
748 | + } | |
749 | + return NULL; /* Not found */ | |
750 | +} | |
751 | + | |
752 | + | |
753 | +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) | |
754 | +{ | |
755 | + char *v = c->x86_vendor_id; | |
756 | + int i; | |
757 | + static int printed; | |
758 | + | |
759 | + for (i = 0; i < X86_VENDOR_NUM; i++) { | |
760 | + if (cpu_devs[i]) { | |
761 | + if (!strcmp(v,cpu_devs[i]->c_ident[0]) || | |
762 | + (cpu_devs[i]->c_ident[1] && | |
763 | + !strcmp(v,cpu_devs[i]->c_ident[1]))) { | |
764 | + c->x86_vendor = i; | |
765 | + if (!early) | |
766 | + this_cpu = cpu_devs[i]; | |
767 | + return; | |
768 | + } | |
769 | + } | |
770 | + } | |
771 | + if (!printed) { | |
772 | + printed++; | |
773 | + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); | |
774 | + printk(KERN_ERR "CPU: Your system may be unstable.\n"); | |
775 | + } | |
776 | + c->x86_vendor = X86_VENDOR_UNKNOWN; | |
777 | + this_cpu = &default_cpu; | |
778 | +} | |
779 | + | |
780 | + | |
781 | +static int __init x86_fxsr_setup(char * s) | |
782 | +{ | |
783 | + disable_x86_fxsr = 1; | |
784 | + return 1; | |
785 | +} | |
786 | +__setup("nofxsr", x86_fxsr_setup); | |
787 | + | |
788 | + | |
789 | +static int __init x86_sep_setup(char * s) | |
790 | +{ | |
791 | + disable_x86_sep = 1; | |
792 | + return 1; | |
793 | +} | |
794 | +__setup("nosep", x86_sep_setup); | |
795 | + | |
796 | + | |
797 | +/* Standard macro to see if a specific flag is changeable */ | |
798 | +static inline int flag_is_changeable_p(u32 flag) | |
799 | +{ | |
800 | + u32 f1, f2; | |
801 | + | |
802 | + asm("pushfl\n\t" | |
803 | + "pushfl\n\t" | |
804 | + "popl %0\n\t" | |
805 | + "movl %0,%1\n\t" | |
806 | + "xorl %2,%0\n\t" | |
807 | + "pushl %0\n\t" | |
808 | + "popfl\n\t" | |
809 | + "pushfl\n\t" | |
810 | + "popl %0\n\t" | |
811 | + "popfl\n\t" | |
812 | + : "=&r" (f1), "=&r" (f2) | |
813 | + : "ir" (flag)); | |
814 | + | |
815 | + return ((f1^f2) & flag) != 0; | |
816 | +} | |
817 | + | |
818 | + | |
819 | +/* Probe for the CPUID instruction */ | |
820 | +static int __cpuinit have_cpuid_p(void) | |
821 | +{ | |
822 | + return flag_is_changeable_p(X86_EFLAGS_ID); | |
823 | +} | |
824 | + | |
825 | +/* Do minimum CPU detection early. | |
826 | + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. | |
827 | + The others are not touched to avoid unwanted side effects. | |
828 | + | |
829 | + WARNING: this function is only called on the BP. Don't add code here | |
830 | + that is supposed to run on all CPUs. */ | |
831 | +static void __init early_cpu_detect(void) | |
832 | +{ | |
833 | + struct cpuinfo_x86 *c = &boot_cpu_data; | |
834 | + | |
835 | + c->x86_cache_alignment = 32; | |
836 | + | |
837 | + if (!have_cpuid_p()) | |
838 | + return; | |
839 | + | |
840 | + /* Get vendor name */ | |
841 | + cpuid(0x00000000, &c->cpuid_level, | |
842 | + (int *)&c->x86_vendor_id[0], | |
843 | + (int *)&c->x86_vendor_id[8], | |
844 | + (int *)&c->x86_vendor_id[4]); | |
845 | + | |
846 | + get_cpu_vendor(c, 1); | |
847 | + | |
848 | + c->x86 = 4; | |
849 | + if (c->cpuid_level >= 0x00000001) { | |
850 | + u32 junk, tfms, cap0, misc; | |
851 | + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); | |
852 | + c->x86 = (tfms >> 8) & 15; | |
853 | + c->x86_model = (tfms >> 4) & 15; | |
854 | + if (c->x86 == 0xf) | |
855 | + c->x86 += (tfms >> 20) & 0xff; | |
856 | + if (c->x86 >= 0x6) | |
857 | + c->x86_model += ((tfms >> 16) & 0xF) << 4; | |
858 | + c->x86_mask = tfms & 15; | |
859 | + if (cap0 & (1<<19)) | |
860 | + c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; | |
861 | + } | |
862 | +} | |
863 | + | |
864 | +void __cpuinit generic_identify(struct cpuinfo_x86 * c) | |
865 | +{ | |
866 | + u32 tfms, xlvl; | |
867 | + int ebx; | |
868 | + | |
869 | + if (have_cpuid_p()) { | |
870 | + /* Get vendor name */ | |
871 | + cpuid(0x00000000, &c->cpuid_level, | |
872 | + (int *)&c->x86_vendor_id[0], | |
873 | + (int *)&c->x86_vendor_id[8], | |
874 | + (int *)&c->x86_vendor_id[4]); | |
875 | + | |
876 | + get_cpu_vendor(c, 0); | |
877 | + /* Initialize the standard set of capabilities */ | |
878 | + /* Note that the vendor-specific code below might override */ | |
879 | + | |
880 | + /* Intel-defined flags: level 0x00000001 */ | |
881 | + if ( c->cpuid_level >= 0x00000001 ) { | |
882 | + u32 capability, excap; | |
883 | + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); | |
884 | + c->x86_capability[0] = capability; | |
885 | + c->x86_capability[4] = excap; | |
886 | + c->x86 = (tfms >> 8) & 15; | |
887 | + c->x86_model = (tfms >> 4) & 15; | |
888 | + if (c->x86 == 0xf) | |
889 | + c->x86 += (tfms >> 20) & 0xff; | |
890 | + if (c->x86 >= 0x6) | |
891 | + c->x86_model += ((tfms >> 16) & 0xF) << 4; | |
892 | + c->x86_mask = tfms & 15; | |
893 | +#ifdef CONFIG_X86_HT | |
894 | + c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); | |
895 | +#else | |
896 | + c->apicid = (ebx >> 24) & 0xFF; | |
897 | +#endif | |
898 | + } else { | |
899 | + /* Have CPUID level 0 only - unheard of */ | |
900 | + c->x86 = 4; | |
901 | + } | |
902 | + | |
903 | + /* AMD-defined flags: level 0x80000001 */ | |
904 | + xlvl = cpuid_eax(0x80000000); | |
905 | + if ( (xlvl & 0xffff0000) == 0x80000000 ) { | |
906 | + if ( xlvl >= 0x80000001 ) { | |
907 | + c->x86_capability[1] = cpuid_edx(0x80000001); | |
908 | + c->x86_capability[6] = cpuid_ecx(0x80000001); | |
909 | + } | |
910 | + if ( xlvl >= 0x80000004 ) | |
911 | + get_model_name(c); /* Default name */ | |
912 | + } | |
913 | + } | |
914 | + | |
915 | + early_intel_workaround(c); | |
916 | + | |
917 | +#ifdef CONFIG_X86_HT | |
918 | + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; | |
919 | +#endif | |
920 | +} | |
921 | + | |
922 | +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) | |
923 | +{ | |
924 | + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { | |
925 | + /* Disable processor serial number */ | |
926 | + unsigned long lo,hi; | |
927 | + rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi); | |
928 | + lo |= 0x200000; | |
929 | + wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi); | |
930 | + printk(KERN_NOTICE "CPU serial number disabled.\n"); | |
931 | + clear_bit(X86_FEATURE_PN, c->x86_capability); | |
932 | + | |
933 | + /* Disabling the serial number may affect the cpuid level */ | |
934 | + c->cpuid_level = cpuid_eax(0); | |
935 | + } | |
936 | +} | |
937 | + | |
938 | +static int __init x86_serial_nr_setup(char *s) | |
939 | +{ | |
940 | + disable_x86_serial_nr = 0; | |
941 | + return 1; | |
942 | +} | |
943 | +__setup("serialnumber", x86_serial_nr_setup); | |
944 | + | |
945 | + | |
946 | + | |
947 | +/* | |
948 | + * This does the hard work of actually picking apart the CPU stuff... | |
949 | + */ | |
950 | +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |
951 | +{ | |
952 | + int i; | |
953 | + | |
954 | + c->loops_per_jiffy = loops_per_jiffy; | |
955 | + c->x86_cache_size = -1; | |
956 | + c->x86_vendor = X86_VENDOR_UNKNOWN; | |
957 | + c->cpuid_level = -1; /* CPUID not detected */ | |
958 | + c->x86_model = c->x86_mask = 0; /* So far unknown... */ | |
959 | + c->x86_vendor_id[0] = '\0'; /* Unset */ | |
960 | + c->x86_model_id[0] = '\0'; /* Unset */ | |
961 | + c->x86_max_cores = 1; | |
962 | + memset(&c->x86_capability, 0, sizeof c->x86_capability); | |
963 | + | |
964 | + if (!have_cpuid_p()) { | |
965 | + /* First of all, decide if this is a 486 or higher */ | |
966 | + /* It's a 486 if we can modify the AC flag */ | |
967 | + if ( flag_is_changeable_p(X86_EFLAGS_AC) ) | |
968 | + c->x86 = 4; | |
969 | + else | |
970 | + c->x86 = 3; | |
971 | + } | |
972 | + | |
973 | + generic_identify(c); | |
974 | + | |
975 | + printk(KERN_DEBUG "CPU: After generic identify, caps:"); | |
976 | + for (i = 0; i < NCAPINTS; i++) | |
977 | + printk(" %08lx", c->x86_capability[i]); | |
978 | + printk("\n"); | |
979 | + | |
980 | + if (this_cpu->c_identify) { | |
981 | + this_cpu->c_identify(c); | |
982 | + | |
983 | + printk(KERN_DEBUG "CPU: After vendor identify, caps:"); | |
984 | + for (i = 0; i < NCAPINTS; i++) | |
985 | + printk(" %08lx", c->x86_capability[i]); | |
986 | + printk("\n"); | |
987 | + } | |
988 | + | |
989 | + /* | |
990 | + * Vendor-specific initialization. In this section we | |
991 | + * canonicalize the feature flags, meaning if there are | |
992 | + * features a certain CPU supports which CPUID doesn't | |
993 | + * tell us, CPUID claiming incorrect flags, or other bugs, | |
994 | + * we handle them here. | |
995 | + * | |
996 | + * At the end of this section, c->x86_capability better | |
997 | + * indicate the features this CPU genuinely supports! | |
998 | + */ | |
999 | + if (this_cpu->c_init) | |
1000 | + this_cpu->c_init(c); | |
1001 | + | |
1002 | + /* Disable the PN if appropriate */ | |
1003 | + squash_the_stupid_serial_number(c); | |
1004 | + | |
1005 | + /* | |
1006 | + * The vendor-specific functions might have changed features. Now | |
1007 | + * we do "generic changes." | |
1008 | + */ | |
1009 | + | |
1010 | + /* TSC disabled? */ | |
1011 | + if ( tsc_disable ) | |
1012 | + clear_bit(X86_FEATURE_TSC, c->x86_capability); | |
1013 | + | |
1014 | + /* FXSR disabled? */ | |
1015 | + if (disable_x86_fxsr) { | |
1016 | + clear_bit(X86_FEATURE_FXSR, c->x86_capability); | |
1017 | + clear_bit(X86_FEATURE_XMM, c->x86_capability); | |
1018 | + } | |
1019 | + | |
1020 | + /* SEP disabled? */ | |
1021 | + if (disable_x86_sep) | |
1022 | + clear_bit(X86_FEATURE_SEP, c->x86_capability); | |
1023 | + | |
1024 | + if (disable_pse) | |
1025 | + clear_bit(X86_FEATURE_PSE, c->x86_capability); | |
1026 | + | |
1027 | + /* If the model name is still unset, do table lookup. */ | |
1028 | + if ( !c->x86_model_id[0] ) { | |
1029 | + char *p; | |
1030 | + p = table_lookup_model(c); | |
1031 | + if ( p ) | |
1032 | + strcpy(c->x86_model_id, p); | |
1033 | + else | |
1034 | + /* Last resort... */ | |
1035 | + sprintf(c->x86_model_id, "%02x/%02x", | |
1036 | + c->x86, c->x86_model); | |
1037 | + } | |
1038 | + | |
1039 | + /* Now the feature flags better reflect actual CPU features! */ | |
1040 | + | |
1041 | + printk(KERN_DEBUG "CPU: After all inits, caps:"); | |
1042 | + for (i = 0; i < NCAPINTS; i++) | |
1043 | + printk(" %08lx", c->x86_capability[i]); | |
1044 | + printk("\n"); | |
1045 | + | |
1046 | + /* | |
1047 | + * On SMP, boot_cpu_data holds the common feature set between | |
1048 | + * all CPUs; so make sure that we indicate which features are | |
1049 | + * common between the CPUs. The first time this routine gets | |
1050 | + * executed, c == &boot_cpu_data. | |
1051 | + */ | |
1052 | + if ( c != &boot_cpu_data ) { | |
1053 | + /* AND the already accumulated flags with these */ | |
1054 | + for ( i = 0 ; i < NCAPINTS ; i++ ) | |
1055 | + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | |
1056 | + } | |
1057 | + | |
1058 | + /* Init Machine Check Exception if available. */ | |
1059 | + mcheck_init(c); | |
1060 | + | |
1061 | + if (c == &boot_cpu_data) | |
1062 | + sysenter_setup(); | |
1063 | + enable_sep_cpu(); | |
1064 | + | |
1065 | + if (c == &boot_cpu_data) | |
1066 | + mtrr_bp_init(); | |
1067 | + else | |
1068 | + mtrr_ap_init(); | |
1069 | +} | |
1070 | + | |
1071 | +#ifdef CONFIG_X86_HT | |
1072 | +void __cpuinit detect_ht(struct cpuinfo_x86 *c) | |
1073 | +{ | |
1074 | + u32 eax, ebx, ecx, edx; | |
1075 | + int index_msb, core_bits; | |
1076 | + | |
1077 | + cpuid(1, &eax, &ebx, &ecx, &edx); | |
1078 | + | |
1079 | + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) | |
1080 | + return; | |
1081 | + | |
1082 | + smp_num_siblings = (ebx & 0xff0000) >> 16; | |
1083 | + | |
1084 | + if (smp_num_siblings == 1) { | |
1085 | + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | |
1086 | + } else if (smp_num_siblings > 1 ) { | |
1087 | + | |
1088 | + if (smp_num_siblings > NR_CPUS) { | |
1089 | + printk(KERN_WARNING "CPU: Unsupported number of the " | |
1090 | + "siblings %d", smp_num_siblings); | |
1091 | + smp_num_siblings = 1; | |
1092 | + return; | |
1093 | + } | |
1094 | + | |
1095 | + index_msb = get_count_order(smp_num_siblings); | |
1096 | + c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); | |
1097 | + | |
1098 | + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", | |
1099 | + c->phys_proc_id); | |
1100 | + | |
1101 | + smp_num_siblings = smp_num_siblings / c->x86_max_cores; | |
1102 | + | |
1103 | + index_msb = get_count_order(smp_num_siblings) ; | |
1104 | + | |
1105 | + core_bits = get_count_order(c->x86_max_cores); | |
1106 | + | |
1107 | + c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & | |
1108 | + ((1 << core_bits) - 1); | |
1109 | + | |
1110 | + if (c->x86_max_cores > 1) | |
1111 | + printk(KERN_INFO "CPU: Processor Core ID: %d\n", | |
1112 | + c->cpu_core_id); | |
1113 | + } | |
1114 | +} | |
1115 | +#endif | |
1116 | + | |
1117 | +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | |
1118 | +{ | |
1119 | + char *vendor = NULL; | |
1120 | + | |
1121 | + if (c->x86_vendor < X86_VENDOR_NUM) | |
1122 | + vendor = this_cpu->c_vendor; | |
1123 | + else if (c->cpuid_level >= 0) | |
1124 | + vendor = c->x86_vendor_id; | |
1125 | + | |
1126 | + if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) | |
1127 | + printk("%s ", vendor); | |
1128 | + | |
1129 | + if (!c->x86_model_id[0]) | |
1130 | + printk("%d86", c->x86); | |
1131 | + else | |
1132 | + printk("%s", c->x86_model_id); | |
1133 | + | |
1134 | + if (c->x86_mask || c->cpuid_level >= 0) | |
1135 | + printk(" stepping %02x\n", c->x86_mask); | |
1136 | + else | |
1137 | + printk("\n"); | |
1138 | +} | |
1139 | + | |
1140 | +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; | |
1141 | + | |
1142 | +/* This is hacky. :) | |
1143 | + * We're emulating future behavior. | |
1144 | + * In the future, the cpu-specific init functions will be called implicitly | |
1145 | + * via the magic of initcalls. | |
1146 | + * They will insert themselves into the cpu_devs structure. | |
1147 | + * Then, when cpu_init() is called, we can just iterate over that array. | |
1148 | + */ | |
1149 | + | |
1150 | +extern int intel_cpu_init(void); | |
1151 | +extern int cyrix_init_cpu(void); | |
1152 | +extern int nsc_init_cpu(void); | |
1153 | +extern int amd_init_cpu(void); | |
1154 | +extern int centaur_init_cpu(void); | |
1155 | +extern int transmeta_init_cpu(void); | |
1156 | +extern int rise_init_cpu(void); | |
1157 | +extern int nexgen_init_cpu(void); | |
1158 | +extern int umc_init_cpu(void); | |
1159 | + | |
1160 | +void __init early_cpu_init(void) | |
1161 | +{ | |
1162 | + intel_cpu_init(); | |
1163 | + cyrix_init_cpu(); | |
1164 | + nsc_init_cpu(); | |
1165 | + amd_init_cpu(); | |
1166 | + centaur_init_cpu(); | |
1167 | + transmeta_init_cpu(); | |
1168 | + rise_init_cpu(); | |
1169 | + nexgen_init_cpu(); | |
1170 | + umc_init_cpu(); | |
1171 | + early_cpu_detect(); | |
1172 | + | |
1173 | +#ifdef CONFIG_DEBUG_PAGEALLOC | |
1174 | + /* pse is not compatible with on-the-fly unmapping, | |
1175 | + * disable it even if the cpus claim to support it. | |
1176 | + */ | |
1177 | + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); | |
1178 | + disable_pse = 1; | |
1179 | +#endif | |
1180 | +} | |
1181 | + | |
1182 | +static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr) | |
1183 | +{ | |
1184 | + unsigned long frames[16]; | |
1185 | + unsigned long va; | |
1186 | + int f; | |
1187 | + | |
1188 | + for (va = gdt_descr->address, f = 0; | |
1189 | + va < gdt_descr->address + gdt_descr->size; | |
1190 | + va += PAGE_SIZE, f++) { | |
1191 | + frames[f] = virt_to_mfn(va); | |
1192 | + make_lowmem_page_readonly( | |
1193 | + (void *)va, XENFEAT_writable_descriptor_tables); | |
1194 | + } | |
1195 | + if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8)) | |
1196 | + BUG(); | |
1197 | +} | |
1198 | + | |
1199 | +/* | |
1200 | + * cpu_init() initializes state that is per-CPU. Some data is already | |
1201 | + * initialized (naturally) in the bootstrap process, such as the GDT | |
1202 | + * and IDT. We reload them nevertheless, this function acts as a | |
1203 | + * 'CPU state barrier', nothing should get across. | |
1204 | + */ | |
1205 | +void __cpuinit cpu_init(void) | |
1206 | +{ | |
1207 | + int cpu = smp_processor_id(); | |
1208 | +#ifndef CONFIG_X86_NO_TSS | |
1209 | + struct tss_struct * t = &per_cpu(init_tss, cpu); | |
1210 | +#endif | |
1211 | + struct thread_struct *thread = ¤t->thread; | |
1212 | + struct desc_struct *gdt; | |
1213 | + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); | |
1214 | + | |
1215 | + if (cpu_test_and_set(cpu, cpu_initialized)) { | |
1216 | + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); | |
1217 | + for (;;) local_irq_enable(); | |
1218 | + } | |
1219 | + printk(KERN_INFO "Initializing CPU#%d\n", cpu); | |
1220 | + | |
1221 | + if (cpu_has_vme || cpu_has_de) | |
1222 | + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | |
1223 | + if (tsc_disable && cpu_has_tsc) { | |
1224 | + printk(KERN_NOTICE "Disabling TSC...\n"); | |
1225 | + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ | |
1226 | + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); | |
1227 | + set_in_cr4(X86_CR4_TSD); | |
1228 | + } | |
1229 | + | |
1230 | +#ifndef CONFIG_XEN | |
1231 | + /* The CPU hotplug case */ | |
1232 | + if (cpu_gdt_descr->address) { | |
1233 | + gdt = (struct desc_struct *)cpu_gdt_descr->address; | |
1234 | + memset(gdt, 0, PAGE_SIZE); | |
1235 | + goto old_gdt; | |
1236 | + } | |
1237 | + /* | |
1238 | + * This is a horrible hack to allocate the GDT. The problem | |
1239 | + * is that cpu_init() is called really early for the boot CPU | |
1240 | + * (and hence needs bootmem) but much later for the secondary | |
1241 | + * CPUs, when bootmem will have gone away | |
1242 | + */ | |
1243 | + if (NODE_DATA(0)->bdata->node_bootmem_map) { | |
1244 | + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); | |
1245 | + /* alloc_bootmem_pages panics on failure, so no check */ | |
1246 | + memset(gdt, 0, PAGE_SIZE); | |
1247 | + } else { | |
1248 | + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); | |
1249 | + if (unlikely(!gdt)) { | |
1250 | + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); | |
1251 | + for (;;) | |
1252 | + local_irq_enable(); | |
1253 | + } | |
1254 | + } | |
1255 | +old_gdt: | |
1256 | + /* | |
1257 | + * Initialize the per-CPU GDT with the boot GDT, | |
1258 | + * and set up the GDT descriptor: | |
1259 | + */ | |
1260 | + memcpy(gdt, cpu_gdt_table, GDT_SIZE); | |
1261 | + | |
1262 | + /* Set up GDT entry for 16bit stack */ | |
1263 | + *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= | |
1264 | + ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | | |
1265 | + ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | | |
1266 | + (CPU_16BIT_STACK_SIZE - 1); | |
1267 | + | |
1268 | + cpu_gdt_descr->size = GDT_SIZE - 1; | |
1269 | + cpu_gdt_descr->address = (unsigned long)gdt; | |
1270 | +#else | |
1271 | + if (cpu == 0 && cpu_gdt_descr->address == 0) { | |
1272 | + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); | |
1273 | + /* alloc_bootmem_pages panics on failure, so no check */ | |
1274 | + memset(gdt, 0, PAGE_SIZE); | |
1275 | + | |
1276 | + memcpy(gdt, cpu_gdt_table, GDT_SIZE); | |
1277 | + | |
1278 | + cpu_gdt_descr->size = GDT_SIZE; | |
1279 | + cpu_gdt_descr->address = (unsigned long)gdt; | |
1280 | + } | |
1281 | +#endif | |
1282 | + | |
1283 | + cpu_gdt_init(cpu_gdt_descr); | |
1284 | + | |
1285 | + /* | |
1286 | + * Set up and load the per-CPU TSS and LDT | |
1287 | + */ | |
1288 | + atomic_inc(&init_mm.mm_count); | |
1289 | + current->active_mm = &init_mm; | |
1290 | + if (current->mm) | |
1291 | + BUG(); | |
1292 | + enter_lazy_tlb(&init_mm, current); | |
1293 | + | |
1294 | + load_esp0(t, thread); | |
1295 | + | |
1296 | + load_LDT(&init_mm.context); | |
1297 | + | |
1298 | +#ifdef CONFIG_DOUBLEFAULT | |
1299 | + /* Set up doublefault TSS pointer in the GDT */ | |
1300 | + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); | |
1301 | +#endif | |
1302 | + | |
1303 | + /* Clear %fs and %gs. */ | |
1304 | + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); | |
1305 | + | |
1306 | + /* Clear all 6 debug registers: */ | |
1307 | + set_debugreg(0, 0); | |
1308 | + set_debugreg(0, 1); | |
1309 | + set_debugreg(0, 2); | |
1310 | + set_debugreg(0, 3); | |
1311 | + set_debugreg(0, 6); | |
1312 | + set_debugreg(0, 7); | |
1313 | + | |
1314 | + /* | |
1315 | + * Force FPU initialization: | |
1316 | + */ | |
1317 | + current_thread_info()->status = 0; | |
1318 | + clear_used_math(); | |
1319 | + mxcsr_feature_mask_init(); | |
1320 | +} | |
1321 | + | |
1322 | +#ifdef CONFIG_HOTPLUG_CPU | |
1323 | +void __cpuinit cpu_uninit(void) | |
1324 | +{ | |
1325 | + int cpu = raw_smp_processor_id(); | |
1326 | + cpu_clear(cpu, cpu_initialized); | |
1327 | + | |
1328 | + /* lazy TLB state */ | |
1329 | + per_cpu(cpu_tlbstate, cpu).state = 0; | |
1330 | + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; | |
1331 | +} | |
1332 | +#endif | |
1333 | Index: head-2008-11-25/arch/x86/kernel/cpu/mtrr/main-xen.c | |
1334 | =================================================================== | |
1335 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
1336 | +++ head-2008-11-25/arch/x86/kernel/cpu/mtrr/main-xen.c 2008-01-28 12:24:18.000000000 +0100 | |
1337 | @@ -0,0 +1,198 @@ | |
1338 | +#include <linux/init.h> | |
1339 | +#include <linux/proc_fs.h> | |
1340 | +#include <linux/ctype.h> | |
1341 | +#include <linux/module.h> | |
1342 | +#include <linux/seq_file.h> | |
1343 | +#include <asm/uaccess.h> | |
1344 | +#include <linux/mutex.h> | |
1345 | + | |
1346 | +#include <asm/mtrr.h> | |
1347 | +#include "mtrr.h" | |
1348 | + | |
1349 | +static DEFINE_MUTEX(mtrr_mutex); | |
1350 | + | |
1351 | +void generic_get_mtrr(unsigned int reg, unsigned long *base, | |
1352 | + unsigned int *size, mtrr_type * type) | |
1353 | +{ | |
1354 | + struct xen_platform_op op; | |
1355 | + | |
1356 | + op.cmd = XENPF_read_memtype; | |
1357 | + op.u.read_memtype.reg = reg; | |
1358 | + if (unlikely(HYPERVISOR_platform_op(&op))) | |
1359 | + memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype)); | |
1360 | + | |
1361 | + *size = op.u.read_memtype.nr_mfns; | |
1362 | + *base = op.u.read_memtype.mfn; | |
1363 | + *type = op.u.read_memtype.type; | |
1364 | +} | |
1365 | + | |
1366 | +struct mtrr_ops generic_mtrr_ops = { | |
1367 | + .use_intel_if = 1, | |
1368 | + .get = generic_get_mtrr, | |
1369 | +}; | |
1370 | + | |
1371 | +struct mtrr_ops *mtrr_if = &generic_mtrr_ops; | |
1372 | +unsigned int num_var_ranges; | |
1373 | +unsigned int *usage_table; | |
1374 | + | |
1375 | +static void __init set_num_var_ranges(void) | |
1376 | +{ | |
1377 | + struct xen_platform_op op; | |
1378 | + | |
1379 | + for (num_var_ranges = 0; ; num_var_ranges++) { | |
1380 | + op.cmd = XENPF_read_memtype; | |
1381 | + op.u.read_memtype.reg = num_var_ranges; | |
1382 | + if (HYPERVISOR_platform_op(&op) != 0) | |
1383 | + break; | |
1384 | + } | |
1385 | +} | |
1386 | + | |
1387 | +static void __init init_table(void) | |
1388 | +{ | |
1389 | + int i, max; | |
1390 | + | |
1391 | + max = num_var_ranges; | |
1392 | + if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) | |
1393 | + == NULL) { | |
1394 | + printk(KERN_ERR "mtrr: could not allocate\n"); | |
1395 | + return; | |
1396 | + } | |
1397 | + for (i = 0; i < max; i++) | |
1398 | + usage_table[i] = 0; | |
1399 | +} | |
1400 | + | |
1401 | +int mtrr_add_page(unsigned long base, unsigned long size, | |
1402 | + unsigned int type, char increment) | |
1403 | +{ | |
1404 | + int error; | |
1405 | + struct xen_platform_op op; | |
1406 | + | |
1407 | + mutex_lock(&mtrr_mutex); | |
1408 | + | |
1409 | + op.cmd = XENPF_add_memtype; | |
1410 | + op.u.add_memtype.mfn = base; | |
1411 | + op.u.add_memtype.nr_mfns = size; | |
1412 | + op.u.add_memtype.type = type; | |
1413 | + error = HYPERVISOR_platform_op(&op); | |
1414 | + if (error) { | |
1415 | + mutex_unlock(&mtrr_mutex); | |
1416 | + BUG_ON(error > 0); | |
1417 | + return error; | |
1418 | + } | |
1419 | + | |
1420 | + if (increment) | |
1421 | + ++usage_table[op.u.add_memtype.reg]; | |
1422 | + | |
1423 | + mutex_unlock(&mtrr_mutex); | |
1424 | + | |
1425 | + return op.u.add_memtype.reg; | |
1426 | +} | |
1427 | + | |
1428 | +static int mtrr_check(unsigned long base, unsigned long size) | |
1429 | +{ | |
1430 | + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { | |
1431 | + printk(KERN_WARNING | |
1432 | + "mtrr: size and base must be multiples of 4 kiB\n"); | |
1433 | + printk(KERN_DEBUG | |
1434 | + "mtrr: size: 0x%lx base: 0x%lx\n", size, base); | |
1435 | + dump_stack(); | |
1436 | + return -1; | |
1437 | + } | |
1438 | + return 0; | |
1439 | +} | |
1440 | + | |
1441 | +int | |
1442 | +mtrr_add(unsigned long base, unsigned long size, unsigned int type, | |
1443 | + char increment) | |
1444 | +{ | |
1445 | + if (mtrr_check(base, size)) | |
1446 | + return -EINVAL; | |
1447 | + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, | |
1448 | + increment); | |
1449 | +} | |
1450 | + | |
1451 | +int mtrr_del_page(int reg, unsigned long base, unsigned long size) | |
1452 | +{ | |
1453 | + unsigned i; | |
1454 | + mtrr_type ltype; | |
1455 | + unsigned long lbase; | |
1456 | + unsigned int lsize; | |
1457 | + int error = -EINVAL; | |
1458 | + struct xen_platform_op op; | |
1459 | + | |
1460 | + mutex_lock(&mtrr_mutex); | |
1461 | + | |
1462 | + if (reg < 0) { | |
1463 | + /* Search for existing MTRR */ | |
1464 | + for (i = 0; i < num_var_ranges; ++i) { | |
1465 | + mtrr_if->get(i, &lbase, &lsize, <ype); | |
1466 | + if (lbase == base && lsize == size) { | |
1467 | + reg = i; | |
1468 | + break; | |
1469 | + } | |
1470 | + } | |
1471 | + if (reg < 0) { | |
1472 | + printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, | |
1473 | + size); | |
1474 | + goto out; | |
1475 | + } | |
1476 | + } | |
1477 | + if (usage_table[reg] < 1) { | |
1478 | + printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); | |
1479 | + goto out; | |
1480 | + } | |
1481 | + if (--usage_table[reg] < 1) { | |
1482 | + op.cmd = XENPF_del_memtype; | |
1483 | + op.u.del_memtype.handle = 0; | |
1484 | + op.u.del_memtype.reg = reg; | |
1485 | + error = HYPERVISOR_platform_op(&op); | |
1486 | + if (error) { | |
1487 | + BUG_ON(error > 0); | |
1488 | + goto out; | |
1489 | + } | |
1490 | + } | |
1491 | + error = reg; | |
1492 | + out: | |
1493 | + mutex_unlock(&mtrr_mutex); | |
1494 | + return error; | |
1495 | +} | |
1496 | + | |
1497 | +int | |
1498 | +mtrr_del(int reg, unsigned long base, unsigned long size) | |
1499 | +{ | |
1500 | + if (mtrr_check(base, size)) | |
1501 | + return -EINVAL; | |
1502 | + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); | |
1503 | +} | |
1504 | + | |
1505 | +EXPORT_SYMBOL(mtrr_add); | |
1506 | +EXPORT_SYMBOL(mtrr_del); | |
1507 | + | |
1508 | +void __init mtrr_bp_init(void) | |
1509 | +{ | |
1510 | +} | |
1511 | + | |
1512 | +void mtrr_ap_init(void) | |
1513 | +{ | |
1514 | +} | |
1515 | + | |
1516 | +static int __init mtrr_init(void) | |
1517 | +{ | |
1518 | + struct cpuinfo_x86 *c = &boot_cpu_data; | |
1519 | + | |
1520 | + if (!is_initial_xendomain()) | |
1521 | + return -ENODEV; | |
1522 | + | |
1523 | + if ((!cpu_has(c, X86_FEATURE_MTRR)) && | |
1524 | + (!cpu_has(c, X86_FEATURE_K6_MTRR)) && | |
1525 | + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && | |
1526 | + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) | |
1527 | + return -ENODEV; | |
1528 | + | |
1529 | + set_num_var_ranges(); | |
1530 | + init_table(); | |
1531 | + | |
1532 | + return 0; | |
1533 | +} | |
1534 | + | |
1535 | +subsys_initcall(mtrr_init); | |
1536 | Index: head-2008-11-25/arch/x86/kernel/entry_32-xen.S | |
1537 | =================================================================== | |
1538 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
1539 | +++ head-2008-11-25/arch/x86/kernel/entry_32-xen.S 2007-12-10 08:47:31.000000000 +0100 | |
1540 | @@ -0,0 +1,1238 @@ | |
1541 | +/* | |
1542 | + * linux/arch/i386/entry.S | |
1543 | + * | |
1544 | + * Copyright (C) 1991, 1992 Linus Torvalds | |
1545 | + */ | |
1546 | + | |
1547 | +/* | |
1548 | + * entry.S contains the system-call and fault low-level handling routines. | |
1549 | + * This also contains the timer-interrupt handler, as well as all interrupts | |
1550 | + * and faults that can result in a task-switch. | |
1551 | + * | |
1552 | + * NOTE: This code handles signal-recognition, which happens every time | |
1553 | + * after a timer-interrupt and after each system call. | |
1554 | + * | |
1555 | + * I changed all the .align's to 4 (16 byte alignment), as that's faster | |
1556 | + * on a 486. | |
1557 | + * | |
1558 | + * Stack layout in 'ret_from_system_call': | |
1559 | + * ptrace needs to have all regs on the stack. | |
1560 | + * if the order here is changed, it needs to be | |
1561 | + * updated in fork.c:copy_process, signal.c:do_signal, | |
1562 | + * ptrace.c and ptrace.h | |
1563 | + * | |
1564 | + * 0(%esp) - %ebx | |
1565 | + * 4(%esp) - %ecx | |
1566 | + * 8(%esp) - %edx | |
1567 | + * C(%esp) - %esi | |
1568 | + * 10(%esp) - %edi | |
1569 | + * 14(%esp) - %ebp | |
1570 | + * 18(%esp) - %eax | |
1571 | + * 1C(%esp) - %ds | |
1572 | + * 20(%esp) - %es | |
1573 | + * 24(%esp) - orig_eax | |
1574 | + * 28(%esp) - %eip | |
1575 | + * 2C(%esp) - %cs | |
1576 | + * 30(%esp) - %eflags | |
1577 | + * 34(%esp) - %oldesp | |
1578 | + * 38(%esp) - %oldss | |
1579 | + * | |
1580 | + * "current" is in register %ebx during any slow entries. | |
1581 | + */ | |
1582 | + | |
1583 | +#include <linux/linkage.h> | |
1584 | +#include <asm/thread_info.h> | |
1585 | +#include <asm/irqflags.h> | |
1586 | +#include <asm/errno.h> | |
1587 | +#include <asm/segment.h> | |
1588 | +#include <asm/smp.h> | |
1589 | +#include <asm/page.h> | |
1590 | +#include <asm/desc.h> | |
1591 | +#include <asm/dwarf2.h> | |
1592 | +#include "irq_vectors.h" | |
1593 | +#include <xen/interface/xen.h> | |
1594 | + | |
1595 | +#define nr_syscalls ((syscall_table_size)/4) | |
1596 | + | |
1597 | +EBX = 0x00 | |
1598 | +ECX = 0x04 | |
1599 | +EDX = 0x08 | |
1600 | +ESI = 0x0C | |
1601 | +EDI = 0x10 | |
1602 | +EBP = 0x14 | |
1603 | +EAX = 0x18 | |
1604 | +DS = 0x1C | |
1605 | +ES = 0x20 | |
1606 | +ORIG_EAX = 0x24 | |
1607 | +EIP = 0x28 | |
1608 | +CS = 0x2C | |
1609 | +EFLAGS = 0x30 | |
1610 | +OLDESP = 0x34 | |
1611 | +OLDSS = 0x38 | |
1612 | + | |
1613 | +CF_MASK = 0x00000001 | |
1614 | +TF_MASK = 0x00000100 | |
1615 | +IF_MASK = 0x00000200 | |
1616 | +DF_MASK = 0x00000400 | |
1617 | +NT_MASK = 0x00004000 | |
1618 | +VM_MASK = 0x00020000 | |
1619 | +/* Pseudo-eflags. */ | |
1620 | +NMI_MASK = 0x80000000 | |
1621 | + | |
1622 | +#ifndef CONFIG_XEN | |
1623 | +#define DISABLE_INTERRUPTS cli | |
1624 | +#define ENABLE_INTERRUPTS sti | |
1625 | +#else | |
1626 | +/* Offsets into shared_info_t. */ | |
1627 | +#define evtchn_upcall_pending /* 0 */ | |
1628 | +#define evtchn_upcall_mask 1 | |
1629 | + | |
1630 | +#define sizeof_vcpu_shift 6 | |
1631 | + | |
1632 | +#ifdef CONFIG_SMP | |
1633 | +#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \ | |
1634 | + shl $sizeof_vcpu_shift,%esi ; \ | |
1635 | + addl HYPERVISOR_shared_info,%esi | |
1636 | +#else | |
1637 | +#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi | |
1638 | +#endif | |
1639 | + | |
1640 | +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) | |
1641 | +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) | |
1642 | +#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \ | |
1643 | + __DISABLE_INTERRUPTS | |
1644 | +#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \ | |
1645 | + __ENABLE_INTERRUPTS | |
1646 | +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) | |
1647 | +#endif | |
1648 | + | |
1649 | +#ifdef CONFIG_PREEMPT | |
1650 | +#define preempt_stop cli; TRACE_IRQS_OFF | |
1651 | +#else | |
1652 | +#define preempt_stop | |
1653 | +#define resume_kernel restore_nocheck | |
1654 | +#endif | |
1655 | + | |
1656 | +.macro TRACE_IRQS_IRET | |
1657 | +#ifdef CONFIG_TRACE_IRQFLAGS | |
1658 | + testl $IF_MASK,EFLAGS(%esp) # interrupts off? | |
1659 | + jz 1f | |
1660 | + TRACE_IRQS_ON | |
1661 | +1: | |
1662 | +#endif | |
1663 | +.endm | |
1664 | + | |
1665 | +#ifdef CONFIG_VM86 | |
1666 | +#define resume_userspace_sig check_userspace | |
1667 | +#else | |
1668 | +#define resume_userspace_sig resume_userspace | |
1669 | +#endif | |
1670 | + | |
1671 | +#define SAVE_ALL \ | |
1672 | + cld; \ | |
1673 | + pushl %es; \ | |
1674 | + CFI_ADJUST_CFA_OFFSET 4;\ | |
1675 | + /*CFI_REL_OFFSET es, 0;*/\ | |
1676 | + pushl %ds; \ | |
1677 | + CFI_ADJUST_CFA_OFFSET 4;\ | |
1678 | + /*CFI_REL_OFFSET ds, 0;*/\ | |
1679 | + pushl %eax; \ | |
1680 | + CFI_ADJUST_CFA_OFFSET 4;\ | |
1681 | + CFI_REL_OFFSET eax, 0;\ | |
1682 | + pushl %ebp; \ | |
1683 | + CFI_ADJUST_CFA_OFFSET 4;\ | |
1684 | + CFI_REL_OFFSET ebp, 0;\ | |
1685 | + pushl %edi; \ | |
1686 | + CFI_ADJUST_CFA_OFFSET 4;\ | |
1687 | + CFI_REL_OFFSET edi, 0;\ | |
1688 | + pushl %esi; \ | |
1689 | + CFI_ADJUST_CFA_OFFSET 4;\ | |
1690 | + CFI_REL_OFFSET esi, 0;\ | |
1691 | + pushl %edx; \ | |
1692 | + CFI_ADJUST_CFA_OFFSET 4;\ | |
1693 | + CFI_REL_OFFSET edx, 0;\ | |
1694 | + pushl %ecx; \ | |
1695 | + CFI_ADJUST_CFA_OFFSET 4;\ | |
1696 | + CFI_REL_OFFSET ecx, 0;\ | |
1697 | + pushl %ebx; \ | |
1698 | + CFI_ADJUST_CFA_OFFSET 4;\ | |
1699 | + CFI_REL_OFFSET ebx, 0;\ | |
1700 | + movl $(__USER_DS), %edx; \ | |
1701 | + movl %edx, %ds; \ | |
1702 | + movl %edx, %es; | |
1703 | + | |
1704 | +#define RESTORE_INT_REGS \ | |
1705 | + popl %ebx; \ | |
1706 | + CFI_ADJUST_CFA_OFFSET -4;\ | |
1707 | + CFI_RESTORE ebx;\ | |
1708 | + popl %ecx; \ | |
1709 | + CFI_ADJUST_CFA_OFFSET -4;\ | |
1710 | + CFI_RESTORE ecx;\ | |
1711 | + popl %edx; \ | |
1712 | + CFI_ADJUST_CFA_OFFSET -4;\ | |
1713 | + CFI_RESTORE edx;\ | |
1714 | + popl %esi; \ | |
1715 | + CFI_ADJUST_CFA_OFFSET -4;\ | |
1716 | + CFI_RESTORE esi;\ | |
1717 | + popl %edi; \ | |
1718 | + CFI_ADJUST_CFA_OFFSET -4;\ | |
1719 | + CFI_RESTORE edi;\ | |
1720 | + popl %ebp; \ | |
1721 | + CFI_ADJUST_CFA_OFFSET -4;\ | |
1722 | + CFI_RESTORE ebp;\ | |
1723 | + popl %eax; \ | |
1724 | + CFI_ADJUST_CFA_OFFSET -4;\ | |
1725 | + CFI_RESTORE eax | |
1726 | + | |
1727 | +#define RESTORE_REGS \ | |
1728 | + RESTORE_INT_REGS; \ | |
1729 | +1: popl %ds; \ | |
1730 | + CFI_ADJUST_CFA_OFFSET -4;\ | |
1731 | + /*CFI_RESTORE ds;*/\ | |
1732 | +2: popl %es; \ | |
1733 | + CFI_ADJUST_CFA_OFFSET -4;\ | |
1734 | + /*CFI_RESTORE es;*/\ | |
1735 | +.section .fixup,"ax"; \ | |
1736 | +3: movl $0,(%esp); \ | |
1737 | + jmp 1b; \ | |
1738 | +4: movl $0,(%esp); \ | |
1739 | + jmp 2b; \ | |
1740 | +.previous; \ | |
1741 | +.section __ex_table,"a";\ | |
1742 | + .align 4; \ | |
1743 | + .long 1b,3b; \ | |
1744 | + .long 2b,4b; \ | |
1745 | +.previous | |
1746 | + | |
1747 | +#define RING0_INT_FRAME \ | |
1748 | + CFI_STARTPROC simple;\ | |
1749 | + CFI_DEF_CFA esp, 3*4;\ | |
1750 | + /*CFI_OFFSET cs, -2*4;*/\ | |
1751 | + CFI_OFFSET eip, -3*4 | |
1752 | + | |
1753 | +#define RING0_EC_FRAME \ | |
1754 | + CFI_STARTPROC simple;\ | |
1755 | + CFI_DEF_CFA esp, 4*4;\ | |
1756 | + /*CFI_OFFSET cs, -2*4;*/\ | |
1757 | + CFI_OFFSET eip, -3*4 | |
1758 | + | |
1759 | +#define RING0_PTREGS_FRAME \ | |
1760 | + CFI_STARTPROC simple;\ | |
1761 | + CFI_DEF_CFA esp, OLDESP-EBX;\ | |
1762 | + /*CFI_OFFSET cs, CS-OLDESP;*/\ | |
1763 | + CFI_OFFSET eip, EIP-OLDESP;\ | |
1764 | + /*CFI_OFFSET es, ES-OLDESP;*/\ | |
1765 | + /*CFI_OFFSET ds, DS-OLDESP;*/\ | |
1766 | + CFI_OFFSET eax, EAX-OLDESP;\ | |
1767 | + CFI_OFFSET ebp, EBP-OLDESP;\ | |
1768 | + CFI_OFFSET edi, EDI-OLDESP;\ | |
1769 | + CFI_OFFSET esi, ESI-OLDESP;\ | |
1770 | + CFI_OFFSET edx, EDX-OLDESP;\ | |
1771 | + CFI_OFFSET ecx, ECX-OLDESP;\ | |
1772 | + CFI_OFFSET ebx, EBX-OLDESP | |
1773 | + | |
1774 | +ENTRY(ret_from_fork) | |
1775 | + CFI_STARTPROC | |
1776 | + pushl %eax | |
1777 | + CFI_ADJUST_CFA_OFFSET 4 | |
1778 | + call schedule_tail | |
1779 | + GET_THREAD_INFO(%ebp) | |
1780 | + popl %eax | |
1781 | + CFI_ADJUST_CFA_OFFSET -4 | |
1782 | + pushl $0x0202 # Reset kernel eflags | |
1783 | + CFI_ADJUST_CFA_OFFSET 4 | |
1784 | + popfl | |
1785 | + CFI_ADJUST_CFA_OFFSET -4 | |
1786 | + jmp syscall_exit | |
1787 | + CFI_ENDPROC | |
1788 | + | |
1789 | +/* | |
1790 | + * Return to user mode is not as complex as all this looks, | |
1791 | + * but we want the default path for a system call return to | |
1792 | + * go as quickly as possible which is why some of this is | |
1793 | + * less clear than it otherwise should be. | |
1794 | + */ | |
1795 | + | |
1796 | + # userspace resumption stub bypassing syscall exit tracing | |
1797 | + ALIGN | |
1798 | + RING0_PTREGS_FRAME | |
1799 | +ret_from_exception: | |
1800 | + preempt_stop | |
1801 | +ret_from_intr: | |
1802 | + GET_THREAD_INFO(%ebp) | |
1803 | +check_userspace: | |
1804 | + movl EFLAGS(%esp), %eax # mix EFLAGS and CS | |
1805 | + movb CS(%esp), %al | |
1806 | + testl $(VM_MASK | 2), %eax | |
1807 | + jz resume_kernel | |
1808 | +ENTRY(resume_userspace) | |
1809 | + DISABLE_INTERRUPTS # make sure we don't miss an interrupt | |
1810 | + # setting need_resched or sigpending | |
1811 | + # between sampling and the iret | |
1812 | + movl TI_flags(%ebp), %ecx | |
1813 | + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on | |
1814 | + # int/exception return? | |
1815 | + jne work_pending | |
1816 | + jmp restore_all | |
1817 | + | |
1818 | +#ifdef CONFIG_PREEMPT | |
1819 | +ENTRY(resume_kernel) | |
1820 | + cli | |
1821 | + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? | |
1822 | + jnz restore_nocheck | |
1823 | +need_resched: | |
1824 | + movl TI_flags(%ebp), %ecx # need_resched set ? | |
1825 | + testb $_TIF_NEED_RESCHED, %cl | |
1826 | + jz restore_all | |
1827 | + testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? | |
1828 | + jz restore_all | |
1829 | + call preempt_schedule_irq | |
1830 | + jmp need_resched | |
1831 | +#endif | |
1832 | + CFI_ENDPROC | |
1833 | + | |
1834 | +/* SYSENTER_RETURN points to after the "sysenter" instruction in | |
1835 | + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ | |
1836 | + | |
1837 | + # sysenter call handler stub | |
1838 | +ENTRY(sysenter_entry) | |
1839 | + CFI_STARTPROC simple | |
1840 | + CFI_DEF_CFA esp, 0 | |
1841 | + CFI_REGISTER esp, ebp | |
1842 | + movl SYSENTER_stack_esp0(%esp),%esp | |
1843 | +sysenter_past_esp: | |
1844 | + /* | |
1845 | + * No need to follow this irqs on/off section: the syscall | |
1846 | + * disabled irqs and here we enable it straight after entry: | |
1847 | + */ | |
1848 | + sti | |
1849 | + pushl $(__USER_DS) | |
1850 | + CFI_ADJUST_CFA_OFFSET 4 | |
1851 | + /*CFI_REL_OFFSET ss, 0*/ | |
1852 | + pushl %ebp | |
1853 | + CFI_ADJUST_CFA_OFFSET 4 | |
1854 | + CFI_REL_OFFSET esp, 0 | |
1855 | + pushfl | |
1856 | + CFI_ADJUST_CFA_OFFSET 4 | |
1857 | + pushl $(__USER_CS) | |
1858 | + CFI_ADJUST_CFA_OFFSET 4 | |
1859 | + /*CFI_REL_OFFSET cs, 0*/ | |
1860 | + /* | |
1861 | + * Push current_thread_info()->sysenter_return to the stack. | |
1862 | + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words | |
1863 | + * pushed above; +8 corresponds to copy_thread's esp0 setting. | |
1864 | + */ | |
1865 | + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) | |
1866 | + CFI_ADJUST_CFA_OFFSET 4 | |
1867 | + CFI_REL_OFFSET eip, 0 | |
1868 | + | |
1869 | +/* | |
1870 | + * Load the potential sixth argument from user stack. | |
1871 | + * Careful about security. | |
1872 | + */ | |
1873 | + cmpl $__PAGE_OFFSET-3,%ebp | |
1874 | + jae syscall_fault | |
1875 | +1: movl (%ebp),%ebp | |
1876 | +.section __ex_table,"a" | |
1877 | + .align 4 | |
1878 | + .long 1b,syscall_fault | |
1879 | +.previous | |
1880 | + | |
1881 | + pushl %eax | |
1882 | + CFI_ADJUST_CFA_OFFSET 4 | |
1883 | + SAVE_ALL | |
1884 | + GET_THREAD_INFO(%ebp) | |
1885 | + | |
1886 | + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ | |
1887 | + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) | |
1888 | + jnz syscall_trace_entry | |
1889 | + cmpl $(nr_syscalls), %eax | |
1890 | + jae syscall_badsys | |
1891 | + call *sys_call_table(,%eax,4) | |
1892 | + movl %eax,EAX(%esp) | |
1893 | + DISABLE_INTERRUPTS | |
1894 | + TRACE_IRQS_OFF | |
1895 | + movl TI_flags(%ebp), %ecx | |
1896 | + testw $_TIF_ALLWORK_MASK, %cx | |
1897 | + jne syscall_exit_work | |
1898 | +/* if something modifies registers it must also disable sysexit */ | |
1899 | + movl EIP(%esp), %edx | |
1900 | + movl OLDESP(%esp), %ecx | |
1901 | + xorl %ebp,%ebp | |
1902 | +#ifdef CONFIG_XEN | |
1903 | + TRACE_IRQS_ON | |
1904 | + __ENABLE_INTERRUPTS | |
1905 | +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ | |
1906 | + __TEST_PENDING | |
1907 | + jnz 14f # process more events if necessary... | |
1908 | + movl ESI(%esp), %esi | |
1909 | + sysexit | |
1910 | +14: __DISABLE_INTERRUPTS | |
1911 | + TRACE_IRQS_OFF | |
1912 | +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ | |
1913 | + push %esp | |
1914 | + call evtchn_do_upcall | |
1915 | + add $4,%esp | |
1916 | + jmp ret_from_intr | |
1917 | +#else | |
1918 | + TRACE_IRQS_ON | |
1919 | + sti | |
1920 | + sysexit | |
1921 | +#endif /* !CONFIG_XEN */ | |
1922 | + CFI_ENDPROC | |
1923 | + | |
1924 | + # pv sysenter call handler stub | |
1925 | +ENTRY(sysenter_entry_pv) | |
1926 | + RING0_INT_FRAME | |
1927 | + movl $__USER_DS,16(%esp) | |
1928 | + movl %ebp,12(%esp) | |
1929 | + movl $__USER_CS,4(%esp) | |
1930 | + addl $4,%esp | |
1931 | + /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */ | |
1932 | + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) | |
1933 | +/* | |
1934 | + * Load the potential sixth argument from user stack. | |
1935 | + * Careful about security. | |
1936 | + */ | |
1937 | + cmpl $__PAGE_OFFSET-3,%ebp | |
1938 | + jae syscall_fault | |
1939 | +1: movl (%ebp),%ebp | |
1940 | +.section __ex_table,"a" | |
1941 | + .align 4 | |
1942 | + .long 1b,syscall_fault | |
1943 | +.previous | |
1944 | + /* fall through */ | |
1945 | + CFI_ENDPROC | |
1946 | +ENDPROC(sysenter_entry_pv) | |
1947 | + | |
1948 | + # system call handler stub | |
1949 | +ENTRY(system_call) | |
1950 | + RING0_INT_FRAME # can't unwind into user space anyway | |
1951 | + pushl %eax # save orig_eax | |
1952 | + CFI_ADJUST_CFA_OFFSET 4 | |
1953 | + SAVE_ALL | |
1954 | + GET_THREAD_INFO(%ebp) | |
1955 | + testl $TF_MASK,EFLAGS(%esp) | |
1956 | + jz no_singlestep | |
1957 | + orl $_TIF_SINGLESTEP,TI_flags(%ebp) | |
1958 | +no_singlestep: | |
1959 | + # system call tracing in operation / emulation | |
1960 | + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ | |
1961 | + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) | |
1962 | + jnz syscall_trace_entry | |
1963 | + cmpl $(nr_syscalls), %eax | |
1964 | + jae syscall_badsys | |
1965 | +syscall_call: | |
1966 | + call *sys_call_table(,%eax,4) | |
1967 | + movl %eax,EAX(%esp) # store the return value | |
1968 | +syscall_exit: | |
1969 | + DISABLE_INTERRUPTS # make sure we don't miss an interrupt | |
1970 | + # setting need_resched or sigpending | |
1971 | + # between sampling and the iret | |
1972 | + TRACE_IRQS_OFF | |
1973 | + movl TI_flags(%ebp), %ecx | |
1974 | + testw $_TIF_ALLWORK_MASK, %cx # current->work | |
1975 | + jne syscall_exit_work | |
1976 | + | |
1977 | +restore_all: | |
1978 | +#ifndef CONFIG_XEN | |
1979 | + movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS | |
1980 | + # Warning: OLDSS(%esp) contains the wrong/random values if we | |
1981 | + # are returning to the kernel. | |
1982 | + # See comments in process.c:copy_thread() for details. | |
1983 | + movb OLDSS(%esp), %ah | |
1984 | + movb CS(%esp), %al | |
1985 | + andl $(VM_MASK | (4 << 8) | 3), %eax | |
1986 | + cmpl $((4 << 8) | 3), %eax | |
1987 | + CFI_REMEMBER_STATE | |
1988 | + je ldt_ss # returning to user-space with LDT SS | |
1989 | +restore_nocheck: | |
1990 | +#else | |
1991 | +restore_nocheck: | |
1992 | + movl EFLAGS(%esp), %eax | |
1993 | + testl $(VM_MASK|NMI_MASK), %eax | |
1994 | + CFI_REMEMBER_STATE | |
1995 | + jnz hypervisor_iret | |
1996 | + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF | |
1997 | + GET_VCPU_INFO | |
1998 | + andb evtchn_upcall_mask(%esi),%al | |
1999 | + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask | |
2000 | + CFI_REMEMBER_STATE | |
2001 | + jnz restore_all_enable_events # != 0 => enable event delivery | |
2002 | +#endif | |
2003 | + TRACE_IRQS_IRET | |
2004 | +restore_nocheck_notrace: | |
2005 | + RESTORE_REGS | |
2006 | + addl $4, %esp | |
2007 | + CFI_ADJUST_CFA_OFFSET -4 | |
2008 | +1: iret | |
2009 | +.section .fixup,"ax" | |
2010 | +iret_exc: | |
2011 | +#ifndef CONFIG_XEN | |
2012 | + TRACE_IRQS_ON | |
2013 | + sti | |
2014 | +#endif | |
2015 | + pushl $0 # no error code | |
2016 | + pushl $do_iret_error | |
2017 | + jmp error_code | |
2018 | +.previous | |
2019 | +.section __ex_table,"a" | |
2020 | + .align 4 | |
2021 | + .long 1b,iret_exc | |
2022 | +.previous | |
2023 | + | |
2024 | + CFI_RESTORE_STATE | |
2025 | +#ifndef CONFIG_XEN | |
2026 | +ldt_ss: | |
2027 | + larl OLDSS(%esp), %eax | |
2028 | + jnz restore_nocheck | |
2029 | + testl $0x00400000, %eax # returning to 32bit stack? | |
2030 | + jnz restore_nocheck # allright, normal return | |
2031 | + /* If returning to userspace with 16bit stack, | |
2032 | + * try to fix the higher word of ESP, as the CPU | |
2033 | + * won't restore it. | |
2034 | + * This is an "official" bug of all the x86-compatible | |
2035 | + * CPUs, which we can try to work around to make | |
2036 | + * dosemu and wine happy. */ | |
2037 | + subl $8, %esp # reserve space for switch16 pointer | |
2038 | + CFI_ADJUST_CFA_OFFSET 8 | |
2039 | + cli | |
2040 | + TRACE_IRQS_OFF | |
2041 | + movl %esp, %eax | |
2042 | + /* Set up the 16bit stack frame with switch32 pointer on top, | |
2043 | + * and a switch16 pointer on top of the current frame. */ | |
2044 | + call setup_x86_bogus_stack | |
2045 | + CFI_ADJUST_CFA_OFFSET -8 # frame has moved | |
2046 | + TRACE_IRQS_IRET | |
2047 | + RESTORE_REGS | |
2048 | + lss 20+4(%esp), %esp # switch to 16bit stack | |
2049 | +1: iret | |
2050 | +.section __ex_table,"a" | |
2051 | + .align 4 | |
2052 | + .long 1b,iret_exc | |
2053 | +.previous | |
2054 | +#else | |
2055 | + ALIGN | |
2056 | +restore_all_enable_events: | |
2057 | + TRACE_IRQS_ON | |
2058 | + __ENABLE_INTERRUPTS | |
2059 | +scrit: /**** START OF CRITICAL REGION ****/ | |
2060 | + __TEST_PENDING | |
2061 | + jnz 14f # process more events if necessary... | |
2062 | + RESTORE_REGS | |
2063 | + addl $4, %esp | |
2064 | + CFI_ADJUST_CFA_OFFSET -4 | |
2065 | +1: iret | |
2066 | +.section __ex_table,"a" | |
2067 | + .align 4 | |
2068 | + .long 1b,iret_exc | |
2069 | +.previous | |
2070 | +14: __DISABLE_INTERRUPTS | |
2071 | + TRACE_IRQS_OFF | |
2072 | + jmp 11f | |
2073 | +ecrit: /**** END OF CRITICAL REGION ****/ | |
2074 | + | |
2075 | + CFI_RESTORE_STATE | |
2076 | +hypervisor_iret: | |
2077 | + andl $~NMI_MASK, EFLAGS(%esp) | |
2078 | + RESTORE_REGS | |
2079 | + addl $4, %esp | |
2080 | + CFI_ADJUST_CFA_OFFSET -4 | |
2081 | + jmp hypercall_page + (__HYPERVISOR_iret * 32) | |
2082 | +#endif | |
2083 | + CFI_ENDPROC | |
2084 | + | |
2085 | + # perform work that needs to be done immediately before resumption | |
2086 | + ALIGN | |
2087 | + RING0_PTREGS_FRAME # can't unwind into user space anyway | |
2088 | +work_pending: | |
2089 | + testb $_TIF_NEED_RESCHED, %cl | |
2090 | + jz work_notifysig | |
2091 | +work_resched: | |
2092 | + call schedule | |
2093 | + DISABLE_INTERRUPTS # make sure we don't miss an interrupt | |
2094 | + # setting need_resched or sigpending | |
2095 | + # between sampling and the iret | |
2096 | + TRACE_IRQS_OFF | |
2097 | + movl TI_flags(%ebp), %ecx | |
2098 | + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other | |
2099 | + # than syscall tracing? | |
2100 | + jz restore_all | |
2101 | + testb $_TIF_NEED_RESCHED, %cl | |
2102 | + jnz work_resched | |
2103 | + | |
2104 | +work_notifysig: # deal with pending signals and | |
2105 | + # notify-resume requests | |
2106 | + testl $VM_MASK, EFLAGS(%esp) | |
2107 | + movl %esp, %eax | |
2108 | + jne work_notifysig_v86 # returning to kernel-space or | |
2109 | + # vm86-space | |
2110 | + xorl %edx, %edx | |
2111 | + call do_notify_resume | |
2112 | + jmp resume_userspace_sig | |
2113 | + | |
2114 | + ALIGN | |
2115 | +work_notifysig_v86: | |
2116 | +#ifdef CONFIG_VM86 | |
2117 | + pushl %ecx # save ti_flags for do_notify_resume | |
2118 | + CFI_ADJUST_CFA_OFFSET 4 | |
2119 | + call save_v86_state # %eax contains pt_regs pointer | |
2120 | + popl %ecx | |
2121 | + CFI_ADJUST_CFA_OFFSET -4 | |
2122 | + movl %eax, %esp | |
2123 | + xorl %edx, %edx | |
2124 | + call do_notify_resume | |
2125 | + jmp resume_userspace_sig | |
2126 | +#endif | |
2127 | + | |
2128 | + # perform syscall exit tracing | |
2129 | + ALIGN | |
2130 | +syscall_trace_entry: | |
2131 | + movl $-ENOSYS,EAX(%esp) | |
2132 | + movl %esp, %eax | |
2133 | + xorl %edx,%edx | |
2134 | + call do_syscall_trace | |
2135 | + cmpl $0, %eax | |
2136 | + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, | |
2137 | + # so must skip actual syscall | |
2138 | + movl ORIG_EAX(%esp), %eax | |
2139 | + cmpl $(nr_syscalls), %eax | |
2140 | + jnae syscall_call | |
2141 | + jmp syscall_exit | |
2142 | + | |
2143 | + # perform syscall exit tracing | |
2144 | + ALIGN | |
2145 | +syscall_exit_work: | |
2146 | + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl | |
2147 | + jz work_pending | |
2148 | + TRACE_IRQS_ON | |
2149 | + ENABLE_INTERRUPTS # could let do_syscall_trace() call | |
2150 | + # schedule() instead | |
2151 | + movl %esp, %eax | |
2152 | + movl $1, %edx | |
2153 | + call do_syscall_trace | |
2154 | + jmp resume_userspace | |
2155 | + CFI_ENDPROC | |
2156 | + | |
2157 | + RING0_INT_FRAME # can't unwind into user space anyway | |
2158 | +syscall_fault: | |
2159 | + pushl %eax # save orig_eax | |
2160 | + CFI_ADJUST_CFA_OFFSET 4 | |
2161 | + SAVE_ALL | |
2162 | + GET_THREAD_INFO(%ebp) | |
2163 | + movl $-EFAULT,EAX(%esp) | |
2164 | + jmp resume_userspace | |
2165 | + | |
2166 | +syscall_badsys: | |
2167 | + movl $-ENOSYS,EAX(%esp) | |
2168 | + jmp resume_userspace | |
2169 | + CFI_ENDPROC | |
2170 | + | |
2171 | +#ifndef CONFIG_XEN | |
2172 | +#define FIXUP_ESPFIX_STACK \ | |
2173 | + movl %esp, %eax; \ | |
2174 | + /* switch to 32bit stack using the pointer on top of 16bit stack */ \ | |
2175 | + lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ | |
2176 | + /* copy data from 16bit stack to 32bit stack */ \ | |
2177 | + call fixup_x86_bogus_stack; \ | |
2178 | + /* put ESP to the proper location */ \ | |
2179 | + movl %eax, %esp; | |
2180 | +#define UNWIND_ESPFIX_STACK \ | |
2181 | + pushl %eax; \ | |
2182 | + CFI_ADJUST_CFA_OFFSET 4; \ | |
2183 | + movl %ss, %eax; \ | |
2184 | + /* see if on 16bit stack */ \ | |
2185 | + cmpw $__ESPFIX_SS, %ax; \ | |
2186 | + je 28f; \ | |
2187 | +27: popl %eax; \ | |
2188 | + CFI_ADJUST_CFA_OFFSET -4; \ | |
2189 | +.section .fixup,"ax"; \ | |
2190 | +28: movl $__KERNEL_DS, %eax; \ | |
2191 | + movl %eax, %ds; \ | |
2192 | + movl %eax, %es; \ | |
2193 | + /* switch to 32bit stack */ \ | |
2194 | + FIXUP_ESPFIX_STACK; \ | |
2195 | + jmp 27b; \ | |
2196 | +.previous | |
2197 | + | |
2198 | +/* | |
2199 | + * Build the entry stubs and pointer table with | |
2200 | + * some assembler magic. | |
2201 | + */ | |
2202 | +.data | |
2203 | +ENTRY(interrupt) | |
2204 | +.text | |
2205 | + | |
2206 | +vector=0 | |
2207 | +ENTRY(irq_entries_start) | |
2208 | + RING0_INT_FRAME | |
2209 | +.rept NR_IRQS | |
2210 | + ALIGN | |
2211 | + .if vector | |
2212 | + CFI_ADJUST_CFA_OFFSET -4 | |
2213 | + .endif | |
2214 | +1: pushl $~(vector) | |
2215 | + CFI_ADJUST_CFA_OFFSET 4 | |
2216 | + jmp common_interrupt | |
2217 | +.data | |
2218 | + .long 1b | |
2219 | +.text | |
2220 | +vector=vector+1 | |
2221 | +.endr | |
2222 | + | |
2223 | +/* | |
2224 | + * the CPU automatically disables interrupts when executing an IRQ vector, | |
2225 | + * so IRQ-flags tracing has to follow that: | |
2226 | + */ | |
2227 | + ALIGN | |
2228 | +common_interrupt: | |
2229 | + SAVE_ALL | |
2230 | + TRACE_IRQS_OFF | |
2231 | + movl %esp,%eax | |
2232 | + call do_IRQ | |
2233 | + jmp ret_from_intr | |
2234 | + CFI_ENDPROC | |
2235 | + | |
2236 | +#define BUILD_INTERRUPT(name, nr) \ | |
2237 | +ENTRY(name) \ | |
2238 | + RING0_INT_FRAME; \ | |
2239 | + pushl $~(nr); \ | |
2240 | + CFI_ADJUST_CFA_OFFSET 4; \ | |
2241 | + SAVE_ALL; \ | |
2242 | + TRACE_IRQS_OFF \ | |
2243 | + movl %esp,%eax; \ | |
2244 | + call smp_/**/name; \ | |
2245 | + jmp ret_from_intr; \ | |
2246 | + CFI_ENDPROC | |
2247 | + | |
2248 | +/* The include is where all of the SMP etc. interrupts come from */ | |
2249 | +#include "entry_arch.h" | |
2250 | +#else | |
2251 | +#define UNWIND_ESPFIX_STACK | |
2252 | +#endif | |
2253 | + | |
2254 | +ENTRY(divide_error) | |
2255 | + RING0_INT_FRAME | |
2256 | + pushl $0 # no error code | |
2257 | + CFI_ADJUST_CFA_OFFSET 4 | |
2258 | + pushl $do_divide_error | |
2259 | + CFI_ADJUST_CFA_OFFSET 4 | |
2260 | + ALIGN | |
2261 | +error_code: | |
2262 | + pushl %ds | |
2263 | + CFI_ADJUST_CFA_OFFSET 4 | |
2264 | + /*CFI_REL_OFFSET ds, 0*/ | |
2265 | + pushl %eax | |
2266 | + CFI_ADJUST_CFA_OFFSET 4 | |
2267 | + CFI_REL_OFFSET eax, 0 | |
2268 | + xorl %eax, %eax | |
2269 | + pushl %ebp | |
2270 | + CFI_ADJUST_CFA_OFFSET 4 | |
2271 | + CFI_REL_OFFSET ebp, 0 | |
2272 | + pushl %edi | |
2273 | + CFI_ADJUST_CFA_OFFSET 4 | |
2274 | + CFI_REL_OFFSET edi, 0 | |
2275 | + pushl %esi | |
2276 | + CFI_ADJUST_CFA_OFFSET 4 | |
2277 | + CFI_REL_OFFSET esi, 0 | |
2278 | + pushl %edx | |
2279 | + CFI_ADJUST_CFA_OFFSET 4 | |
2280 | + CFI_REL_OFFSET edx, 0 | |
2281 | + decl %eax # eax = -1 | |
2282 | + pushl %ecx | |
2283 | + CFI_ADJUST_CFA_OFFSET 4 | |
2284 | + CFI_REL_OFFSET ecx, 0 | |
2285 | + pushl %ebx | |
2286 | + CFI_ADJUST_CFA_OFFSET 4 | |
2287 | + CFI_REL_OFFSET ebx, 0 | |
2288 | + cld | |
2289 | + pushl %es | |
2290 | + CFI_ADJUST_CFA_OFFSET 4 | |
2291 | + /*CFI_REL_OFFSET es, 0*/ | |
2292 | + UNWIND_ESPFIX_STACK | |
2293 | + popl %ecx | |
2294 | + CFI_ADJUST_CFA_OFFSET -4 | |
2295 | + /*CFI_REGISTER es, ecx*/ | |
2296 | + movl ES(%esp), %edi # get the function address | |
2297 | + movl ORIG_EAX(%esp), %edx # get the error code | |
2298 | + movl %eax, ORIG_EAX(%esp) | |
2299 | + movl %ecx, ES(%esp) | |
2300 | + /*CFI_REL_OFFSET es, ES*/ | |
2301 | + movl $(__USER_DS), %ecx | |
2302 | + movl %ecx, %ds | |
2303 | + movl %ecx, %es | |
2304 | + movl %esp,%eax # pt_regs pointer | |
2305 | + call *%edi | |
2306 | + jmp ret_from_exception | |
2307 | + CFI_ENDPROC | |
2308 | + | |
2309 | +#ifdef CONFIG_XEN | |
2310 | +# A note on the "critical region" in our callback handler. | |
2311 | +# We want to avoid stacking callback handlers due to events occurring | |
2312 | +# during handling of the last event. To do this, we keep events disabled | |
2313 | +# until we've done all processing. HOWEVER, we must enable events before | |
2314 | +# popping the stack frame (can't be done atomically) and so it would still | |
2315 | +# be possible to get enough handler activations to overflow the stack. | |
2316 | +# Although unlikely, bugs of that kind are hard to track down, so we'd | |
2317 | +# like to avoid the possibility. | |
2318 | +# So, on entry to the handler we detect whether we interrupted an | |
2319 | +# existing activation in its critical region -- if so, we pop the current | |
2320 | +# activation and restart the handler using the previous one. | |
2321 | +# | |
2322 | +# The sysexit critical region is slightly different. sysexit | |
2323 | +# atomically removes the entire stack frame. If we interrupt in the | |
2324 | +# critical region we know that the entire frame is present and correct | |
2325 | +# so we can simply throw away the new one. | |
2326 | +ENTRY(hypervisor_callback) | |
2327 | + RING0_INT_FRAME | |
2328 | + pushl %eax | |
2329 | + CFI_ADJUST_CFA_OFFSET 4 | |
2330 | + SAVE_ALL | |
2331 | + movl EIP(%esp),%eax | |
2332 | + cmpl $scrit,%eax | |
2333 | + jb 11f | |
2334 | + cmpl $ecrit,%eax | |
2335 | + jb critical_region_fixup | |
2336 | + cmpl $sysexit_scrit,%eax | |
2337 | + jb 11f | |
2338 | + cmpl $sysexit_ecrit,%eax | |
2339 | + ja 11f | |
2340 | + addl $OLDESP,%esp # Remove eflags...ebx from stack frame. | |
2341 | +11: push %esp | |
2342 | + CFI_ADJUST_CFA_OFFSET 4 | |
2343 | + call evtchn_do_upcall | |
2344 | + add $4,%esp | |
2345 | + CFI_ADJUST_CFA_OFFSET -4 | |
2346 | + jmp ret_from_intr | |
2347 | + CFI_ENDPROC | |
2348 | + | |
2349 | +# [How we do the fixup]. We want to merge the current stack frame with the | |
2350 | +# just-interrupted frame. How we do this depends on where in the critical | |
2351 | +# region the interrupted handler was executing, and so how many saved | |
2352 | +# registers are in each frame. We do this quickly using the lookup table | |
2353 | +# 'critical_fixup_table'. For each byte offset in the critical region, it | |
2354 | +# provides the number of bytes which have already been popped from the | |
2355 | +# interrupted stack frame. | |
2356 | +critical_region_fixup: | |
2357 | + movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped | |
2358 | + cmpb $0xff,%cl # 0xff => vcpu_info critical region | |
2359 | + jne 15f | |
2360 | + xorl %ecx,%ecx | |
2361 | +15: leal (%esp,%ecx),%esi # %esi points at end of src region | |
2362 | + leal OLDESP(%esp),%edi # %edi points at end of dst region | |
2363 | + shrl $2,%ecx # convert words to bytes | |
2364 | + je 17f # skip loop if nothing to copy | |
2365 | +16: subl $4,%esi # pre-decrementing copy loop | |
2366 | + subl $4,%edi | |
2367 | + movl (%esi),%eax | |
2368 | + movl %eax,(%edi) | |
2369 | + loop 16b | |
2370 | +17: movl %edi,%esp # final %edi is top of merged stack | |
2371 | + jmp 11b | |
2372 | + | |
2373 | +.section .rodata,"a" | |
2374 | +critical_fixup_table: | |
2375 | + .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING | |
2376 | + .byte 0xff,0xff # jnz 14f | |
2377 | + .byte 0x00 # pop %ebx | |
2378 | + .byte 0x04 # pop %ecx | |
2379 | + .byte 0x08 # pop %edx | |
2380 | + .byte 0x0c # pop %esi | |
2381 | + .byte 0x10 # pop %edi | |
2382 | + .byte 0x14 # pop %ebp | |
2383 | + .byte 0x18 # pop %eax | |
2384 | + .byte 0x1c # pop %ds | |
2385 | + .byte 0x20 # pop %es | |
2386 | + .byte 0x24,0x24,0x24 # add $4,%esp | |
2387 | + .byte 0x28 # iret | |
2388 | + .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi) | |
2389 | + .byte 0x00,0x00 # jmp 11b | |
2390 | +.previous | |
2391 | + | |
2392 | +# Hypervisor uses this for application faults while it executes. | |
2393 | +# We get here for two reasons: | |
2394 | +# 1. Fault while reloading DS, ES, FS or GS | |
2395 | +# 2. Fault while executing IRET | |
2396 | +# Category 1 we fix up by reattempting the load, and zeroing the segment | |
2397 | +# register if the load fails. | |
2398 | +# Category 2 we fix up by jumping to do_iret_error. We cannot use the | |
2399 | +# normal Linux return path in this case because if we use the IRET hypercall | |
2400 | +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. | |
2401 | +# We distinguish between categories by maintaining a status value in EAX. | |
2402 | +ENTRY(failsafe_callback) | |
2403 | + pushl %eax | |
2404 | + movl $1,%eax | |
2405 | +1: mov 4(%esp),%ds | |
2406 | +2: mov 8(%esp),%es | |
2407 | +3: mov 12(%esp),%fs | |
2408 | +4: mov 16(%esp),%gs | |
2409 | + testl %eax,%eax | |
2410 | + popl %eax | |
2411 | + jz 5f | |
2412 | + addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) | |
2413 | + jmp iret_exc | |
2414 | +5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) | |
2415 | + RING0_INT_FRAME | |
2416 | + pushl $0 | |
2417 | + SAVE_ALL | |
2418 | + jmp ret_from_exception | |
2419 | +.section .fixup,"ax"; \ | |
2420 | +6: xorl %eax,%eax; \ | |
2421 | + movl %eax,4(%esp); \ | |
2422 | + jmp 1b; \ | |
2423 | +7: xorl %eax,%eax; \ | |
2424 | + movl %eax,8(%esp); \ | |
2425 | + jmp 2b; \ | |
2426 | +8: xorl %eax,%eax; \ | |
2427 | + movl %eax,12(%esp); \ | |
2428 | + jmp 3b; \ | |
2429 | +9: xorl %eax,%eax; \ | |
2430 | + movl %eax,16(%esp); \ | |
2431 | + jmp 4b; \ | |
2432 | +.previous; \ | |
2433 | +.section __ex_table,"a"; \ | |
2434 | + .align 4; \ | |
2435 | + .long 1b,6b; \ | |
2436 | + .long 2b,7b; \ | |
2437 | + .long 3b,8b; \ | |
2438 | + .long 4b,9b; \ | |
2439 | +.previous | |
2440 | +#endif | |
2441 | + CFI_ENDPROC | |
2442 | + | |
2443 | +ENTRY(coprocessor_error) | |
2444 | + RING0_INT_FRAME | |
2445 | + pushl $0 | |
2446 | + CFI_ADJUST_CFA_OFFSET 4 | |
2447 | + pushl $do_coprocessor_error | |
2448 | + CFI_ADJUST_CFA_OFFSET 4 | |
2449 | + jmp error_code | |
2450 | + CFI_ENDPROC | |
2451 | + | |
2452 | +ENTRY(simd_coprocessor_error) | |
2453 | + RING0_INT_FRAME | |
2454 | + pushl $0 | |
2455 | + CFI_ADJUST_CFA_OFFSET 4 | |
2456 | + pushl $do_simd_coprocessor_error | |
2457 | + CFI_ADJUST_CFA_OFFSET 4 | |
2458 | + jmp error_code | |
2459 | + CFI_ENDPROC | |
2460 | + | |
2461 | +ENTRY(device_not_available) | |
2462 | + RING0_INT_FRAME | |
2463 | + pushl $-1 # mark this as an int | |
2464 | + CFI_ADJUST_CFA_OFFSET 4 | |
2465 | + SAVE_ALL | |
2466 | +#ifndef CONFIG_XEN | |
2467 | + movl %cr0, %eax | |
2468 | + testl $0x4, %eax # EM (math emulation bit) | |
2469 | + je device_available_emulate | |
2470 | + pushl $0 # temporary storage for ORIG_EIP | |
2471 | + CFI_ADJUST_CFA_OFFSET 4 | |
2472 | + call math_emulate | |
2473 | + addl $4, %esp | |
2474 | + CFI_ADJUST_CFA_OFFSET -4 | |
2475 | + jmp ret_from_exception | |
2476 | +device_available_emulate: | |
2477 | +#endif | |
2478 | + preempt_stop | |
2479 | + call math_state_restore | |
2480 | + jmp ret_from_exception | |
2481 | + CFI_ENDPROC | |
2482 | + | |
2483 | +#ifndef CONFIG_XEN | |
2484 | +/* | |
2485 | + * Debug traps and NMI can happen at the one SYSENTER instruction | |
2486 | + * that sets up the real kernel stack. Check here, since we can't | |
2487 | + * allow the wrong stack to be used. | |
2488 | + * | |
2489 | + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have | |
2490 | + * already pushed 3 words if it hits on the sysenter instruction: | |
2491 | + * eflags, cs and eip. | |
2492 | + * | |
2493 | + * We just load the right stack, and push the three (known) values | |
2494 | + * by hand onto the new stack - while updating the return eip past | |
2495 | + * the instruction that would have done it for sysenter. | |
2496 | + */ | |
2497 | +#define FIX_STACK(offset, ok, label) \ | |
2498 | + cmpw $__KERNEL_CS,4(%esp); \ | |
2499 | + jne ok; \ | |
2500 | +label: \ | |
2501 | + movl SYSENTER_stack_esp0+offset(%esp),%esp; \ | |
2502 | + pushfl; \ | |
2503 | + pushl $__KERNEL_CS; \ | |
2504 | + pushl $sysenter_past_esp | |
2505 | +#endif /* CONFIG_XEN */ | |
2506 | + | |
2507 | +KPROBE_ENTRY(debug) | |
2508 | + RING0_INT_FRAME | |
2509 | +#ifndef CONFIG_XEN | |
2510 | + cmpl $sysenter_entry,(%esp) | |
2511 | + jne debug_stack_correct | |
2512 | + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) | |
2513 | +debug_stack_correct: | |
2514 | +#endif /* !CONFIG_XEN */ | |
2515 | + pushl $-1 # mark this as an int | |
2516 | + CFI_ADJUST_CFA_OFFSET 4 | |
2517 | + SAVE_ALL | |
2518 | + xorl %edx,%edx # error code 0 | |
2519 | + movl %esp,%eax # pt_regs pointer | |
2520 | + call do_debug | |
2521 | + jmp ret_from_exception | |
2522 | + CFI_ENDPROC | |
2523 | + .previous .text | |
2524 | +#ifndef CONFIG_XEN | |
2525 | +/* | |
2526 | + * NMI is doubly nasty. It can happen _while_ we're handling | |
2527 | + * a debug fault, and the debug fault hasn't yet been able to | |
2528 | + * clear up the stack. So we first check whether we got an | |
2529 | + * NMI on the sysenter entry path, but after that we need to | |
2530 | + * check whether we got an NMI on the debug path where the debug | |
2531 | + * fault happened on the sysenter path. | |
2532 | + */ | |
2533 | +ENTRY(nmi) | |
2534 | + RING0_INT_FRAME | |
2535 | + pushl %eax | |
2536 | + CFI_ADJUST_CFA_OFFSET 4 | |
2537 | + movl %ss, %eax | |
2538 | + cmpw $__ESPFIX_SS, %ax | |
2539 | + popl %eax | |
2540 | + CFI_ADJUST_CFA_OFFSET -4 | |
2541 | + je nmi_16bit_stack | |
2542 | + cmpl $sysenter_entry,(%esp) | |
2543 | + je nmi_stack_fixup | |
2544 | + pushl %eax | |
2545 | + CFI_ADJUST_CFA_OFFSET 4 | |
2546 | + movl %esp,%eax | |
2547 | + /* Do not access memory above the end of our stack page, | |
2548 | + * it might not exist. | |
2549 | + */ | |
2550 | + andl $(THREAD_SIZE-1),%eax | |
2551 | + cmpl $(THREAD_SIZE-20),%eax | |
2552 | + popl %eax | |
2553 | + CFI_ADJUST_CFA_OFFSET -4 | |
2554 | + jae nmi_stack_correct | |
2555 | + cmpl $sysenter_entry,12(%esp) | |
2556 | + je nmi_debug_stack_check | |
2557 | +nmi_stack_correct: | |
2558 | + pushl %eax | |
2559 | + CFI_ADJUST_CFA_OFFSET 4 | |
2560 | + SAVE_ALL | |
2561 | + xorl %edx,%edx # zero error code | |
2562 | + movl %esp,%eax # pt_regs pointer | |
2563 | + call do_nmi | |
2564 | + jmp restore_nocheck_notrace | |
2565 | + CFI_ENDPROC | |
2566 | + | |
2567 | +nmi_stack_fixup: | |
2568 | + FIX_STACK(12,nmi_stack_correct, 1) | |
2569 | + jmp nmi_stack_correct | |
2570 | +nmi_debug_stack_check: | |
2571 | + cmpw $__KERNEL_CS,16(%esp) | |
2572 | + jne nmi_stack_correct | |
2573 | + cmpl $debug,(%esp) | |
2574 | + jb nmi_stack_correct | |
2575 | + cmpl $debug_esp_fix_insn,(%esp) | |
2576 | + ja nmi_stack_correct | |
2577 | + FIX_STACK(24,nmi_stack_correct, 1) | |
2578 | + jmp nmi_stack_correct | |
2579 | + | |
2580 | +nmi_16bit_stack: | |
2581 | + RING0_INT_FRAME | |
2582 | + /* create the pointer to lss back */ | |
2583 | + pushl %ss | |
2584 | + CFI_ADJUST_CFA_OFFSET 4 | |
2585 | + pushl %esp | |
2586 | + CFI_ADJUST_CFA_OFFSET 4 | |
2587 | + movzwl %sp, %esp | |
2588 | + addw $4, (%esp) | |
2589 | + /* copy the iret frame of 12 bytes */ | |
2590 | + .rept 3 | |
2591 | + pushl 16(%esp) | |
2592 | + CFI_ADJUST_CFA_OFFSET 4 | |
2593 | + .endr | |
2594 | + pushl %eax | |
2595 | + CFI_ADJUST_CFA_OFFSET 4 | |
2596 | + SAVE_ALL | |
2597 | + FIXUP_ESPFIX_STACK # %eax == %esp | |
2598 | + CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved | |
2599 | + xorl %edx,%edx # zero error code | |
2600 | + call do_nmi | |
2601 | + RESTORE_REGS | |
2602 | + lss 12+4(%esp), %esp # back to 16bit stack | |
2603 | +1: iret | |
2604 | + CFI_ENDPROC | |
2605 | +.section __ex_table,"a" | |
2606 | + .align 4 | |
2607 | + .long 1b,iret_exc | |
2608 | +.previous | |
2609 | +#else | |
2610 | +ENTRY(nmi) | |
2611 | + RING0_INT_FRAME | |
2612 | + pushl %eax | |
2613 | + CFI_ADJUST_CFA_OFFSET 4 | |
2614 | + SAVE_ALL | |
2615 | + xorl %edx,%edx # zero error code | |
2616 | + movl %esp,%eax # pt_regs pointer | |
2617 | + call do_nmi | |
2618 | + orl $NMI_MASK, EFLAGS(%esp) | |
2619 | + jmp restore_all | |
2620 | + CFI_ENDPROC | |
2621 | +#endif | |
2622 | + | |
2623 | +KPROBE_ENTRY(int3) | |
2624 | + RING0_INT_FRAME | |
2625 | + pushl $-1 # mark this as an int | |
2626 | + CFI_ADJUST_CFA_OFFSET 4 | |
2627 | + SAVE_ALL | |
2628 | + xorl %edx,%edx # zero error code | |
2629 | + movl %esp,%eax # pt_regs pointer | |
2630 | + call do_int3 | |
2631 | + jmp ret_from_exception | |
2632 | + CFI_ENDPROC | |
2633 | + .previous .text | |
2634 | + | |
2635 | +ENTRY(overflow) | |
2636 | + RING0_INT_FRAME | |
2637 | + pushl $0 | |
2638 | + CFI_ADJUST_CFA_OFFSET 4 | |
2639 | + pushl $do_overflow | |
2640 | + CFI_ADJUST_CFA_OFFSET 4 | |
2641 | + jmp error_code | |
2642 | + CFI_ENDPROC | |
2643 | + | |
2644 | +ENTRY(bounds) | |
2645 | + RING0_INT_FRAME | |
2646 | + pushl $0 | |
2647 | + CFI_ADJUST_CFA_OFFSET 4 | |
2648 | + pushl $do_bounds | |
2649 | + CFI_ADJUST_CFA_OFFSET 4 | |
2650 | + jmp error_code | |
2651 | + CFI_ENDPROC | |
2652 | + | |
2653 | +ENTRY(invalid_op) | |
2654 | + RING0_INT_FRAME | |
2655 | + pushl $0 | |
2656 | + CFI_ADJUST_CFA_OFFSET 4 | |
2657 | + pushl $do_invalid_op | |
2658 | + CFI_ADJUST_CFA_OFFSET 4 | |
2659 | + jmp error_code | |
2660 | + CFI_ENDPROC | |
2661 | + | |
2662 | +ENTRY(coprocessor_segment_overrun) | |
2663 | + RING0_INT_FRAME | |
2664 | + pushl $0 | |
2665 | + CFI_ADJUST_CFA_OFFSET 4 | |
2666 | + pushl $do_coprocessor_segment_overrun | |
2667 | + CFI_ADJUST_CFA_OFFSET 4 | |
2668 | + jmp error_code | |
2669 | + CFI_ENDPROC | |
2670 | + | |
2671 | +ENTRY(invalid_TSS) | |
2672 | + RING0_EC_FRAME | |
2673 | + pushl $do_invalid_TSS | |
2674 | + CFI_ADJUST_CFA_OFFSET 4 | |
2675 | + jmp error_code | |
2676 | + CFI_ENDPROC | |
2677 | + | |
2678 | +ENTRY(segment_not_present) | |
2679 | + RING0_EC_FRAME | |
2680 | + pushl $do_segment_not_present | |
2681 | + CFI_ADJUST_CFA_OFFSET 4 | |
2682 | + jmp error_code | |
2683 | + CFI_ENDPROC | |
2684 | + | |
2685 | +ENTRY(stack_segment) | |
2686 | + RING0_EC_FRAME | |
2687 | + pushl $do_stack_segment | |
2688 | + CFI_ADJUST_CFA_OFFSET 4 | |
2689 | + jmp error_code | |
2690 | + CFI_ENDPROC | |
2691 | + | |
2692 | +KPROBE_ENTRY(general_protection) | |
2693 | + RING0_EC_FRAME | |
2694 | + pushl $do_general_protection | |
2695 | + CFI_ADJUST_CFA_OFFSET 4 | |
2696 | + jmp error_code | |
2697 | + CFI_ENDPROC | |
2698 | + .previous .text | |
2699 | + | |
2700 | +ENTRY(alignment_check) | |
2701 | + RING0_EC_FRAME | |
2702 | + pushl $do_alignment_check | |
2703 | + CFI_ADJUST_CFA_OFFSET 4 | |
2704 | + jmp error_code | |
2705 | + CFI_ENDPROC | |
2706 | + | |
2707 | +KPROBE_ENTRY(page_fault) | |
2708 | + RING0_EC_FRAME | |
2709 | + pushl $do_page_fault | |
2710 | + CFI_ADJUST_CFA_OFFSET 4 | |
2711 | + jmp error_code | |
2712 | + CFI_ENDPROC | |
2713 | + .previous .text | |
2714 | + | |
2715 | +#ifdef CONFIG_X86_MCE | |
2716 | +ENTRY(machine_check) | |
2717 | + RING0_INT_FRAME | |
2718 | + pushl $0 | |
2719 | + CFI_ADJUST_CFA_OFFSET 4 | |
2720 | + pushl machine_check_vector | |
2721 | + CFI_ADJUST_CFA_OFFSET 4 | |
2722 | + jmp error_code | |
2723 | + CFI_ENDPROC | |
2724 | +#endif | |
2725 | + | |
2726 | +#ifndef CONFIG_XEN | |
2727 | +ENTRY(spurious_interrupt_bug) | |
2728 | + RING0_INT_FRAME | |
2729 | + pushl $0 | |
2730 | + CFI_ADJUST_CFA_OFFSET 4 | |
2731 | + pushl $do_spurious_interrupt_bug | |
2732 | + CFI_ADJUST_CFA_OFFSET 4 | |
2733 | + jmp error_code | |
2734 | + CFI_ENDPROC | |
2735 | +#endif /* !CONFIG_XEN */ | |
2736 | + | |
2737 | +#ifdef CONFIG_STACK_UNWIND | |
2738 | +ENTRY(arch_unwind_init_running) | |
2739 | + CFI_STARTPROC | |
2740 | + movl 4(%esp), %edx | |
2741 | + movl (%esp), %ecx | |
2742 | + leal 4(%esp), %eax | |
2743 | + movl %ebx, EBX(%edx) | |
2744 | + xorl %ebx, %ebx | |
2745 | + movl %ebx, ECX(%edx) | |
2746 | + movl %ebx, EDX(%edx) | |
2747 | + movl %esi, ESI(%edx) | |
2748 | + movl %edi, EDI(%edx) | |
2749 | + movl %ebp, EBP(%edx) | |
2750 | + movl %ebx, EAX(%edx) | |
2751 | + movl $__USER_DS, DS(%edx) | |
2752 | + movl $__USER_DS, ES(%edx) | |
2753 | + movl %ebx, ORIG_EAX(%edx) | |
2754 | + movl %ecx, EIP(%edx) | |
2755 | + movl 12(%esp), %ecx | |
2756 | + movl $__KERNEL_CS, CS(%edx) | |
2757 | + movl %ebx, EFLAGS(%edx) | |
2758 | + movl %eax, OLDESP(%edx) | |
2759 | + movl 8(%esp), %eax | |
2760 | + movl %ecx, 8(%esp) | |
2761 | + movl EBX(%edx), %ebx | |
2762 | + movl $__KERNEL_DS, OLDSS(%edx) | |
2763 | + jmpl *%eax | |
2764 | + CFI_ENDPROC | |
2765 | +ENDPROC(arch_unwind_init_running) | |
2766 | +#endif | |
2767 | + | |
2768 | +ENTRY(fixup_4gb_segment) | |
2769 | + RING0_EC_FRAME | |
2770 | + pushl $do_fixup_4gb_segment | |
2771 | + CFI_ADJUST_CFA_OFFSET 4 | |
2772 | + jmp error_code | |
2773 | + CFI_ENDPROC | |
2774 | + | |
2775 | +.section .rodata,"a" | |
2776 | +#include "syscall_table.S" | |
2777 | + | |
2778 | +syscall_table_size=(.-sys_call_table) | |
2779 | Index: head-2008-11-25/arch/x86/kernel/fixup.c | |
2780 | =================================================================== | |
2781 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
2782 | +++ head-2008-11-25/arch/x86/kernel/fixup.c 2008-01-28 12:24:18.000000000 +0100 | |
2783 | @@ -0,0 +1,88 @@ | |
2784 | +/****************************************************************************** | |
2785 | + * fixup.c | |
2786 | + * | |
2787 | + * Binary-rewriting of certain IA32 instructions, on notification by Xen. | |
2788 | + * Used to avoid repeated slow emulation of common instructions used by the | |
2789 | + * user-space TLS (Thread-Local Storage) libraries. | |
2790 | + * | |
2791 | + * **** NOTE **** | |
2792 | + * Issues with the binary rewriting have caused it to be removed. Instead | |
2793 | + * we rely on Xen's emulator to boot the kernel, and then print a banner | |
2794 | + * message recommending that the user disables /lib/tls. | |
2795 | + * | |
2796 | + * Copyright (c) 2004, K A Fraser | |
2797 | + * | |
2798 | + * This program is free software; you can redistribute it and/or modify | |
2799 | + * it under the terms of the GNU General Public License as published by | |
2800 | + * the Free Software Foundation; either version 2 of the License, or | |
2801 | + * (at your option) any later version. | |
2802 | + * | |
2803 | + * This program is distributed in the hope that it will be useful, | |
2804 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
2805 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
2806 | + * GNU General Public License for more details. | |
2807 | + * | |
2808 | + * You should have received a copy of the GNU General Public License | |
2809 | + * along with this program; if not, write to the Free Software | |
2810 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
2811 | + */ | |
2812 | + | |
2813 | +#include <linux/init.h> | |
2814 | +#include <linux/sched.h> | |
2815 | +#include <linux/slab.h> | |
2816 | +#include <linux/kernel.h> | |
2817 | +#include <linux/delay.h> | |
2818 | +#include <linux/version.h> | |
2819 | + | |
2820 | +#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) | |
2821 | + | |
2822 | +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) | |
2823 | +{ | |
2824 | + static unsigned long printed = 0; | |
2825 | + char info[100]; | |
2826 | + int i; | |
2827 | + | |
2828 | + /* Ignore statically-linked init. */ | |
2829 | + if (current->tgid == 1) | |
2830 | + return; | |
2831 | + | |
2832 | + VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable, | |
2833 | + VMASST_TYPE_4gb_segments_notify)); | |
2834 | + | |
2835 | + if (test_and_set_bit(0, &printed)) | |
2836 | + return; | |
2837 | + | |
2838 | + sprintf(info, "%s (pid=%d)", current->comm, current->tgid); | |
2839 | + | |
2840 | + DP(""); | |
2841 | + DP("***************************************************************"); | |
2842 | + DP("***************************************************************"); | |
2843 | + DP("** WARNING: Currently emulating unsupported memory accesses **"); | |
2844 | + DP("** in /lib/tls glibc libraries. The emulation is **"); | |
2845 | + DP("** slow. To ensure full performance you should **"); | |
2846 | + DP("** install a 'xen-friendly' (nosegneg) version of **"); | |
2847 | + DP("** the library, or disable tls support by executing **"); | |
2848 | + DP("** the following as root: **"); | |
2849 | + DP("** mv /lib/tls /lib/tls.disabled **"); | |
2850 | + DP("** Offending process: %-38.38s **", info); | |
2851 | + DP("***************************************************************"); | |
2852 | + DP("***************************************************************"); | |
2853 | + DP(""); | |
2854 | + | |
2855 | + for (i = 5; i > 0; i--) { | |
2856 | + touch_softlockup_watchdog(); | |
2857 | + printk("Pausing... %d", i); | |
2858 | + mdelay(1000); | |
2859 | + printk("\b\b\b\b\b\b\b\b\b\b\b\b"); | |
2860 | + } | |
2861 | + | |
2862 | + printk("Continuing...\n\n"); | |
2863 | +} | |
2864 | + | |
2865 | +static int __init fixup_init(void) | |
2866 | +{ | |
2867 | + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, | |
2868 | + VMASST_TYPE_4gb_segments_notify)); | |
2869 | + return 0; | |
2870 | +} | |
2871 | +__initcall(fixup_init); | |
2872 | Index: head-2008-11-25/arch/x86/kernel/head_32-xen.S | |
2873 | =================================================================== | |
2874 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
2875 | +++ head-2008-11-25/arch/x86/kernel/head_32-xen.S 2007-06-12 13:12:48.000000000 +0200 | |
2876 | @@ -0,0 +1,207 @@ | |
2877 | + | |
2878 | + | |
2879 | +.text | |
2880 | +#include <linux/elfnote.h> | |
2881 | +#include <linux/threads.h> | |
2882 | +#include <linux/linkage.h> | |
2883 | +#include <asm/segment.h> | |
2884 | +#include <asm/page.h> | |
2885 | +#include <asm/cache.h> | |
2886 | +#include <asm/thread_info.h> | |
2887 | +#include <asm/asm-offsets.h> | |
2888 | +#include <asm/dwarf2.h> | |
2889 | +#include <xen/interface/xen.h> | |
2890 | +#include <xen/interface/elfnote.h> | |
2891 | + | |
2892 | +/* | |
2893 | + * References to members of the new_cpu_data structure. | |
2894 | + */ | |
2895 | + | |
2896 | +#define X86 new_cpu_data+CPUINFO_x86 | |
2897 | +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor | |
2898 | +#define X86_MODEL new_cpu_data+CPUINFO_x86_model | |
2899 | +#define X86_MASK new_cpu_data+CPUINFO_x86_mask | |
2900 | +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math | |
2901 | +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level | |
2902 | +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability | |
2903 | +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id | |
2904 | + | |
2905 | +#define VIRT_ENTRY_OFFSET 0x0 | |
2906 | +.org VIRT_ENTRY_OFFSET | |
2907 | +ENTRY(startup_32) | |
2908 | + movl %esi,xen_start_info | |
2909 | + cld | |
2910 | + | |
2911 | + /* Set up the stack pointer */ | |
2912 | + movl $(init_thread_union+THREAD_SIZE),%esp | |
2913 | + | |
2914 | + /* get vendor info */ | |
2915 | + xorl %eax,%eax # call CPUID with 0 -> return vendor ID | |
2916 | + XEN_CPUID | |
2917 | + movl %eax,X86_CPUID # save CPUID level | |
2918 | + movl %ebx,X86_VENDOR_ID # lo 4 chars | |
2919 | + movl %edx,X86_VENDOR_ID+4 # next 4 chars | |
2920 | + movl %ecx,X86_VENDOR_ID+8 # last 4 chars | |
2921 | + | |
2922 | + movl $1,%eax # Use the CPUID instruction to get CPU type | |
2923 | + XEN_CPUID | |
2924 | + movb %al,%cl # save reg for future use | |
2925 | + andb $0x0f,%ah # mask processor family | |
2926 | + movb %ah,X86 | |
2927 | + andb $0xf0,%al # mask model | |
2928 | + shrb $4,%al | |
2929 | + movb %al,X86_MODEL | |
2930 | + andb $0x0f,%cl # mask mask revision | |
2931 | + movb %cl,X86_MASK | |
2932 | + movl %edx,X86_CAPABILITY | |
2933 | + | |
2934 | + movb $1,X86_HARD_MATH | |
2935 | + | |
2936 | + xorl %eax,%eax # Clear FS/GS and LDT | |
2937 | + movl %eax,%fs | |
2938 | + movl %eax,%gs | |
2939 | + cld # gcc2 wants the direction flag cleared at all times | |
2940 | + | |
2941 | + pushl %eax # fake return address | |
2942 | + jmp start_kernel | |
2943 | + | |
2944 | +#define HYPERCALL_PAGE_OFFSET 0x1000 | |
2945 | +.org HYPERCALL_PAGE_OFFSET | |
2946 | +ENTRY(hypercall_page) | |
2947 | + CFI_STARTPROC | |
2948 | +.skip 0x1000 | |
2949 | + CFI_ENDPROC | |
2950 | + | |
2951 | +/* | |
2952 | + * Real beginning of normal "text" segment | |
2953 | + */ | |
2954 | +ENTRY(stext) | |
2955 | +ENTRY(_stext) | |
2956 | + | |
2957 | +/* | |
2958 | + * BSS section | |
2959 | + */ | |
2960 | +.section ".bss.page_aligned","w" | |
2961 | +ENTRY(empty_zero_page) | |
2962 | + .fill 4096,1,0 | |
2963 | + | |
2964 | +/* | |
2965 | + * This starts the data section. | |
2966 | + */ | |
2967 | +.data | |
2968 | + | |
2969 | +/* | |
2970 | + * The Global Descriptor Table contains 28 quadwords, per-CPU. | |
2971 | + */ | |
2972 | + .align L1_CACHE_BYTES | |
2973 | +ENTRY(cpu_gdt_table) | |
2974 | + .quad 0x0000000000000000 /* NULL descriptor */ | |
2975 | + .quad 0x0000000000000000 /* 0x0b reserved */ | |
2976 | + .quad 0x0000000000000000 /* 0x13 reserved */ | |
2977 | + .quad 0x0000000000000000 /* 0x1b reserved */ | |
2978 | + .quad 0x0000000000000000 /* 0x20 unused */ | |
2979 | + .quad 0x0000000000000000 /* 0x28 unused */ | |
2980 | + .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ | |
2981 | + .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ | |
2982 | + .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ | |
2983 | + .quad 0x0000000000000000 /* 0x4b reserved */ | |
2984 | + .quad 0x0000000000000000 /* 0x53 reserved */ | |
2985 | + .quad 0x0000000000000000 /* 0x5b reserved */ | |
2986 | + | |
2987 | + .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ | |
2988 | + .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ | |
2989 | + .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ | |
2990 | + .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ | |
2991 | + | |
2992 | + .quad 0x0000000000000000 /* 0x80 TSS descriptor */ | |
2993 | + .quad 0x0000000000000000 /* 0x88 LDT descriptor */ | |
2994 | + | |
2995 | + /* | |
2996 | + * Segments used for calling PnP BIOS have byte granularity. | |
2997 | + * They code segments and data segments have fixed 64k limits, | |
2998 | + * the transfer segment sizes are set at run time. | |
2999 | + */ | |
3000 | + .quad 0x0000000000000000 /* 0x90 32-bit code */ | |
3001 | + .quad 0x0000000000000000 /* 0x98 16-bit code */ | |
3002 | + .quad 0x0000000000000000 /* 0xa0 16-bit data */ | |
3003 | + .quad 0x0000000000000000 /* 0xa8 16-bit data */ | |
3004 | + .quad 0x0000000000000000 /* 0xb0 16-bit data */ | |
3005 | + | |
3006 | + /* | |
3007 | + * The APM segments have byte granularity and their bases | |
3008 | + * are set at run time. All have 64k limits. | |
3009 | + */ | |
3010 | + .quad 0x0000000000000000 /* 0xb8 APM CS code */ | |
3011 | + .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ | |
3012 | + .quad 0x0000000000000000 /* 0xc8 APM DS data */ | |
3013 | + | |
3014 | + .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */ | |
3015 | + .quad 0x0000000000000000 /* 0xd8 - unused */ | |
3016 | + .quad 0x0000000000000000 /* 0xe0 - unused */ | |
3017 | + .quad 0x0000000000000000 /* 0xe8 - unused */ | |
3018 | + .quad 0x0000000000000000 /* 0xf0 - unused */ | |
3019 | + .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ | |
3020 | + | |
3021 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
3022 | +/* | |
3023 | + * __xen_guest information | |
3024 | + */ | |
3025 | +.macro utoa value | |
3026 | + .if (\value) < 0 || (\value) >= 0x10 | |
3027 | + utoa (((\value)>>4)&0x0fffffff) | |
3028 | + .endif | |
3029 | + .if ((\value) & 0xf) < 10 | |
3030 | + .byte '0' + ((\value) & 0xf) | |
3031 | + .else | |
3032 | + .byte 'A' + ((\value) & 0xf) - 10 | |
3033 | + .endif | |
3034 | +.endm | |
3035 | + | |
3036 | +.section __xen_guest | |
3037 | + .ascii "GUEST_OS=linux,GUEST_VER=2.6" | |
3038 | + .ascii ",XEN_VER=xen-3.0" | |
3039 | + .ascii ",VIRT_BASE=0x" | |
3040 | + utoa __PAGE_OFFSET | |
3041 | + .ascii ",ELF_PADDR_OFFSET=0x" | |
3042 | + utoa __PAGE_OFFSET | |
3043 | + .ascii ",VIRT_ENTRY=0x" | |
3044 | + utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET) | |
3045 | + .ascii ",HYPERCALL_PAGE=0x" | |
3046 | + utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) | |
3047 | + .ascii ",FEATURES=writable_page_tables" | |
3048 | + .ascii "|writable_descriptor_tables" | |
3049 | + .ascii "|auto_translated_physmap" | |
3050 | + .ascii "|pae_pgdir_above_4gb" | |
3051 | + .ascii "|supervisor_mode_kernel" | |
3052 | +#ifdef CONFIG_X86_PAE | |
3053 | + .ascii ",PAE=yes[extended-cr3]" | |
3054 | +#else | |
3055 | + .ascii ",PAE=no" | |
3056 | +#endif | |
3057 | + .ascii ",LOADER=generic" | |
3058 | + .byte 0 | |
3059 | +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ | |
3060 | + | |
3061 | + | |
3062 | + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") | |
3063 | + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") | |
3064 | + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") | |
3065 | + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET) | |
3066 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
3067 | + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET) | |
3068 | +#else | |
3069 | + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0) | |
3070 | +#endif | |
3071 | + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32) | |
3072 | + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) | |
3073 | + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) | |
3074 | + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") | |
3075 | +#ifdef CONFIG_X86_PAE | |
3076 | + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") | |
3077 | + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT) | |
3078 | +#else | |
3079 | + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") | |
3080 | + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, _PAGE_PRESENT,_PAGE_PRESENT) | |
3081 | +#endif | |
3082 | + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") | |
3083 | + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) | |
3084 | Index: head-2008-11-25/arch/x86/kernel/init_task-xen.c | |
3085 | =================================================================== | |
3086 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
3087 | +++ head-2008-11-25/arch/x86/kernel/init_task-xen.c 2007-06-12 13:12:48.000000000 +0200 | |
3088 | @@ -0,0 +1,51 @@ | |
3089 | +#include <linux/mm.h> | |
3090 | +#include <linux/module.h> | |
3091 | +#include <linux/sched.h> | |
3092 | +#include <linux/init.h> | |
3093 | +#include <linux/init_task.h> | |
3094 | +#include <linux/fs.h> | |
3095 | +#include <linux/mqueue.h> | |
3096 | + | |
3097 | +#include <asm/uaccess.h> | |
3098 | +#include <asm/pgtable.h> | |
3099 | +#include <asm/desc.h> | |
3100 | + | |
3101 | +static struct fs_struct init_fs = INIT_FS; | |
3102 | +static struct files_struct init_files = INIT_FILES; | |
3103 | +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); | |
3104 | +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); | |
3105 | + | |
3106 | +#define swapper_pg_dir ((pgd_t *)NULL) | |
3107 | +struct mm_struct init_mm = INIT_MM(init_mm); | |
3108 | +#undef swapper_pg_dir | |
3109 | + | |
3110 | +EXPORT_SYMBOL(init_mm); | |
3111 | + | |
3112 | +/* | |
3113 | + * Initial thread structure. | |
3114 | + * | |
3115 | + * We need to make sure that this is THREAD_SIZE aligned due to the | |
3116 | + * way process stacks are handled. This is done by having a special | |
3117 | + * "init_task" linker map entry.. | |
3118 | + */ | |
3119 | +union thread_union init_thread_union | |
3120 | + __attribute__((__section__(".data.init_task"))) = | |
3121 | + { INIT_THREAD_INFO(init_task) }; | |
3122 | + | |
3123 | +/* | |
3124 | + * Initial task structure. | |
3125 | + * | |
3126 | + * All other task structs will be allocated on slabs in fork.c | |
3127 | + */ | |
3128 | +struct task_struct init_task = INIT_TASK(init_task); | |
3129 | + | |
3130 | +EXPORT_SYMBOL(init_task); | |
3131 | + | |
3132 | +#ifndef CONFIG_X86_NO_TSS | |
3133 | +/* | |
3134 | + * per-CPU TSS segments. Threads are completely 'soft' on Linux, | |
3135 | + * no more per-task TSS's. | |
3136 | + */ | |
3137 | +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; | |
3138 | +#endif | |
3139 | + | |
3140 | Index: head-2008-11-25/arch/x86/kernel/io_apic_32-xen.c | |
3141 | =================================================================== | |
3142 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
3143 | +++ head-2008-11-25/arch/x86/kernel/io_apic_32-xen.c 2008-11-25 12:22:34.000000000 +0100 | |
3144 | @@ -0,0 +1,2776 @@ | |
3145 | +/* | |
3146 | + * Intel IO-APIC support for multi-Pentium hosts. | |
3147 | + * | |
3148 | + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo | |
3149 | + * | |
3150 | + * Many thanks to Stig Venaas for trying out countless experimental | |
3151 | + * patches and reporting/debugging problems patiently! | |
3152 | + * | |
3153 | + * (c) 1999, Multiple IO-APIC support, developed by | |
3154 | + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and | |
3155 | + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, | |
3156 | + * further tested and cleaned up by Zach Brown <zab@redhat.com> | |
3157 | + * and Ingo Molnar <mingo@redhat.com> | |
3158 | + * | |
3159 | + * Fixes | |
3160 | + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | |
3161 | + * thanks to Eric Gilmore | |
3162 | + * and Rolf G. Tews | |
3163 | + * for testing these extensively | |
3164 | + * Paul Diefenbaugh : Added full ACPI support | |
3165 | + */ | |
3166 | + | |
3167 | +#include <linux/mm.h> | |
3168 | +#include <linux/interrupt.h> | |
3169 | +#include <linux/init.h> | |
3170 | +#include <linux/delay.h> | |
3171 | +#include <linux/sched.h> | |
3172 | +#include <linux/smp_lock.h> | |
3173 | +#include <linux/mc146818rtc.h> | |
3174 | +#include <linux/compiler.h> | |
3175 | +#include <linux/acpi.h> | |
3176 | +#include <linux/module.h> | |
3177 | +#include <linux/sysdev.h> | |
3178 | + | |
3179 | +#include <asm/io.h> | |
3180 | +#include <asm/smp.h> | |
3181 | +#include <asm/desc.h> | |
3182 | +#include <asm/timer.h> | |
3183 | +#include <asm/i8259.h> | |
3184 | +#include <asm/nmi.h> | |
3185 | + | |
3186 | +#include <mach_apic.h> | |
3187 | + | |
3188 | +#include "io_ports.h" | |
3189 | + | |
3190 | +#ifdef CONFIG_XEN | |
3191 | + | |
3192 | +#include <xen/interface/xen.h> | |
3193 | +#include <xen/interface/physdev.h> | |
3194 | +#include <xen/evtchn.h> | |
3195 | + | |
3196 | +/* Fake i8259 */ | |
3197 | +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) | |
3198 | +#define disable_8259A_irq(_irq) ((void)0) | |
3199 | +#define i8259A_irq_pending(_irq) (0) | |
3200 | + | |
3201 | +unsigned long io_apic_irqs; | |
3202 | + | |
3203 | +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) | |
3204 | +{ | |
3205 | + struct physdev_apic apic_op; | |
3206 | + int ret; | |
3207 | + | |
3208 | + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; | |
3209 | + apic_op.reg = reg; | |
3210 | + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); | |
3211 | + if (ret) | |
3212 | + return ret; | |
3213 | + return apic_op.value; | |
3214 | +} | |
3215 | + | |
3216 | +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) | |
3217 | +{ | |
3218 | + struct physdev_apic apic_op; | |
3219 | + | |
3220 | + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; | |
3221 | + apic_op.reg = reg; | |
3222 | + apic_op.value = value; | |
3223 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); | |
3224 | +} | |
3225 | + | |
3226 | +#define io_apic_read(a,r) xen_io_apic_read(a,r) | |
3227 | +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) | |
3228 | + | |
3229 | +#endif /* CONFIG_XEN */ | |
3230 | + | |
3231 | +int (*ioapic_renumber_irq)(int ioapic, int irq); | |
3232 | +atomic_t irq_mis_count; | |
3233 | + | |
3234 | +/* Where if anywhere is the i8259 connect in external int mode */ | |
3235 | +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | |
3236 | + | |
3237 | +static DEFINE_SPINLOCK(ioapic_lock); | |
3238 | +static DEFINE_SPINLOCK(vector_lock); | |
3239 | + | |
3240 | +int timer_over_8254 __initdata = 1; | |
3241 | + | |
3242 | +/* | |
3243 | + * Is the SiS APIC rmw bug present ? | |
3244 | + * -1 = don't know, 0 = no, 1 = yes | |
3245 | + */ | |
3246 | +int sis_apic_bug = -1; | |
3247 | + | |
3248 | +/* | |
3249 | + * # of IRQ routing registers | |
3250 | + */ | |
3251 | +int nr_ioapic_registers[MAX_IO_APICS]; | |
3252 | + | |
3253 | +int disable_timer_pin_1 __initdata; | |
3254 | + | |
3255 | +/* | |
3256 | + * Rough estimation of how many shared IRQs there are, can | |
3257 | + * be changed anytime. | |
3258 | + */ | |
3259 | +#define MAX_PLUS_SHARED_IRQS NR_IRQS | |
3260 | +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) | |
3261 | + | |
3262 | +/* | |
3263 | + * This is performance-critical, we want to do it O(1) | |
3264 | + * | |
3265 | + * the indexing order of this array favors 1:1 mappings | |
3266 | + * between pins and IRQs. | |
3267 | + */ | |
3268 | + | |
3269 | +static struct irq_pin_list { | |
3270 | + int apic, pin, next; | |
3271 | +} irq_2_pin[PIN_MAP_SIZE]; | |
3272 | + | |
3273 | +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; | |
3274 | +#ifdef CONFIG_PCI_MSI | |
3275 | +#define vector_to_irq(vector) \ | |
3276 | + (platform_legacy_irq(vector) ? vector : vector_irq[vector]) | |
3277 | +#else | |
3278 | +#define vector_to_irq(vector) (vector) | |
3279 | +#endif | |
3280 | + | |
3281 | +/* | |
3282 | + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are | |
3283 | + * shared ISA-space IRQs, so we have to support them. We are super | |
3284 | + * fast in the common case, and fast for shared ISA-space IRQs. | |
3285 | + */ | |
3286 | +static void add_pin_to_irq(unsigned int irq, int apic, int pin) | |
3287 | +{ | |
3288 | + static int first_free_entry = NR_IRQS; | |
3289 | + struct irq_pin_list *entry = irq_2_pin + irq; | |
3290 | + | |
3291 | + while (entry->next) | |
3292 | + entry = irq_2_pin + entry->next; | |
3293 | + | |
3294 | + if (entry->pin != -1) { | |
3295 | + entry->next = first_free_entry; | |
3296 | + entry = irq_2_pin + entry->next; | |
3297 | + if (++first_free_entry >= PIN_MAP_SIZE) | |
3298 | + panic("io_apic.c: whoops"); | |
3299 | + } | |
3300 | + entry->apic = apic; | |
3301 | + entry->pin = pin; | |
3302 | +} | |
3303 | + | |
3304 | +#ifdef CONFIG_XEN | |
3305 | +#define clear_IO_APIC() ((void)0) | |
3306 | +#else | |
3307 | +/* | |
3308 | + * Reroute an IRQ to a different pin. | |
3309 | + */ | |
3310 | +static void __init replace_pin_at_irq(unsigned int irq, | |
3311 | + int oldapic, int oldpin, | |
3312 | + int newapic, int newpin) | |
3313 | +{ | |
3314 | + struct irq_pin_list *entry = irq_2_pin + irq; | |
3315 | + | |
3316 | + while (1) { | |
3317 | + if (entry->apic == oldapic && entry->pin == oldpin) { | |
3318 | + entry->apic = newapic; | |
3319 | + entry->pin = newpin; | |
3320 | + } | |
3321 | + if (!entry->next) | |
3322 | + break; | |
3323 | + entry = irq_2_pin + entry->next; | |
3324 | + } | |
3325 | +} | |
3326 | + | |
3327 | +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) | |
3328 | +{ | |
3329 | + struct irq_pin_list *entry = irq_2_pin + irq; | |
3330 | + unsigned int pin, reg; | |
3331 | + | |
3332 | + for (;;) { | |
3333 | + pin = entry->pin; | |
3334 | + if (pin == -1) | |
3335 | + break; | |
3336 | + reg = io_apic_read(entry->apic, 0x10 + pin*2); | |
3337 | + reg &= ~disable; | |
3338 | + reg |= enable; | |
3339 | + io_apic_modify(entry->apic, 0x10 + pin*2, reg); | |
3340 | + if (!entry->next) | |
3341 | + break; | |
3342 | + entry = irq_2_pin + entry->next; | |
3343 | + } | |
3344 | +} | |
3345 | + | |
3346 | +/* mask = 1 */ | |
3347 | +static void __mask_IO_APIC_irq (unsigned int irq) | |
3348 | +{ | |
3349 | + __modify_IO_APIC_irq(irq, 0x00010000, 0); | |
3350 | +} | |
3351 | + | |
3352 | +/* mask = 0 */ | |
3353 | +static void __unmask_IO_APIC_irq (unsigned int irq) | |
3354 | +{ | |
3355 | + __modify_IO_APIC_irq(irq, 0, 0x00010000); | |
3356 | +} | |
3357 | + | |
3358 | +/* mask = 1, trigger = 0 */ | |
3359 | +static void __mask_and_edge_IO_APIC_irq (unsigned int irq) | |
3360 | +{ | |
3361 | + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); | |
3362 | +} | |
3363 | + | |
3364 | +/* mask = 0, trigger = 1 */ | |
3365 | +static void __unmask_and_level_IO_APIC_irq (unsigned int irq) | |
3366 | +{ | |
3367 | + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); | |
3368 | +} | |
3369 | + | |
3370 | +static void mask_IO_APIC_irq (unsigned int irq) | |
3371 | +{ | |
3372 | + unsigned long flags; | |
3373 | + | |
3374 | + spin_lock_irqsave(&ioapic_lock, flags); | |
3375 | + __mask_IO_APIC_irq(irq); | |
3376 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
3377 | +} | |
3378 | + | |
3379 | +static void unmask_IO_APIC_irq (unsigned int irq) | |
3380 | +{ | |
3381 | + unsigned long flags; | |
3382 | + | |
3383 | + spin_lock_irqsave(&ioapic_lock, flags); | |
3384 | + __unmask_IO_APIC_irq(irq); | |
3385 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
3386 | +} | |
3387 | + | |
3388 | +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) | |
3389 | +{ | |
3390 | + struct IO_APIC_route_entry entry; | |
3391 | + unsigned long flags; | |
3392 | + | |
3393 | + /* Check delivery_mode to be sure we're not clearing an SMI pin */ | |
3394 | + spin_lock_irqsave(&ioapic_lock, flags); | |
3395 | + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); | |
3396 | + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); | |
3397 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
3398 | + if (entry.delivery_mode == dest_SMI) | |
3399 | + return; | |
3400 | + | |
3401 | + /* | |
3402 | + * Disable it in the IO-APIC irq-routing table: | |
3403 | + */ | |
3404 | + memset(&entry, 0, sizeof(entry)); | |
3405 | + entry.mask = 1; | |
3406 | + spin_lock_irqsave(&ioapic_lock, flags); | |
3407 | + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); | |
3408 | + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); | |
3409 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
3410 | +} | |
3411 | + | |
3412 | +static void clear_IO_APIC (void) | |
3413 | +{ | |
3414 | + int apic, pin; | |
3415 | + | |
3416 | + for (apic = 0; apic < nr_ioapics; apic++) | |
3417 | + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) | |
3418 | + clear_IO_APIC_pin(apic, pin); | |
3419 | +} | |
3420 | + | |
3421 | +#ifdef CONFIG_SMP | |
3422 | +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) | |
3423 | +{ | |
3424 | + unsigned long flags; | |
3425 | + int pin; | |
3426 | + struct irq_pin_list *entry = irq_2_pin + irq; | |
3427 | + unsigned int apicid_value; | |
3428 | + cpumask_t tmp; | |
3429 | + | |
3430 | + cpus_and(tmp, cpumask, cpu_online_map); | |
3431 | + if (cpus_empty(tmp)) | |
3432 | + tmp = TARGET_CPUS; | |
3433 | + | |
3434 | + cpus_and(cpumask, tmp, CPU_MASK_ALL); | |
3435 | + | |
3436 | + apicid_value = cpu_mask_to_apicid(cpumask); | |
3437 | + /* Prepare to do the io_apic_write */ | |
3438 | + apicid_value = apicid_value << 24; | |
3439 | + spin_lock_irqsave(&ioapic_lock, flags); | |
3440 | + for (;;) { | |
3441 | + pin = entry->pin; | |
3442 | + if (pin == -1) | |
3443 | + break; | |
3444 | + io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); | |
3445 | + if (!entry->next) | |
3446 | + break; | |
3447 | + entry = irq_2_pin + entry->next; | |
3448 | + } | |
3449 | + set_irq_info(irq, cpumask); | |
3450 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
3451 | +} | |
3452 | + | |
3453 | +#if defined(CONFIG_IRQBALANCE) | |
3454 | +# include <asm/processor.h> /* kernel_thread() */ | |
3455 | +# include <linux/kernel_stat.h> /* kstat */ | |
3456 | +# include <linux/slab.h> /* kmalloc() */ | |
3457 | +# include <linux/timer.h> /* time_after() */ | |
3458 | + | |
3459 | +#ifdef CONFIG_BALANCED_IRQ_DEBUG | |
3460 | +# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) | |
3461 | +# define Dprintk(x...) do { TDprintk(x); } while (0) | |
3462 | +# else | |
3463 | +# define TDprintk(x...) | |
3464 | +# define Dprintk(x...) | |
3465 | +# endif | |
3466 | + | |
3467 | +#define IRQBALANCE_CHECK_ARCH -999 | |
3468 | +#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) | |
3469 | +#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) | |
3470 | +#define BALANCED_IRQ_MORE_DELTA (HZ/10) | |
3471 | +#define BALANCED_IRQ_LESS_DELTA (HZ) | |
3472 | + | |
3473 | +static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; | |
3474 | +static int physical_balance __read_mostly; | |
3475 | +static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; | |
3476 | + | |
3477 | +static struct irq_cpu_info { | |
3478 | + unsigned long * last_irq; | |
3479 | + unsigned long * irq_delta; | |
3480 | + unsigned long irq; | |
3481 | +} irq_cpu_data[NR_CPUS]; | |
3482 | + | |
3483 | +#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) | |
3484 | +#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) | |
3485 | +#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) | |
3486 | + | |
3487 | +#define IDLE_ENOUGH(cpu,now) \ | |
3488 | + (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) | |
3489 | + | |
3490 | +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) | |
3491 | + | |
3492 | +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) | |
3493 | + | |
3494 | +static cpumask_t balance_irq_affinity[NR_IRQS] = { | |
3495 | + [0 ... NR_IRQS-1] = CPU_MASK_ALL | |
3496 | +}; | |
3497 | + | |
3498 | +void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) | |
3499 | +{ | |
3500 | + balance_irq_affinity[irq] = mask; | |
3501 | +} | |
3502 | + | |
3503 | +static unsigned long move(int curr_cpu, cpumask_t allowed_mask, | |
3504 | + unsigned long now, int direction) | |
3505 | +{ | |
3506 | + int search_idle = 1; | |
3507 | + int cpu = curr_cpu; | |
3508 | + | |
3509 | + goto inside; | |
3510 | + | |
3511 | + do { | |
3512 | + if (unlikely(cpu == curr_cpu)) | |
3513 | + search_idle = 0; | |
3514 | +inside: | |
3515 | + if (direction == 1) { | |
3516 | + cpu++; | |
3517 | + if (cpu >= NR_CPUS) | |
3518 | + cpu = 0; | |
3519 | + } else { | |
3520 | + cpu--; | |
3521 | + if (cpu == -1) | |
3522 | + cpu = NR_CPUS-1; | |
3523 | + } | |
3524 | + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || | |
3525 | + (search_idle && !IDLE_ENOUGH(cpu,now))); | |
3526 | + | |
3527 | + return cpu; | |
3528 | +} | |
3529 | + | |
3530 | +static inline void balance_irq(int cpu, int irq) | |
3531 | +{ | |
3532 | + unsigned long now = jiffies; | |
3533 | + cpumask_t allowed_mask; | |
3534 | + unsigned int new_cpu; | |
3535 | + | |
3536 | + if (irqbalance_disabled) | |
3537 | + return; | |
3538 | + | |
3539 | + cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); | |
3540 | + new_cpu = move(cpu, allowed_mask, now, 1); | |
3541 | + if (cpu != new_cpu) { | |
3542 | + set_pending_irq(irq, cpumask_of_cpu(new_cpu)); | |
3543 | + } | |
3544 | +} | |
3545 | + | |
3546 | +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) | |
3547 | +{ | |
3548 | + int i, j; | |
3549 | + Dprintk("Rotating IRQs among CPUs.\n"); | |
3550 | + for_each_online_cpu(i) { | |
3551 | + for (j = 0; j < NR_IRQS; j++) { | |
3552 | + if (!irq_desc[j].action) | |
3553 | + continue; | |
3554 | + /* Is it a significant load ? */ | |
3555 | + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < | |
3556 | + useful_load_threshold) | |
3557 | + continue; | |
3558 | + balance_irq(i, j); | |
3559 | + } | |
3560 | + } | |
3561 | + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, | |
3562 | + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); | |
3563 | + return; | |
3564 | +} | |
3565 | + | |
3566 | +static void do_irq_balance(void) | |
3567 | +{ | |
3568 | + int i, j; | |
3569 | + unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); | |
3570 | + unsigned long move_this_load = 0; | |
3571 | + int max_loaded = 0, min_loaded = 0; | |
3572 | + int load; | |
3573 | + unsigned long useful_load_threshold = balanced_irq_interval + 10; | |
3574 | + int selected_irq; | |
3575 | + int tmp_loaded, first_attempt = 1; | |
3576 | + unsigned long tmp_cpu_irq; | |
3577 | + unsigned long imbalance = 0; | |
3578 | + cpumask_t allowed_mask, target_cpu_mask, tmp; | |
3579 | + | |
3580 | + for_each_possible_cpu(i) { | |
3581 | + int package_index; | |
3582 | + CPU_IRQ(i) = 0; | |
3583 | + if (!cpu_online(i)) | |
3584 | + continue; | |
3585 | + package_index = CPU_TO_PACKAGEINDEX(i); | |
3586 | + for (j = 0; j < NR_IRQS; j++) { | |
3587 | + unsigned long value_now, delta; | |
3588 | + /* Is this an active IRQ? */ | |
3589 | + if (!irq_desc[j].action) | |
3590 | + continue; | |
3591 | + if ( package_index == i ) | |
3592 | + IRQ_DELTA(package_index,j) = 0; | |
3593 | + /* Determine the total count per processor per IRQ */ | |
3594 | + value_now = (unsigned long) kstat_cpu(i).irqs[j]; | |
3595 | + | |
3596 | + /* Determine the activity per processor per IRQ */ | |
3597 | + delta = value_now - LAST_CPU_IRQ(i,j); | |
3598 | + | |
3599 | + /* Update last_cpu_irq[][] for the next time */ | |
3600 | + LAST_CPU_IRQ(i,j) = value_now; | |
3601 | + | |
3602 | + /* Ignore IRQs whose rate is less than the clock */ | |
3603 | + if (delta < useful_load_threshold) | |
3604 | + continue; | |
3605 | + /* update the load for the processor or package total */ | |
3606 | + IRQ_DELTA(package_index,j) += delta; | |
3607 | + | |
3608 | + /* Keep track of the higher numbered sibling as well */ | |
3609 | + if (i != package_index) | |
3610 | + CPU_IRQ(i) += delta; | |
3611 | + /* | |
3612 | + * We have sibling A and sibling B in the package | |
3613 | + * | |
3614 | + * cpu_irq[A] = load for cpu A + load for cpu B | |
3615 | + * cpu_irq[B] = load for cpu B | |
3616 | + */ | |
3617 | + CPU_IRQ(package_index) += delta; | |
3618 | + } | |
3619 | + } | |
3620 | + /* Find the least loaded processor package */ | |
3621 | + for_each_online_cpu(i) { | |
3622 | + if (i != CPU_TO_PACKAGEINDEX(i)) | |
3623 | + continue; | |
3624 | + if (min_cpu_irq > CPU_IRQ(i)) { | |
3625 | + min_cpu_irq = CPU_IRQ(i); | |
3626 | + min_loaded = i; | |
3627 | + } | |
3628 | + } | |
3629 | + max_cpu_irq = ULONG_MAX; | |
3630 | + | |
3631 | +tryanothercpu: | |
3632 | + /* Look for heaviest loaded processor. | |
3633 | + * We may come back to get the next heaviest loaded processor. | |
3634 | + * Skip processors with trivial loads. | |
3635 | + */ | |
3636 | + tmp_cpu_irq = 0; | |
3637 | + tmp_loaded = -1; | |
3638 | + for_each_online_cpu(i) { | |
3639 | + if (i != CPU_TO_PACKAGEINDEX(i)) | |
3640 | + continue; | |
3641 | + if (max_cpu_irq <= CPU_IRQ(i)) | |
3642 | + continue; | |
3643 | + if (tmp_cpu_irq < CPU_IRQ(i)) { | |
3644 | + tmp_cpu_irq = CPU_IRQ(i); | |
3645 | + tmp_loaded = i; | |
3646 | + } | |
3647 | + } | |
3648 | + | |
3649 | + if (tmp_loaded == -1) { | |
3650 | + /* In the case of small number of heavy interrupt sources, | |
3651 | + * loading some of the cpus too much. We use Ingo's original | |
3652 | + * approach to rotate them around. | |
3653 | + */ | |
3654 | + if (!first_attempt && imbalance >= useful_load_threshold) { | |
3655 | + rotate_irqs_among_cpus(useful_load_threshold); | |
3656 | + return; | |
3657 | + } | |
3658 | + goto not_worth_the_effort; | |
3659 | + } | |
3660 | + | |
3661 | + first_attempt = 0; /* heaviest search */ | |
3662 | + max_cpu_irq = tmp_cpu_irq; /* load */ | |
3663 | + max_loaded = tmp_loaded; /* processor */ | |
3664 | + imbalance = (max_cpu_irq - min_cpu_irq) / 2; | |
3665 | + | |
3666 | + Dprintk("max_loaded cpu = %d\n", max_loaded); | |
3667 | + Dprintk("min_loaded cpu = %d\n", min_loaded); | |
3668 | + Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); | |
3669 | + Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); | |
3670 | + Dprintk("load imbalance = %lu\n", imbalance); | |
3671 | + | |
3672 | + /* if imbalance is less than approx 10% of max load, then | |
3673 | + * observe diminishing returns action. - quit | |
3674 | + */ | |
3675 | + if (imbalance < (max_cpu_irq >> 3)) { | |
3676 | + Dprintk("Imbalance too trivial\n"); | |
3677 | + goto not_worth_the_effort; | |
3678 | + } | |
3679 | + | |
3680 | +tryanotherirq: | |
3681 | + /* if we select an IRQ to move that can't go where we want, then | |
3682 | + * see if there is another one to try. | |
3683 | + */ | |
3684 | + move_this_load = 0; | |
3685 | + selected_irq = -1; | |
3686 | + for (j = 0; j < NR_IRQS; j++) { | |
3687 | + /* Is this an active IRQ? */ | |
3688 | + if (!irq_desc[j].action) | |
3689 | + continue; | |
3690 | + if (imbalance <= IRQ_DELTA(max_loaded,j)) | |
3691 | + continue; | |
3692 | + /* Try to find the IRQ that is closest to the imbalance | |
3693 | + * without going over. | |
3694 | + */ | |
3695 | + if (move_this_load < IRQ_DELTA(max_loaded,j)) { | |
3696 | + move_this_load = IRQ_DELTA(max_loaded,j); | |
3697 | + selected_irq = j; | |
3698 | + } | |
3699 | + } | |
3700 | + if (selected_irq == -1) { | |
3701 | + goto tryanothercpu; | |
3702 | + } | |
3703 | + | |
3704 | + imbalance = move_this_load; | |
3705 | + | |
3706 | + /* For physical_balance case, we accumlated both load | |
3707 | + * values in the one of the siblings cpu_irq[], | |
3708 | + * to use the same code for physical and logical processors | |
3709 | + * as much as possible. | |
3710 | + * | |
3711 | + * NOTE: the cpu_irq[] array holds the sum of the load for | |
3712 | + * sibling A and sibling B in the slot for the lowest numbered | |
3713 | + * sibling (A), _AND_ the load for sibling B in the slot for | |
3714 | + * the higher numbered sibling. | |
3715 | + * | |
3716 | + * We seek the least loaded sibling by making the comparison | |
3717 | + * (A+B)/2 vs B | |
3718 | + */ | |
3719 | + load = CPU_IRQ(min_loaded) >> 1; | |
3720 | + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { | |
3721 | + if (load > CPU_IRQ(j)) { | |
3722 | + /* This won't change cpu_sibling_map[min_loaded] */ | |
3723 | + load = CPU_IRQ(j); | |
3724 | + min_loaded = j; | |
3725 | + } | |
3726 | + } | |
3727 | + | |
3728 | + cpus_and(allowed_mask, | |
3729 | + cpu_online_map, | |
3730 | + balance_irq_affinity[selected_irq]); | |
3731 | + target_cpu_mask = cpumask_of_cpu(min_loaded); | |
3732 | + cpus_and(tmp, target_cpu_mask, allowed_mask); | |
3733 | + | |
3734 | + if (!cpus_empty(tmp)) { | |
3735 | + | |
3736 | + Dprintk("irq = %d moved to cpu = %d\n", | |
3737 | + selected_irq, min_loaded); | |
3738 | + /* mark for change destination */ | |
3739 | + set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); | |
3740 | + | |
3741 | + /* Since we made a change, come back sooner to | |
3742 | + * check for more variation. | |
3743 | + */ | |
3744 | + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, | |
3745 | + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); | |
3746 | + return; | |
3747 | + } | |
3748 | + goto tryanotherirq; | |
3749 | + | |
3750 | +not_worth_the_effort: | |
3751 | + /* | |
3752 | + * if we did not find an IRQ to move, then adjust the time interval | |
3753 | + * upward | |
3754 | + */ | |
3755 | + balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, | |
3756 | + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); | |
3757 | + Dprintk("IRQ worth rotating not found\n"); | |
3758 | + return; | |
3759 | +} | |
3760 | + | |
3761 | +static int balanced_irq(void *unused) | |
3762 | +{ | |
3763 | + int i; | |
3764 | + unsigned long prev_balance_time = jiffies; | |
3765 | + long time_remaining = balanced_irq_interval; | |
3766 | + | |
3767 | + daemonize("kirqd"); | |
3768 | + | |
3769 | + /* push everything to CPU 0 to give us a starting point. */ | |
3770 | + for (i = 0 ; i < NR_IRQS ; i++) { | |
3771 | + irq_desc[i].pending_mask = cpumask_of_cpu(0); | |
3772 | + set_pending_irq(i, cpumask_of_cpu(0)); | |
3773 | + } | |
3774 | + | |
3775 | + for ( ; ; ) { | |
3776 | + time_remaining = schedule_timeout_interruptible(time_remaining); | |
3777 | + try_to_freeze(); | |
3778 | + if (time_after(jiffies, | |
3779 | + prev_balance_time+balanced_irq_interval)) { | |
3780 | + preempt_disable(); | |
3781 | + do_irq_balance(); | |
3782 | + prev_balance_time = jiffies; | |
3783 | + time_remaining = balanced_irq_interval; | |
3784 | + preempt_enable(); | |
3785 | + } | |
3786 | + } | |
3787 | + return 0; | |
3788 | +} | |
3789 | + | |
3790 | +static int __init balanced_irq_init(void) | |
3791 | +{ | |
3792 | + int i; | |
3793 | + struct cpuinfo_x86 *c; | |
3794 | + cpumask_t tmp; | |
3795 | + | |
3796 | + cpus_shift_right(tmp, cpu_online_map, 2); | |
3797 | + c = &boot_cpu_data; | |
3798 | + /* When not overwritten by the command line ask subarchitecture. */ | |
3799 | + if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) | |
3800 | + irqbalance_disabled = NO_BALANCE_IRQ; | |
3801 | + if (irqbalance_disabled) | |
3802 | + return 0; | |
3803 | + | |
3804 | + /* disable irqbalance completely if there is only one processor online */ | |
3805 | + if (num_online_cpus() < 2) { | |
3806 | + irqbalance_disabled = 1; | |
3807 | + return 0; | |
3808 | + } | |
3809 | + /* | |
3810 | + * Enable physical balance only if more than 1 physical processor | |
3811 | + * is present | |
3812 | + */ | |
3813 | + if (smp_num_siblings > 1 && !cpus_empty(tmp)) | |
3814 | + physical_balance = 1; | |
3815 | + | |
3816 | + for_each_online_cpu(i) { | |
3817 | + irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); | |
3818 | + irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); | |
3819 | + if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { | |
3820 | + printk(KERN_ERR "balanced_irq_init: out of memory"); | |
3821 | + goto failed; | |
3822 | + } | |
3823 | + memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); | |
3824 | + memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); | |
3825 | + } | |
3826 | + | |
3827 | + printk(KERN_INFO "Starting balanced_irq\n"); | |
3828 | + if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) | |
3829 | + return 0; | |
3830 | + else | |
3831 | + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); | |
3832 | +failed: | |
3833 | + for_each_possible_cpu(i) { | |
3834 | + kfree(irq_cpu_data[i].irq_delta); | |
3835 | + irq_cpu_data[i].irq_delta = NULL; | |
3836 | + kfree(irq_cpu_data[i].last_irq); | |
3837 | + irq_cpu_data[i].last_irq = NULL; | |
3838 | + } | |
3839 | + return 0; | |
3840 | +} | |
3841 | + | |
3842 | +int __init irqbalance_disable(char *str) | |
3843 | +{ | |
3844 | + irqbalance_disabled = 1; | |
3845 | + return 1; | |
3846 | +} | |
3847 | + | |
3848 | +__setup("noirqbalance", irqbalance_disable); | |
3849 | + | |
3850 | +late_initcall(balanced_irq_init); | |
3851 | +#endif /* CONFIG_IRQBALANCE */ | |
3852 | +#endif /* CONFIG_SMP */ | |
3853 | +#endif | |
3854 | + | |
3855 | +#ifndef CONFIG_SMP | |
3856 | +void fastcall send_IPI_self(int vector) | |
3857 | +{ | |
3858 | +#ifndef CONFIG_XEN | |
3859 | + unsigned int cfg; | |
3860 | + | |
3861 | + /* | |
3862 | + * Wait for idle. | |
3863 | + */ | |
3864 | + apic_wait_icr_idle(); | |
3865 | + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; | |
3866 | + /* | |
3867 | + * Send the IPI. The write to APIC_ICR fires this off. | |
3868 | + */ | |
3869 | + apic_write_around(APIC_ICR, cfg); | |
3870 | +#endif | |
3871 | +} | |
3872 | +#endif /* !CONFIG_SMP */ | |
3873 | + | |
3874 | + | |
3875 | +/* | |
3876 | + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to | |
3877 | + * specific CPU-side IRQs. | |
3878 | + */ | |
3879 | + | |
3880 | +#define MAX_PIRQS 8 | |
3881 | +static int pirq_entries [MAX_PIRQS]; | |
3882 | +static int pirqs_enabled; | |
3883 | +int skip_ioapic_setup; | |
3884 | + | |
3885 | +static int __init ioapic_setup(char *str) | |
3886 | +{ | |
3887 | + skip_ioapic_setup = 1; | |
3888 | + return 1; | |
3889 | +} | |
3890 | + | |
3891 | +__setup("noapic", ioapic_setup); | |
3892 | + | |
3893 | +static int __init ioapic_pirq_setup(char *str) | |
3894 | +{ | |
3895 | + int i, max; | |
3896 | + int ints[MAX_PIRQS+1]; | |
3897 | + | |
3898 | + get_options(str, ARRAY_SIZE(ints), ints); | |
3899 | + | |
3900 | + for (i = 0; i < MAX_PIRQS; i++) | |
3901 | + pirq_entries[i] = -1; | |
3902 | + | |
3903 | + pirqs_enabled = 1; | |
3904 | + apic_printk(APIC_VERBOSE, KERN_INFO | |
3905 | + "PIRQ redirection, working around broken MP-BIOS.\n"); | |
3906 | + max = MAX_PIRQS; | |
3907 | + if (ints[0] < MAX_PIRQS) | |
3908 | + max = ints[0]; | |
3909 | + | |
3910 | + for (i = 0; i < max; i++) { | |
3911 | + apic_printk(APIC_VERBOSE, KERN_DEBUG | |
3912 | + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); | |
3913 | + /* | |
3914 | + * PIRQs are mapped upside down, usually. | |
3915 | + */ | |
3916 | + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; | |
3917 | + } | |
3918 | + return 1; | |
3919 | +} | |
3920 | + | |
3921 | +__setup("pirq=", ioapic_pirq_setup); | |
3922 | + | |
3923 | +/* | |
3924 | + * Find the IRQ entry number of a certain pin. | |
3925 | + */ | |
3926 | +static int find_irq_entry(int apic, int pin, int type) | |
3927 | +{ | |
3928 | + int i; | |
3929 | + | |
3930 | + for (i = 0; i < mp_irq_entries; i++) | |
3931 | + if (mp_irqs[i].mpc_irqtype == type && | |
3932 | + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || | |
3933 | + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && | |
3934 | + mp_irqs[i].mpc_dstirq == pin) | |
3935 | + return i; | |
3936 | + | |
3937 | + return -1; | |
3938 | +} | |
3939 | + | |
3940 | +/* | |
3941 | + * Find the pin to which IRQ[irq] (ISA) is connected | |
3942 | + */ | |
3943 | +static int __init find_isa_irq_pin(int irq, int type) | |
3944 | +{ | |
3945 | + int i; | |
3946 | + | |
3947 | + for (i = 0; i < mp_irq_entries; i++) { | |
3948 | + int lbus = mp_irqs[i].mpc_srcbus; | |
3949 | + | |
3950 | + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || | |
3951 | + mp_bus_id_to_type[lbus] == MP_BUS_EISA || | |
3952 | + mp_bus_id_to_type[lbus] == MP_BUS_MCA || | |
3953 | + mp_bus_id_to_type[lbus] == MP_BUS_NEC98 | |
3954 | + ) && | |
3955 | + (mp_irqs[i].mpc_irqtype == type) && | |
3956 | + (mp_irqs[i].mpc_srcbusirq == irq)) | |
3957 | + | |
3958 | + return mp_irqs[i].mpc_dstirq; | |
3959 | + } | |
3960 | + return -1; | |
3961 | +} | |
3962 | + | |
3963 | +static int __init find_isa_irq_apic(int irq, int type) | |
3964 | +{ | |
3965 | + int i; | |
3966 | + | |
3967 | + for (i = 0; i < mp_irq_entries; i++) { | |
3968 | + int lbus = mp_irqs[i].mpc_srcbus; | |
3969 | + | |
3970 | + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || | |
3971 | + mp_bus_id_to_type[lbus] == MP_BUS_EISA || | |
3972 | + mp_bus_id_to_type[lbus] == MP_BUS_MCA || | |
3973 | + mp_bus_id_to_type[lbus] == MP_BUS_NEC98 | |
3974 | + ) && | |
3975 | + (mp_irqs[i].mpc_irqtype == type) && | |
3976 | + (mp_irqs[i].mpc_srcbusirq == irq)) | |
3977 | + break; | |
3978 | + } | |
3979 | + if (i < mp_irq_entries) { | |
3980 | + int apic; | |
3981 | + for(apic = 0; apic < nr_ioapics; apic++) { | |
3982 | + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) | |
3983 | + return apic; | |
3984 | + } | |
3985 | + } | |
3986 | + | |
3987 | + return -1; | |
3988 | +} | |
3989 | + | |
3990 | +/* | |
3991 | + * Find a specific PCI IRQ entry. | |
3992 | + * Not an __init, possibly needed by modules | |
3993 | + */ | |
3994 | +static int pin_2_irq(int idx, int apic, int pin); | |
3995 | + | |
3996 | +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | |
3997 | +{ | |
3998 | + int apic, i, best_guess = -1; | |
3999 | + | |
4000 | + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " | |
4001 | + "slot:%d, pin:%d.\n", bus, slot, pin); | |
4002 | + if (mp_bus_id_to_pci_bus[bus] == -1) { | |
4003 | + printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | |
4004 | + return -1; | |
4005 | + } | |
4006 | + for (i = 0; i < mp_irq_entries; i++) { | |
4007 | + int lbus = mp_irqs[i].mpc_srcbus; | |
4008 | + | |
4009 | + for (apic = 0; apic < nr_ioapics; apic++) | |
4010 | + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || | |
4011 | + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) | |
4012 | + break; | |
4013 | + | |
4014 | + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && | |
4015 | + !mp_irqs[i].mpc_irqtype && | |
4016 | + (bus == lbus) && | |
4017 | + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { | |
4018 | + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); | |
4019 | + | |
4020 | + if (!(apic || IO_APIC_IRQ(irq))) | |
4021 | + continue; | |
4022 | + | |
4023 | + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) | |
4024 | + return irq; | |
4025 | + /* | |
4026 | + * Use the first all-but-pin matching entry as a | |
4027 | + * best-guess fuzzy result for broken mptables. | |
4028 | + */ | |
4029 | + if (best_guess < 0) | |
4030 | + best_guess = irq; | |
4031 | + } | |
4032 | + } | |
4033 | + return best_guess; | |
4034 | +} | |
4035 | +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); | |
4036 | + | |
4037 | +/* | |
4038 | + * This function currently is only a helper for the i386 smp boot process where | |
4039 | + * we need to reprogram the ioredtbls to cater for the cpus which have come online | |
4040 | + * so mask in all cases should simply be TARGET_CPUS | |
4041 | + */ | |
4042 | +#ifdef CONFIG_SMP | |
4043 | +#ifndef CONFIG_XEN | |
4044 | +void __init setup_ioapic_dest(void) | |
4045 | +{ | |
4046 | + int pin, ioapic, irq, irq_entry; | |
4047 | + | |
4048 | + if (skip_ioapic_setup == 1) | |
4049 | + return; | |
4050 | + | |
4051 | + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { | |
4052 | + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { | |
4053 | + irq_entry = find_irq_entry(ioapic, pin, mp_INT); | |
4054 | + if (irq_entry == -1) | |
4055 | + continue; | |
4056 | + irq = pin_2_irq(irq_entry, ioapic, pin); | |
4057 | + set_ioapic_affinity_irq(irq, TARGET_CPUS); | |
4058 | + } | |
4059 | + | |
4060 | + } | |
4061 | +} | |
4062 | +#endif /* !CONFIG_XEN */ | |
4063 | +#endif | |
4064 | + | |
4065 | +/* | |
4066 | + * EISA Edge/Level control register, ELCR | |
4067 | + */ | |
4068 | +static int EISA_ELCR(unsigned int irq) | |
4069 | +{ | |
4070 | + if (irq < 16) { | |
4071 | + unsigned int port = 0x4d0 + (irq >> 3); | |
4072 | + return (inb(port) >> (irq & 7)) & 1; | |
4073 | + } | |
4074 | + apic_printk(APIC_VERBOSE, KERN_INFO | |
4075 | + "Broken MPtable reports ISA irq %d\n", irq); | |
4076 | + return 0; | |
4077 | +} | |
4078 | + | |
4079 | +/* EISA interrupts are always polarity zero and can be edge or level | |
4080 | + * trigger depending on the ELCR value. If an interrupt is listed as | |
4081 | + * EISA conforming in the MP table, that means its trigger type must | |
4082 | + * be read in from the ELCR */ | |
4083 | + | |
4084 | +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) | |
4085 | +#define default_EISA_polarity(idx) (0) | |
4086 | + | |
4087 | +/* ISA interrupts are always polarity zero edge triggered, | |
4088 | + * when listed as conforming in the MP table. */ | |
4089 | + | |
4090 | +#define default_ISA_trigger(idx) (0) | |
4091 | +#define default_ISA_polarity(idx) (0) | |
4092 | + | |
4093 | +/* PCI interrupts are always polarity one level triggered, | |
4094 | + * when listed as conforming in the MP table. */ | |
4095 | + | |
4096 | +#define default_PCI_trigger(idx) (1) | |
4097 | +#define default_PCI_polarity(idx) (1) | |
4098 | + | |
4099 | +/* MCA interrupts are always polarity zero level triggered, | |
4100 | + * when listed as conforming in the MP table. */ | |
4101 | + | |
4102 | +#define default_MCA_trigger(idx) (1) | |
4103 | +#define default_MCA_polarity(idx) (0) | |
4104 | + | |
4105 | +/* NEC98 interrupts are always polarity zero edge triggered, | |
4106 | + * when listed as conforming in the MP table. */ | |
4107 | + | |
4108 | +#define default_NEC98_trigger(idx) (0) | |
4109 | +#define default_NEC98_polarity(idx) (0) | |
4110 | + | |
4111 | +static int __init MPBIOS_polarity(int idx) | |
4112 | +{ | |
4113 | + int bus = mp_irqs[idx].mpc_srcbus; | |
4114 | + int polarity; | |
4115 | + | |
4116 | + /* | |
4117 | + * Determine IRQ line polarity (high active or low active): | |
4118 | + */ | |
4119 | + switch (mp_irqs[idx].mpc_irqflag & 3) | |
4120 | + { | |
4121 | + case 0: /* conforms, ie. bus-type dependent polarity */ | |
4122 | + { | |
4123 | + switch (mp_bus_id_to_type[bus]) | |
4124 | + { | |
4125 | + case MP_BUS_ISA: /* ISA pin */ | |
4126 | + { | |
4127 | + polarity = default_ISA_polarity(idx); | |
4128 | + break; | |
4129 | + } | |
4130 | + case MP_BUS_EISA: /* EISA pin */ | |
4131 | + { | |
4132 | + polarity = default_EISA_polarity(idx); | |
4133 | + break; | |
4134 | + } | |
4135 | + case MP_BUS_PCI: /* PCI pin */ | |
4136 | + { | |
4137 | + polarity = default_PCI_polarity(idx); | |
4138 | + break; | |
4139 | + } | |
4140 | + case MP_BUS_MCA: /* MCA pin */ | |
4141 | + { | |
4142 | + polarity = default_MCA_polarity(idx); | |
4143 | + break; | |
4144 | + } | |
4145 | + case MP_BUS_NEC98: /* NEC 98 pin */ | |
4146 | + { | |
4147 | + polarity = default_NEC98_polarity(idx); | |
4148 | + break; | |
4149 | + } | |
4150 | + default: | |
4151 | + { | |
4152 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
4153 | + polarity = 1; | |
4154 | + break; | |
4155 | + } | |
4156 | + } | |
4157 | + break; | |
4158 | + } | |
4159 | + case 1: /* high active */ | |
4160 | + { | |
4161 | + polarity = 0; | |
4162 | + break; | |
4163 | + } | |
4164 | + case 2: /* reserved */ | |
4165 | + { | |
4166 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
4167 | + polarity = 1; | |
4168 | + break; | |
4169 | + } | |
4170 | + case 3: /* low active */ | |
4171 | + { | |
4172 | + polarity = 1; | |
4173 | + break; | |
4174 | + } | |
4175 | + default: /* invalid */ | |
4176 | + { | |
4177 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
4178 | + polarity = 1; | |
4179 | + break; | |
4180 | + } | |
4181 | + } | |
4182 | + return polarity; | |
4183 | +} | |
4184 | + | |
4185 | +static int MPBIOS_trigger(int idx) | |
4186 | +{ | |
4187 | + int bus = mp_irqs[idx].mpc_srcbus; | |
4188 | + int trigger; | |
4189 | + | |
4190 | + /* | |
4191 | + * Determine IRQ trigger mode (edge or level sensitive): | |
4192 | + */ | |
4193 | + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) | |
4194 | + { | |
4195 | + case 0: /* conforms, ie. bus-type dependent */ | |
4196 | + { | |
4197 | + switch (mp_bus_id_to_type[bus]) | |
4198 | + { | |
4199 | + case MP_BUS_ISA: /* ISA pin */ | |
4200 | + { | |
4201 | + trigger = default_ISA_trigger(idx); | |
4202 | + break; | |
4203 | + } | |
4204 | + case MP_BUS_EISA: /* EISA pin */ | |
4205 | + { | |
4206 | + trigger = default_EISA_trigger(idx); | |
4207 | + break; | |
4208 | + } | |
4209 | + case MP_BUS_PCI: /* PCI pin */ | |
4210 | + { | |
4211 | + trigger = default_PCI_trigger(idx); | |
4212 | + break; | |
4213 | + } | |
4214 | + case MP_BUS_MCA: /* MCA pin */ | |
4215 | + { | |
4216 | + trigger = default_MCA_trigger(idx); | |
4217 | + break; | |
4218 | + } | |
4219 | + case MP_BUS_NEC98: /* NEC 98 pin */ | |
4220 | + { | |
4221 | + trigger = default_NEC98_trigger(idx); | |
4222 | + break; | |
4223 | + } | |
4224 | + default: | |
4225 | + { | |
4226 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
4227 | + trigger = 1; | |
4228 | + break; | |
4229 | + } | |
4230 | + } | |
4231 | + break; | |
4232 | + } | |
4233 | + case 1: /* edge */ | |
4234 | + { | |
4235 | + trigger = 0; | |
4236 | + break; | |
4237 | + } | |
4238 | + case 2: /* reserved */ | |
4239 | + { | |
4240 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
4241 | + trigger = 1; | |
4242 | + break; | |
4243 | + } | |
4244 | + case 3: /* level */ | |
4245 | + { | |
4246 | + trigger = 1; | |
4247 | + break; | |
4248 | + } | |
4249 | + default: /* invalid */ | |
4250 | + { | |
4251 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
4252 | + trigger = 0; | |
4253 | + break; | |
4254 | + } | |
4255 | + } | |
4256 | + return trigger; | |
4257 | +} | |
4258 | + | |
4259 | +static inline int irq_polarity(int idx) | |
4260 | +{ | |
4261 | + return MPBIOS_polarity(idx); | |
4262 | +} | |
4263 | + | |
4264 | +static inline int irq_trigger(int idx) | |
4265 | +{ | |
4266 | + return MPBIOS_trigger(idx); | |
4267 | +} | |
4268 | + | |
4269 | +static int pin_2_irq(int idx, int apic, int pin) | |
4270 | +{ | |
4271 | + int irq, i; | |
4272 | + int bus = mp_irqs[idx].mpc_srcbus; | |
4273 | + | |
4274 | + /* | |
4275 | + * Debugging check, we are in big trouble if this message pops up! | |
4276 | + */ | |
4277 | + if (mp_irqs[idx].mpc_dstirq != pin) | |
4278 | + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); | |
4279 | + | |
4280 | + switch (mp_bus_id_to_type[bus]) | |
4281 | + { | |
4282 | + case MP_BUS_ISA: /* ISA pin */ | |
4283 | + case MP_BUS_EISA: | |
4284 | + case MP_BUS_MCA: | |
4285 | + case MP_BUS_NEC98: | |
4286 | + { | |
4287 | + irq = mp_irqs[idx].mpc_srcbusirq; | |
4288 | + break; | |
4289 | + } | |
4290 | + case MP_BUS_PCI: /* PCI pin */ | |
4291 | + { | |
4292 | + /* | |
4293 | + * PCI IRQs are mapped in order | |
4294 | + */ | |
4295 | + i = irq = 0; | |
4296 | + while (i < apic) | |
4297 | + irq += nr_ioapic_registers[i++]; | |
4298 | + irq += pin; | |
4299 | + | |
4300 | + /* | |
4301 | + * For MPS mode, so far only needed by ES7000 platform | |
4302 | + */ | |
4303 | + if (ioapic_renumber_irq) | |
4304 | + irq = ioapic_renumber_irq(apic, irq); | |
4305 | + | |
4306 | + break; | |
4307 | + } | |
4308 | + default: | |
4309 | + { | |
4310 | + printk(KERN_ERR "unknown bus type %d.\n",bus); | |
4311 | + irq = 0; | |
4312 | + break; | |
4313 | + } | |
4314 | + } | |
4315 | + | |
4316 | + /* | |
4317 | + * PCI IRQ command line redirection. Yes, limits are hardcoded. | |
4318 | + */ | |
4319 | + if ((pin >= 16) && (pin <= 23)) { | |
4320 | + if (pirq_entries[pin-16] != -1) { | |
4321 | + if (!pirq_entries[pin-16]) { | |
4322 | + apic_printk(APIC_VERBOSE, KERN_DEBUG | |
4323 | + "disabling PIRQ%d\n", pin-16); | |
4324 | + } else { | |
4325 | + irq = pirq_entries[pin-16]; | |
4326 | + apic_printk(APIC_VERBOSE, KERN_DEBUG | |
4327 | + "using PIRQ%d -> IRQ %d\n", | |
4328 | + pin-16, irq); | |
4329 | + } | |
4330 | + } | |
4331 | + } | |
4332 | + return irq; | |
4333 | +} | |
4334 | + | |
4335 | +static inline int IO_APIC_irq_trigger(int irq) | |
4336 | +{ | |
4337 | + int apic, idx, pin; | |
4338 | + | |
4339 | + for (apic = 0; apic < nr_ioapics; apic++) { | |
4340 | + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | |
4341 | + idx = find_irq_entry(apic,pin,mp_INT); | |
4342 | + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) | |
4343 | + return irq_trigger(idx); | |
4344 | + } | |
4345 | + } | |
4346 | + /* | |
4347 | + * nonexistent IRQs are edge default | |
4348 | + */ | |
4349 | + return 0; | |
4350 | +} | |
4351 | + | |
4352 | +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ | |
4353 | +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ | |
4354 | + | |
4355 | +int assign_irq_vector(int irq) | |
4356 | +{ | |
4357 | + unsigned long flags; | |
4358 | + int vector; | |
4359 | + struct physdev_irq irq_op; | |
4360 | + | |
4361 | + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); | |
4362 | + | |
4363 | + if (irq < PIRQ_BASE || irq - PIRQ_BASE > NR_PIRQS) | |
4364 | + return -EINVAL; | |
4365 | + | |
4366 | + spin_lock_irqsave(&vector_lock, flags); | |
4367 | + | |
4368 | + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { | |
4369 | + spin_unlock_irqrestore(&vector_lock, flags); | |
4370 | + return IO_APIC_VECTOR(irq); | |
4371 | + } | |
4372 | + | |
4373 | + irq_op.irq = irq; | |
4374 | + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { | |
4375 | + spin_unlock_irqrestore(&vector_lock, flags); | |
4376 | + return -ENOSPC; | |
4377 | + } | |
4378 | + | |
4379 | + vector = irq_op.vector; | |
4380 | + vector_irq[vector] = irq; | |
4381 | + if (irq != AUTO_ASSIGN) | |
4382 | + IO_APIC_VECTOR(irq) = vector; | |
4383 | + | |
4384 | + spin_unlock_irqrestore(&vector_lock, flags); | |
4385 | + | |
4386 | + return vector; | |
4387 | +} | |
4388 | + | |
4389 | +#ifndef CONFIG_XEN | |
4390 | +static struct hw_interrupt_type ioapic_level_type; | |
4391 | +static struct hw_interrupt_type ioapic_edge_type; | |
4392 | + | |
4393 | +#define IOAPIC_AUTO -1 | |
4394 | +#define IOAPIC_EDGE 0 | |
4395 | +#define IOAPIC_LEVEL 1 | |
4396 | + | |
4397 | +static void ioapic_register_intr(int irq, int vector, unsigned long trigger) | |
4398 | +{ | |
4399 | + unsigned idx; | |
4400 | + | |
4401 | + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; | |
4402 | + | |
4403 | + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || | |
4404 | + trigger == IOAPIC_LEVEL) | |
4405 | + irq_desc[idx].chip = &ioapic_level_type; | |
4406 | + else | |
4407 | + irq_desc[idx].chip = &ioapic_edge_type; | |
4408 | + set_intr_gate(vector, interrupt[idx]); | |
4409 | +} | |
4410 | +#else | |
4411 | +#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq) | |
4412 | +#endif | |
4413 | + | |
4414 | +static void __init setup_IO_APIC_irqs(void) | |
4415 | +{ | |
4416 | + struct IO_APIC_route_entry entry; | |
4417 | + int apic, pin, idx, irq, first_notcon = 1, vector; | |
4418 | + unsigned long flags; | |
4419 | + | |
4420 | + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); | |
4421 | + | |
4422 | + for (apic = 0; apic < nr_ioapics; apic++) { | |
4423 | + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | |
4424 | + | |
4425 | + /* | |
4426 | + * add it to the IO-APIC irq-routing table: | |
4427 | + */ | |
4428 | + memset(&entry,0,sizeof(entry)); | |
4429 | + | |
4430 | + entry.delivery_mode = INT_DELIVERY_MODE; | |
4431 | + entry.dest_mode = INT_DEST_MODE; | |
4432 | + entry.mask = 0; /* enable IRQ */ | |
4433 | + entry.dest.logical.logical_dest = | |
4434 | + cpu_mask_to_apicid(TARGET_CPUS); | |
4435 | + | |
4436 | + idx = find_irq_entry(apic,pin,mp_INT); | |
4437 | + if (idx == -1) { | |
4438 | + if (first_notcon) { | |
4439 | + apic_printk(APIC_VERBOSE, KERN_DEBUG | |
4440 | + " IO-APIC (apicid-pin) %d-%d", | |
4441 | + mp_ioapics[apic].mpc_apicid, | |
4442 | + pin); | |
4443 | + first_notcon = 0; | |
4444 | + } else | |
4445 | + apic_printk(APIC_VERBOSE, ", %d-%d", | |
4446 | + mp_ioapics[apic].mpc_apicid, pin); | |
4447 | + continue; | |
4448 | + } | |
4449 | + | |
4450 | + entry.trigger = irq_trigger(idx); | |
4451 | + entry.polarity = irq_polarity(idx); | |
4452 | + | |
4453 | + if (irq_trigger(idx)) { | |
4454 | + entry.trigger = 1; | |
4455 | + entry.mask = 1; | |
4456 | + } | |
4457 | + | |
4458 | + irq = pin_2_irq(idx, apic, pin); | |
4459 | + /* | |
4460 | + * skip adding the timer int on secondary nodes, which causes | |
4461 | + * a small but painful rift in the time-space continuum | |
4462 | + */ | |
4463 | + if (multi_timer_check(apic, irq)) | |
4464 | + continue; | |
4465 | + else | |
4466 | + add_pin_to_irq(irq, apic, pin); | |
4467 | + | |
4468 | + if (/*!apic &&*/ !IO_APIC_IRQ(irq)) | |
4469 | + continue; | |
4470 | + | |
4471 | + if (IO_APIC_IRQ(irq)) { | |
4472 | + vector = assign_irq_vector(irq); | |
4473 | + entry.vector = vector; | |
4474 | + ioapic_register_intr(irq, vector, IOAPIC_AUTO); | |
4475 | + | |
4476 | + if (!apic && (irq < 16)) | |
4477 | + disable_8259A_irq(irq); | |
4478 | + } | |
4479 | + spin_lock_irqsave(&ioapic_lock, flags); | |
4480 | + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); | |
4481 | + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); | |
4482 | + set_native_irq_info(irq, TARGET_CPUS); | |
4483 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
4484 | + } | |
4485 | + } | |
4486 | + | |
4487 | + if (!first_notcon) | |
4488 | + apic_printk(APIC_VERBOSE, " not connected.\n"); | |
4489 | +} | |
4490 | + | |
4491 | +/* | |
4492 | + * Set up the 8259A-master output pin: | |
4493 | + */ | |
4494 | +#ifndef CONFIG_XEN | |
4495 | +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) | |
4496 | +{ | |
4497 | + struct IO_APIC_route_entry entry; | |
4498 | + unsigned long flags; | |
4499 | + | |
4500 | + memset(&entry,0,sizeof(entry)); | |
4501 | + | |
4502 | + disable_8259A_irq(0); | |
4503 | + | |
4504 | + /* mask LVT0 */ | |
4505 | + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | |
4506 | + | |
4507 | + /* | |
4508 | + * We use logical delivery to get the timer IRQ | |
4509 | + * to the first CPU. | |
4510 | + */ | |
4511 | + entry.dest_mode = INT_DEST_MODE; | |
4512 | + entry.mask = 0; /* unmask IRQ now */ | |
4513 | + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | |
4514 | + entry.delivery_mode = INT_DELIVERY_MODE; | |
4515 | + entry.polarity = 0; | |
4516 | + entry.trigger = 0; | |
4517 | + entry.vector = vector; | |
4518 | + | |
4519 | + /* | |
4520 | + * The timer IRQ doesn't have to know that behind the | |
4521 | + * scene we have a 8259A-master in AEOI mode ... | |
4522 | + */ | |
4523 | + irq_desc[0].chip = &ioapic_edge_type; | |
4524 | + | |
4525 | + /* | |
4526 | + * Add it to the IO-APIC irq-routing table: | |
4527 | + */ | |
4528 | + spin_lock_irqsave(&ioapic_lock, flags); | |
4529 | + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); | |
4530 | + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); | |
4531 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
4532 | + | |
4533 | + enable_8259A_irq(0); | |
4534 | +} | |
4535 | + | |
4536 | +static inline void UNEXPECTED_IO_APIC(void) | |
4537 | +{ | |
4538 | +} | |
4539 | + | |
4540 | +void __init print_IO_APIC(void) | |
4541 | +{ | |
4542 | + int apic, i; | |
4543 | + union IO_APIC_reg_00 reg_00; | |
4544 | + union IO_APIC_reg_01 reg_01; | |
4545 | + union IO_APIC_reg_02 reg_02; | |
4546 | + union IO_APIC_reg_03 reg_03; | |
4547 | + unsigned long flags; | |
4548 | + | |
4549 | + if (apic_verbosity == APIC_QUIET) | |
4550 | + return; | |
4551 | + | |
4552 | + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); | |
4553 | + for (i = 0; i < nr_ioapics; i++) | |
4554 | + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", | |
4555 | + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); | |
4556 | + | |
4557 | + /* | |
4558 | + * We are a bit conservative about what we expect. We have to | |
4559 | + * know about every hardware change ASAP. | |
4560 | + */ | |
4561 | + printk(KERN_INFO "testing the IO APIC.......................\n"); | |
4562 | + | |
4563 | + for (apic = 0; apic < nr_ioapics; apic++) { | |
4564 | + | |
4565 | + spin_lock_irqsave(&ioapic_lock, flags); | |
4566 | + reg_00.raw = io_apic_read(apic, 0); | |
4567 | + reg_01.raw = io_apic_read(apic, 1); | |
4568 | + if (reg_01.bits.version >= 0x10) | |
4569 | + reg_02.raw = io_apic_read(apic, 2); | |
4570 | + if (reg_01.bits.version >= 0x20) | |
4571 | + reg_03.raw = io_apic_read(apic, 3); | |
4572 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
4573 | + | |
4574 | + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); | |
4575 | + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | |
4576 | + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | |
4577 | + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); | |
4578 | + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); | |
4579 | + if (reg_00.bits.ID >= get_physical_broadcast()) | |
4580 | + UNEXPECTED_IO_APIC(); | |
4581 | + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) | |
4582 | + UNEXPECTED_IO_APIC(); | |
4583 | + | |
4584 | + printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); | |
4585 | + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); | |
4586 | + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ | |
4587 | + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ | |
4588 | + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ | |
4589 | + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ | |
4590 | + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ | |
4591 | + (reg_01.bits.entries != 0x2E) && | |
4592 | + (reg_01.bits.entries != 0x3F) | |
4593 | + ) | |
4594 | + UNEXPECTED_IO_APIC(); | |
4595 | + | |
4596 | + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); | |
4597 | + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); | |
4598 | + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ | |
4599 | + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ | |
4600 | + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ | |
4601 | + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ | |
4602 | + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ | |
4603 | + ) | |
4604 | + UNEXPECTED_IO_APIC(); | |
4605 | + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) | |
4606 | + UNEXPECTED_IO_APIC(); | |
4607 | + | |
4608 | + /* | |
4609 | + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, | |
4610 | + * but the value of reg_02 is read as the previous read register | |
4611 | + * value, so ignore it if reg_02 == reg_01. | |
4612 | + */ | |
4613 | + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { | |
4614 | + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); | |
4615 | + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); | |
4616 | + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) | |
4617 | + UNEXPECTED_IO_APIC(); | |
4618 | + } | |
4619 | + | |
4620 | + /* | |
4621 | + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 | |
4622 | + * or reg_03, but the value of reg_0[23] is read as the previous read | |
4623 | + * register value, so ignore it if reg_03 == reg_0[12]. | |
4624 | + */ | |
4625 | + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && | |
4626 | + reg_03.raw != reg_01.raw) { | |
4627 | + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); | |
4628 | + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); | |
4629 | + if (reg_03.bits.__reserved_1) | |
4630 | + UNEXPECTED_IO_APIC(); | |
4631 | + } | |
4632 | + | |
4633 | + printk(KERN_DEBUG ".... IRQ redirection table:\n"); | |
4634 | + | |
4635 | + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" | |
4636 | + " Stat Dest Deli Vect: \n"); | |
4637 | + | |
4638 | + for (i = 0; i <= reg_01.bits.entries; i++) { | |
4639 | + struct IO_APIC_route_entry entry; | |
4640 | + | |
4641 | + spin_lock_irqsave(&ioapic_lock, flags); | |
4642 | + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); | |
4643 | + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); | |
4644 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
4645 | + | |
4646 | + printk(KERN_DEBUG " %02x %03X %02X ", | |
4647 | + i, | |
4648 | + entry.dest.logical.logical_dest, | |
4649 | + entry.dest.physical.physical_dest | |
4650 | + ); | |
4651 | + | |
4652 | + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", | |
4653 | + entry.mask, | |
4654 | + entry.trigger, | |
4655 | + entry.irr, | |
4656 | + entry.polarity, | |
4657 | + entry.delivery_status, | |
4658 | + entry.dest_mode, | |
4659 | + entry.delivery_mode, | |
4660 | + entry.vector | |
4661 | + ); | |
4662 | + } | |
4663 | + } | |
4664 | + if (use_pci_vector()) | |
4665 | + printk(KERN_INFO "Using vector-based indexing\n"); | |
4666 | + printk(KERN_DEBUG "IRQ to pin mappings:\n"); | |
4667 | + for (i = 0; i < NR_IRQS; i++) { | |
4668 | + struct irq_pin_list *entry = irq_2_pin + i; | |
4669 | + if (entry->pin < 0) | |
4670 | + continue; | |
4671 | + if (use_pci_vector() && !platform_legacy_irq(i)) | |
4672 | + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); | |
4673 | + else | |
4674 | + printk(KERN_DEBUG "IRQ%d ", i); | |
4675 | + for (;;) { | |
4676 | + printk("-> %d:%d", entry->apic, entry->pin); | |
4677 | + if (!entry->next) | |
4678 | + break; | |
4679 | + entry = irq_2_pin + entry->next; | |
4680 | + } | |
4681 | + printk("\n"); | |
4682 | + } | |
4683 | + | |
4684 | + printk(KERN_INFO ".................................... done.\n"); | |
4685 | + | |
4686 | + return; | |
4687 | +} | |
4688 | + | |
4689 | +static void print_APIC_bitfield (int base) | |
4690 | +{ | |
4691 | + unsigned int v; | |
4692 | + int i, j; | |
4693 | + | |
4694 | + if (apic_verbosity == APIC_QUIET) | |
4695 | + return; | |
4696 | + | |
4697 | + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); | |
4698 | + for (i = 0; i < 8; i++) { | |
4699 | + v = apic_read(base + i*0x10); | |
4700 | + for (j = 0; j < 32; j++) { | |
4701 | + if (v & (1<<j)) | |
4702 | + printk("1"); | |
4703 | + else | |
4704 | + printk("0"); | |
4705 | + } | |
4706 | + printk("\n"); | |
4707 | + } | |
4708 | +} | |
4709 | + | |
4710 | +void /*__init*/ print_local_APIC(void * dummy) | |
4711 | +{ | |
4712 | + unsigned int v, ver, maxlvt; | |
4713 | + | |
4714 | + if (apic_verbosity == APIC_QUIET) | |
4715 | + return; | |
4716 | + | |
4717 | + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | |
4718 | + smp_processor_id(), hard_smp_processor_id()); | |
4719 | + v = apic_read(APIC_ID); | |
4720 | + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); | |
4721 | + v = apic_read(APIC_LVR); | |
4722 | + printk(KERN_INFO "... APIC VERSION: %08x\n", v); | |
4723 | + ver = GET_APIC_VERSION(v); | |
4724 | + maxlvt = get_maxlvt(); | |
4725 | + | |
4726 | + v = apic_read(APIC_TASKPRI); | |
4727 | + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); | |
4728 | + | |
4729 | + if (APIC_INTEGRATED(ver)) { /* !82489DX */ | |
4730 | + v = apic_read(APIC_ARBPRI); | |
4731 | + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, | |
4732 | + v & APIC_ARBPRI_MASK); | |
4733 | + v = apic_read(APIC_PROCPRI); | |
4734 | + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); | |
4735 | + } | |
4736 | + | |
4737 | + v = apic_read(APIC_EOI); | |
4738 | + printk(KERN_DEBUG "... APIC EOI: %08x\n", v); | |
4739 | + v = apic_read(APIC_RRR); | |
4740 | + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); | |
4741 | + v = apic_read(APIC_LDR); | |
4742 | + printk(KERN_DEBUG "... APIC LDR: %08x\n", v); | |
4743 | + v = apic_read(APIC_DFR); | |
4744 | + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); | |
4745 | + v = apic_read(APIC_SPIV); | |
4746 | + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); | |
4747 | + | |
4748 | + printk(KERN_DEBUG "... APIC ISR field:\n"); | |
4749 | + print_APIC_bitfield(APIC_ISR); | |
4750 | + printk(KERN_DEBUG "... APIC TMR field:\n"); | |
4751 | + print_APIC_bitfield(APIC_TMR); | |
4752 | + printk(KERN_DEBUG "... APIC IRR field:\n"); | |
4753 | + print_APIC_bitfield(APIC_IRR); | |
4754 | + | |
4755 | + if (APIC_INTEGRATED(ver)) { /* !82489DX */ | |
4756 | + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | |
4757 | + apic_write(APIC_ESR, 0); | |
4758 | + v = apic_read(APIC_ESR); | |
4759 | + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | |
4760 | + } | |
4761 | + | |
4762 | + v = apic_read(APIC_ICR); | |
4763 | + printk(KERN_DEBUG "... APIC ICR: %08x\n", v); | |
4764 | + v = apic_read(APIC_ICR2); | |
4765 | + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); | |
4766 | + | |
4767 | + v = apic_read(APIC_LVTT); | |
4768 | + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); | |
4769 | + | |
4770 | + if (maxlvt > 3) { /* PC is LVT#4. */ | |
4771 | + v = apic_read(APIC_LVTPC); | |
4772 | + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); | |
4773 | + } | |
4774 | + v = apic_read(APIC_LVT0); | |
4775 | + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); | |
4776 | + v = apic_read(APIC_LVT1); | |
4777 | + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); | |
4778 | + | |
4779 | + if (maxlvt > 2) { /* ERR is LVT#3. */ | |
4780 | + v = apic_read(APIC_LVTERR); | |
4781 | + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); | |
4782 | + } | |
4783 | + | |
4784 | + v = apic_read(APIC_TMICT); | |
4785 | + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); | |
4786 | + v = apic_read(APIC_TMCCT); | |
4787 | + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); | |
4788 | + v = apic_read(APIC_TDCR); | |
4789 | + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); | |
4790 | + printk("\n"); | |
4791 | +} | |
4792 | + | |
4793 | +void print_all_local_APICs (void) | |
4794 | +{ | |
4795 | + on_each_cpu(print_local_APIC, NULL, 1, 1); | |
4796 | +} | |
4797 | + | |
4798 | +void /*__init*/ print_PIC(void) | |
4799 | +{ | |
4800 | + unsigned int v; | |
4801 | + unsigned long flags; | |
4802 | + | |
4803 | + if (apic_verbosity == APIC_QUIET) | |
4804 | + return; | |
4805 | + | |
4806 | + printk(KERN_DEBUG "\nprinting PIC contents\n"); | |
4807 | + | |
4808 | + spin_lock_irqsave(&i8259A_lock, flags); | |
4809 | + | |
4810 | + v = inb(0xa1) << 8 | inb(0x21); | |
4811 | + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); | |
4812 | + | |
4813 | + v = inb(0xa0) << 8 | inb(0x20); | |
4814 | + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); | |
4815 | + | |
4816 | + outb(0x0b,0xa0); | |
4817 | + outb(0x0b,0x20); | |
4818 | + v = inb(0xa0) << 8 | inb(0x20); | |
4819 | + outb(0x0a,0xa0); | |
4820 | + outb(0x0a,0x20); | |
4821 | + | |
4822 | + spin_unlock_irqrestore(&i8259A_lock, flags); | |
4823 | + | |
4824 | + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); | |
4825 | + | |
4826 | + v = inb(0x4d1) << 8 | inb(0x4d0); | |
4827 | + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); | |
4828 | +} | |
4829 | +#endif /* !CONFIG_XEN */ | |
4830 | + | |
4831 | +static void __init enable_IO_APIC(void) | |
4832 | +{ | |
4833 | + union IO_APIC_reg_01 reg_01; | |
4834 | + int i8259_apic, i8259_pin; | |
4835 | + int i, apic; | |
4836 | + unsigned long flags; | |
4837 | + | |
4838 | + for (i = 0; i < PIN_MAP_SIZE; i++) { | |
4839 | + irq_2_pin[i].pin = -1; | |
4840 | + irq_2_pin[i].next = 0; | |
4841 | + } | |
4842 | + if (!pirqs_enabled) | |
4843 | + for (i = 0; i < MAX_PIRQS; i++) | |
4844 | + pirq_entries[i] = -1; | |
4845 | + | |
4846 | + /* | |
4847 | + * The number of IO-APIC IRQ registers (== #pins): | |
4848 | + */ | |
4849 | + for (apic = 0; apic < nr_ioapics; apic++) { | |
4850 | + spin_lock_irqsave(&ioapic_lock, flags); | |
4851 | + reg_01.raw = io_apic_read(apic, 1); | |
4852 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
4853 | + nr_ioapic_registers[apic] = reg_01.bits.entries+1; | |
4854 | + } | |
4855 | + for(apic = 0; apic < nr_ioapics; apic++) { | |
4856 | + int pin; | |
4857 | + /* See if any of the pins is in ExtINT mode */ | |
4858 | + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | |
4859 | + struct IO_APIC_route_entry entry; | |
4860 | + spin_lock_irqsave(&ioapic_lock, flags); | |
4861 | + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); | |
4862 | + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); | |
4863 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
4864 | + | |
4865 | + | |
4866 | + /* If the interrupt line is enabled and in ExtInt mode | |
4867 | + * I have found the pin where the i8259 is connected. | |
4868 | + */ | |
4869 | + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { | |
4870 | + ioapic_i8259.apic = apic; | |
4871 | + ioapic_i8259.pin = pin; | |
4872 | + goto found_i8259; | |
4873 | + } | |
4874 | + } | |
4875 | + } | |
4876 | + found_i8259: | |
4877 | + /* Look to see what if the MP table has reported the ExtINT */ | |
4878 | + /* If we could not find the appropriate pin by looking at the ioapic | |
4879 | + * the i8259 probably is not connected the ioapic but give the | |
4880 | + * mptable a chance anyway. | |
4881 | + */ | |
4882 | + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); | |
4883 | + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); | |
4884 | + /* Trust the MP table if nothing is setup in the hardware */ | |
4885 | + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { | |
4886 | + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); | |
4887 | + ioapic_i8259.pin = i8259_pin; | |
4888 | + ioapic_i8259.apic = i8259_apic; | |
4889 | + } | |
4890 | + /* Complain if the MP table and the hardware disagree */ | |
4891 | + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && | |
4892 | + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) | |
4893 | + { | |
4894 | + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); | |
4895 | + } | |
4896 | + | |
4897 | + /* | |
4898 | + * Do not trust the IO-APIC being empty at bootup | |
4899 | + */ | |
4900 | + clear_IO_APIC(); | |
4901 | +} | |
4902 | + | |
4903 | +/* | |
4904 | + * Not an __init, needed by the reboot code | |
4905 | + */ | |
4906 | +void disable_IO_APIC(void) | |
4907 | +{ | |
4908 | + /* | |
4909 | + * Clear the IO-APIC before rebooting: | |
4910 | + */ | |
4911 | + clear_IO_APIC(); | |
4912 | + | |
4913 | +#ifndef CONFIG_XEN | |
4914 | + /* | |
4915 | + * If the i8259 is routed through an IOAPIC | |
4916 | + * Put that IOAPIC in virtual wire mode | |
4917 | + * so legacy interrupts can be delivered. | |
4918 | + */ | |
4919 | + if (ioapic_i8259.pin != -1) { | |
4920 | + struct IO_APIC_route_entry entry; | |
4921 | + unsigned long flags; | |
4922 | + | |
4923 | + memset(&entry, 0, sizeof(entry)); | |
4924 | + entry.mask = 0; /* Enabled */ | |
4925 | + entry.trigger = 0; /* Edge */ | |
4926 | + entry.irr = 0; | |
4927 | + entry.polarity = 0; /* High */ | |
4928 | + entry.delivery_status = 0; | |
4929 | + entry.dest_mode = 0; /* Physical */ | |
4930 | + entry.delivery_mode = dest_ExtINT; /* ExtInt */ | |
4931 | + entry.vector = 0; | |
4932 | + entry.dest.physical.physical_dest = | |
4933 | + GET_APIC_ID(apic_read(APIC_ID)); | |
4934 | + | |
4935 | + /* | |
4936 | + * Add it to the IO-APIC irq-routing table: | |
4937 | + */ | |
4938 | + spin_lock_irqsave(&ioapic_lock, flags); | |
4939 | + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, | |
4940 | + *(((int *)&entry)+1)); | |
4941 | + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, | |
4942 | + *(((int *)&entry)+0)); | |
4943 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
4944 | + } | |
4945 | + disconnect_bsp_APIC(ioapic_i8259.pin != -1); | |
4946 | +#endif | |
4947 | +} | |
4948 | + | |
4949 | +/* | |
4950 | + * function to set the IO-APIC physical IDs based on the | |
4951 | + * values stored in the MPC table. | |
4952 | + * | |
4953 | + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 | |
4954 | + */ | |
4955 | + | |
4956 | +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ) | |
4957 | +static void __init setup_ioapic_ids_from_mpc(void) | |
4958 | +{ | |
4959 | + union IO_APIC_reg_00 reg_00; | |
4960 | + physid_mask_t phys_id_present_map; | |
4961 | + int apic; | |
4962 | + int i; | |
4963 | + unsigned char old_id; | |
4964 | + unsigned long flags; | |
4965 | + | |
4966 | + /* | |
4967 | + * Don't check I/O APIC IDs for xAPIC systems. They have | |
4968 | + * no meaning without the serial APIC bus. | |
4969 | + */ | |
4970 | + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | |
4971 | + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | |
4972 | + return; | |
4973 | + /* | |
4974 | + * This is broken; anything with a real cpu count has to | |
4975 | + * circumvent this idiocy regardless. | |
4976 | + */ | |
4977 | + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); | |
4978 | + | |
4979 | + /* | |
4980 | + * Set the IOAPIC ID to the value stored in the MPC table. | |
4981 | + */ | |
4982 | + for (apic = 0; apic < nr_ioapics; apic++) { | |
4983 | + | |
4984 | + /* Read the register 0 value */ | |
4985 | + spin_lock_irqsave(&ioapic_lock, flags); | |
4986 | + reg_00.raw = io_apic_read(apic, 0); | |
4987 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
4988 | + | |
4989 | + old_id = mp_ioapics[apic].mpc_apicid; | |
4990 | + | |
4991 | + if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { | |
4992 | + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", | |
4993 | + apic, mp_ioapics[apic].mpc_apicid); | |
4994 | + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", | |
4995 | + reg_00.bits.ID); | |
4996 | + mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; | |
4997 | + } | |
4998 | + | |
4999 | + /* | |
5000 | + * Sanity check, is the ID really free? Every APIC in a | |
5001 | + * system must have a unique ID or we get lots of nice | |
5002 | + * 'stuck on smp_invalidate_needed IPI wait' messages. | |
5003 | + */ | |
5004 | + if (check_apicid_used(phys_id_present_map, | |
5005 | + mp_ioapics[apic].mpc_apicid)) { | |
5006 | + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", | |
5007 | + apic, mp_ioapics[apic].mpc_apicid); | |
5008 | + for (i = 0; i < get_physical_broadcast(); i++) | |
5009 | + if (!physid_isset(i, phys_id_present_map)) | |
5010 | + break; | |
5011 | + if (i >= get_physical_broadcast()) | |
5012 | + panic("Max APIC ID exceeded!\n"); | |
5013 | + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", | |
5014 | + i); | |
5015 | + physid_set(i, phys_id_present_map); | |
5016 | + mp_ioapics[apic].mpc_apicid = i; | |
5017 | + } else { | |
5018 | + physid_mask_t tmp; | |
5019 | + tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); | |
5020 | + apic_printk(APIC_VERBOSE, "Setting %d in the " | |
5021 | + "phys_id_present_map\n", | |
5022 | + mp_ioapics[apic].mpc_apicid); | |
5023 | + physids_or(phys_id_present_map, phys_id_present_map, tmp); | |
5024 | + } | |
5025 | + | |
5026 | + | |
5027 | + /* | |
5028 | + * We need to adjust the IRQ routing table | |
5029 | + * if the ID changed. | |
5030 | + */ | |
5031 | + if (old_id != mp_ioapics[apic].mpc_apicid) | |
5032 | + for (i = 0; i < mp_irq_entries; i++) | |
5033 | + if (mp_irqs[i].mpc_dstapic == old_id) | |
5034 | + mp_irqs[i].mpc_dstapic | |
5035 | + = mp_ioapics[apic].mpc_apicid; | |
5036 | + | |
5037 | + /* | |
5038 | + * Read the right value from the MPC table and | |
5039 | + * write it into the ID register. | |
5040 | + */ | |
5041 | + apic_printk(APIC_VERBOSE, KERN_INFO | |
5042 | + "...changing IO-APIC physical APIC ID to %d ...", | |
5043 | + mp_ioapics[apic].mpc_apicid); | |
5044 | + | |
5045 | + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; | |
5046 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5047 | + io_apic_write(apic, 0, reg_00.raw); | |
5048 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5049 | + | |
5050 | + /* | |
5051 | + * Sanity check | |
5052 | + */ | |
5053 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5054 | + reg_00.raw = io_apic_read(apic, 0); | |
5055 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5056 | + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) | |
5057 | + printk("could not set ID!\n"); | |
5058 | + else | |
5059 | + apic_printk(APIC_VERBOSE, " ok.\n"); | |
5060 | + } | |
5061 | +} | |
5062 | +#else | |
5063 | +static void __init setup_ioapic_ids_from_mpc(void) { } | |
5064 | +#endif | |
5065 | + | |
5066 | +#ifndef CONFIG_XEN | |
5067 | +/* | |
5068 | + * There is a nasty bug in some older SMP boards, their mptable lies | |
5069 | + * about the timer IRQ. We do the following to work around the situation: | |
5070 | + * | |
5071 | + * - timer IRQ defaults to IO-APIC IRQ | |
5072 | + * - if this function detects that timer IRQs are defunct, then we fall | |
5073 | + * back to ISA timer IRQs | |
5074 | + */ | |
5075 | +static int __init timer_irq_works(void) | |
5076 | +{ | |
5077 | + unsigned long t1 = jiffies; | |
5078 | + | |
5079 | + local_irq_enable(); | |
5080 | + /* Let ten ticks pass... */ | |
5081 | + mdelay((10 * 1000) / HZ); | |
5082 | + | |
5083 | + /* | |
5084 | + * Expect a few ticks at least, to be sure some possible | |
5085 | + * glue logic does not lock up after one or two first | |
5086 | + * ticks in a non-ExtINT mode. Also the local APIC | |
5087 | + * might have cached one ExtINT interrupt. Finally, at | |
5088 | + * least one tick may be lost due to delays. | |
5089 | + */ | |
5090 | + if (jiffies - t1 > 4) | |
5091 | + return 1; | |
5092 | + | |
5093 | + return 0; | |
5094 | +} | |
5095 | + | |
5096 | +/* | |
5097 | + * In the SMP+IOAPIC case it might happen that there are an unspecified | |
5098 | + * number of pending IRQ events unhandled. These cases are very rare, | |
5099 | + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much | |
5100 | + * better to do it this way as thus we do not have to be aware of | |
5101 | + * 'pending' interrupts in the IRQ path, except at this point. | |
5102 | + */ | |
5103 | +/* | |
5104 | + * Edge triggered needs to resend any interrupt | |
5105 | + * that was delayed but this is now handled in the device | |
5106 | + * independent code. | |
5107 | + */ | |
5108 | + | |
5109 | +/* | |
5110 | + * Starting up a edge-triggered IO-APIC interrupt is | |
5111 | + * nasty - we need to make sure that we get the edge. | |
5112 | + * If it is already asserted for some reason, we need | |
5113 | + * return 1 to indicate that is was pending. | |
5114 | + * | |
5115 | + * This is not complete - we should be able to fake | |
5116 | + * an edge even if it isn't on the 8259A... | |
5117 | + */ | |
5118 | +static unsigned int startup_edge_ioapic_irq(unsigned int irq) | |
5119 | +{ | |
5120 | + int was_pending = 0; | |
5121 | + unsigned long flags; | |
5122 | + | |
5123 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5124 | + if (irq < 16) { | |
5125 | + disable_8259A_irq(irq); | |
5126 | + if (i8259A_irq_pending(irq)) | |
5127 | + was_pending = 1; | |
5128 | + } | |
5129 | + __unmask_IO_APIC_irq(irq); | |
5130 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5131 | + | |
5132 | + return was_pending; | |
5133 | +} | |
5134 | + | |
5135 | +/* | |
5136 | + * Once we have recorded IRQ_PENDING already, we can mask the | |
5137 | + * interrupt for real. This prevents IRQ storms from unhandled | |
5138 | + * devices. | |
5139 | + */ | |
5140 | +static void ack_edge_ioapic_irq(unsigned int irq) | |
5141 | +{ | |
5142 | + move_irq(irq); | |
5143 | + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) | |
5144 | + == (IRQ_PENDING | IRQ_DISABLED)) | |
5145 | + mask_IO_APIC_irq(irq); | |
5146 | + ack_APIC_irq(); | |
5147 | +} | |
5148 | + | |
5149 | +/* | |
5150 | + * Level triggered interrupts can just be masked, | |
5151 | + * and shutting down and starting up the interrupt | |
5152 | + * is the same as enabling and disabling them -- except | |
5153 | + * with a startup need to return a "was pending" value. | |
5154 | + * | |
5155 | + * Level triggered interrupts are special because we | |
5156 | + * do not touch any IO-APIC register while handling | |
5157 | + * them. We ack the APIC in the end-IRQ handler, not | |
5158 | + * in the start-IRQ-handler. Protection against reentrance | |
5159 | + * from the same interrupt is still provided, both by the | |
5160 | + * generic IRQ layer and by the fact that an unacked local | |
5161 | + * APIC does not accept IRQs. | |
5162 | + */ | |
5163 | +static unsigned int startup_level_ioapic_irq (unsigned int irq) | |
5164 | +{ | |
5165 | + unmask_IO_APIC_irq(irq); | |
5166 | + | |
5167 | + return 0; /* don't check for pending */ | |
5168 | +} | |
5169 | + | |
5170 | +static void end_level_ioapic_irq (unsigned int irq) | |
5171 | +{ | |
5172 | + unsigned long v; | |
5173 | + int i; | |
5174 | + | |
5175 | + move_irq(irq); | |
5176 | +/* | |
5177 | + * It appears there is an erratum which affects at least version 0x11 | |
5178 | + * of I/O APIC (that's the 82093AA and cores integrated into various | |
5179 | + * chipsets). Under certain conditions a level-triggered interrupt is | |
5180 | + * erroneously delivered as edge-triggered one but the respective IRR | |
5181 | + * bit gets set nevertheless. As a result the I/O unit expects an EOI | |
5182 | + * message but it will never arrive and further interrupts are blocked | |
5183 | + * from the source. The exact reason is so far unknown, but the | |
5184 | + * phenomenon was observed when two consecutive interrupt requests | |
5185 | + * from a given source get delivered to the same CPU and the source is | |
5186 | + * temporarily disabled in between. | |
5187 | + * | |
5188 | + * A workaround is to simulate an EOI message manually. We achieve it | |
5189 | + * by setting the trigger mode to edge and then to level when the edge | |
5190 | + * trigger mode gets detected in the TMR of a local APIC for a | |
5191 | + * level-triggered interrupt. We mask the source for the time of the | |
5192 | + * operation to prevent an edge-triggered interrupt escaping meanwhile. | |
5193 | + * The idea is from Manfred Spraul. --macro | |
5194 | + */ | |
5195 | + i = IO_APIC_VECTOR(irq); | |
5196 | + | |
5197 | + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); | |
5198 | + | |
5199 | + ack_APIC_irq(); | |
5200 | + | |
5201 | + if (!(v & (1 << (i & 0x1f)))) { | |
5202 | + atomic_inc(&irq_mis_count); | |
5203 | + spin_lock(&ioapic_lock); | |
5204 | + __mask_and_edge_IO_APIC_irq(irq); | |
5205 | + __unmask_and_level_IO_APIC_irq(irq); | |
5206 | + spin_unlock(&ioapic_lock); | |
5207 | + } | |
5208 | +} | |
5209 | + | |
5210 | +#ifdef CONFIG_PCI_MSI | |
5211 | +static unsigned int startup_edge_ioapic_vector(unsigned int vector) | |
5212 | +{ | |
5213 | + int irq = vector_to_irq(vector); | |
5214 | + | |
5215 | + return startup_edge_ioapic_irq(irq); | |
5216 | +} | |
5217 | + | |
5218 | +static void ack_edge_ioapic_vector(unsigned int vector) | |
5219 | +{ | |
5220 | + int irq = vector_to_irq(vector); | |
5221 | + | |
5222 | + move_native_irq(vector); | |
5223 | + ack_edge_ioapic_irq(irq); | |
5224 | +} | |
5225 | + | |
5226 | +static unsigned int startup_level_ioapic_vector (unsigned int vector) | |
5227 | +{ | |
5228 | + int irq = vector_to_irq(vector); | |
5229 | + | |
5230 | + return startup_level_ioapic_irq (irq); | |
5231 | +} | |
5232 | + | |
5233 | +static void end_level_ioapic_vector (unsigned int vector) | |
5234 | +{ | |
5235 | + int irq = vector_to_irq(vector); | |
5236 | + | |
5237 | + move_native_irq(vector); | |
5238 | + end_level_ioapic_irq(irq); | |
5239 | +} | |
5240 | + | |
5241 | +static void mask_IO_APIC_vector (unsigned int vector) | |
5242 | +{ | |
5243 | + int irq = vector_to_irq(vector); | |
5244 | + | |
5245 | + mask_IO_APIC_irq(irq); | |
5246 | +} | |
5247 | + | |
5248 | +static void unmask_IO_APIC_vector (unsigned int vector) | |
5249 | +{ | |
5250 | + int irq = vector_to_irq(vector); | |
5251 | + | |
5252 | + unmask_IO_APIC_irq(irq); | |
5253 | +} | |
5254 | + | |
5255 | +#ifdef CONFIG_SMP | |
5256 | +static void set_ioapic_affinity_vector (unsigned int vector, | |
5257 | + cpumask_t cpu_mask) | |
5258 | +{ | |
5259 | + int irq = vector_to_irq(vector); | |
5260 | + | |
5261 | + set_native_irq_info(vector, cpu_mask); | |
5262 | + set_ioapic_affinity_irq(irq, cpu_mask); | |
5263 | +} | |
5264 | +#endif | |
5265 | +#endif | |
5266 | + | |
5267 | +static int ioapic_retrigger(unsigned int irq) | |
5268 | +{ | |
5269 | + send_IPI_self(IO_APIC_VECTOR(irq)); | |
5270 | + | |
5271 | + return 1; | |
5272 | +} | |
5273 | + | |
5274 | +/* | |
5275 | + * Level and edge triggered IO-APIC interrupts need different handling, | |
5276 | + * so we use two separate IRQ descriptors. Edge triggered IRQs can be | |
5277 | + * handled with the level-triggered descriptor, but that one has slightly | |
5278 | + * more overhead. Level-triggered interrupts cannot be handled with the | |
5279 | + * edge-triggered handler, without risking IRQ storms and other ugly | |
5280 | + * races. | |
5281 | + */ | |
5282 | +static struct hw_interrupt_type ioapic_edge_type __read_mostly = { | |
5283 | + .typename = "IO-APIC-edge", | |
5284 | + .startup = startup_edge_ioapic, | |
5285 | + .shutdown = shutdown_edge_ioapic, | |
5286 | + .enable = enable_edge_ioapic, | |
5287 | + .disable = disable_edge_ioapic, | |
5288 | + .ack = ack_edge_ioapic, | |
5289 | + .end = end_edge_ioapic, | |
5290 | +#ifdef CONFIG_SMP | |
5291 | + .set_affinity = set_ioapic_affinity, | |
5292 | +#endif | |
5293 | + .retrigger = ioapic_retrigger, | |
5294 | +}; | |
5295 | + | |
5296 | +static struct hw_interrupt_type ioapic_level_type __read_mostly = { | |
5297 | + .typename = "IO-APIC-level", | |
5298 | + .startup = startup_level_ioapic, | |
5299 | + .shutdown = shutdown_level_ioapic, | |
5300 | + .enable = enable_level_ioapic, | |
5301 | + .disable = disable_level_ioapic, | |
5302 | + .ack = mask_and_ack_level_ioapic, | |
5303 | + .end = end_level_ioapic, | |
5304 | +#ifdef CONFIG_SMP | |
5305 | + .set_affinity = set_ioapic_affinity, | |
5306 | +#endif | |
5307 | + .retrigger = ioapic_retrigger, | |
5308 | +}; | |
5309 | +#endif /* !CONFIG_XEN */ | |
5310 | + | |
5311 | +static inline void init_IO_APIC_traps(void) | |
5312 | +{ | |
5313 | + int irq; | |
5314 | + | |
5315 | + /* | |
5316 | + * NOTE! The local APIC isn't very good at handling | |
5317 | + * multiple interrupts at the same interrupt level. | |
5318 | + * As the interrupt level is determined by taking the | |
5319 | + * vector number and shifting that right by 4, we | |
5320 | + * want to spread these out a bit so that they don't | |
5321 | + * all fall in the same interrupt level. | |
5322 | + * | |
5323 | + * Also, we've got to be careful not to trash gate | |
5324 | + * 0x80, because int 0x80 is hm, kind of importantish. ;) | |
5325 | + */ | |
5326 | + for (irq = 0; irq < NR_IRQS ; irq++) { | |
5327 | + int tmp = irq; | |
5328 | + if (use_pci_vector()) { | |
5329 | + if (!platform_legacy_irq(tmp)) | |
5330 | + if ((tmp = vector_to_irq(tmp)) == -1) | |
5331 | + continue; | |
5332 | + } | |
5333 | + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { | |
5334 | + /* | |
5335 | + * Hmm.. We don't have an entry for this, | |
5336 | + * so default to an old-fashioned 8259 | |
5337 | + * interrupt if we can.. | |
5338 | + */ | |
5339 | + if (irq < 16) | |
5340 | + make_8259A_irq(irq); | |
5341 | +#ifndef CONFIG_XEN | |
5342 | + else | |
5343 | + /* Strange. Oh, well.. */ | |
5344 | + irq_desc[irq].chip = &no_irq_type; | |
5345 | +#endif | |
5346 | + } | |
5347 | + } | |
5348 | +} | |
5349 | + | |
5350 | +#ifndef CONFIG_XEN | |
5351 | +static void enable_lapic_irq (unsigned int irq) | |
5352 | +{ | |
5353 | + unsigned long v; | |
5354 | + | |
5355 | + v = apic_read(APIC_LVT0); | |
5356 | + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); | |
5357 | +} | |
5358 | + | |
5359 | +static void disable_lapic_irq (unsigned int irq) | |
5360 | +{ | |
5361 | + unsigned long v; | |
5362 | + | |
5363 | + v = apic_read(APIC_LVT0); | |
5364 | + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); | |
5365 | +} | |
5366 | + | |
5367 | +static void ack_lapic_irq (unsigned int irq) | |
5368 | +{ | |
5369 | + ack_APIC_irq(); | |
5370 | +} | |
5371 | + | |
5372 | +static void end_lapic_irq (unsigned int i) { /* nothing */ } | |
5373 | + | |
5374 | +static struct hw_interrupt_type lapic_irq_type __read_mostly = { | |
5375 | + .typename = "local-APIC-edge", | |
5376 | + .startup = NULL, /* startup_irq() not used for IRQ0 */ | |
5377 | + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ | |
5378 | + .enable = enable_lapic_irq, | |
5379 | + .disable = disable_lapic_irq, | |
5380 | + .ack = ack_lapic_irq, | |
5381 | + .end = end_lapic_irq | |
5382 | +}; | |
5383 | + | |
5384 | +static void setup_nmi (void) | |
5385 | +{ | |
5386 | + /* | |
5387 | + * Dirty trick to enable the NMI watchdog ... | |
5388 | + * We put the 8259A master into AEOI mode and | |
5389 | + * unmask on all local APICs LVT0 as NMI. | |
5390 | + * | |
5391 | + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | |
5392 | + * is from Maciej W. Rozycki - so we do not have to EOI from | |
5393 | + * the NMI handler or the timer interrupt. | |
5394 | + */ | |
5395 | + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); | |
5396 | + | |
5397 | + on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); | |
5398 | + | |
5399 | + apic_printk(APIC_VERBOSE, " done.\n"); | |
5400 | +} | |
5401 | + | |
5402 | +/* | |
5403 | + * This looks a bit hackish but it's about the only one way of sending | |
5404 | + * a few INTA cycles to 8259As and any associated glue logic. ICR does | |
5405 | + * not support the ExtINT mode, unfortunately. We need to send these | |
5406 | + * cycles as some i82489DX-based boards have glue logic that keeps the | |
5407 | + * 8259A interrupt line asserted until INTA. --macro | |
5408 | + */ | |
5409 | +static inline void unlock_ExtINT_logic(void) | |
5410 | +{ | |
5411 | + int apic, pin, i; | |
5412 | + struct IO_APIC_route_entry entry0, entry1; | |
5413 | + unsigned char save_control, save_freq_select; | |
5414 | + unsigned long flags; | |
5415 | + | |
5416 | + pin = find_isa_irq_pin(8, mp_INT); | |
5417 | + apic = find_isa_irq_apic(8, mp_INT); | |
5418 | + if (pin == -1) | |
5419 | + return; | |
5420 | + | |
5421 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5422 | + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); | |
5423 | + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); | |
5424 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5425 | + clear_IO_APIC_pin(apic, pin); | |
5426 | + | |
5427 | + memset(&entry1, 0, sizeof(entry1)); | |
5428 | + | |
5429 | + entry1.dest_mode = 0; /* physical delivery */ | |
5430 | + entry1.mask = 0; /* unmask IRQ now */ | |
5431 | + entry1.dest.physical.physical_dest = hard_smp_processor_id(); | |
5432 | + entry1.delivery_mode = dest_ExtINT; | |
5433 | + entry1.polarity = entry0.polarity; | |
5434 | + entry1.trigger = 0; | |
5435 | + entry1.vector = 0; | |
5436 | + | |
5437 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5438 | + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); | |
5439 | + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); | |
5440 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5441 | + | |
5442 | + save_control = CMOS_READ(RTC_CONTROL); | |
5443 | + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); | |
5444 | + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, | |
5445 | + RTC_FREQ_SELECT); | |
5446 | + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); | |
5447 | + | |
5448 | + i = 100; | |
5449 | + while (i-- > 0) { | |
5450 | + mdelay(10); | |
5451 | + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) | |
5452 | + i -= 10; | |
5453 | + } | |
5454 | + | |
5455 | + CMOS_WRITE(save_control, RTC_CONTROL); | |
5456 | + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); | |
5457 | + clear_IO_APIC_pin(apic, pin); | |
5458 | + | |
5459 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5460 | + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); | |
5461 | + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); | |
5462 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5463 | +} | |
5464 | + | |
5465 | +int timer_uses_ioapic_pin_0; | |
5466 | + | |
5467 | +/* | |
5468 | + * This code may look a bit paranoid, but it's supposed to cooperate with | |
5469 | + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ | |
5470 | + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast | |
5471 | + * fanatically on his truly buggy board. | |
5472 | + */ | |
5473 | +static inline void check_timer(void) | |
5474 | +{ | |
5475 | + int apic1, pin1, apic2, pin2; | |
5476 | + int vector; | |
5477 | + | |
5478 | + /* | |
5479 | + * get/set the timer IRQ vector: | |
5480 | + */ | |
5481 | + disable_8259A_irq(0); | |
5482 | + vector = assign_irq_vector(0); | |
5483 | + set_intr_gate(vector, interrupt[0]); | |
5484 | + | |
5485 | + /* | |
5486 | + * Subtle, code in do_timer_interrupt() expects an AEOI | |
5487 | + * mode for the 8259A whenever interrupts are routed | |
5488 | + * through I/O APICs. Also IRQ0 has to be enabled in | |
5489 | + * the 8259A which implies the virtual wire has to be | |
5490 | + * disabled in the local APIC. | |
5491 | + */ | |
5492 | + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | |
5493 | + init_8259A(1); | |
5494 | + timer_ack = 1; | |
5495 | + if (timer_over_8254 > 0) | |
5496 | + enable_8259A_irq(0); | |
5497 | + | |
5498 | + pin1 = find_isa_irq_pin(0, mp_INT); | |
5499 | + apic1 = find_isa_irq_apic(0, mp_INT); | |
5500 | + pin2 = ioapic_i8259.pin; | |
5501 | + apic2 = ioapic_i8259.apic; | |
5502 | + | |
5503 | + if (pin1 == 0) | |
5504 | + timer_uses_ioapic_pin_0 = 1; | |
5505 | + | |
5506 | + printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", | |
5507 | + vector, apic1, pin1, apic2, pin2); | |
5508 | + | |
5509 | + if (pin1 != -1) { | |
5510 | + /* | |
5511 | + * Ok, does IRQ0 through the IOAPIC work? | |
5512 | + */ | |
5513 | + unmask_IO_APIC_irq(0); | |
5514 | + if (timer_irq_works()) { | |
5515 | + if (nmi_watchdog == NMI_IO_APIC) { | |
5516 | + disable_8259A_irq(0); | |
5517 | + setup_nmi(); | |
5518 | + enable_8259A_irq(0); | |
5519 | + } | |
5520 | + if (disable_timer_pin_1 > 0) | |
5521 | + clear_IO_APIC_pin(0, pin1); | |
5522 | + return; | |
5523 | + } | |
5524 | + clear_IO_APIC_pin(apic1, pin1); | |
5525 | + printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " | |
5526 | + "IO-APIC\n"); | |
5527 | + } | |
5528 | + | |
5529 | + printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); | |
5530 | + if (pin2 != -1) { | |
5531 | + printk("\n..... (found pin %d) ...", pin2); | |
5532 | + /* | |
5533 | + * legacy devices should be connected to IO APIC #0 | |
5534 | + */ | |
5535 | + setup_ExtINT_IRQ0_pin(apic2, pin2, vector); | |
5536 | + if (timer_irq_works()) { | |
5537 | + printk("works.\n"); | |
5538 | + if (pin1 != -1) | |
5539 | + replace_pin_at_irq(0, apic1, pin1, apic2, pin2); | |
5540 | + else | |
5541 | + add_pin_to_irq(0, apic2, pin2); | |
5542 | + if (nmi_watchdog == NMI_IO_APIC) { | |
5543 | + setup_nmi(); | |
5544 | + } | |
5545 | + return; | |
5546 | + } | |
5547 | + /* | |
5548 | + * Cleanup, just in case ... | |
5549 | + */ | |
5550 | + clear_IO_APIC_pin(apic2, pin2); | |
5551 | + } | |
5552 | + printk(" failed.\n"); | |
5553 | + | |
5554 | + if (nmi_watchdog == NMI_IO_APIC) { | |
5555 | + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); | |
5556 | + nmi_watchdog = 0; | |
5557 | + } | |
5558 | + | |
5559 | + printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); | |
5560 | + | |
5561 | + disable_8259A_irq(0); | |
5562 | + irq_desc[0].chip = &lapic_irq_type; | |
5563 | + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ | |
5564 | + enable_8259A_irq(0); | |
5565 | + | |
5566 | + if (timer_irq_works()) { | |
5567 | + printk(" works.\n"); | |
5568 | + return; | |
5569 | + } | |
5570 | + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); | |
5571 | + printk(" failed.\n"); | |
5572 | + | |
5573 | + printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); | |
5574 | + | |
5575 | + timer_ack = 0; | |
5576 | + init_8259A(0); | |
5577 | + make_8259A_irq(0); | |
5578 | + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); | |
5579 | + | |
5580 | + unlock_ExtINT_logic(); | |
5581 | + | |
5582 | + if (timer_irq_works()) { | |
5583 | + printk(" works.\n"); | |
5584 | + return; | |
5585 | + } | |
5586 | + printk(" failed :(.\n"); | |
5587 | + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " | |
5588 | + "report. Then try booting with the 'noapic' option"); | |
5589 | +} | |
5590 | +#else | |
5591 | +int timer_uses_ioapic_pin_0 = 0; | |
5592 | +#define check_timer() ((void)0) | |
5593 | +#endif | |
5594 | + | |
5595 | +/* | |
5596 | + * | |
5597 | + * IRQ's that are handled by the PIC in the MPS IOAPIC case. | |
5598 | + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. | |
5599 | + * Linux doesn't really care, as it's not actually used | |
5600 | + * for any interrupt handling anyway. | |
5601 | + */ | |
5602 | +#define PIC_IRQS (1 << PIC_CASCADE_IR) | |
5603 | + | |
5604 | +void __init setup_IO_APIC(void) | |
5605 | +{ | |
5606 | + enable_IO_APIC(); | |
5607 | + | |
5608 | + if (acpi_ioapic) | |
5609 | + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ | |
5610 | + else | |
5611 | + io_apic_irqs = ~PIC_IRQS; | |
5612 | + | |
5613 | + printk("ENABLING IO-APIC IRQs\n"); | |
5614 | + | |
5615 | + /* | |
5616 | + * Set up IO-APIC IRQ routing. | |
5617 | + */ | |
5618 | + if (!acpi_ioapic) | |
5619 | + setup_ioapic_ids_from_mpc(); | |
5620 | +#ifndef CONFIG_XEN | |
5621 | + sync_Arb_IDs(); | |
5622 | +#endif | |
5623 | + setup_IO_APIC_irqs(); | |
5624 | + init_IO_APIC_traps(); | |
5625 | + check_timer(); | |
5626 | + if (!acpi_ioapic) | |
5627 | + print_IO_APIC(); | |
5628 | +} | |
5629 | + | |
5630 | +static int __init setup_disable_8254_timer(char *s) | |
5631 | +{ | |
5632 | + timer_over_8254 = -1; | |
5633 | + return 1; | |
5634 | +} | |
5635 | +static int __init setup_enable_8254_timer(char *s) | |
5636 | +{ | |
5637 | + timer_over_8254 = 2; | |
5638 | + return 1; | |
5639 | +} | |
5640 | + | |
5641 | +__setup("disable_8254_timer", setup_disable_8254_timer); | |
5642 | +__setup("enable_8254_timer", setup_enable_8254_timer); | |
5643 | + | |
5644 | +/* | |
5645 | + * Called after all the initialization is done. If we didnt find any | |
5646 | + * APIC bugs then we can allow the modify fast path | |
5647 | + */ | |
5648 | + | |
5649 | +static int __init io_apic_bug_finalize(void) | |
5650 | +{ | |
5651 | + if(sis_apic_bug == -1) | |
5652 | + sis_apic_bug = 0; | |
5653 | + if (is_initial_xendomain()) { | |
5654 | + struct xen_platform_op op = { .cmd = XENPF_platform_quirk }; | |
5655 | + op.u.platform_quirk.quirk_id = sis_apic_bug ? | |
5656 | + QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL; | |
5657 | + VOID(HYPERVISOR_platform_op(&op)); | |
5658 | + } | |
5659 | + return 0; | |
5660 | +} | |
5661 | + | |
5662 | +late_initcall(io_apic_bug_finalize); | |
5663 | + | |
5664 | +struct sysfs_ioapic_data { | |
5665 | + struct sys_device dev; | |
5666 | + struct IO_APIC_route_entry entry[0]; | |
5667 | +}; | |
5668 | +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; | |
5669 | + | |
5670 | +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) | |
5671 | +{ | |
5672 | + struct IO_APIC_route_entry *entry; | |
5673 | + struct sysfs_ioapic_data *data; | |
5674 | + unsigned long flags; | |
5675 | + int i; | |
5676 | + | |
5677 | + data = container_of(dev, struct sysfs_ioapic_data, dev); | |
5678 | + entry = data->entry; | |
5679 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5680 | + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { | |
5681 | + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); | |
5682 | + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); | |
5683 | + } | |
5684 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5685 | + | |
5686 | + return 0; | |
5687 | +} | |
5688 | + | |
5689 | +static int ioapic_resume(struct sys_device *dev) | |
5690 | +{ | |
5691 | + struct IO_APIC_route_entry *entry; | |
5692 | + struct sysfs_ioapic_data *data; | |
5693 | + unsigned long flags; | |
5694 | + union IO_APIC_reg_00 reg_00; | |
5695 | + int i; | |
5696 | + | |
5697 | + data = container_of(dev, struct sysfs_ioapic_data, dev); | |
5698 | + entry = data->entry; | |
5699 | + | |
5700 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5701 | + reg_00.raw = io_apic_read(dev->id, 0); | |
5702 | + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { | |
5703 | + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; | |
5704 | + io_apic_write(dev->id, 0, reg_00.raw); | |
5705 | + } | |
5706 | + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { | |
5707 | + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); | |
5708 | + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); | |
5709 | + } | |
5710 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5711 | + | |
5712 | + return 0; | |
5713 | +} | |
5714 | + | |
5715 | +static struct sysdev_class ioapic_sysdev_class = { | |
5716 | + set_kset_name("ioapic"), | |
5717 | +#ifndef CONFIG_XEN | |
5718 | + .suspend = ioapic_suspend, | |
5719 | + .resume = ioapic_resume, | |
5720 | +#endif | |
5721 | +}; | |
5722 | + | |
5723 | +static int __init ioapic_init_sysfs(void) | |
5724 | +{ | |
5725 | + struct sys_device * dev; | |
5726 | + int i, size, error = 0; | |
5727 | + | |
5728 | + error = sysdev_class_register(&ioapic_sysdev_class); | |
5729 | + if (error) | |
5730 | + return error; | |
5731 | + | |
5732 | + for (i = 0; i < nr_ioapics; i++ ) { | |
5733 | + size = sizeof(struct sys_device) + nr_ioapic_registers[i] | |
5734 | + * sizeof(struct IO_APIC_route_entry); | |
5735 | + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); | |
5736 | + if (!mp_ioapic_data[i]) { | |
5737 | + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | |
5738 | + continue; | |
5739 | + } | |
5740 | + memset(mp_ioapic_data[i], 0, size); | |
5741 | + dev = &mp_ioapic_data[i]->dev; | |
5742 | + dev->id = i; | |
5743 | + dev->cls = &ioapic_sysdev_class; | |
5744 | + error = sysdev_register(dev); | |
5745 | + if (error) { | |
5746 | + kfree(mp_ioapic_data[i]); | |
5747 | + mp_ioapic_data[i] = NULL; | |
5748 | + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | |
5749 | + continue; | |
5750 | + } | |
5751 | + } | |
5752 | + | |
5753 | + return 0; | |
5754 | +} | |
5755 | + | |
5756 | +device_initcall(ioapic_init_sysfs); | |
5757 | + | |
5758 | +/* -------------------------------------------------------------------------- | |
5759 | + ACPI-based IOAPIC Configuration | |
5760 | + -------------------------------------------------------------------------- */ | |
5761 | + | |
5762 | +#ifdef CONFIG_ACPI | |
5763 | + | |
5764 | +int __init io_apic_get_unique_id (int ioapic, int apic_id) | |
5765 | +{ | |
5766 | +#ifndef CONFIG_XEN | |
5767 | + union IO_APIC_reg_00 reg_00; | |
5768 | + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; | |
5769 | + physid_mask_t tmp; | |
5770 | + unsigned long flags; | |
5771 | + int i = 0; | |
5772 | + | |
5773 | + /* | |
5774 | + * The P4 platform supports up to 256 APIC IDs on two separate APIC | |
5775 | + * buses (one for LAPICs, one for IOAPICs), where predecessors only | |
5776 | + * supports up to 16 on one shared APIC bus. | |
5777 | + * | |
5778 | + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full | |
5779 | + * advantage of new APIC bus architecture. | |
5780 | + */ | |
5781 | + | |
5782 | + if (physids_empty(apic_id_map)) | |
5783 | + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); | |
5784 | + | |
5785 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5786 | + reg_00.raw = io_apic_read(ioapic, 0); | |
5787 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5788 | + | |
5789 | + if (apic_id >= get_physical_broadcast()) { | |
5790 | + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " | |
5791 | + "%d\n", ioapic, apic_id, reg_00.bits.ID); | |
5792 | + apic_id = reg_00.bits.ID; | |
5793 | + } | |
5794 | + | |
5795 | + /* | |
5796 | + * Every APIC in a system must have a unique ID or we get lots of nice | |
5797 | + * 'stuck on smp_invalidate_needed IPI wait' messages. | |
5798 | + */ | |
5799 | + if (check_apicid_used(apic_id_map, apic_id)) { | |
5800 | + | |
5801 | + for (i = 0; i < get_physical_broadcast(); i++) { | |
5802 | + if (!check_apicid_used(apic_id_map, i)) | |
5803 | + break; | |
5804 | + } | |
5805 | + | |
5806 | + if (i == get_physical_broadcast()) | |
5807 | + panic("Max apic_id exceeded!\n"); | |
5808 | + | |
5809 | + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " | |
5810 | + "trying %d\n", ioapic, apic_id, i); | |
5811 | + | |
5812 | + apic_id = i; | |
5813 | + } | |
5814 | + | |
5815 | + tmp = apicid_to_cpu_present(apic_id); | |
5816 | + physids_or(apic_id_map, apic_id_map, tmp); | |
5817 | + | |
5818 | + if (reg_00.bits.ID != apic_id) { | |
5819 | + reg_00.bits.ID = apic_id; | |
5820 | + | |
5821 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5822 | + io_apic_write(ioapic, 0, reg_00.raw); | |
5823 | + reg_00.raw = io_apic_read(ioapic, 0); | |
5824 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5825 | + | |
5826 | + /* Sanity check */ | |
5827 | + if (reg_00.bits.ID != apic_id) { | |
5828 | + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); | |
5829 | + return -1; | |
5830 | + } | |
5831 | + } | |
5832 | + | |
5833 | + apic_printk(APIC_VERBOSE, KERN_INFO | |
5834 | + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); | |
5835 | +#endif /* !CONFIG_XEN */ | |
5836 | + | |
5837 | + return apic_id; | |
5838 | +} | |
5839 | + | |
5840 | + | |
5841 | +int __init io_apic_get_version (int ioapic) | |
5842 | +{ | |
5843 | + union IO_APIC_reg_01 reg_01; | |
5844 | + unsigned long flags; | |
5845 | + | |
5846 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5847 | + reg_01.raw = io_apic_read(ioapic, 1); | |
5848 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5849 | + | |
5850 | + return reg_01.bits.version; | |
5851 | +} | |
5852 | + | |
5853 | + | |
5854 | +int __init io_apic_get_redir_entries (int ioapic) | |
5855 | +{ | |
5856 | + union IO_APIC_reg_01 reg_01; | |
5857 | + unsigned long flags; | |
5858 | + | |
5859 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5860 | + reg_01.raw = io_apic_read(ioapic, 1); | |
5861 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5862 | + | |
5863 | + return reg_01.bits.entries; | |
5864 | +} | |
5865 | + | |
5866 | + | |
5867 | +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) | |
5868 | +{ | |
5869 | + struct IO_APIC_route_entry entry; | |
5870 | + unsigned long flags; | |
5871 | + | |
5872 | + if (!IO_APIC_IRQ(irq)) { | |
5873 | + printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", | |
5874 | + ioapic); | |
5875 | + return -EINVAL; | |
5876 | + } | |
5877 | + | |
5878 | + /* | |
5879 | + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. | |
5880 | + * Note that we mask (disable) IRQs now -- these get enabled when the | |
5881 | + * corresponding device driver registers for this IRQ. | |
5882 | + */ | |
5883 | + | |
5884 | + memset(&entry,0,sizeof(entry)); | |
5885 | + | |
5886 | + entry.delivery_mode = INT_DELIVERY_MODE; | |
5887 | + entry.dest_mode = INT_DEST_MODE; | |
5888 | + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | |
5889 | + entry.trigger = edge_level; | |
5890 | + entry.polarity = active_high_low; | |
5891 | + entry.mask = 1; | |
5892 | + | |
5893 | + /* | |
5894 | + * IRQs < 16 are already in the irq_2_pin[] map | |
5895 | + */ | |
5896 | + if (irq >= 16) | |
5897 | + add_pin_to_irq(irq, ioapic, pin); | |
5898 | + | |
5899 | + entry.vector = assign_irq_vector(irq); | |
5900 | + | |
5901 | + apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " | |
5902 | + "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, | |
5903 | + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, | |
5904 | + edge_level, active_high_low); | |
5905 | + | |
5906 | + ioapic_register_intr(irq, entry.vector, edge_level); | |
5907 | + | |
5908 | + if (!ioapic && (irq < 16)) | |
5909 | + disable_8259A_irq(irq); | |
5910 | + | |
5911 | + spin_lock_irqsave(&ioapic_lock, flags); | |
5912 | + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); | |
5913 | + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); | |
5914 | + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); | |
5915 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
5916 | + | |
5917 | + return 0; | |
5918 | +} | |
5919 | + | |
5920 | +#endif /* CONFIG_ACPI */ | |
5921 | Index: head-2008-11-25/arch/x86/kernel/ioport_32-xen.c | |
5922 | =================================================================== | |
5923 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
5924 | +++ head-2008-11-25/arch/x86/kernel/ioport_32-xen.c 2008-01-28 12:24:19.000000000 +0100 | |
5925 | @@ -0,0 +1,123 @@ | |
5926 | +/* | |
5927 | + * linux/arch/i386/kernel/ioport.c | |
5928 | + * | |
5929 | + * This contains the io-permission bitmap code - written by obz, with changes | |
5930 | + * by Linus. | |
5931 | + */ | |
5932 | + | |
5933 | +#include <linux/sched.h> | |
5934 | +#include <linux/kernel.h> | |
5935 | +#include <linux/capability.h> | |
5936 | +#include <linux/errno.h> | |
5937 | +#include <linux/types.h> | |
5938 | +#include <linux/ioport.h> | |
5939 | +#include <linux/smp.h> | |
5940 | +#include <linux/smp_lock.h> | |
5941 | +#include <linux/stddef.h> | |
5942 | +#include <linux/slab.h> | |
5943 | +#include <linux/thread_info.h> | |
5944 | +#include <xen/interface/physdev.h> | |
5945 | + | |
5946 | +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | |
5947 | +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) | |
5948 | +{ | |
5949 | + unsigned long mask; | |
5950 | + unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); | |
5951 | + unsigned int low_index = base & (BITS_PER_LONG-1); | |
5952 | + int length = low_index + extent; | |
5953 | + | |
5954 | + if (low_index != 0) { | |
5955 | + mask = (~0UL << low_index); | |
5956 | + if (length < BITS_PER_LONG) | |
5957 | + mask &= ~(~0UL << length); | |
5958 | + if (new_value) | |
5959 | + *bitmap_base++ |= mask; | |
5960 | + else | |
5961 | + *bitmap_base++ &= ~mask; | |
5962 | + length -= BITS_PER_LONG; | |
5963 | + } | |
5964 | + | |
5965 | + mask = (new_value ? ~0UL : 0UL); | |
5966 | + while (length >= BITS_PER_LONG) { | |
5967 | + *bitmap_base++ = mask; | |
5968 | + length -= BITS_PER_LONG; | |
5969 | + } | |
5970 | + | |
5971 | + if (length > 0) { | |
5972 | + mask = ~(~0UL << length); | |
5973 | + if (new_value) | |
5974 | + *bitmap_base++ |= mask; | |
5975 | + else | |
5976 | + *bitmap_base++ &= ~mask; | |
5977 | + } | |
5978 | +} | |
5979 | + | |
5980 | + | |
5981 | +/* | |
5982 | + * this changes the io permissions bitmap in the current task. | |
5983 | + */ | |
5984 | +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | |
5985 | +{ | |
5986 | + struct thread_struct * t = ¤t->thread; | |
5987 | + unsigned long *bitmap; | |
5988 | + struct physdev_set_iobitmap set_iobitmap; | |
5989 | + | |
5990 | + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) | |
5991 | + return -EINVAL; | |
5992 | + if (turn_on && !capable(CAP_SYS_RAWIO)) | |
5993 | + return -EPERM; | |
5994 | + | |
5995 | + /* | |
5996 | + * If it's the first ioperm() call in this thread's lifetime, set the | |
5997 | + * IO bitmap up. ioperm() is much less timing critical than clone(), | |
5998 | + * this is why we delay this operation until now: | |
5999 | + */ | |
6000 | + if (!t->io_bitmap_ptr) { | |
6001 | + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | |
6002 | + if (!bitmap) | |
6003 | + return -ENOMEM; | |
6004 | + | |
6005 | + memset(bitmap, 0xff, IO_BITMAP_BYTES); | |
6006 | + t->io_bitmap_ptr = bitmap; | |
6007 | + set_thread_flag(TIF_IO_BITMAP); | |
6008 | + | |
6009 | + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); | |
6010 | + set_iobitmap.nr_ports = IO_BITMAP_BITS; | |
6011 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, | |
6012 | + &set_iobitmap)); | |
6013 | + } | |
6014 | + | |
6015 | + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); | |
6016 | + | |
6017 | + return 0; | |
6018 | +} | |
6019 | + | |
6020 | +/* | |
6021 | + * sys_iopl has to be used when you want to access the IO ports | |
6022 | + * beyond the 0x3ff range: to get the full 65536 ports bitmapped | |
6023 | + * you'd need 8kB of bitmaps/process, which is a bit excessive. | |
6024 | + * | |
6025 | + * Here we just change the eflags value on the stack: we allow | |
6026 | + * only the super-user to do it. This depends on the stack-layout | |
6027 | + * on system-call entry - see also fork() and the signal handling | |
6028 | + * code. | |
6029 | + */ | |
6030 | + | |
6031 | +asmlinkage long sys_iopl(unsigned long unused) | |
6032 | +{ | |
6033 | + volatile struct pt_regs * regs = (struct pt_regs *) &unused; | |
6034 | + unsigned int level = regs->ebx; | |
6035 | + struct thread_struct *t = ¤t->thread; | |
6036 | + unsigned int old = (t->iopl >> 12) & 3; | |
6037 | + | |
6038 | + if (level > 3) | |
6039 | + return -EINVAL; | |
6040 | + /* Trying to gain more privileges? */ | |
6041 | + if (level > old) { | |
6042 | + if (!capable(CAP_SYS_RAWIO)) | |
6043 | + return -EPERM; | |
6044 | + } | |
6045 | + t->iopl = level << 12; | |
6046 | + set_iopl_mask(t->iopl); | |
6047 | + return 0; | |
6048 | +} | |
6049 | Index: head-2008-11-25/arch/x86/kernel/irq_32-xen.c | |
6050 | =================================================================== | |
6051 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
6052 | +++ head-2008-11-25/arch/x86/kernel/irq_32-xen.c 2008-10-29 09:55:56.000000000 +0100 | |
6053 | @@ -0,0 +1,324 @@ | |
6054 | +/* | |
6055 | + * linux/arch/i386/kernel/irq.c | |
6056 | + * | |
6057 | + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar | |
6058 | + * | |
6059 | + * This file contains the lowest level x86-specific interrupt | |
6060 | + * entry, irq-stacks and irq statistics code. All the remaining | |
6061 | + * irq logic is done by the generic kernel/irq/ code and | |
6062 | + * by the x86-specific irq controller code. (e.g. i8259.c and | |
6063 | + * io_apic.c.) | |
6064 | + */ | |
6065 | + | |
6066 | +#include <asm/uaccess.h> | |
6067 | +#include <linux/module.h> | |
6068 | +#include <linux/seq_file.h> | |
6069 | +#include <linux/interrupt.h> | |
6070 | +#include <linux/kernel_stat.h> | |
6071 | +#include <linux/notifier.h> | |
6072 | +#include <linux/cpu.h> | |
6073 | +#include <linux/delay.h> | |
6074 | + | |
6075 | +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; | |
6076 | +EXPORT_PER_CPU_SYMBOL(irq_stat); | |
6077 | + | |
6078 | +#ifndef CONFIG_X86_LOCAL_APIC | |
6079 | +/* | |
6080 | + * 'what should we do if we get a hw irq event on an illegal vector'. | |
6081 | + * each architecture has to answer this themselves. | |
6082 | + */ | |
6083 | +void ack_bad_irq(unsigned int irq) | |
6084 | +{ | |
6085 | + printk("unexpected IRQ trap at vector %02x\n", irq); | |
6086 | +} | |
6087 | +#endif | |
6088 | + | |
6089 | +#ifdef CONFIG_4KSTACKS | |
6090 | +/* | |
6091 | + * per-CPU IRQ handling contexts (thread information and stack) | |
6092 | + */ | |
6093 | +union irq_ctx { | |
6094 | + struct thread_info tinfo; | |
6095 | + u32 stack[THREAD_SIZE/sizeof(u32)]; | |
6096 | +}; | |
6097 | + | |
6098 | +static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; | |
6099 | +static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; | |
6100 | +#endif | |
6101 | + | |
6102 | +/* | |
6103 | + * do_IRQ handles all normal device IRQ's (the special | |
6104 | + * SMP cross-CPU interrupts have their own specific | |
6105 | + * handlers). | |
6106 | + */ | |
6107 | +fastcall unsigned int do_IRQ(struct pt_regs *regs) | |
6108 | +{ | |
6109 | + /* high bit used in ret_from_ code */ | |
6110 | + int irq = ~regs->orig_eax; | |
6111 | +#ifdef CONFIG_4KSTACKS | |
6112 | + union irq_ctx *curctx, *irqctx; | |
6113 | + u32 *isp; | |
6114 | +#endif | |
6115 | + | |
6116 | + if (unlikely((unsigned)irq >= NR_IRQS)) { | |
6117 | + printk(KERN_EMERG "%s: cannot handle IRQ %d\n", | |
6118 | + __FUNCTION__, irq); | |
6119 | + BUG(); | |
6120 | + } | |
6121 | + | |
6122 | + /*irq_enter();*/ | |
6123 | +#ifdef CONFIG_DEBUG_STACKOVERFLOW | |
6124 | + /* Debugging check for stack overflow: is there less than 1KB free? */ | |
6125 | + { | |
6126 | + long esp; | |
6127 | + | |
6128 | + __asm__ __volatile__("andl %%esp,%0" : | |
6129 | + "=r" (esp) : "0" (THREAD_SIZE - 1)); | |
6130 | + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { | |
6131 | + printk("do_IRQ: stack overflow: %ld\n", | |
6132 | + esp - sizeof(struct thread_info)); | |
6133 | + dump_stack(); | |
6134 | + } | |
6135 | + } | |
6136 | +#endif | |
6137 | + | |
6138 | +#ifdef CONFIG_4KSTACKS | |
6139 | + | |
6140 | + curctx = (union irq_ctx *) current_thread_info(); | |
6141 | + irqctx = hardirq_ctx[smp_processor_id()]; | |
6142 | + | |
6143 | + /* | |
6144 | + * this is where we switch to the IRQ stack. However, if we are | |
6145 | + * already using the IRQ stack (because we interrupted a hardirq | |
6146 | + * handler) we can't do that and just have to keep using the | |
6147 | + * current stack (which is the irq stack already after all) | |
6148 | + */ | |
6149 | + if (curctx != irqctx) { | |
6150 | + int arg1, arg2, ebx; | |
6151 | + | |
6152 | + /* build the stack frame on the IRQ stack */ | |
6153 | + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); | |
6154 | + irqctx->tinfo.task = curctx->tinfo.task; | |
6155 | + irqctx->tinfo.previous_esp = current_stack_pointer; | |
6156 | + | |
6157 | + /* | |
6158 | + * Copy the softirq bits in preempt_count so that the | |
6159 | + * softirq checks work in the hardirq context. | |
6160 | + */ | |
6161 | + irqctx->tinfo.preempt_count = | |
6162 | + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | | |
6163 | + (curctx->tinfo.preempt_count & SOFTIRQ_MASK); | |
6164 | + | |
6165 | + asm volatile( | |
6166 | + " xchgl %%ebx,%%esp \n" | |
6167 | + " call __do_IRQ \n" | |
6168 | + " movl %%ebx,%%esp \n" | |
6169 | + : "=a" (arg1), "=d" (arg2), "=b" (ebx) | |
6170 | + : "0" (irq), "1" (regs), "2" (isp) | |
6171 | + : "memory", "cc", "ecx" | |
6172 | + ); | |
6173 | + } else | |
6174 | +#endif | |
6175 | + __do_IRQ(irq, regs); | |
6176 | + | |
6177 | + /*irq_exit();*/ | |
6178 | + | |
6179 | + return 1; | |
6180 | +} | |
6181 | + | |
6182 | +#ifdef CONFIG_4KSTACKS | |
6183 | + | |
6184 | +/* | |
6185 | + * These should really be __section__(".bss.page_aligned") as well, but | |
6186 | + * gcc's 3.0 and earlier don't handle that correctly. | |
6187 | + */ | |
6188 | +static char softirq_stack[NR_CPUS * THREAD_SIZE] | |
6189 | + __attribute__((__aligned__(THREAD_SIZE))); | |
6190 | + | |
6191 | +static char hardirq_stack[NR_CPUS * THREAD_SIZE] | |
6192 | + __attribute__((__aligned__(THREAD_SIZE))); | |
6193 | + | |
6194 | +/* | |
6195 | + * allocate per-cpu stacks for hardirq and for softirq processing | |
6196 | + */ | |
6197 | +void irq_ctx_init(int cpu) | |
6198 | +{ | |
6199 | + union irq_ctx *irqctx; | |
6200 | + | |
6201 | + if (hardirq_ctx[cpu]) | |
6202 | + return; | |
6203 | + | |
6204 | + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; | |
6205 | + irqctx->tinfo.task = NULL; | |
6206 | + irqctx->tinfo.exec_domain = NULL; | |
6207 | + irqctx->tinfo.cpu = cpu; | |
6208 | + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; | |
6209 | + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | |
6210 | + | |
6211 | + hardirq_ctx[cpu] = irqctx; | |
6212 | + | |
6213 | + irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; | |
6214 | + irqctx->tinfo.task = NULL; | |
6215 | + irqctx->tinfo.exec_domain = NULL; | |
6216 | + irqctx->tinfo.cpu = cpu; | |
6217 | + irqctx->tinfo.preempt_count = 0; | |
6218 | + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | |
6219 | + | |
6220 | + softirq_ctx[cpu] = irqctx; | |
6221 | + | |
6222 | + printk("CPU %u irqstacks, hard=%p soft=%p\n", | |
6223 | + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); | |
6224 | +} | |
6225 | + | |
6226 | +void irq_ctx_exit(int cpu) | |
6227 | +{ | |
6228 | + hardirq_ctx[cpu] = NULL; | |
6229 | +} | |
6230 | + | |
6231 | +extern asmlinkage void __do_softirq(void); | |
6232 | + | |
6233 | +asmlinkage void do_softirq(void) | |
6234 | +{ | |
6235 | + unsigned long flags; | |
6236 | + struct thread_info *curctx; | |
6237 | + union irq_ctx *irqctx; | |
6238 | + u32 *isp; | |
6239 | + | |
6240 | + if (in_interrupt()) | |
6241 | + return; | |
6242 | + | |
6243 | + local_irq_save(flags); | |
6244 | + | |
6245 | + if (local_softirq_pending()) { | |
6246 | + curctx = current_thread_info(); | |
6247 | + irqctx = softirq_ctx[smp_processor_id()]; | |
6248 | + irqctx->tinfo.task = curctx->task; | |
6249 | + irqctx->tinfo.previous_esp = current_stack_pointer; | |
6250 | + | |
6251 | + /* build the stack frame on the softirq stack */ | |
6252 | + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); | |
6253 | + | |
6254 | + asm volatile( | |
6255 | + " xchgl %%ebx,%%esp \n" | |
6256 | + " call __do_softirq \n" | |
6257 | + " movl %%ebx,%%esp \n" | |
6258 | + : "=b"(isp) | |
6259 | + : "0"(isp) | |
6260 | + : "memory", "cc", "edx", "ecx", "eax" | |
6261 | + ); | |
6262 | + /* | |
6263 | + * Shouldnt happen, we returned above if in_interrupt(): | |
6264 | + */ | |
6265 | + WARN_ON_ONCE(softirq_count()); | |
6266 | + } | |
6267 | + | |
6268 | + local_irq_restore(flags); | |
6269 | +} | |
6270 | + | |
6271 | +EXPORT_SYMBOL(do_softirq); | |
6272 | +#endif | |
6273 | + | |
6274 | +/* | |
6275 | + * Interrupt statistics: | |
6276 | + */ | |
6277 | + | |
6278 | +atomic_t irq_err_count; | |
6279 | + | |
6280 | +/* | |
6281 | + * /proc/interrupts printing: | |
6282 | + */ | |
6283 | + | |
6284 | +int show_interrupts(struct seq_file *p, void *v) | |
6285 | +{ | |
6286 | + int i = *(loff_t *) v, j; | |
6287 | + struct irqaction * action; | |
6288 | + unsigned long flags; | |
6289 | + | |
6290 | + if (i == 0) { | |
6291 | + seq_printf(p, " "); | |
6292 | + for_each_online_cpu(j) | |
6293 | + seq_printf(p, "CPU%-8d",j); | |
6294 | + seq_putc(p, '\n'); | |
6295 | + } | |
6296 | + | |
6297 | + if (i < NR_IRQS) { | |
6298 | + spin_lock_irqsave(&irq_desc[i].lock, flags); | |
6299 | + action = irq_desc[i].action; | |
6300 | + if (!action) | |
6301 | + goto skip; | |
6302 | + seq_printf(p, "%3d: ",i); | |
6303 | +#ifndef CONFIG_SMP | |
6304 | + seq_printf(p, "%10u ", kstat_irqs(i)); | |
6305 | +#else | |
6306 | + for_each_online_cpu(j) | |
6307 | + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); | |
6308 | +#endif | |
6309 | + seq_printf(p, " %14s", irq_desc[i].chip->typename); | |
6310 | + seq_printf(p, " %s", action->name); | |
6311 | + | |
6312 | + for (action=action->next; action; action = action->next) | |
6313 | + seq_printf(p, ", %s", action->name); | |
6314 | + | |
6315 | + seq_putc(p, '\n'); | |
6316 | +skip: | |
6317 | + spin_unlock_irqrestore(&irq_desc[i].lock, flags); | |
6318 | + } else if (i == NR_IRQS) { | |
6319 | + seq_printf(p, "NMI: "); | |
6320 | + for_each_online_cpu(j) | |
6321 | + seq_printf(p, "%10u ", nmi_count(j)); | |
6322 | + seq_putc(p, '\n'); | |
6323 | +#ifdef CONFIG_X86_LOCAL_APIC | |
6324 | + seq_printf(p, "LOC: "); | |
6325 | + for_each_online_cpu(j) | |
6326 | + seq_printf(p, "%10u ", | |
6327 | + per_cpu(irq_stat,j).apic_timer_irqs); | |
6328 | + seq_putc(p, '\n'); | |
6329 | +#endif | |
6330 | + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | |
6331 | +#if defined(CONFIG_X86_IO_APIC) | |
6332 | + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); | |
6333 | +#endif | |
6334 | + } | |
6335 | + return 0; | |
6336 | +} | |
6337 | + | |
6338 | +#ifdef CONFIG_HOTPLUG_CPU | |
6339 | + | |
6340 | +void fixup_irqs(cpumask_t map) | |
6341 | +{ | |
6342 | + unsigned int irq; | |
6343 | + static int warned; | |
6344 | + | |
6345 | + for (irq = 0; irq < NR_IRQS; irq++) { | |
6346 | + cpumask_t mask; | |
6347 | + if (irq == 2) | |
6348 | + continue; | |
6349 | + | |
6350 | + cpus_and(mask, irq_desc[irq].affinity, map); | |
6351 | + if (any_online_cpu(mask) == NR_CPUS) { | |
6352 | + /*printk("Breaking affinity for irq %i\n", irq);*/ | |
6353 | + mask = map; | |
6354 | + } | |
6355 | + if (irq_desc[irq].chip->set_affinity) | |
6356 | + irq_desc[irq].chip->set_affinity(irq, mask); | |
6357 | + else if (irq_desc[irq].action && !(warned++)) | |
6358 | + printk("Cannot set affinity for irq %i\n", irq); | |
6359 | + } | |
6360 | + | |
6361 | +#if 0 | |
6362 | + barrier(); | |
6363 | + /* Ingo Molnar says: "after the IO-APIC masks have been redirected | |
6364 | + [note the nop - the interrupt-enable boundary on x86 is two | |
6365 | + instructions from sti] - to flush out pending hardirqs and | |
6366 | + IPIs. After this point nothing is supposed to reach this CPU." */ | |
6367 | + __asm__ __volatile__("sti; nop; cli"); | |
6368 | + barrier(); | |
6369 | +#else | |
6370 | + /* That doesn't seem sufficient. Give it 1ms. */ | |
6371 | + local_irq_enable(); | |
6372 | + mdelay(1); | |
6373 | + local_irq_disable(); | |
6374 | +#endif | |
6375 | +} | |
6376 | +#endif | |
6377 | + | |
6378 | Index: head-2008-11-25/arch/x86/kernel/ldt_32-xen.c | |
6379 | =================================================================== | |
6380 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
6381 | +++ head-2008-11-25/arch/x86/kernel/ldt_32-xen.c 2007-06-12 13:12:48.000000000 +0200 | |
6382 | @@ -0,0 +1,270 @@ | |
6383 | +/* | |
6384 | + * linux/kernel/ldt.c | |
6385 | + * | |
6386 | + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds | |
6387 | + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | |
6388 | + */ | |
6389 | + | |
6390 | +#include <linux/errno.h> | |
6391 | +#include <linux/sched.h> | |
6392 | +#include <linux/string.h> | |
6393 | +#include <linux/mm.h> | |
6394 | +#include <linux/smp.h> | |
6395 | +#include <linux/smp_lock.h> | |
6396 | +#include <linux/vmalloc.h> | |
6397 | +#include <linux/slab.h> | |
6398 | + | |
6399 | +#include <asm/uaccess.h> | |
6400 | +#include <asm/system.h> | |
6401 | +#include <asm/ldt.h> | |
6402 | +#include <asm/desc.h> | |
6403 | +#include <asm/mmu_context.h> | |
6404 | + | |
6405 | +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ | |
6406 | +static void flush_ldt(void *null) | |
6407 | +{ | |
6408 | + if (current->active_mm) | |
6409 | + load_LDT(¤t->active_mm->context); | |
6410 | +} | |
6411 | +#endif | |
6412 | + | |
6413 | +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) | |
6414 | +{ | |
6415 | + void *oldldt; | |
6416 | + void *newldt; | |
6417 | + int oldsize; | |
6418 | + | |
6419 | + if (mincount <= pc->size) | |
6420 | + return 0; | |
6421 | + oldsize = pc->size; | |
6422 | + mincount = (mincount+511)&(~511); | |
6423 | + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) | |
6424 | + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); | |
6425 | + else | |
6426 | + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); | |
6427 | + | |
6428 | + if (!newldt) | |
6429 | + return -ENOMEM; | |
6430 | + | |
6431 | + if (oldsize) | |
6432 | + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); | |
6433 | + oldldt = pc->ldt; | |
6434 | + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); | |
6435 | + pc->ldt = newldt; | |
6436 | + wmb(); | |
6437 | + pc->size = mincount; | |
6438 | + wmb(); | |
6439 | + | |
6440 | + if (reload) { | |
6441 | +#ifdef CONFIG_SMP | |
6442 | + cpumask_t mask; | |
6443 | + preempt_disable(); | |
6444 | +#endif | |
6445 | + make_pages_readonly( | |
6446 | + pc->ldt, | |
6447 | + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
6448 | + XENFEAT_writable_descriptor_tables); | |
6449 | + load_LDT(pc); | |
6450 | +#ifdef CONFIG_SMP | |
6451 | + mask = cpumask_of_cpu(smp_processor_id()); | |
6452 | + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | |
6453 | + smp_call_function(flush_ldt, NULL, 1, 1); | |
6454 | + preempt_enable(); | |
6455 | +#endif | |
6456 | + } | |
6457 | + if (oldsize) { | |
6458 | + make_pages_writable( | |
6459 | + oldldt, | |
6460 | + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
6461 | + XENFEAT_writable_descriptor_tables); | |
6462 | + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) | |
6463 | + vfree(oldldt); | |
6464 | + else | |
6465 | + kfree(oldldt); | |
6466 | + } | |
6467 | + return 0; | |
6468 | +} | |
6469 | + | |
6470 | +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | |
6471 | +{ | |
6472 | + int err = alloc_ldt(new, old->size, 0); | |
6473 | + if (err < 0) | |
6474 | + return err; | |
6475 | + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); | |
6476 | + make_pages_readonly( | |
6477 | + new->ldt, | |
6478 | + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
6479 | + XENFEAT_writable_descriptor_tables); | |
6480 | + return 0; | |
6481 | +} | |
6482 | + | |
6483 | +/* | |
6484 | + * we do not have to muck with descriptors here, that is | |
6485 | + * done in switch_mm() as needed. | |
6486 | + */ | |
6487 | +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | |
6488 | +{ | |
6489 | + struct mm_struct * old_mm; | |
6490 | + int retval = 0; | |
6491 | + | |
6492 | + init_MUTEX(&mm->context.sem); | |
6493 | + mm->context.size = 0; | |
6494 | + mm->context.has_foreign_mappings = 0; | |
6495 | + old_mm = current->mm; | |
6496 | + if (old_mm && old_mm->context.size > 0) { | |
6497 | + down(&old_mm->context.sem); | |
6498 | + retval = copy_ldt(&mm->context, &old_mm->context); | |
6499 | + up(&old_mm->context.sem); | |
6500 | + } | |
6501 | + return retval; | |
6502 | +} | |
6503 | + | |
6504 | +/* | |
6505 | + * No need to lock the MM as we are the last user | |
6506 | + */ | |
6507 | +void destroy_context(struct mm_struct *mm) | |
6508 | +{ | |
6509 | + if (mm->context.size) { | |
6510 | + if (mm == current->active_mm) | |
6511 | + clear_LDT(); | |
6512 | + make_pages_writable( | |
6513 | + mm->context.ldt, | |
6514 | + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
6515 | + XENFEAT_writable_descriptor_tables); | |
6516 | + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) | |
6517 | + vfree(mm->context.ldt); | |
6518 | + else | |
6519 | + kfree(mm->context.ldt); | |
6520 | + mm->context.size = 0; | |
6521 | + } | |
6522 | +} | |
6523 | + | |
6524 | +static int read_ldt(void __user * ptr, unsigned long bytecount) | |
6525 | +{ | |
6526 | + int err; | |
6527 | + unsigned long size; | |
6528 | + struct mm_struct * mm = current->mm; | |
6529 | + | |
6530 | + if (!mm->context.size) | |
6531 | + return 0; | |
6532 | + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) | |
6533 | + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; | |
6534 | + | |
6535 | + down(&mm->context.sem); | |
6536 | + size = mm->context.size*LDT_ENTRY_SIZE; | |
6537 | + if (size > bytecount) | |
6538 | + size = bytecount; | |
6539 | + | |
6540 | + err = 0; | |
6541 | + if (copy_to_user(ptr, mm->context.ldt, size)) | |
6542 | + err = -EFAULT; | |
6543 | + up(&mm->context.sem); | |
6544 | + if (err < 0) | |
6545 | + goto error_return; | |
6546 | + if (size != bytecount) { | |
6547 | + /* zero-fill the rest */ | |
6548 | + if (clear_user(ptr+size, bytecount-size) != 0) { | |
6549 | + err = -EFAULT; | |
6550 | + goto error_return; | |
6551 | + } | |
6552 | + } | |
6553 | + return bytecount; | |
6554 | +error_return: | |
6555 | + return err; | |
6556 | +} | |
6557 | + | |
6558 | +static int read_default_ldt(void __user * ptr, unsigned long bytecount) | |
6559 | +{ | |
6560 | + int err; | |
6561 | + unsigned long size; | |
6562 | + void *address; | |
6563 | + | |
6564 | + err = 0; | |
6565 | + address = &default_ldt[0]; | |
6566 | + size = 5*sizeof(struct desc_struct); | |
6567 | + if (size > bytecount) | |
6568 | + size = bytecount; | |
6569 | + | |
6570 | + err = size; | |
6571 | + if (copy_to_user(ptr, address, size)) | |
6572 | + err = -EFAULT; | |
6573 | + | |
6574 | + return err; | |
6575 | +} | |
6576 | + | |
6577 | +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) | |
6578 | +{ | |
6579 | + struct mm_struct * mm = current->mm; | |
6580 | + __u32 entry_1, entry_2; | |
6581 | + int error; | |
6582 | + struct user_desc ldt_info; | |
6583 | + | |
6584 | + error = -EINVAL; | |
6585 | + if (bytecount != sizeof(ldt_info)) | |
6586 | + goto out; | |
6587 | + error = -EFAULT; | |
6588 | + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) | |
6589 | + goto out; | |
6590 | + | |
6591 | + error = -EINVAL; | |
6592 | + if (ldt_info.entry_number >= LDT_ENTRIES) | |
6593 | + goto out; | |
6594 | + if (ldt_info.contents == 3) { | |
6595 | + if (oldmode) | |
6596 | + goto out; | |
6597 | + if (ldt_info.seg_not_present == 0) | |
6598 | + goto out; | |
6599 | + } | |
6600 | + | |
6601 | + down(&mm->context.sem); | |
6602 | + if (ldt_info.entry_number >= mm->context.size) { | |
6603 | + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); | |
6604 | + if (error < 0) | |
6605 | + goto out_unlock; | |
6606 | + } | |
6607 | + | |
6608 | + /* Allow LDTs to be cleared by the user. */ | |
6609 | + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { | |
6610 | + if (oldmode || LDT_empty(&ldt_info)) { | |
6611 | + entry_1 = 0; | |
6612 | + entry_2 = 0; | |
6613 | + goto install; | |
6614 | + } | |
6615 | + } | |
6616 | + | |
6617 | + entry_1 = LDT_entry_a(&ldt_info); | |
6618 | + entry_2 = LDT_entry_b(&ldt_info); | |
6619 | + if (oldmode) | |
6620 | + entry_2 &= ~(1 << 20); | |
6621 | + | |
6622 | + /* Install the new entry ... */ | |
6623 | +install: | |
6624 | + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, | |
6625 | + entry_1, entry_2); | |
6626 | + | |
6627 | +out_unlock: | |
6628 | + up(&mm->context.sem); | |
6629 | +out: | |
6630 | + return error; | |
6631 | +} | |
6632 | + | |
6633 | +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) | |
6634 | +{ | |
6635 | + int ret = -ENOSYS; | |
6636 | + | |
6637 | + switch (func) { | |
6638 | + case 0: | |
6639 | + ret = read_ldt(ptr, bytecount); | |
6640 | + break; | |
6641 | + case 1: | |
6642 | + ret = write_ldt(ptr, bytecount, 1); | |
6643 | + break; | |
6644 | + case 2: | |
6645 | + ret = read_default_ldt(ptr, bytecount); | |
6646 | + break; | |
6647 | + case 0x11: | |
6648 | + ret = write_ldt(ptr, bytecount, 0); | |
6649 | + break; | |
6650 | + } | |
6651 | + return ret; | |
6652 | +} | |
6653 | Index: head-2008-11-25/arch/x86/kernel/microcode-xen.c | |
6654 | =================================================================== | |
6655 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
6656 | +++ head-2008-11-25/arch/x86/kernel/microcode-xen.c 2007-06-12 13:12:48.000000000 +0200 | |
6657 | @@ -0,0 +1,144 @@ | |
6658 | +/* | |
6659 | + * Intel CPU Microcode Update Driver for Linux | |
6660 | + * | |
6661 | + * Copyright (C) 2000-2004 Tigran Aivazian | |
6662 | + * | |
6663 | + * This driver allows to upgrade microcode on Intel processors | |
6664 | + * belonging to IA-32 family - PentiumPro, Pentium II, | |
6665 | + * Pentium III, Xeon, Pentium 4, etc. | |
6666 | + * | |
6667 | + * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, | |
6668 | + * Order Number 245472 or free download from: | |
6669 | + * | |
6670 | + * http://developer.intel.com/design/pentium4/manuals/245472.htm | |
6671 | + * | |
6672 | + * For more information, go to http://www.urbanmyth.org/microcode | |
6673 | + * | |
6674 | + * This program is free software; you can redistribute it and/or | |
6675 | + * modify it under the terms of the GNU General Public License | |
6676 | + * as published by the Free Software Foundation; either version | |
6677 | + * 2 of the License, or (at your option) any later version. | |
6678 | + */ | |
6679 | + | |
6680 | +//#define DEBUG /* pr_debug */ | |
6681 | +#include <linux/capability.h> | |
6682 | +#include <linux/kernel.h> | |
6683 | +#include <linux/init.h> | |
6684 | +#include <linux/sched.h> | |
6685 | +#include <linux/cpumask.h> | |
6686 | +#include <linux/module.h> | |
6687 | +#include <linux/slab.h> | |
6688 | +#include <linux/vmalloc.h> | |
6689 | +#include <linux/miscdevice.h> | |
6690 | +#include <linux/spinlock.h> | |
6691 | +#include <linux/mm.h> | |
6692 | +#include <linux/mutex.h> | |
6693 | +#include <linux/syscalls.h> | |
6694 | + | |
6695 | +#include <asm/msr.h> | |
6696 | +#include <asm/uaccess.h> | |
6697 | +#include <asm/processor.h> | |
6698 | + | |
6699 | +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); | |
6700 | +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>"); | |
6701 | +MODULE_LICENSE("GPL"); | |
6702 | + | |
6703 | +static int verbose; | |
6704 | +module_param(verbose, int, 0644); | |
6705 | + | |
6706 | +#define MICROCODE_VERSION "1.14a-xen" | |
6707 | + | |
6708 | +#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ | |
6709 | +#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ | |
6710 | +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ | |
6711 | + | |
6712 | +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ | |
6713 | +static DEFINE_MUTEX(microcode_mutex); | |
6714 | + | |
6715 | +static int microcode_open (struct inode *unused1, struct file *unused2) | |
6716 | +{ | |
6717 | + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; | |
6718 | +} | |
6719 | + | |
6720 | + | |
6721 | +static int do_microcode_update (const void __user *ubuf, size_t len) | |
6722 | +{ | |
6723 | + int err; | |
6724 | + void *kbuf; | |
6725 | + | |
6726 | + kbuf = vmalloc(len); | |
6727 | + if (!kbuf) | |
6728 | + return -ENOMEM; | |
6729 | + | |
6730 | + if (copy_from_user(kbuf, ubuf, len) == 0) { | |
6731 | + struct xen_platform_op op; | |
6732 | + | |
6733 | + op.cmd = XENPF_microcode_update; | |
6734 | + set_xen_guest_handle(op.u.microcode.data, kbuf); | |
6735 | + op.u.microcode.length = len; | |
6736 | + err = HYPERVISOR_platform_op(&op); | |
6737 | + } else | |
6738 | + err = -EFAULT; | |
6739 | + | |
6740 | + vfree(kbuf); | |
6741 | + | |
6742 | + return err; | |
6743 | +} | |
6744 | + | |
6745 | +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) | |
6746 | +{ | |
6747 | + ssize_t ret; | |
6748 | + | |
6749 | + if (len < MC_HEADER_SIZE) { | |
6750 | + printk(KERN_ERR "microcode: not enough data\n"); | |
6751 | + return -EINVAL; | |
6752 | + } | |
6753 | + | |
6754 | + mutex_lock(µcode_mutex); | |
6755 | + | |
6756 | + ret = do_microcode_update(buf, len); | |
6757 | + if (!ret) | |
6758 | + ret = (ssize_t)len; | |
6759 | + | |
6760 | + mutex_unlock(µcode_mutex); | |
6761 | + | |
6762 | + return ret; | |
6763 | +} | |
6764 | + | |
6765 | +static struct file_operations microcode_fops = { | |
6766 | + .owner = THIS_MODULE, | |
6767 | + .write = microcode_write, | |
6768 | + .open = microcode_open, | |
6769 | +}; | |
6770 | + | |
6771 | +static struct miscdevice microcode_dev = { | |
6772 | + .minor = MICROCODE_MINOR, | |
6773 | + .name = "microcode", | |
6774 | + .fops = µcode_fops, | |
6775 | +}; | |
6776 | + | |
6777 | +static int __init microcode_init (void) | |
6778 | +{ | |
6779 | + int error; | |
6780 | + | |
6781 | + error = misc_register(µcode_dev); | |
6782 | + if (error) { | |
6783 | + printk(KERN_ERR | |
6784 | + "microcode: can't misc_register on minor=%d\n", | |
6785 | + MICROCODE_MINOR); | |
6786 | + return error; | |
6787 | + } | |
6788 | + | |
6789 | + printk(KERN_INFO | |
6790 | + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n"); | |
6791 | + return 0; | |
6792 | +} | |
6793 | + | |
6794 | +static void __exit microcode_exit (void) | |
6795 | +{ | |
6796 | + misc_deregister(µcode_dev); | |
6797 | +} | |
6798 | + | |
6799 | +module_init(microcode_init) | |
6800 | +module_exit(microcode_exit) | |
6801 | +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); | |
6802 | Index: head-2008-11-25/arch/x86/kernel/mpparse_32-xen.c | |
6803 | =================================================================== | |
6804 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
6805 | +++ head-2008-11-25/arch/x86/kernel/mpparse_32-xen.c 2007-06-12 13:12:48.000000000 +0200 | |
6806 | @@ -0,0 +1,1185 @@ | |
6807 | +/* | |
6808 | + * Intel Multiprocessor Specification 1.1 and 1.4 | |
6809 | + * compliant MP-table parsing routines. | |
6810 | + * | |
6811 | + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | |
6812 | + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | |
6813 | + * | |
6814 | + * Fixes | |
6815 | + * Erich Boleyn : MP v1.4 and additional changes. | |
6816 | + * Alan Cox : Added EBDA scanning | |
6817 | + * Ingo Molnar : various cleanups and rewrites | |
6818 | + * Maciej W. Rozycki: Bits for default MP configurations | |
6819 | + * Paul Diefenbaugh: Added full ACPI support | |
6820 | + */ | |
6821 | + | |
6822 | +#include <linux/mm.h> | |
6823 | +#include <linux/init.h> | |
6824 | +#include <linux/acpi.h> | |
6825 | +#include <linux/delay.h> | |
6826 | +#include <linux/bootmem.h> | |
6827 | +#include <linux/smp_lock.h> | |
6828 | +#include <linux/kernel_stat.h> | |
6829 | +#include <linux/mc146818rtc.h> | |
6830 | +#include <linux/bitops.h> | |
6831 | + | |
6832 | +#include <asm/smp.h> | |
6833 | +#include <asm/acpi.h> | |
6834 | +#include <asm/mtrr.h> | |
6835 | +#include <asm/mpspec.h> | |
6836 | +#include <asm/io_apic.h> | |
6837 | + | |
6838 | +#include <mach_apic.h> | |
6839 | +#include <mach_mpparse.h> | |
6840 | +#include <bios_ebda.h> | |
6841 | + | |
6842 | +/* Have we found an MP table */ | |
6843 | +int smp_found_config; | |
6844 | +unsigned int __initdata maxcpus = NR_CPUS; | |
6845 | + | |
6846 | +/* | |
6847 | + * Various Linux-internal data structures created from the | |
6848 | + * MP-table. | |
6849 | + */ | |
6850 | +int apic_version [MAX_APICS]; | |
6851 | +int mp_bus_id_to_type [MAX_MP_BUSSES]; | |
6852 | +int mp_bus_id_to_node [MAX_MP_BUSSES]; | |
6853 | +int mp_bus_id_to_local [MAX_MP_BUSSES]; | |
6854 | +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; | |
6855 | +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; | |
6856 | +static int mp_current_pci_id; | |
6857 | + | |
6858 | +/* I/O APIC entries */ | |
6859 | +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; | |
6860 | + | |
6861 | +/* # of MP IRQ source entries */ | |
6862 | +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | |
6863 | + | |
6864 | +/* MP IRQ source entries */ | |
6865 | +int mp_irq_entries; | |
6866 | + | |
6867 | +int nr_ioapics; | |
6868 | + | |
6869 | +int pic_mode; | |
6870 | +unsigned long mp_lapic_addr; | |
6871 | + | |
6872 | +unsigned int def_to_bigsmp = 0; | |
6873 | + | |
6874 | +/* Processor that is doing the boot up */ | |
6875 | +unsigned int boot_cpu_physical_apicid = -1U; | |
6876 | +/* Internal processor count */ | |
6877 | +static unsigned int __devinitdata num_processors; | |
6878 | + | |
6879 | +/* Bitmask of physically existing CPUs */ | |
6880 | +physid_mask_t phys_cpu_present_map; | |
6881 | + | |
6882 | +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | |
6883 | + | |
6884 | +/* | |
6885 | + * Intel MP BIOS table parsing routines: | |
6886 | + */ | |
6887 | + | |
6888 | + | |
6889 | +/* | |
6890 | + * Checksum an MP configuration block. | |
6891 | + */ | |
6892 | + | |
6893 | +static int __init mpf_checksum(unsigned char *mp, int len) | |
6894 | +{ | |
6895 | + int sum = 0; | |
6896 | + | |
6897 | + while (len--) | |
6898 | + sum += *mp++; | |
6899 | + | |
6900 | + return sum & 0xFF; | |
6901 | +} | |
6902 | + | |
6903 | +/* | |
6904 | + * Have to match translation table entries to main table entries by counter | |
6905 | + * hence the mpc_record variable .... can't see a less disgusting way of | |
6906 | + * doing this .... | |
6907 | + */ | |
6908 | + | |
6909 | +static int mpc_record; | |
6910 | +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata; | |
6911 | + | |
6912 | +#ifndef CONFIG_XEN | |
6913 | +static void __devinit MP_processor_info (struct mpc_config_processor *m) | |
6914 | +{ | |
6915 | + int ver, apicid; | |
6916 | + physid_mask_t phys_cpu; | |
6917 | + | |
6918 | + if (!(m->mpc_cpuflag & CPU_ENABLED)) | |
6919 | + return; | |
6920 | + | |
6921 | + apicid = mpc_apic_id(m, translation_table[mpc_record]); | |
6922 | + | |
6923 | + if (m->mpc_featureflag&(1<<0)) | |
6924 | + Dprintk(" Floating point unit present.\n"); | |
6925 | + if (m->mpc_featureflag&(1<<7)) | |
6926 | + Dprintk(" Machine Exception supported.\n"); | |
6927 | + if (m->mpc_featureflag&(1<<8)) | |
6928 | + Dprintk(" 64 bit compare & exchange supported.\n"); | |
6929 | + if (m->mpc_featureflag&(1<<9)) | |
6930 | + Dprintk(" Internal APIC present.\n"); | |
6931 | + if (m->mpc_featureflag&(1<<11)) | |
6932 | + Dprintk(" SEP present.\n"); | |
6933 | + if (m->mpc_featureflag&(1<<12)) | |
6934 | + Dprintk(" MTRR present.\n"); | |
6935 | + if (m->mpc_featureflag&(1<<13)) | |
6936 | + Dprintk(" PGE present.\n"); | |
6937 | + if (m->mpc_featureflag&(1<<14)) | |
6938 | + Dprintk(" MCA present.\n"); | |
6939 | + if (m->mpc_featureflag&(1<<15)) | |
6940 | + Dprintk(" CMOV present.\n"); | |
6941 | + if (m->mpc_featureflag&(1<<16)) | |
6942 | + Dprintk(" PAT present.\n"); | |
6943 | + if (m->mpc_featureflag&(1<<17)) | |
6944 | + Dprintk(" PSE present.\n"); | |
6945 | + if (m->mpc_featureflag&(1<<18)) | |
6946 | + Dprintk(" PSN present.\n"); | |
6947 | + if (m->mpc_featureflag&(1<<19)) | |
6948 | + Dprintk(" Cache Line Flush Instruction present.\n"); | |
6949 | + /* 20 Reserved */ | |
6950 | + if (m->mpc_featureflag&(1<<21)) | |
6951 | + Dprintk(" Debug Trace and EMON Store present.\n"); | |
6952 | + if (m->mpc_featureflag&(1<<22)) | |
6953 | + Dprintk(" ACPI Thermal Throttle Registers present.\n"); | |
6954 | + if (m->mpc_featureflag&(1<<23)) | |
6955 | + Dprintk(" MMX present.\n"); | |
6956 | + if (m->mpc_featureflag&(1<<24)) | |
6957 | + Dprintk(" FXSR present.\n"); | |
6958 | + if (m->mpc_featureflag&(1<<25)) | |
6959 | + Dprintk(" XMM present.\n"); | |
6960 | + if (m->mpc_featureflag&(1<<26)) | |
6961 | + Dprintk(" Willamette New Instructions present.\n"); | |
6962 | + if (m->mpc_featureflag&(1<<27)) | |
6963 | + Dprintk(" Self Snoop present.\n"); | |
6964 | + if (m->mpc_featureflag&(1<<28)) | |
6965 | + Dprintk(" HT present.\n"); | |
6966 | + if (m->mpc_featureflag&(1<<29)) | |
6967 | + Dprintk(" Thermal Monitor present.\n"); | |
6968 | + /* 30, 31 Reserved */ | |
6969 | + | |
6970 | + | |
6971 | + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | |
6972 | + Dprintk(" Bootup CPU\n"); | |
6973 | + boot_cpu_physical_apicid = m->mpc_apicid; | |
6974 | + } | |
6975 | + | |
6976 | + ver = m->mpc_apicver; | |
6977 | + | |
6978 | + /* | |
6979 | + * Validate version | |
6980 | + */ | |
6981 | + if (ver == 0x0) { | |
6982 | + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " | |
6983 | + "fixing up to 0x10. (tell your hw vendor)\n", | |
6984 | + m->mpc_apicid); | |
6985 | + ver = 0x10; | |
6986 | + } | |
6987 | + apic_version[m->mpc_apicid] = ver; | |
6988 | + | |
6989 | + phys_cpu = apicid_to_cpu_present(apicid); | |
6990 | + physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); | |
6991 | + | |
6992 | + if (num_processors >= NR_CPUS) { | |
6993 | + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." | |
6994 | + " Processor ignored.\n", NR_CPUS); | |
6995 | + return; | |
6996 | + } | |
6997 | + | |
6998 | + if (num_processors >= maxcpus) { | |
6999 | + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." | |
7000 | + " Processor ignored.\n", maxcpus); | |
7001 | + return; | |
7002 | + } | |
7003 | + | |
7004 | + cpu_set(num_processors, cpu_possible_map); | |
7005 | + num_processors++; | |
7006 | + | |
7007 | + /* | |
7008 | + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y | |
7009 | + * but we need to work other dependencies like SMP_SUSPEND etc | |
7010 | + * before this can be done without some confusion. | |
7011 | + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) | |
7012 | + * - Ashok Raj <ashok.raj@intel.com> | |
7013 | + */ | |
7014 | + if (num_processors > 8) { | |
7015 | + switch (boot_cpu_data.x86_vendor) { | |
7016 | + case X86_VENDOR_INTEL: | |
7017 | + if (!APIC_XAPIC(ver)) { | |
7018 | + def_to_bigsmp = 0; | |
7019 | + break; | |
7020 | + } | |
7021 | + /* If P4 and above fall through */ | |
7022 | + case X86_VENDOR_AMD: | |
7023 | + def_to_bigsmp = 1; | |
7024 | + } | |
7025 | + } | |
7026 | + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; | |
7027 | +} | |
7028 | +#else | |
7029 | +void __init MP_processor_info (struct mpc_config_processor *m) | |
7030 | +{ | |
7031 | + num_processors++; | |
7032 | +} | |
7033 | +#endif /* CONFIG_XEN */ | |
7034 | + | |
7035 | +static void __init MP_bus_info (struct mpc_config_bus *m) | |
7036 | +{ | |
7037 | + char str[7]; | |
7038 | + | |
7039 | + memcpy(str, m->mpc_bustype, 6); | |
7040 | + str[6] = 0; | |
7041 | + | |
7042 | + mpc_oem_bus_info(m, str, translation_table[mpc_record]); | |
7043 | + | |
7044 | + if (m->mpc_busid >= MAX_MP_BUSSES) { | |
7045 | + printk(KERN_WARNING "MP table busid value (%d) for bustype %s " | |
7046 | + " is too large, max. supported is %d\n", | |
7047 | + m->mpc_busid, str, MAX_MP_BUSSES - 1); | |
7048 | + return; | |
7049 | + } | |
7050 | + | |
7051 | + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { | |
7052 | + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; | |
7053 | + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { | |
7054 | + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; | |
7055 | + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { | |
7056 | + mpc_oem_pci_bus(m, translation_table[mpc_record]); | |
7057 | + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; | |
7058 | + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; | |
7059 | + mp_current_pci_id++; | |
7060 | + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { | |
7061 | + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; | |
7062 | + } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { | |
7063 | + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; | |
7064 | + } else { | |
7065 | + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); | |
7066 | + } | |
7067 | +} | |
7068 | + | |
7069 | +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) | |
7070 | +{ | |
7071 | + if (!(m->mpc_flags & MPC_APIC_USABLE)) | |
7072 | + return; | |
7073 | + | |
7074 | + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", | |
7075 | + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); | |
7076 | + if (nr_ioapics >= MAX_IO_APICS) { | |
7077 | + printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", | |
7078 | + MAX_IO_APICS, nr_ioapics); | |
7079 | + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); | |
7080 | + } | |
7081 | + if (!m->mpc_apicaddr) { | |
7082 | + printk(KERN_ERR "WARNING: bogus zero I/O APIC address" | |
7083 | + " found in MP table, skipping!\n"); | |
7084 | + return; | |
7085 | + } | |
7086 | + mp_ioapics[nr_ioapics] = *m; | |
7087 | + nr_ioapics++; | |
7088 | +} | |
7089 | + | |
7090 | +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) | |
7091 | +{ | |
7092 | + mp_irqs [mp_irq_entries] = *m; | |
7093 | + Dprintk("Int: type %d, pol %d, trig %d, bus %d," | |
7094 | + " IRQ %02x, APIC ID %x, APIC INT %02x\n", | |
7095 | + m->mpc_irqtype, m->mpc_irqflag & 3, | |
7096 | + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, | |
7097 | + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); | |
7098 | + if (++mp_irq_entries == MAX_IRQ_SOURCES) | |
7099 | + panic("Max # of irq sources exceeded!!\n"); | |
7100 | +} | |
7101 | + | |
7102 | +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) | |
7103 | +{ | |
7104 | + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," | |
7105 | + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", | |
7106 | + m->mpc_irqtype, m->mpc_irqflag & 3, | |
7107 | + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, | |
7108 | + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); | |
7109 | + /* | |
7110 | + * Well it seems all SMP boards in existence | |
7111 | + * use ExtINT/LVT1 == LINT0 and | |
7112 | + * NMI/LVT2 == LINT1 - the following check | |
7113 | + * will show us if this assumptions is false. | |
7114 | + * Until then we do not have to add baggage. | |
7115 | + */ | |
7116 | + if ((m->mpc_irqtype == mp_ExtINT) && | |
7117 | + (m->mpc_destapiclint != 0)) | |
7118 | + BUG(); | |
7119 | + if ((m->mpc_irqtype == mp_NMI) && | |
7120 | + (m->mpc_destapiclint != 1)) | |
7121 | + BUG(); | |
7122 | +} | |
7123 | + | |
7124 | +#ifdef CONFIG_X86_NUMAQ | |
7125 | +static void __init MP_translation_info (struct mpc_config_translation *m) | |
7126 | +{ | |
7127 | + printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local); | |
7128 | + | |
7129 | + if (mpc_record >= MAX_MPC_ENTRY) | |
7130 | + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); | |
7131 | + else | |
7132 | + translation_table[mpc_record] = m; /* stash this for later */ | |
7133 | + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) | |
7134 | + node_set_online(m->trans_quad); | |
7135 | +} | |
7136 | + | |
7137 | +/* | |
7138 | + * Read/parse the MPC oem tables | |
7139 | + */ | |
7140 | + | |
7141 | +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \ | |
7142 | + unsigned short oemsize) | |
7143 | +{ | |
7144 | + int count = sizeof (*oemtable); /* the header size */ | |
7145 | + unsigned char *oemptr = ((unsigned char *)oemtable)+count; | |
7146 | + | |
7147 | + mpc_record = 0; | |
7148 | + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); | |
7149 | + if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4)) | |
7150 | + { | |
7151 | + printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", | |
7152 | + oemtable->oem_signature[0], | |
7153 | + oemtable->oem_signature[1], | |
7154 | + oemtable->oem_signature[2], | |
7155 | + oemtable->oem_signature[3]); | |
7156 | + return; | |
7157 | + } | |
7158 | + if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length)) | |
7159 | + { | |
7160 | + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); | |
7161 | + return; | |
7162 | + } | |
7163 | + while (count < oemtable->oem_length) { | |
7164 | + switch (*oemptr) { | |
7165 | + case MP_TRANSLATION: | |
7166 | + { | |
7167 | + struct mpc_config_translation *m= | |
7168 | + (struct mpc_config_translation *)oemptr; | |
7169 | + MP_translation_info(m); | |
7170 | + oemptr += sizeof(*m); | |
7171 | + count += sizeof(*m); | |
7172 | + ++mpc_record; | |
7173 | + break; | |
7174 | + } | |
7175 | + default: | |
7176 | + { | |
7177 | + printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr); | |
7178 | + return; | |
7179 | + } | |
7180 | + } | |
7181 | + } | |
7182 | +} | |
7183 | + | |
7184 | +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, | |
7185 | + char *productid) | |
7186 | +{ | |
7187 | + if (strncmp(oem, "IBM NUMA", 8)) | |
7188 | + printk("Warning! May not be a NUMA-Q system!\n"); | |
7189 | + if (mpc->mpc_oemptr) | |
7190 | + smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, | |
7191 | + mpc->mpc_oemsize); | |
7192 | +} | |
7193 | +#endif /* CONFIG_X86_NUMAQ */ | |
7194 | + | |
7195 | +/* | |
7196 | + * Read/parse the MPC | |
7197 | + */ | |
7198 | + | |
7199 | +static int __init smp_read_mpc(struct mp_config_table *mpc) | |
7200 | +{ | |
7201 | + char str[16]; | |
7202 | + char oem[10]; | |
7203 | + int count=sizeof(*mpc); | |
7204 | + unsigned char *mpt=((unsigned char *)mpc)+count; | |
7205 | + | |
7206 | + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { | |
7207 | + printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n", | |
7208 | + *(u32 *)mpc->mpc_signature); | |
7209 | + return 0; | |
7210 | + } | |
7211 | + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { | |
7212 | + printk(KERN_ERR "SMP mptable: checksum error!\n"); | |
7213 | + return 0; | |
7214 | + } | |
7215 | + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { | |
7216 | + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", | |
7217 | + mpc->mpc_spec); | |
7218 | + return 0; | |
7219 | + } | |
7220 | + if (!mpc->mpc_lapic) { | |
7221 | + printk(KERN_ERR "SMP mptable: null local APIC address!\n"); | |
7222 | + return 0; | |
7223 | + } | |
7224 | + memcpy(oem,mpc->mpc_oem,8); | |
7225 | + oem[8]=0; | |
7226 | + printk(KERN_INFO "OEM ID: %s ",oem); | |
7227 | + | |
7228 | + memcpy(str,mpc->mpc_productid,12); | |
7229 | + str[12]=0; | |
7230 | + printk("Product ID: %s ",str); | |
7231 | + | |
7232 | + mps_oem_check(mpc, oem, str); | |
7233 | + | |
7234 | + printk("APIC at: 0x%lX\n",mpc->mpc_lapic); | |
7235 | + | |
7236 | + /* | |
7237 | + * Save the local APIC address (it might be non-default) -- but only | |
7238 | + * if we're not using ACPI. | |
7239 | + */ | |
7240 | + if (!acpi_lapic) | |
7241 | + mp_lapic_addr = mpc->mpc_lapic; | |
7242 | + | |
7243 | + /* | |
7244 | + * Now process the configuration blocks. | |
7245 | + */ | |
7246 | + mpc_record = 0; | |
7247 | + while (count < mpc->mpc_length) { | |
7248 | + switch(*mpt) { | |
7249 | + case MP_PROCESSOR: | |
7250 | + { | |
7251 | + struct mpc_config_processor *m= | |
7252 | + (struct mpc_config_processor *)mpt; | |
7253 | + /* ACPI may have already provided this data */ | |
7254 | + if (!acpi_lapic) | |
7255 | + MP_processor_info(m); | |
7256 | + mpt += sizeof(*m); | |
7257 | + count += sizeof(*m); | |
7258 | + break; | |
7259 | + } | |
7260 | + case MP_BUS: | |
7261 | + { | |
7262 | + struct mpc_config_bus *m= | |
7263 | + (struct mpc_config_bus *)mpt; | |
7264 | + MP_bus_info(m); | |
7265 | + mpt += sizeof(*m); | |
7266 | + count += sizeof(*m); | |
7267 | + break; | |
7268 | + } | |
7269 | + case MP_IOAPIC: | |
7270 | + { | |
7271 | + struct mpc_config_ioapic *m= | |
7272 | + (struct mpc_config_ioapic *)mpt; | |
7273 | + MP_ioapic_info(m); | |
7274 | + mpt+=sizeof(*m); | |
7275 | + count+=sizeof(*m); | |
7276 | + break; | |
7277 | + } | |
7278 | + case MP_INTSRC: | |
7279 | + { | |
7280 | + struct mpc_config_intsrc *m= | |
7281 | + (struct mpc_config_intsrc *)mpt; | |
7282 | + | |
7283 | + MP_intsrc_info(m); | |
7284 | + mpt+=sizeof(*m); | |
7285 | + count+=sizeof(*m); | |
7286 | + break; | |
7287 | + } | |
7288 | + case MP_LINTSRC: | |
7289 | + { | |
7290 | + struct mpc_config_lintsrc *m= | |
7291 | + (struct mpc_config_lintsrc *)mpt; | |
7292 | + MP_lintsrc_info(m); | |
7293 | + mpt+=sizeof(*m); | |
7294 | + count+=sizeof(*m); | |
7295 | + break; | |
7296 | + } | |
7297 | + default: | |
7298 | + { | |
7299 | + count = mpc->mpc_length; | |
7300 | + break; | |
7301 | + } | |
7302 | + } | |
7303 | + ++mpc_record; | |
7304 | + } | |
7305 | + clustered_apic_check(); | |
7306 | + if (!num_processors) | |
7307 | + printk(KERN_ERR "SMP mptable: no processors registered!\n"); | |
7308 | + return num_processors; | |
7309 | +} | |
7310 | + | |
7311 | +static int __init ELCR_trigger(unsigned int irq) | |
7312 | +{ | |
7313 | + unsigned int port; | |
7314 | + | |
7315 | + port = 0x4d0 + (irq >> 3); | |
7316 | + return (inb(port) >> (irq & 7)) & 1; | |
7317 | +} | |
7318 | + | |
7319 | +static void __init construct_default_ioirq_mptable(int mpc_default_type) | |
7320 | +{ | |
7321 | + struct mpc_config_intsrc intsrc; | |
7322 | + int i; | |
7323 | + int ELCR_fallback = 0; | |
7324 | + | |
7325 | + intsrc.mpc_type = MP_INTSRC; | |
7326 | + intsrc.mpc_irqflag = 0; /* conforming */ | |
7327 | + intsrc.mpc_srcbus = 0; | |
7328 | + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; | |
7329 | + | |
7330 | + intsrc.mpc_irqtype = mp_INT; | |
7331 | + | |
7332 | + /* | |
7333 | + * If true, we have an ISA/PCI system with no IRQ entries | |
7334 | + * in the MP table. To prevent the PCI interrupts from being set up | |
7335 | + * incorrectly, we try to use the ELCR. The sanity check to see if | |
7336 | + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can | |
7337 | + * never be level sensitive, so we simply see if the ELCR agrees. | |
7338 | + * If it does, we assume it's valid. | |
7339 | + */ | |
7340 | + if (mpc_default_type == 5) { | |
7341 | + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); | |
7342 | + | |
7343 | + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) | |
7344 | + printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); | |
7345 | + else { | |
7346 | + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); | |
7347 | + ELCR_fallback = 1; | |
7348 | + } | |
7349 | + } | |
7350 | + | |
7351 | + for (i = 0; i < 16; i++) { | |
7352 | + switch (mpc_default_type) { | |
7353 | + case 2: | |
7354 | + if (i == 0 || i == 13) | |
7355 | + continue; /* IRQ0 & IRQ13 not connected */ | |
7356 | + /* fall through */ | |
7357 | + default: | |
7358 | + if (i == 2) | |
7359 | + continue; /* IRQ2 is never connected */ | |
7360 | + } | |
7361 | + | |
7362 | + if (ELCR_fallback) { | |
7363 | + /* | |
7364 | + * If the ELCR indicates a level-sensitive interrupt, we | |
7365 | + * copy that information over to the MP table in the | |
7366 | + * irqflag field (level sensitive, active high polarity). | |
7367 | + */ | |
7368 | + if (ELCR_trigger(i)) | |
7369 | + intsrc.mpc_irqflag = 13; | |
7370 | + else | |
7371 | + intsrc.mpc_irqflag = 0; | |
7372 | + } | |
7373 | + | |
7374 | + intsrc.mpc_srcbusirq = i; | |
7375 | + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ | |
7376 | + MP_intsrc_info(&intsrc); | |
7377 | + } | |
7378 | + | |
7379 | + intsrc.mpc_irqtype = mp_ExtINT; | |
7380 | + intsrc.mpc_srcbusirq = 0; | |
7381 | + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ | |
7382 | + MP_intsrc_info(&intsrc); | |
7383 | +} | |
7384 | + | |
7385 | +static inline void __init construct_default_ISA_mptable(int mpc_default_type) | |
7386 | +{ | |
7387 | + struct mpc_config_processor processor; | |
7388 | + struct mpc_config_bus bus; | |
7389 | + struct mpc_config_ioapic ioapic; | |
7390 | + struct mpc_config_lintsrc lintsrc; | |
7391 | + int linttypes[2] = { mp_ExtINT, mp_NMI }; | |
7392 | + int i; | |
7393 | + | |
7394 | + /* | |
7395 | + * local APIC has default address | |
7396 | + */ | |
7397 | + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | |
7398 | + | |
7399 | + /* | |
7400 | + * 2 CPUs, numbered 0 & 1. | |
7401 | + */ | |
7402 | + processor.mpc_type = MP_PROCESSOR; | |
7403 | + /* Either an integrated APIC or a discrete 82489DX. */ | |
7404 | + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | |
7405 | + processor.mpc_cpuflag = CPU_ENABLED; | |
7406 | + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | |
7407 | + (boot_cpu_data.x86_model << 4) | | |
7408 | + boot_cpu_data.x86_mask; | |
7409 | + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | |
7410 | + processor.mpc_reserved[0] = 0; | |
7411 | + processor.mpc_reserved[1] = 0; | |
7412 | + for (i = 0; i < 2; i++) { | |
7413 | + processor.mpc_apicid = i; | |
7414 | + MP_processor_info(&processor); | |
7415 | + } | |
7416 | + | |
7417 | + bus.mpc_type = MP_BUS; | |
7418 | + bus.mpc_busid = 0; | |
7419 | + switch (mpc_default_type) { | |
7420 | + default: | |
7421 | + printk("???\n"); | |
7422 | + printk(KERN_ERR "Unknown standard configuration %d\n", | |
7423 | + mpc_default_type); | |
7424 | + /* fall through */ | |
7425 | + case 1: | |
7426 | + case 5: | |
7427 | + memcpy(bus.mpc_bustype, "ISA ", 6); | |
7428 | + break; | |
7429 | + case 2: | |
7430 | + case 6: | |
7431 | + case 3: | |
7432 | + memcpy(bus.mpc_bustype, "EISA ", 6); | |
7433 | + break; | |
7434 | + case 4: | |
7435 | + case 7: | |
7436 | + memcpy(bus.mpc_bustype, "MCA ", 6); | |
7437 | + } | |
7438 | + MP_bus_info(&bus); | |
7439 | + if (mpc_default_type > 4) { | |
7440 | + bus.mpc_busid = 1; | |
7441 | + memcpy(bus.mpc_bustype, "PCI ", 6); | |
7442 | + MP_bus_info(&bus); | |
7443 | + } | |
7444 | + | |
7445 | + ioapic.mpc_type = MP_IOAPIC; | |
7446 | + ioapic.mpc_apicid = 2; | |
7447 | + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | |
7448 | + ioapic.mpc_flags = MPC_APIC_USABLE; | |
7449 | + ioapic.mpc_apicaddr = 0xFEC00000; | |
7450 | + MP_ioapic_info(&ioapic); | |
7451 | + | |
7452 | + /* | |
7453 | + * We set up most of the low 16 IO-APIC pins according to MPS rules. | |
7454 | + */ | |
7455 | + construct_default_ioirq_mptable(mpc_default_type); | |
7456 | + | |
7457 | + lintsrc.mpc_type = MP_LINTSRC; | |
7458 | + lintsrc.mpc_irqflag = 0; /* conforming */ | |
7459 | + lintsrc.mpc_srcbusid = 0; | |
7460 | + lintsrc.mpc_srcbusirq = 0; | |
7461 | + lintsrc.mpc_destapic = MP_APIC_ALL; | |
7462 | + for (i = 0; i < 2; i++) { | |
7463 | + lintsrc.mpc_irqtype = linttypes[i]; | |
7464 | + lintsrc.mpc_destapiclint = i; | |
7465 | + MP_lintsrc_info(&lintsrc); | |
7466 | + } | |
7467 | +} | |
7468 | + | |
7469 | +static struct intel_mp_floating *mpf_found; | |
7470 | + | |
7471 | +/* | |
7472 | + * Scan the memory blocks for an SMP configuration block. | |
7473 | + */ | |
7474 | +void __init get_smp_config (void) | |
7475 | +{ | |
7476 | + struct intel_mp_floating *mpf = mpf_found; | |
7477 | + | |
7478 | + /* | |
7479 | + * ACPI supports both logical (e.g. Hyper-Threading) and physical | |
7480 | + * processors, where MPS only supports physical. | |
7481 | + */ | |
7482 | + if (acpi_lapic && acpi_ioapic) { | |
7483 | + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); | |
7484 | + return; | |
7485 | + } | |
7486 | + else if (acpi_lapic) | |
7487 | + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); | |
7488 | + | |
7489 | + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); | |
7490 | + if (mpf->mpf_feature2 & (1<<7)) { | |
7491 | + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); | |
7492 | + pic_mode = 1; | |
7493 | + } else { | |
7494 | + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); | |
7495 | + pic_mode = 0; | |
7496 | + } | |
7497 | + | |
7498 | + /* | |
7499 | + * Now see if we need to read further. | |
7500 | + */ | |
7501 | + if (mpf->mpf_feature1 != 0) { | |
7502 | + | |
7503 | + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); | |
7504 | + construct_default_ISA_mptable(mpf->mpf_feature1); | |
7505 | + | |
7506 | + } else if (mpf->mpf_physptr) { | |
7507 | + | |
7508 | + /* | |
7509 | + * Read the physical hardware table. Anything here will | |
7510 | + * override the defaults. | |
7511 | + */ | |
7512 | + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { | |
7513 | + smp_found_config = 0; | |
7514 | + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); | |
7515 | + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); | |
7516 | + return; | |
7517 | + } | |
7518 | + /* | |
7519 | + * If there are no explicit MP IRQ entries, then we are | |
7520 | + * broken. We set up most of the low 16 IO-APIC pins to | |
7521 | + * ISA defaults and hope it will work. | |
7522 | + */ | |
7523 | + if (!mp_irq_entries) { | |
7524 | + struct mpc_config_bus bus; | |
7525 | + | |
7526 | + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); | |
7527 | + | |
7528 | + bus.mpc_type = MP_BUS; | |
7529 | + bus.mpc_busid = 0; | |
7530 | + memcpy(bus.mpc_bustype, "ISA ", 6); | |
7531 | + MP_bus_info(&bus); | |
7532 | + | |
7533 | + construct_default_ioirq_mptable(0); | |
7534 | + } | |
7535 | + | |
7536 | + } else | |
7537 | + BUG(); | |
7538 | + | |
7539 | + printk(KERN_INFO "Processors: %d\n", num_processors); | |
7540 | + /* | |
7541 | + * Only use the first configuration found. | |
7542 | + */ | |
7543 | +} | |
7544 | + | |
7545 | +static int __init smp_scan_config (unsigned long base, unsigned long length) | |
7546 | +{ | |
7547 | + unsigned long *bp = isa_bus_to_virt(base); | |
7548 | + struct intel_mp_floating *mpf; | |
7549 | + | |
7550 | + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); | |
7551 | + if (sizeof(*mpf) != 16) | |
7552 | + printk("Error: MPF size\n"); | |
7553 | + | |
7554 | + while (length > 0) { | |
7555 | + mpf = (struct intel_mp_floating *)bp; | |
7556 | + if ((*bp == SMP_MAGIC_IDENT) && | |
7557 | + (mpf->mpf_length == 1) && | |
7558 | + !mpf_checksum((unsigned char *)bp, 16) && | |
7559 | + ((mpf->mpf_specification == 1) | |
7560 | + || (mpf->mpf_specification == 4)) ) { | |
7561 | + | |
7562 | + smp_found_config = 1; | |
7563 | +#ifndef CONFIG_XEN | |
7564 | + printk(KERN_INFO "found SMP MP-table at %08lx\n", | |
7565 | + virt_to_phys(mpf)); | |
7566 | + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); | |
7567 | + if (mpf->mpf_physptr) { | |
7568 | + /* | |
7569 | + * We cannot access to MPC table to compute | |
7570 | + * table size yet, as only few megabytes from | |
7571 | + * the bottom is mapped now. | |
7572 | + * PC-9800's MPC table places on the very last | |
7573 | + * of physical memory; so that simply reserving | |
7574 | + * PAGE_SIZE from mpg->mpf_physptr yields BUG() | |
7575 | + * in reserve_bootmem. | |
7576 | + */ | |
7577 | + unsigned long size = PAGE_SIZE; | |
7578 | + unsigned long end = max_low_pfn * PAGE_SIZE; | |
7579 | + if (mpf->mpf_physptr + size > end) | |
7580 | + size = end - mpf->mpf_physptr; | |
7581 | + reserve_bootmem(mpf->mpf_physptr, size); | |
7582 | + } | |
7583 | +#else | |
7584 | + printk(KERN_INFO "found SMP MP-table at %08lx\n", | |
7585 | + ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); | |
7586 | +#endif | |
7587 | + | |
7588 | + mpf_found = mpf; | |
7589 | + return 1; | |
7590 | + } | |
7591 | + bp += 4; | |
7592 | + length -= 16; | |
7593 | + } | |
7594 | + return 0; | |
7595 | +} | |
7596 | + | |
7597 | +void __init find_smp_config (void) | |
7598 | +{ | |
7599 | +#ifndef CONFIG_XEN | |
7600 | + unsigned int address; | |
7601 | +#endif | |
7602 | + | |
7603 | + /* | |
7604 | + * FIXME: Linux assumes you have 640K of base ram.. | |
7605 | + * this continues the error... | |
7606 | + * | |
7607 | + * 1) Scan the bottom 1K for a signature | |
7608 | + * 2) Scan the top 1K of base RAM | |
7609 | + * 3) Scan the 64K of bios | |
7610 | + */ | |
7611 | + if (smp_scan_config(0x0,0x400) || | |
7612 | + smp_scan_config(639*0x400,0x400) || | |
7613 | + smp_scan_config(0xF0000,0x10000)) | |
7614 | + return; | |
7615 | + /* | |
7616 | + * If it is an SMP machine we should know now, unless the | |
7617 | + * configuration is in an EISA/MCA bus machine with an | |
7618 | + * extended bios data area. | |
7619 | + * | |
7620 | + * there is a real-mode segmented pointer pointing to the | |
7621 | + * 4K EBDA area at 0x40E, calculate and scan it here. | |
7622 | + * | |
7623 | + * NOTE! There are Linux loaders that will corrupt the EBDA | |
7624 | + * area, and as such this kind of SMP config may be less | |
7625 | + * trustworthy, simply because the SMP table may have been | |
7626 | + * stomped on during early boot. These loaders are buggy and | |
7627 | + * should be fixed. | |
7628 | + * | |
7629 | + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. | |
7630 | + */ | |
7631 | + | |
7632 | +#ifndef CONFIG_XEN | |
7633 | + address = get_bios_ebda(); | |
7634 | + if (address) | |
7635 | + smp_scan_config(address, 0x400); | |
7636 | +#endif | |
7637 | +} | |
7638 | + | |
7639 | +int es7000_plat; | |
7640 | + | |
7641 | +/* -------------------------------------------------------------------------- | |
7642 | + ACPI-based MP Configuration | |
7643 | + -------------------------------------------------------------------------- */ | |
7644 | + | |
7645 | +#ifdef CONFIG_ACPI | |
7646 | + | |
7647 | +void __init mp_register_lapic_address ( | |
7648 | + u64 address) | |
7649 | +{ | |
7650 | +#ifndef CONFIG_XEN | |
7651 | + mp_lapic_addr = (unsigned long) address; | |
7652 | + | |
7653 | + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); | |
7654 | + | |
7655 | + if (boot_cpu_physical_apicid == -1U) | |
7656 | + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); | |
7657 | + | |
7658 | + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); | |
7659 | +#endif | |
7660 | +} | |
7661 | + | |
7662 | + | |
7663 | +void __devinit mp_register_lapic ( | |
7664 | + u8 id, | |
7665 | + u8 enabled) | |
7666 | +{ | |
7667 | + struct mpc_config_processor processor; | |
7668 | + int boot_cpu = 0; | |
7669 | + | |
7670 | + if (MAX_APICS - id <= 0) { | |
7671 | + printk(KERN_WARNING "Processor #%d invalid (max %d)\n", | |
7672 | + id, MAX_APICS); | |
7673 | + return; | |
7674 | + } | |
7675 | + | |
7676 | + if (id == boot_cpu_physical_apicid) | |
7677 | + boot_cpu = 1; | |
7678 | + | |
7679 | +#ifndef CONFIG_XEN | |
7680 | + processor.mpc_type = MP_PROCESSOR; | |
7681 | + processor.mpc_apicid = id; | |
7682 | + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); | |
7683 | + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); | |
7684 | + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); | |
7685 | + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | |
7686 | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; | |
7687 | + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | |
7688 | + processor.mpc_reserved[0] = 0; | |
7689 | + processor.mpc_reserved[1] = 0; | |
7690 | +#endif | |
7691 | + | |
7692 | + MP_processor_info(&processor); | |
7693 | +} | |
7694 | + | |
7695 | +#ifdef CONFIG_X86_IO_APIC | |
7696 | + | |
7697 | +#define MP_ISA_BUS 0 | |
7698 | +#define MP_MAX_IOAPIC_PIN 127 | |
7699 | + | |
7700 | +static struct mp_ioapic_routing { | |
7701 | + int apic_id; | |
7702 | + int gsi_base; | |
7703 | + int gsi_end; | |
7704 | + u32 pin_programmed[4]; | |
7705 | +} mp_ioapic_routing[MAX_IO_APICS]; | |
7706 | + | |
7707 | + | |
7708 | +static int mp_find_ioapic ( | |
7709 | + int gsi) | |
7710 | +{ | |
7711 | + int i = 0; | |
7712 | + | |
7713 | + /* Find the IOAPIC that manages this GSI. */ | |
7714 | + for (i = 0; i < nr_ioapics; i++) { | |
7715 | + if ((gsi >= mp_ioapic_routing[i].gsi_base) | |
7716 | + && (gsi <= mp_ioapic_routing[i].gsi_end)) | |
7717 | + return i; | |
7718 | + } | |
7719 | + | |
7720 | + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | |
7721 | + | |
7722 | + return -1; | |
7723 | +} | |
7724 | + | |
7725 | + | |
7726 | +void __init mp_register_ioapic ( | |
7727 | + u8 id, | |
7728 | + u32 address, | |
7729 | + u32 gsi_base) | |
7730 | +{ | |
7731 | + int idx = 0; | |
7732 | + int tmpid; | |
7733 | + | |
7734 | + if (nr_ioapics >= MAX_IO_APICS) { | |
7735 | + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " | |
7736 | + "(found %d)\n", MAX_IO_APICS, nr_ioapics); | |
7737 | + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); | |
7738 | + } | |
7739 | + if (!address) { | |
7740 | + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" | |
7741 | + " found in MADT table, skipping!\n"); | |
7742 | + return; | |
7743 | + } | |
7744 | + | |
7745 | + idx = nr_ioapics++; | |
7746 | + | |
7747 | + mp_ioapics[idx].mpc_type = MP_IOAPIC; | |
7748 | + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; | |
7749 | + mp_ioapics[idx].mpc_apicaddr = address; | |
7750 | + | |
7751 | +#ifndef CONFIG_XEN | |
7752 | + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | |
7753 | +#endif | |
7754 | + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | |
7755 | + && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | |
7756 | + tmpid = io_apic_get_unique_id(idx, id); | |
7757 | + else | |
7758 | + tmpid = id; | |
7759 | + if (tmpid == -1) { | |
7760 | + nr_ioapics--; | |
7761 | + return; | |
7762 | + } | |
7763 | + mp_ioapics[idx].mpc_apicid = tmpid; | |
7764 | + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); | |
7765 | + | |
7766 | + /* | |
7767 | + * Build basic GSI lookup table to facilitate gsi->io_apic lookups | |
7768 | + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). | |
7769 | + */ | |
7770 | + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; | |
7771 | + mp_ioapic_routing[idx].gsi_base = gsi_base; | |
7772 | + mp_ioapic_routing[idx].gsi_end = gsi_base + | |
7773 | + io_apic_get_redir_entries(idx); | |
7774 | + | |
7775 | + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " | |
7776 | + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, | |
7777 | + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, | |
7778 | + mp_ioapic_routing[idx].gsi_base, | |
7779 | + mp_ioapic_routing[idx].gsi_end); | |
7780 | + | |
7781 | + return; | |
7782 | +} | |
7783 | + | |
7784 | + | |
7785 | +void __init mp_override_legacy_irq ( | |
7786 | + u8 bus_irq, | |
7787 | + u8 polarity, | |
7788 | + u8 trigger, | |
7789 | + u32 gsi) | |
7790 | +{ | |
7791 | + struct mpc_config_intsrc intsrc; | |
7792 | + int ioapic = -1; | |
7793 | + int pin = -1; | |
7794 | + | |
7795 | + /* | |
7796 | + * Convert 'gsi' to 'ioapic.pin'. | |
7797 | + */ | |
7798 | + ioapic = mp_find_ioapic(gsi); | |
7799 | + if (ioapic < 0) | |
7800 | + return; | |
7801 | + pin = gsi - mp_ioapic_routing[ioapic].gsi_base; | |
7802 | + | |
7803 | + /* | |
7804 | + * TBD: This check is for faulty timer entries, where the override | |
7805 | + * erroneously sets the trigger to level, resulting in a HUGE | |
7806 | + * increase of timer interrupts! | |
7807 | + */ | |
7808 | + if ((bus_irq == 0) && (trigger == 3)) | |
7809 | + trigger = 1; | |
7810 | + | |
7811 | + intsrc.mpc_type = MP_INTSRC; | |
7812 | + intsrc.mpc_irqtype = mp_INT; | |
7813 | + intsrc.mpc_irqflag = (trigger << 2) | polarity; | |
7814 | + intsrc.mpc_srcbus = MP_ISA_BUS; | |
7815 | + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ | |
7816 | + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ | |
7817 | + intsrc.mpc_dstirq = pin; /* INTIN# */ | |
7818 | + | |
7819 | + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", | |
7820 | + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | |
7821 | + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | |
7822 | + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); | |
7823 | + | |
7824 | + mp_irqs[mp_irq_entries] = intsrc; | |
7825 | + if (++mp_irq_entries == MAX_IRQ_SOURCES) | |
7826 | + panic("Max # of irq sources exceeded!\n"); | |
7827 | + | |
7828 | + return; | |
7829 | +} | |
7830 | + | |
7831 | +void __init mp_config_acpi_legacy_irqs (void) | |
7832 | +{ | |
7833 | + struct mpc_config_intsrc intsrc; | |
7834 | + int i = 0; | |
7835 | + int ioapic = -1; | |
7836 | + | |
7837 | + /* | |
7838 | + * Fabricate the legacy ISA bus (bus #31). | |
7839 | + */ | |
7840 | + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; | |
7841 | + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); | |
7842 | + | |
7843 | + /* | |
7844 | + * Older generations of ES7000 have no legacy identity mappings | |
7845 | + */ | |
7846 | + if (es7000_plat == 1) | |
7847 | + return; | |
7848 | + | |
7849 | + /* | |
7850 | + * Locate the IOAPIC that manages the ISA IRQs (0-15). | |
7851 | + */ | |
7852 | + ioapic = mp_find_ioapic(0); | |
7853 | + if (ioapic < 0) | |
7854 | + return; | |
7855 | + | |
7856 | + intsrc.mpc_type = MP_INTSRC; | |
7857 | + intsrc.mpc_irqflag = 0; /* Conforming */ | |
7858 | + intsrc.mpc_srcbus = MP_ISA_BUS; | |
7859 | + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; | |
7860 | + | |
7861 | + /* | |
7862 | + * Use the default configuration for the IRQs 0-15. Unless | |
7863 | + * overriden by (MADT) interrupt source override entries. | |
7864 | + */ | |
7865 | + for (i = 0; i < 16; i++) { | |
7866 | + int idx; | |
7867 | + | |
7868 | + for (idx = 0; idx < mp_irq_entries; idx++) { | |
7869 | + struct mpc_config_intsrc *irq = mp_irqs + idx; | |
7870 | + | |
7871 | + /* Do we already have a mapping for this ISA IRQ? */ | |
7872 | + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) | |
7873 | + break; | |
7874 | + | |
7875 | + /* Do we already have a mapping for this IOAPIC pin */ | |
7876 | + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && | |
7877 | + (irq->mpc_dstirq == i)) | |
7878 | + break; | |
7879 | + } | |
7880 | + | |
7881 | + if (idx != mp_irq_entries) { | |
7882 | + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); | |
7883 | + continue; /* IRQ already used */ | |
7884 | + } | |
7885 | + | |
7886 | + intsrc.mpc_irqtype = mp_INT; | |
7887 | + intsrc.mpc_srcbusirq = i; /* Identity mapped */ | |
7888 | + intsrc.mpc_dstirq = i; | |
7889 | + | |
7890 | + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " | |
7891 | + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | |
7892 | + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | |
7893 | + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, | |
7894 | + intsrc.mpc_dstirq); | |
7895 | + | |
7896 | + mp_irqs[mp_irq_entries] = intsrc; | |
7897 | + if (++mp_irq_entries == MAX_IRQ_SOURCES) | |
7898 | + panic("Max # of irq sources exceeded!\n"); | |
7899 | + } | |
7900 | +} | |
7901 | + | |
7902 | +#define MAX_GSI_NUM 4096 | |
7903 | + | |
7904 | +int mp_register_gsi (u32 gsi, int triggering, int polarity) | |
7905 | +{ | |
7906 | + int ioapic = -1; | |
7907 | + int ioapic_pin = 0; | |
7908 | + int idx, bit = 0; | |
7909 | + static int pci_irq = 16; | |
7910 | + /* | |
7911 | + * Mapping between Global System Interrups, which | |
7912 | + * represent all possible interrupts, and IRQs | |
7913 | + * assigned to actual devices. | |
7914 | + */ | |
7915 | + static int gsi_to_irq[MAX_GSI_NUM]; | |
7916 | + | |
7917 | + /* Don't set up the ACPI SCI because it's already set up */ | |
7918 | + if (acpi_fadt.sci_int == gsi) | |
7919 | + return gsi; | |
7920 | + | |
7921 | + ioapic = mp_find_ioapic(gsi); | |
7922 | + if (ioapic < 0) { | |
7923 | + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); | |
7924 | + return gsi; | |
7925 | + } | |
7926 | + | |
7927 | + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; | |
7928 | + | |
7929 | + if (ioapic_renumber_irq) | |
7930 | + gsi = ioapic_renumber_irq(ioapic, gsi); | |
7931 | + | |
7932 | + /* | |
7933 | + * Avoid pin reprogramming. PRTs typically include entries | |
7934 | + * with redundant pin->gsi mappings (but unique PCI devices); | |
7935 | + * we only program the IOAPIC on the first. | |
7936 | + */ | |
7937 | + bit = ioapic_pin % 32; | |
7938 | + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); | |
7939 | + if (idx > 3) { | |
7940 | + printk(KERN_ERR "Invalid reference to IOAPIC pin " | |
7941 | + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, | |
7942 | + ioapic_pin); | |
7943 | + return gsi; | |
7944 | + } | |
7945 | + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { | |
7946 | + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", | |
7947 | + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | |
7948 | + return gsi_to_irq[gsi]; | |
7949 | + } | |
7950 | + | |
7951 | + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); | |
7952 | + | |
7953 | + if (triggering == ACPI_LEVEL_SENSITIVE) { | |
7954 | + /* | |
7955 | + * For PCI devices assign IRQs in order, avoiding gaps | |
7956 | + * due to unused I/O APIC pins. | |
7957 | + */ | |
7958 | + int irq = gsi; | |
7959 | + if (gsi < MAX_GSI_NUM) { | |
7960 | + /* | |
7961 | + * Retain the VIA chipset work-around (gsi > 15), but | |
7962 | + * avoid a problem where the 8254 timer (IRQ0) is setup | |
7963 | + * via an override (so it's not on pin 0 of the ioapic), | |
7964 | + * and at the same time, the pin 0 interrupt is a PCI | |
7965 | + * type. The gsi > 15 test could cause these two pins | |
7966 | + * to be shared as IRQ0, and they are not shareable. | |
7967 | + * So test for this condition, and if necessary, avoid | |
7968 | + * the pin collision. | |
7969 | + */ | |
7970 | + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) | |
7971 | + gsi = pci_irq++; | |
7972 | + /* | |
7973 | + * Don't assign IRQ used by ACPI SCI | |
7974 | + */ | |
7975 | + if (gsi == acpi_fadt.sci_int) | |
7976 | + gsi = pci_irq++; | |
7977 | + gsi_to_irq[irq] = gsi; | |
7978 | + } else { | |
7979 | + printk(KERN_ERR "GSI %u is too high\n", gsi); | |
7980 | + return gsi; | |
7981 | + } | |
7982 | + } | |
7983 | + | |
7984 | + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, | |
7985 | + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, | |
7986 | + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); | |
7987 | + return gsi; | |
7988 | +} | |
7989 | + | |
7990 | +#endif /* CONFIG_X86_IO_APIC */ | |
7991 | +#endif /* CONFIG_ACPI */ | |
7992 | Index: head-2008-11-25/arch/x86/kernel/pci-dma-xen.c | |
7993 | =================================================================== | |
7994 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
7995 | +++ head-2008-11-25/arch/x86/kernel/pci-dma-xen.c 2008-10-29 09:55:56.000000000 +0100 | |
7996 | @@ -0,0 +1,409 @@ | |
7997 | +/* | |
7998 | + * Dynamic DMA mapping support. | |
7999 | + * | |
8000 | + * On i386 there is no hardware dynamic DMA address translation, | |
8001 | + * so consistent alloc/free are merely page allocation/freeing. | |
8002 | + * The rest of the dynamic DMA mapping interface is implemented | |
8003 | + * in asm/pci.h. | |
8004 | + */ | |
8005 | + | |
8006 | +#include <linux/types.h> | |
8007 | +#include <linux/mm.h> | |
8008 | +#include <linux/string.h> | |
8009 | +#include <linux/pci.h> | |
8010 | +#include <linux/module.h> | |
8011 | +#include <linux/version.h> | |
8012 | +#include <asm/io.h> | |
8013 | +#include <xen/balloon.h> | |
8014 | +#include <xen/gnttab.h> | |
8015 | +#include <asm/swiotlb.h> | |
8016 | +#include <asm/tlbflush.h> | |
8017 | +#include <asm-i386/mach-xen/asm/swiotlb.h> | |
8018 | +#include <asm-i386/mach-xen/asm/gnttab_dma.h> | |
8019 | +#include <asm/bug.h> | |
8020 | + | |
8021 | +#ifdef __x86_64__ | |
8022 | +#include <asm/proto.h> | |
8023 | + | |
8024 | +int iommu_merge __read_mostly = 0; | |
8025 | +EXPORT_SYMBOL(iommu_merge); | |
8026 | + | |
8027 | +dma_addr_t bad_dma_address __read_mostly; | |
8028 | +EXPORT_SYMBOL(bad_dma_address); | |
8029 | + | |
8030 | +/* This tells the BIO block layer to assume merging. Default to off | |
8031 | + because we cannot guarantee merging later. */ | |
8032 | +int iommu_bio_merge __read_mostly = 0; | |
8033 | +EXPORT_SYMBOL(iommu_bio_merge); | |
8034 | + | |
8035 | +int force_iommu __read_mostly= 0; | |
8036 | + | |
8037 | +__init int iommu_setup(char *p) | |
8038 | +{ | |
8039 | + return 1; | |
8040 | +} | |
8041 | + | |
8042 | +void __init pci_iommu_alloc(void) | |
8043 | +{ | |
8044 | +#ifdef CONFIG_SWIOTLB | |
8045 | + pci_swiotlb_init(); | |
8046 | +#endif | |
8047 | +} | |
8048 | + | |
8049 | +static int __init pci_iommu_init(void) | |
8050 | +{ | |
8051 | + no_iommu_init(); | |
8052 | + return 0; | |
8053 | +} | |
8054 | + | |
8055 | +/* Must execute after PCI subsystem */ | |
8056 | +fs_initcall(pci_iommu_init); | |
8057 | +#endif | |
8058 | + | |
8059 | +struct dma_coherent_mem { | |
8060 | + void *virt_base; | |
8061 | + u32 device_base; | |
8062 | + int size; | |
8063 | + int flags; | |
8064 | + unsigned long *bitmap; | |
8065 | +}; | |
8066 | + | |
8067 | +#define IOMMU_BUG_ON(test) \ | |
8068 | +do { \ | |
8069 | + if (unlikely(test)) { \ | |
8070 | + printk(KERN_ALERT "Fatal DMA error! " \ | |
8071 | + "Please use 'swiotlb=force'\n"); \ | |
8072 | + BUG(); \ | |
8073 | + } \ | |
8074 | +} while (0) | |
8075 | + | |
8076 | +static int check_pages_physically_contiguous(unsigned long pfn, | |
8077 | + unsigned int offset, | |
8078 | + size_t length) | |
8079 | +{ | |
8080 | + unsigned long next_mfn; | |
8081 | + int i; | |
8082 | + int nr_pages; | |
8083 | + | |
8084 | + next_mfn = pfn_to_mfn(pfn); | |
8085 | + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; | |
8086 | + | |
8087 | + for (i = 1; i < nr_pages; i++) { | |
8088 | + if (pfn_to_mfn(++pfn) != ++next_mfn) | |
8089 | + return 0; | |
8090 | + } | |
8091 | + return 1; | |
8092 | +} | |
8093 | + | |
8094 | +int range_straddles_page_boundary(paddr_t p, size_t size) | |
8095 | +{ | |
8096 | + unsigned long pfn = p >> PAGE_SHIFT; | |
8097 | + unsigned int offset = p & ~PAGE_MASK; | |
8098 | + | |
8099 | + return ((offset + size > PAGE_SIZE) && | |
8100 | + !check_pages_physically_contiguous(pfn, offset, size)); | |
8101 | +} | |
8102 | + | |
8103 | +int | |
8104 | +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, | |
8105 | + enum dma_data_direction direction) | |
8106 | +{ | |
8107 | + int i, rc; | |
8108 | + | |
8109 | + if (direction == DMA_NONE) | |
8110 | + BUG(); | |
8111 | + WARN_ON(nents == 0 || sg[0].length == 0); | |
8112 | + | |
8113 | + if (swiotlb) { | |
8114 | + rc = swiotlb_map_sg(hwdev, sg, nents, direction); | |
8115 | + } else { | |
8116 | + for (i = 0; i < nents; i++ ) { | |
8117 | + BUG_ON(!sg[i].page); | |
8118 | + sg[i].dma_address = | |
8119 | + gnttab_dma_map_page(sg[i].page) + sg[i].offset; | |
8120 | + sg[i].dma_length = sg[i].length; | |
8121 | + IOMMU_BUG_ON(address_needs_mapping( | |
8122 | + hwdev, sg[i].dma_address)); | |
8123 | + IOMMU_BUG_ON(range_straddles_page_boundary( | |
8124 | + page_to_pseudophys(sg[i].page) + sg[i].offset, | |
8125 | + sg[i].length)); | |
8126 | + } | |
8127 | + rc = nents; | |
8128 | + } | |
8129 | + | |
8130 | + flush_write_buffers(); | |
8131 | + return rc; | |
8132 | +} | |
8133 | +EXPORT_SYMBOL(dma_map_sg); | |
8134 | + | |
8135 | +void | |
8136 | +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, | |
8137 | + enum dma_data_direction direction) | |
8138 | +{ | |
8139 | + int i; | |
8140 | + | |
8141 | + BUG_ON(direction == DMA_NONE); | |
8142 | + if (swiotlb) | |
8143 | + swiotlb_unmap_sg(hwdev, sg, nents, direction); | |
8144 | + else { | |
8145 | + for (i = 0; i < nents; i++ ) | |
8146 | + gnttab_dma_unmap_page(sg[i].dma_address); | |
8147 | + } | |
8148 | +} | |
8149 | +EXPORT_SYMBOL(dma_unmap_sg); | |
8150 | + | |
8151 | +#ifdef CONFIG_HIGHMEM | |
8152 | +dma_addr_t | |
8153 | +dma_map_page(struct device *dev, struct page *page, unsigned long offset, | |
8154 | + size_t size, enum dma_data_direction direction) | |
8155 | +{ | |
8156 | + dma_addr_t dma_addr; | |
8157 | + | |
8158 | + BUG_ON(direction == DMA_NONE); | |
8159 | + | |
8160 | + if (swiotlb) { | |
8161 | + dma_addr = swiotlb_map_page( | |
8162 | + dev, page, offset, size, direction); | |
8163 | + } else { | |
8164 | + dma_addr = gnttab_dma_map_page(page) + offset; | |
8165 | + IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr)); | |
8166 | + } | |
8167 | + | |
8168 | + return dma_addr; | |
8169 | +} | |
8170 | +EXPORT_SYMBOL(dma_map_page); | |
8171 | + | |
8172 | +void | |
8173 | +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, | |
8174 | + enum dma_data_direction direction) | |
8175 | +{ | |
8176 | + BUG_ON(direction == DMA_NONE); | |
8177 | + if (swiotlb) | |
8178 | + swiotlb_unmap_page(dev, dma_address, size, direction); | |
8179 | + else | |
8180 | + gnttab_dma_unmap_page(dma_address); | |
8181 | +} | |
8182 | +EXPORT_SYMBOL(dma_unmap_page); | |
8183 | +#endif /* CONFIG_HIGHMEM */ | |
8184 | + | |
8185 | +int | |
8186 | +dma_mapping_error(dma_addr_t dma_addr) | |
8187 | +{ | |
8188 | + if (swiotlb) | |
8189 | + return swiotlb_dma_mapping_error(dma_addr); | |
8190 | + return 0; | |
8191 | +} | |
8192 | +EXPORT_SYMBOL(dma_mapping_error); | |
8193 | + | |
8194 | +int | |
8195 | +dma_supported(struct device *dev, u64 mask) | |
8196 | +{ | |
8197 | + if (swiotlb) | |
8198 | + return swiotlb_dma_supported(dev, mask); | |
8199 | + /* | |
8200 | + * By default we'll BUG when an infeasible DMA is requested, and | |
8201 | + * request swiotlb=force (see IOMMU_BUG_ON). | |
8202 | + */ | |
8203 | + return 1; | |
8204 | +} | |
8205 | +EXPORT_SYMBOL(dma_supported); | |
8206 | + | |
8207 | +void *dma_alloc_coherent(struct device *dev, size_t size, | |
8208 | + dma_addr_t *dma_handle, gfp_t gfp) | |
8209 | +{ | |
8210 | + void *ret; | |
8211 | + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; | |
8212 | + unsigned int order = get_order(size); | |
8213 | + unsigned long vstart; | |
8214 | + u64 mask; | |
8215 | + | |
8216 | + /* ignore region specifiers */ | |
8217 | + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); | |
8218 | + | |
8219 | + if (mem) { | |
8220 | + int page = bitmap_find_free_region(mem->bitmap, mem->size, | |
8221 | + order); | |
8222 | + if (page >= 0) { | |
8223 | + *dma_handle = mem->device_base + (page << PAGE_SHIFT); | |
8224 | + ret = mem->virt_base + (page << PAGE_SHIFT); | |
8225 | + memset(ret, 0, size); | |
8226 | + return ret; | |
8227 | + } | |
8228 | + if (mem->flags & DMA_MEMORY_EXCLUSIVE) | |
8229 | + return NULL; | |
8230 | + } | |
8231 | + | |
8232 | + if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) | |
8233 | + gfp |= GFP_DMA; | |
8234 | + | |
8235 | + vstart = __get_free_pages(gfp, order); | |
8236 | + ret = (void *)vstart; | |
8237 | + | |
8238 | + if (dev != NULL && dev->coherent_dma_mask) | |
8239 | + mask = dev->coherent_dma_mask; | |
8240 | + else | |
8241 | + mask = 0xffffffff; | |
8242 | + | |
8243 | + if (ret != NULL) { | |
8244 | + if (xen_create_contiguous_region(vstart, order, | |
8245 | + fls64(mask)) != 0) { | |
8246 | + free_pages(vstart, order); | |
8247 | + return NULL; | |
8248 | + } | |
8249 | + memset(ret, 0, size); | |
8250 | + *dma_handle = virt_to_bus(ret); | |
8251 | + } | |
8252 | + return ret; | |
8253 | +} | |
8254 | +EXPORT_SYMBOL(dma_alloc_coherent); | |
8255 | + | |
8256 | +void dma_free_coherent(struct device *dev, size_t size, | |
8257 | + void *vaddr, dma_addr_t dma_handle) | |
8258 | +{ | |
8259 | + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; | |
8260 | + int order = get_order(size); | |
8261 | + | |
8262 | + if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { | |
8263 | + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; | |
8264 | + | |
8265 | + bitmap_release_region(mem->bitmap, page, order); | |
8266 | + } else { | |
8267 | + xen_destroy_contiguous_region((unsigned long)vaddr, order); | |
8268 | + free_pages((unsigned long)vaddr, order); | |
8269 | + } | |
8270 | +} | |
8271 | +EXPORT_SYMBOL(dma_free_coherent); | |
8272 | + | |
8273 | +#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY | |
8274 | +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, | |
8275 | + dma_addr_t device_addr, size_t size, int flags) | |
8276 | +{ | |
8277 | + void __iomem *mem_base; | |
8278 | + int pages = size >> PAGE_SHIFT; | |
8279 | + int bitmap_size = (pages + 31)/32; | |
8280 | + | |
8281 | + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) | |
8282 | + goto out; | |
8283 | + if (!size) | |
8284 | + goto out; | |
8285 | + if (dev->dma_mem) | |
8286 | + goto out; | |
8287 | + | |
8288 | + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ | |
8289 | + | |
8290 | + mem_base = ioremap(bus_addr, size); | |
8291 | + if (!mem_base) | |
8292 | + goto out; | |
8293 | + | |
8294 | + dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); | |
8295 | + if (!dev->dma_mem) | |
8296 | + goto out; | |
8297 | + memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); | |
8298 | + dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); | |
8299 | + if (!dev->dma_mem->bitmap) | |
8300 | + goto free1_out; | |
8301 | + memset(dev->dma_mem->bitmap, 0, bitmap_size); | |
8302 | + | |
8303 | + dev->dma_mem->virt_base = mem_base; | |
8304 | + dev->dma_mem->device_base = device_addr; | |
8305 | + dev->dma_mem->size = pages; | |
8306 | + dev->dma_mem->flags = flags; | |
8307 | + | |
8308 | + if (flags & DMA_MEMORY_MAP) | |
8309 | + return DMA_MEMORY_MAP; | |
8310 | + | |
8311 | + return DMA_MEMORY_IO; | |
8312 | + | |
8313 | + free1_out: | |
8314 | + kfree(dev->dma_mem->bitmap); | |
8315 | + out: | |
8316 | + return 0; | |
8317 | +} | |
8318 | +EXPORT_SYMBOL(dma_declare_coherent_memory); | |
8319 | + | |
8320 | +void dma_release_declared_memory(struct device *dev) | |
8321 | +{ | |
8322 | + struct dma_coherent_mem *mem = dev->dma_mem; | |
8323 | + | |
8324 | + if(!mem) | |
8325 | + return; | |
8326 | + dev->dma_mem = NULL; | |
8327 | + iounmap(mem->virt_base); | |
8328 | + kfree(mem->bitmap); | |
8329 | + kfree(mem); | |
8330 | +} | |
8331 | +EXPORT_SYMBOL(dma_release_declared_memory); | |
8332 | + | |
8333 | +void *dma_mark_declared_memory_occupied(struct device *dev, | |
8334 | + dma_addr_t device_addr, size_t size) | |
8335 | +{ | |
8336 | + struct dma_coherent_mem *mem = dev->dma_mem; | |
8337 | + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; | |
8338 | + int pos, err; | |
8339 | + | |
8340 | + if (!mem) | |
8341 | + return ERR_PTR(-EINVAL); | |
8342 | + | |
8343 | + pos = (device_addr - mem->device_base) >> PAGE_SHIFT; | |
8344 | + err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); | |
8345 | + if (err != 0) | |
8346 | + return ERR_PTR(err); | |
8347 | + return mem->virt_base + (pos << PAGE_SHIFT); | |
8348 | +} | |
8349 | +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); | |
8350 | +#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ | |
8351 | + | |
8352 | +dma_addr_t | |
8353 | +dma_map_single(struct device *dev, void *ptr, size_t size, | |
8354 | + enum dma_data_direction direction) | |
8355 | +{ | |
8356 | + dma_addr_t dma; | |
8357 | + | |
8358 | + if (direction == DMA_NONE) | |
8359 | + BUG(); | |
8360 | + WARN_ON(size == 0); | |
8361 | + | |
8362 | + if (swiotlb) { | |
8363 | + dma = swiotlb_map_single(dev, ptr, size, direction); | |
8364 | + } else { | |
8365 | + dma = gnttab_dma_map_page(virt_to_page(ptr)) + | |
8366 | + offset_in_page(ptr); | |
8367 | + IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size)); | |
8368 | + IOMMU_BUG_ON(address_needs_mapping(dev, dma)); | |
8369 | + } | |
8370 | + | |
8371 | + flush_write_buffers(); | |
8372 | + return dma; | |
8373 | +} | |
8374 | +EXPORT_SYMBOL(dma_map_single); | |
8375 | + | |
8376 | +void | |
8377 | +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, | |
8378 | + enum dma_data_direction direction) | |
8379 | +{ | |
8380 | + if (direction == DMA_NONE) | |
8381 | + BUG(); | |
8382 | + if (swiotlb) | |
8383 | + swiotlb_unmap_single(dev, dma_addr, size, direction); | |
8384 | + else | |
8385 | + gnttab_dma_unmap_page(dma_addr); | |
8386 | +} | |
8387 | +EXPORT_SYMBOL(dma_unmap_single); | |
8388 | + | |
8389 | +void | |
8390 | +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, | |
8391 | + enum dma_data_direction direction) | |
8392 | +{ | |
8393 | + if (swiotlb) | |
8394 | + swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction); | |
8395 | +} | |
8396 | +EXPORT_SYMBOL(dma_sync_single_for_cpu); | |
8397 | + | |
8398 | +void | |
8399 | +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, | |
8400 | + enum dma_data_direction direction) | |
8401 | +{ | |
8402 | + if (swiotlb) | |
8403 | + swiotlb_sync_single_for_device(dev, dma_handle, size, direction); | |
8404 | +} | |
8405 | +EXPORT_SYMBOL(dma_sync_single_for_device); | |
8406 | Index: head-2008-11-25/arch/x86/kernel/process_32-xen.c | |
8407 | =================================================================== | |
8408 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
8409 | +++ head-2008-11-25/arch/x86/kernel/process_32-xen.c 2008-07-21 11:00:32.000000000 +0200 | |
8410 | @@ -0,0 +1,877 @@ | |
8411 | +/* | |
8412 | + * linux/arch/i386/kernel/process.c | |
8413 | + * | |
8414 | + * Copyright (C) 1995 Linus Torvalds | |
8415 | + * | |
8416 | + * Pentium III FXSR, SSE support | |
8417 | + * Gareth Hughes <gareth@valinux.com>, May 2000 | |
8418 | + */ | |
8419 | + | |
8420 | +/* | |
8421 | + * This file handles the architecture-dependent parts of process handling.. | |
8422 | + */ | |
8423 | + | |
8424 | +#include <stdarg.h> | |
8425 | + | |
8426 | +#include <linux/cpu.h> | |
8427 | +#include <linux/errno.h> | |
8428 | +#include <linux/sched.h> | |
8429 | +#include <linux/fs.h> | |
8430 | +#include <linux/kernel.h> | |
8431 | +#include <linux/mm.h> | |
8432 | +#include <linux/elfcore.h> | |
8433 | +#include <linux/smp.h> | |
8434 | +#include <linux/smp_lock.h> | |
8435 | +#include <linux/stddef.h> | |
8436 | +#include <linux/slab.h> | |
8437 | +#include <linux/vmalloc.h> | |
8438 | +#include <linux/user.h> | |
8439 | +#include <linux/a.out.h> | |
8440 | +#include <linux/interrupt.h> | |
8441 | +#include <linux/utsname.h> | |
8442 | +#include <linux/delay.h> | |
8443 | +#include <linux/reboot.h> | |
8444 | +#include <linux/init.h> | |
8445 | +#include <linux/mc146818rtc.h> | |
8446 | +#include <linux/module.h> | |
8447 | +#include <linux/kallsyms.h> | |
8448 | +#include <linux/ptrace.h> | |
8449 | +#include <linux/random.h> | |
8450 | + | |
8451 | +#include <asm/uaccess.h> | |
8452 | +#include <asm/pgtable.h> | |
8453 | +#include <asm/system.h> | |
8454 | +#include <asm/io.h> | |
8455 | +#include <asm/ldt.h> | |
8456 | +#include <asm/processor.h> | |
8457 | +#include <asm/i387.h> | |
8458 | +#include <asm/desc.h> | |
8459 | +#include <asm/vm86.h> | |
8460 | +#ifdef CONFIG_MATH_EMULATION | |
8461 | +#include <asm/math_emu.h> | |
8462 | +#endif | |
8463 | + | |
8464 | +#include <xen/interface/physdev.h> | |
8465 | +#include <xen/interface/vcpu.h> | |
8466 | +#include <xen/cpu_hotplug.h> | |
8467 | + | |
8468 | +#include <linux/err.h> | |
8469 | + | |
8470 | +#include <asm/tlbflush.h> | |
8471 | +#include <asm/cpu.h> | |
8472 | + | |
8473 | +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | |
8474 | + | |
8475 | +static int hlt_counter; | |
8476 | + | |
8477 | +unsigned long boot_option_idle_override = 0; | |
8478 | +EXPORT_SYMBOL(boot_option_idle_override); | |
8479 | + | |
8480 | +/* | |
8481 | + * Return saved PC of a blocked thread. | |
8482 | + */ | |
8483 | +unsigned long thread_saved_pc(struct task_struct *tsk) | |
8484 | +{ | |
8485 | + return ((unsigned long *)tsk->thread.esp)[3]; | |
8486 | +} | |
8487 | + | |
8488 | +/* | |
8489 | + * Powermanagement idle function, if any.. | |
8490 | + */ | |
8491 | +void (*pm_idle)(void); | |
8492 | +EXPORT_SYMBOL(pm_idle); | |
8493 | +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | |
8494 | + | |
8495 | +void disable_hlt(void) | |
8496 | +{ | |
8497 | + hlt_counter++; | |
8498 | +} | |
8499 | + | |
8500 | +EXPORT_SYMBOL(disable_hlt); | |
8501 | + | |
8502 | +void enable_hlt(void) | |
8503 | +{ | |
8504 | + hlt_counter--; | |
8505 | +} | |
8506 | + | |
8507 | +EXPORT_SYMBOL(enable_hlt); | |
8508 | + | |
8509 | +/* | |
8510 | + * On SMP it's slightly faster (but much more power-consuming!) | |
8511 | + * to poll the ->work.need_resched flag instead of waiting for the | |
8512 | + * cross-CPU IPI to arrive. Use this option with caution. | |
8513 | + */ | |
8514 | +static void poll_idle (void) | |
8515 | +{ | |
8516 | + local_irq_enable(); | |
8517 | + | |
8518 | + asm volatile( | |
8519 | + "2:" | |
8520 | + "testl %0, %1;" | |
8521 | + "rep; nop;" | |
8522 | + "je 2b;" | |
8523 | + : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); | |
8524 | +} | |
8525 | + | |
8526 | +static void xen_idle(void) | |
8527 | +{ | |
8528 | + local_irq_disable(); | |
8529 | + | |
8530 | + if (need_resched()) | |
8531 | + local_irq_enable(); | |
8532 | + else { | |
8533 | + current_thread_info()->status &= ~TS_POLLING; | |
8534 | + smp_mb__after_clear_bit(); | |
8535 | + safe_halt(); | |
8536 | + current_thread_info()->status |= TS_POLLING; | |
8537 | + } | |
8538 | +} | |
8539 | +#ifdef CONFIG_APM_MODULE | |
8540 | +EXPORT_SYMBOL(default_idle); | |
8541 | +#endif | |
8542 | + | |
8543 | +#ifdef CONFIG_HOTPLUG_CPU | |
8544 | +extern cpumask_t cpu_initialized; | |
8545 | +static inline void play_dead(void) | |
8546 | +{ | |
8547 | + idle_task_exit(); | |
8548 | + local_irq_disable(); | |
8549 | + cpu_clear(smp_processor_id(), cpu_initialized); | |
8550 | + preempt_enable_no_resched(); | |
8551 | + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); | |
8552 | + cpu_bringup(); | |
8553 | +} | |
8554 | +#else | |
8555 | +static inline void play_dead(void) | |
8556 | +{ | |
8557 | + BUG(); | |
8558 | +} | |
8559 | +#endif /* CONFIG_HOTPLUG_CPU */ | |
8560 | + | |
8561 | +/* | |
8562 | + * The idle thread. There's no useful work to be | |
8563 | + * done, so just try to conserve power and have a | |
8564 | + * low exit latency (ie sit in a loop waiting for | |
8565 | + * somebody to say that they'd like to reschedule) | |
8566 | + */ | |
8567 | +void cpu_idle(void) | |
8568 | +{ | |
8569 | + int cpu = smp_processor_id(); | |
8570 | + | |
8571 | + current_thread_info()->status |= TS_POLLING; | |
8572 | + | |
8573 | + /* endless idle loop with no priority at all */ | |
8574 | + while (1) { | |
8575 | + while (!need_resched()) { | |
8576 | + void (*idle)(void); | |
8577 | + | |
8578 | + if (__get_cpu_var(cpu_idle_state)) | |
8579 | + __get_cpu_var(cpu_idle_state) = 0; | |
8580 | + | |
8581 | + rmb(); | |
8582 | + idle = xen_idle; /* no alternatives */ | |
8583 | + | |
8584 | + if (cpu_is_offline(cpu)) | |
8585 | + play_dead(); | |
8586 | + | |
8587 | + __get_cpu_var(irq_stat).idle_timestamp = jiffies; | |
8588 | + idle(); | |
8589 | + } | |
8590 | + preempt_enable_no_resched(); | |
8591 | + schedule(); | |
8592 | + preempt_disable(); | |
8593 | + } | |
8594 | +} | |
8595 | + | |
8596 | +void cpu_idle_wait(void) | |
8597 | +{ | |
8598 | + unsigned int cpu, this_cpu = get_cpu(); | |
8599 | + cpumask_t map; | |
8600 | + | |
8601 | + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | |
8602 | + put_cpu(); | |
8603 | + | |
8604 | + cpus_clear(map); | |
8605 | + for_each_online_cpu(cpu) { | |
8606 | + per_cpu(cpu_idle_state, cpu) = 1; | |
8607 | + cpu_set(cpu, map); | |
8608 | + } | |
8609 | + | |
8610 | + __get_cpu_var(cpu_idle_state) = 0; | |
8611 | + | |
8612 | + wmb(); | |
8613 | + do { | |
8614 | + ssleep(1); | |
8615 | + for_each_online_cpu(cpu) { | |
8616 | + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) | |
8617 | + cpu_clear(cpu, map); | |
8618 | + } | |
8619 | + cpus_and(map, map, cpu_online_map); | |
8620 | + } while (!cpus_empty(map)); | |
8621 | +} | |
8622 | +EXPORT_SYMBOL_GPL(cpu_idle_wait); | |
8623 | + | |
8624 | +void __devinit select_idle_routine(const struct cpuinfo_x86 *c) | |
8625 | +{ | |
8626 | +} | |
8627 | + | |
8628 | +static int __init idle_setup (char *str) | |
8629 | +{ | |
8630 | + if (!strncmp(str, "poll", 4)) { | |
8631 | + printk("using polling idle threads.\n"); | |
8632 | + pm_idle = poll_idle; | |
8633 | + } | |
8634 | + | |
8635 | + boot_option_idle_override = 1; | |
8636 | + return 1; | |
8637 | +} | |
8638 | + | |
8639 | +__setup("idle=", idle_setup); | |
8640 | + | |
8641 | +void show_regs(struct pt_regs * regs) | |
8642 | +{ | |
8643 | + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; | |
8644 | + | |
8645 | + printk("\n"); | |
8646 | + printk("Pid: %d, comm: %20s\n", current->pid, current->comm); | |
8647 | + printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); | |
8648 | + print_symbol("EIP is at %s\n", regs->eip); | |
8649 | + | |
8650 | + if (user_mode_vm(regs)) | |
8651 | + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); | |
8652 | + printk(" EFLAGS: %08lx %s (%s %.*s)\n", | |
8653 | + regs->eflags, print_tainted(), system_utsname.release, | |
8654 | + (int)strcspn(system_utsname.version, " "), | |
8655 | + system_utsname.version); | |
8656 | + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", | |
8657 | + regs->eax,regs->ebx,regs->ecx,regs->edx); | |
8658 | + printk("ESI: %08lx EDI: %08lx EBP: %08lx", | |
8659 | + regs->esi, regs->edi, regs->ebp); | |
8660 | + printk(" DS: %04x ES: %04x\n", | |
8661 | + 0xffff & regs->xds,0xffff & regs->xes); | |
8662 | + | |
8663 | + cr0 = read_cr0(); | |
8664 | + cr2 = read_cr2(); | |
8665 | + cr3 = read_cr3(); | |
8666 | + cr4 = read_cr4_safe(); | |
8667 | + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); | |
8668 | + show_trace(NULL, regs, ®s->esp); | |
8669 | +} | |
8670 | + | |
8671 | +/* | |
8672 | + * This gets run with %ebx containing the | |
8673 | + * function to call, and %edx containing | |
8674 | + * the "args". | |
8675 | + */ | |
8676 | +extern void kernel_thread_helper(void); | |
8677 | +__asm__(".section .text\n" | |
8678 | + ".align 4\n" | |
8679 | + "kernel_thread_helper:\n\t" | |
8680 | + "movl %edx,%eax\n\t" | |
8681 | + "pushl %edx\n\t" | |
8682 | + "call *%ebx\n\t" | |
8683 | + "pushl %eax\n\t" | |
8684 | + "call do_exit\n" | |
8685 | + ".previous"); | |
8686 | + | |
8687 | +/* | |
8688 | + * Create a kernel thread | |
8689 | + */ | |
8690 | +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) | |
8691 | +{ | |
8692 | + struct pt_regs regs; | |
8693 | + | |
8694 | + memset(®s, 0, sizeof(regs)); | |
8695 | + | |
8696 | + regs.ebx = (unsigned long) fn; | |
8697 | + regs.edx = (unsigned long) arg; | |
8698 | + | |
8699 | + regs.xds = __USER_DS; | |
8700 | + regs.xes = __USER_DS; | |
8701 | + regs.orig_eax = -1; | |
8702 | + regs.eip = (unsigned long) kernel_thread_helper; | |
8703 | + regs.xcs = GET_KERNEL_CS(); | |
8704 | + regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; | |
8705 | + | |
8706 | + /* Ok, create the new process.. */ | |
8707 | + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); | |
8708 | +} | |
8709 | +EXPORT_SYMBOL(kernel_thread); | |
8710 | + | |
8711 | +/* | |
8712 | + * Free current thread data structures etc.. | |
8713 | + */ | |
8714 | +void exit_thread(void) | |
8715 | +{ | |
8716 | + /* The process may have allocated an io port bitmap... nuke it. */ | |
8717 | + if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { | |
8718 | + struct task_struct *tsk = current; | |
8719 | + struct thread_struct *t = &tsk->thread; | |
8720 | + struct physdev_set_iobitmap set_iobitmap; | |
8721 | + memset(&set_iobitmap, 0, sizeof(set_iobitmap)); | |
8722 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, | |
8723 | + &set_iobitmap)); | |
8724 | + kfree(t->io_bitmap_ptr); | |
8725 | + t->io_bitmap_ptr = NULL; | |
8726 | + clear_thread_flag(TIF_IO_BITMAP); | |
8727 | + } | |
8728 | +} | |
8729 | + | |
8730 | +void flush_thread(void) | |
8731 | +{ | |
8732 | + struct task_struct *tsk = current; | |
8733 | + | |
8734 | + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); | |
8735 | + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | |
8736 | + clear_tsk_thread_flag(tsk, TIF_DEBUG); | |
8737 | + /* | |
8738 | + * Forget coprocessor state.. | |
8739 | + */ | |
8740 | + clear_fpu(tsk); | |
8741 | + clear_used_math(); | |
8742 | +} | |
8743 | + | |
8744 | +void release_thread(struct task_struct *dead_task) | |
8745 | +{ | |
8746 | + BUG_ON(dead_task->mm); | |
8747 | + release_vm86_irqs(dead_task); | |
8748 | +} | |
8749 | + | |
8750 | +/* | |
8751 | + * This gets called before we allocate a new thread and copy | |
8752 | + * the current task into it. | |
8753 | + */ | |
8754 | +void prepare_to_copy(struct task_struct *tsk) | |
8755 | +{ | |
8756 | + unlazy_fpu(tsk); | |
8757 | +} | |
8758 | + | |
8759 | +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, | |
8760 | + unsigned long unused, | |
8761 | + struct task_struct * p, struct pt_regs * regs) | |
8762 | +{ | |
8763 | + struct pt_regs * childregs; | |
8764 | + struct task_struct *tsk; | |
8765 | + int err; | |
8766 | + | |
8767 | + childregs = task_pt_regs(p); | |
8768 | + *childregs = *regs; | |
8769 | + childregs->eax = 0; | |
8770 | + childregs->esp = esp; | |
8771 | + | |
8772 | + p->thread.esp = (unsigned long) childregs; | |
8773 | + p->thread.esp0 = (unsigned long) (childregs+1); | |
8774 | + | |
8775 | + p->thread.eip = (unsigned long) ret_from_fork; | |
8776 | + | |
8777 | + savesegment(fs,p->thread.fs); | |
8778 | + savesegment(gs,p->thread.gs); | |
8779 | + | |
8780 | + tsk = current; | |
8781 | + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { | |
8782 | + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | |
8783 | + if (!p->thread.io_bitmap_ptr) { | |
8784 | + p->thread.io_bitmap_max = 0; | |
8785 | + return -ENOMEM; | |
8786 | + } | |
8787 | + memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, | |
8788 | + IO_BITMAP_BYTES); | |
8789 | + set_tsk_thread_flag(p, TIF_IO_BITMAP); | |
8790 | + } | |
8791 | + | |
8792 | + /* | |
8793 | + * Set a new TLS for the child thread? | |
8794 | + */ | |
8795 | + if (clone_flags & CLONE_SETTLS) { | |
8796 | + struct desc_struct *desc; | |
8797 | + struct user_desc info; | |
8798 | + int idx; | |
8799 | + | |
8800 | + err = -EFAULT; | |
8801 | + if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) | |
8802 | + goto out; | |
8803 | + err = -EINVAL; | |
8804 | + if (LDT_empty(&info)) | |
8805 | + goto out; | |
8806 | + | |
8807 | + idx = info.entry_number; | |
8808 | + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | |
8809 | + goto out; | |
8810 | + | |
8811 | + desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; | |
8812 | + desc->a = LDT_entry_a(&info); | |
8813 | + desc->b = LDT_entry_b(&info); | |
8814 | + } | |
8815 | + | |
8816 | + p->thread.iopl = current->thread.iopl; | |
8817 | + | |
8818 | + err = 0; | |
8819 | + out: | |
8820 | + if (err && p->thread.io_bitmap_ptr) { | |
8821 | + kfree(p->thread.io_bitmap_ptr); | |
8822 | + p->thread.io_bitmap_max = 0; | |
8823 | + } | |
8824 | + return err; | |
8825 | +} | |
8826 | + | |
8827 | +/* | |
8828 | + * fill in the user structure for a core dump.. | |
8829 | + */ | |
8830 | +void dump_thread(struct pt_regs * regs, struct user * dump) | |
8831 | +{ | |
8832 | + int i; | |
8833 | + | |
8834 | +/* changed the size calculations - should hopefully work better. lbt */ | |
8835 | + dump->magic = CMAGIC; | |
8836 | + dump->start_code = 0; | |
8837 | + dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); | |
8838 | + dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; | |
8839 | + dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; | |
8840 | + dump->u_dsize -= dump->u_tsize; | |
8841 | + dump->u_ssize = 0; | |
8842 | + for (i = 0; i < 8; i++) | |
8843 | + dump->u_debugreg[i] = current->thread.debugreg[i]; | |
8844 | + | |
8845 | + if (dump->start_stack < TASK_SIZE) | |
8846 | + dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; | |
8847 | + | |
8848 | + dump->regs.ebx = regs->ebx; | |
8849 | + dump->regs.ecx = regs->ecx; | |
8850 | + dump->regs.edx = regs->edx; | |
8851 | + dump->regs.esi = regs->esi; | |
8852 | + dump->regs.edi = regs->edi; | |
8853 | + dump->regs.ebp = regs->ebp; | |
8854 | + dump->regs.eax = regs->eax; | |
8855 | + dump->regs.ds = regs->xds; | |
8856 | + dump->regs.es = regs->xes; | |
8857 | + savesegment(fs,dump->regs.fs); | |
8858 | + savesegment(gs,dump->regs.gs); | |
8859 | + dump->regs.orig_eax = regs->orig_eax; | |
8860 | + dump->regs.eip = regs->eip; | |
8861 | + dump->regs.cs = regs->xcs; | |
8862 | + dump->regs.eflags = regs->eflags; | |
8863 | + dump->regs.esp = regs->esp; | |
8864 | + dump->regs.ss = regs->xss; | |
8865 | + | |
8866 | + dump->u_fpvalid = dump_fpu (regs, &dump->i387); | |
8867 | +} | |
8868 | +EXPORT_SYMBOL(dump_thread); | |
8869 | + | |
8870 | +/* | |
8871 | + * Capture the user space registers if the task is not running (in user space) | |
8872 | + */ | |
8873 | +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | |
8874 | +{ | |
8875 | + struct pt_regs ptregs = *task_pt_regs(tsk); | |
8876 | + ptregs.xcs &= 0xffff; | |
8877 | + ptregs.xds &= 0xffff; | |
8878 | + ptregs.xes &= 0xffff; | |
8879 | + ptregs.xss &= 0xffff; | |
8880 | + | |
8881 | + elf_core_copy_regs(regs, &ptregs); | |
8882 | + | |
8883 | + return 1; | |
8884 | +} | |
8885 | + | |
8886 | +static noinline void __switch_to_xtra(struct task_struct *next_p) | |
8887 | +{ | |
8888 | + struct thread_struct *next; | |
8889 | + | |
8890 | + next = &next_p->thread; | |
8891 | + | |
8892 | + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { | |
8893 | + set_debugreg(next->debugreg[0], 0); | |
8894 | + set_debugreg(next->debugreg[1], 1); | |
8895 | + set_debugreg(next->debugreg[2], 2); | |
8896 | + set_debugreg(next->debugreg[3], 3); | |
8897 | + /* no 4 and 5 */ | |
8898 | + set_debugreg(next->debugreg[6], 6); | |
8899 | + set_debugreg(next->debugreg[7], 7); | |
8900 | + } | |
8901 | +} | |
8902 | + | |
8903 | +/* | |
8904 | + * This function selects if the context switch from prev to next | |
8905 | + * has to tweak the TSC disable bit in the cr4. | |
8906 | + */ | |
8907 | +static inline void disable_tsc(struct task_struct *prev_p, | |
8908 | + struct task_struct *next_p) | |
8909 | +{ | |
8910 | + struct thread_info *prev, *next; | |
8911 | + | |
8912 | + /* | |
8913 | + * gcc should eliminate the ->thread_info dereference if | |
8914 | + * has_secure_computing returns 0 at compile time (SECCOMP=n). | |
8915 | + */ | |
8916 | + prev = task_thread_info(prev_p); | |
8917 | + next = task_thread_info(next_p); | |
8918 | + | |
8919 | + if (has_secure_computing(prev) || has_secure_computing(next)) { | |
8920 | + /* slow path here */ | |
8921 | + if (has_secure_computing(prev) && | |
8922 | + !has_secure_computing(next)) { | |
8923 | + write_cr4(read_cr4() & ~X86_CR4_TSD); | |
8924 | + } else if (!has_secure_computing(prev) && | |
8925 | + has_secure_computing(next)) | |
8926 | + write_cr4(read_cr4() | X86_CR4_TSD); | |
8927 | + } | |
8928 | +} | |
8929 | + | |
8930 | +/* | |
8931 | + * switch_to(x,yn) should switch tasks from x to y. | |
8932 | + * | |
8933 | + * We fsave/fwait so that an exception goes off at the right time | |
8934 | + * (as a call from the fsave or fwait in effect) rather than to | |
8935 | + * the wrong process. Lazy FP saving no longer makes any sense | |
8936 | + * with modern CPU's, and this simplifies a lot of things (SMP | |
8937 | + * and UP become the same). | |
8938 | + * | |
8939 | + * NOTE! We used to use the x86 hardware context switching. The | |
8940 | + * reason for not using it any more becomes apparent when you | |
8941 | + * try to recover gracefully from saved state that is no longer | |
8942 | + * valid (stale segment register values in particular). With the | |
8943 | + * hardware task-switch, there is no way to fix up bad state in | |
8944 | + * a reasonable manner. | |
8945 | + * | |
8946 | + * The fact that Intel documents the hardware task-switching to | |
8947 | + * be slow is a fairly red herring - this code is not noticeably | |
8948 | + * faster. However, there _is_ some room for improvement here, | |
8949 | + * so the performance issues may eventually be a valid point. | |
8950 | + * More important, however, is the fact that this allows us much | |
8951 | + * more flexibility. | |
8952 | + * | |
8953 | + * The return value (in %eax) will be the "prev" task after | |
8954 | + * the task-switch, and shows up in ret_from_fork in entry.S, | |
8955 | + * for example. | |
8956 | + */ | |
8957 | +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |
8958 | +{ | |
8959 | + struct thread_struct *prev = &prev_p->thread, | |
8960 | + *next = &next_p->thread; | |
8961 | + int cpu = smp_processor_id(); | |
8962 | +#ifndef CONFIG_X86_NO_TSS | |
8963 | + struct tss_struct *tss = &per_cpu(init_tss, cpu); | |
8964 | +#endif | |
8965 | +#if CONFIG_XEN_COMPAT > 0x030002 | |
8966 | + struct physdev_set_iopl iopl_op; | |
8967 | + struct physdev_set_iobitmap iobmp_op; | |
8968 | +#else | |
8969 | + struct physdev_op _pdo[2], *pdo = _pdo; | |
8970 | +#define iopl_op pdo->u.set_iopl | |
8971 | +#define iobmp_op pdo->u.set_iobitmap | |
8972 | +#endif | |
8973 | + multicall_entry_t _mcl[8], *mcl = _mcl; | |
8974 | + | |
8975 | + /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ | |
8976 | + | |
8977 | + /* | |
8978 | + * This is basically '__unlazy_fpu', except that we queue a | |
8979 | + * multicall to indicate FPU task switch, rather than | |
8980 | + * synchronously trapping to Xen. | |
8981 | + */ | |
8982 | + if (prev_p->thread_info->status & TS_USEDFPU) { | |
8983 | + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ | |
8984 | + mcl->op = __HYPERVISOR_fpu_taskswitch; | |
8985 | + mcl->args[0] = 1; | |
8986 | + mcl++; | |
8987 | + } | |
8988 | +#if 0 /* lazy fpu sanity check */ | |
8989 | + else BUG_ON(!(read_cr0() & 8)); | |
8990 | +#endif | |
8991 | + | |
8992 | + /* | |
8993 | + * Reload esp0. | |
8994 | + * This is load_esp0(tss, next) with a multicall. | |
8995 | + */ | |
8996 | + mcl->op = __HYPERVISOR_stack_switch; | |
8997 | + mcl->args[0] = __KERNEL_DS; | |
8998 | + mcl->args[1] = next->esp0; | |
8999 | + mcl++; | |
9000 | + | |
9001 | + /* | |
9002 | + * Load the per-thread Thread-Local Storage descriptor. | |
9003 | + * This is load_TLS(next, cpu) with multicalls. | |
9004 | + */ | |
9005 | +#define C(i) do { \ | |
9006 | + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ | |
9007 | + next->tls_array[i].b != prev->tls_array[i].b)) { \ | |
9008 | + mcl->op = __HYPERVISOR_update_descriptor; \ | |
9009 | + *(u64 *)&mcl->args[0] = virt_to_machine( \ | |
9010 | + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ | |
9011 | + *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ | |
9012 | + mcl++; \ | |
9013 | + } \ | |
9014 | +} while (0) | |
9015 | + C(0); C(1); C(2); | |
9016 | +#undef C | |
9017 | + | |
9018 | + if (unlikely(prev->iopl != next->iopl)) { | |
9019 | + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; | |
9020 | +#if CONFIG_XEN_COMPAT > 0x030002 | |
9021 | + mcl->op = __HYPERVISOR_physdev_op; | |
9022 | + mcl->args[0] = PHYSDEVOP_set_iopl; | |
9023 | + mcl->args[1] = (unsigned long)&iopl_op; | |
9024 | +#else | |
9025 | + mcl->op = __HYPERVISOR_physdev_op_compat; | |
9026 | + pdo->cmd = PHYSDEVOP_set_iopl; | |
9027 | + mcl->args[0] = (unsigned long)pdo++; | |
9028 | +#endif | |
9029 | + mcl++; | |
9030 | + } | |
9031 | + | |
9032 | + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { | |
9033 | + set_xen_guest_handle(iobmp_op.bitmap, | |
9034 | + (char *)next->io_bitmap_ptr); | |
9035 | + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; | |
9036 | +#if CONFIG_XEN_COMPAT > 0x030002 | |
9037 | + mcl->op = __HYPERVISOR_physdev_op; | |
9038 | + mcl->args[0] = PHYSDEVOP_set_iobitmap; | |
9039 | + mcl->args[1] = (unsigned long)&iobmp_op; | |
9040 | +#else | |
9041 | + mcl->op = __HYPERVISOR_physdev_op_compat; | |
9042 | + pdo->cmd = PHYSDEVOP_set_iobitmap; | |
9043 | + mcl->args[0] = (unsigned long)pdo++; | |
9044 | +#endif | |
9045 | + mcl++; | |
9046 | + } | |
9047 | + | |
9048 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
9049 | + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); | |
9050 | +#endif | |
9051 | + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); | |
9052 | + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) | |
9053 | + BUG(); | |
9054 | + | |
9055 | + /* | |
9056 | + * Restore %fs and %gs if needed. | |
9057 | + * | |
9058 | + * Glibc normally makes %fs be zero, and %gs is one of | |
9059 | + * the TLS segments. | |
9060 | + */ | |
9061 | + if (unlikely(next->fs)) | |
9062 | + loadsegment(fs, next->fs); | |
9063 | + | |
9064 | + if (next->gs) | |
9065 | + loadsegment(gs, next->gs); | |
9066 | + | |
9067 | + /* | |
9068 | + * Now maybe handle debug registers | |
9069 | + */ | |
9070 | + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) | |
9071 | + __switch_to_xtra(next_p); | |
9072 | + | |
9073 | + disable_tsc(prev_p, next_p); | |
9074 | + | |
9075 | + return prev_p; | |
9076 | +} | |
9077 | + | |
9078 | +asmlinkage int sys_fork(struct pt_regs regs) | |
9079 | +{ | |
9080 | + return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); | |
9081 | +} | |
9082 | + | |
9083 | +asmlinkage int sys_clone(struct pt_regs regs) | |
9084 | +{ | |
9085 | + unsigned long clone_flags; | |
9086 | + unsigned long newsp; | |
9087 | + int __user *parent_tidptr, *child_tidptr; | |
9088 | + | |
9089 | + clone_flags = regs.ebx; | |
9090 | + newsp = regs.ecx; | |
9091 | + parent_tidptr = (int __user *)regs.edx; | |
9092 | + child_tidptr = (int __user *)regs.edi; | |
9093 | + if (!newsp) | |
9094 | + newsp = regs.esp; | |
9095 | + return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); | |
9096 | +} | |
9097 | + | |
9098 | +/* | |
9099 | + * This is trivial, and on the face of it looks like it | |
9100 | + * could equally well be done in user mode. | |
9101 | + * | |
9102 | + * Not so, for quite unobvious reasons - register pressure. | |
9103 | + * In user mode vfork() cannot have a stack frame, and if | |
9104 | + * done by calling the "clone()" system call directly, you | |
9105 | + * do not have enough call-clobbered registers to hold all | |
9106 | + * the information you need. | |
9107 | + */ | |
9108 | +asmlinkage int sys_vfork(struct pt_regs regs) | |
9109 | +{ | |
9110 | + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); | |
9111 | +} | |
9112 | + | |
9113 | +/* | |
9114 | + * sys_execve() executes a new program. | |
9115 | + */ | |
9116 | +asmlinkage int sys_execve(struct pt_regs regs) | |
9117 | +{ | |
9118 | + int error; | |
9119 | + char * filename; | |
9120 | + | |
9121 | + filename = getname((char __user *) regs.ebx); | |
9122 | + error = PTR_ERR(filename); | |
9123 | + if (IS_ERR(filename)) | |
9124 | + goto out; | |
9125 | + error = do_execve(filename, | |
9126 | + (char __user * __user *) regs.ecx, | |
9127 | + (char __user * __user *) regs.edx, | |
9128 | + ®s); | |
9129 | + if (error == 0) { | |
9130 | + task_lock(current); | |
9131 | + current->ptrace &= ~PT_DTRACE; | |
9132 | + task_unlock(current); | |
9133 | + /* Make sure we don't return using sysenter.. */ | |
9134 | + set_thread_flag(TIF_IRET); | |
9135 | + } | |
9136 | + putname(filename); | |
9137 | +out: | |
9138 | + return error; | |
9139 | +} | |
9140 | + | |
9141 | +#define top_esp (THREAD_SIZE - sizeof(unsigned long)) | |
9142 | +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) | |
9143 | + | |
9144 | +unsigned long get_wchan(struct task_struct *p) | |
9145 | +{ | |
9146 | + unsigned long ebp, esp, eip; | |
9147 | + unsigned long stack_page; | |
9148 | + int count = 0; | |
9149 | + if (!p || p == current || p->state == TASK_RUNNING) | |
9150 | + return 0; | |
9151 | + stack_page = (unsigned long)task_stack_page(p); | |
9152 | + esp = p->thread.esp; | |
9153 | + if (!stack_page || esp < stack_page || esp > top_esp+stack_page) | |
9154 | + return 0; | |
9155 | + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ | |
9156 | + ebp = *(unsigned long *) esp; | |
9157 | + do { | |
9158 | + if (ebp < stack_page || ebp > top_ebp+stack_page) | |
9159 | + return 0; | |
9160 | + eip = *(unsigned long *) (ebp+4); | |
9161 | + if (!in_sched_functions(eip)) | |
9162 | + return eip; | |
9163 | + ebp = *(unsigned long *) ebp; | |
9164 | + } while (count++ < 16); | |
9165 | + return 0; | |
9166 | +} | |
9167 | + | |
9168 | +/* | |
9169 | + * sys_alloc_thread_area: get a yet unused TLS descriptor index. | |
9170 | + */ | |
9171 | +static int get_free_idx(void) | |
9172 | +{ | |
9173 | + struct thread_struct *t = ¤t->thread; | |
9174 | + int idx; | |
9175 | + | |
9176 | + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) | |
9177 | + if (desc_empty(t->tls_array + idx)) | |
9178 | + return idx + GDT_ENTRY_TLS_MIN; | |
9179 | + return -ESRCH; | |
9180 | +} | |
9181 | + | |
9182 | +/* | |
9183 | + * Set a given TLS descriptor: | |
9184 | + */ | |
9185 | +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) | |
9186 | +{ | |
9187 | + struct thread_struct *t = ¤t->thread; | |
9188 | + struct user_desc info; | |
9189 | + struct desc_struct *desc; | |
9190 | + int cpu, idx; | |
9191 | + | |
9192 | + if (copy_from_user(&info, u_info, sizeof(info))) | |
9193 | + return -EFAULT; | |
9194 | + idx = info.entry_number; | |
9195 | + | |
9196 | + /* | |
9197 | + * index -1 means the kernel should try to find and | |
9198 | + * allocate an empty descriptor: | |
9199 | + */ | |
9200 | + if (idx == -1) { | |
9201 | + idx = get_free_idx(); | |
9202 | + if (idx < 0) | |
9203 | + return idx; | |
9204 | + if (put_user(idx, &u_info->entry_number)) | |
9205 | + return -EFAULT; | |
9206 | + } | |
9207 | + | |
9208 | + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | |
9209 | + return -EINVAL; | |
9210 | + | |
9211 | + desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; | |
9212 | + | |
9213 | + /* | |
9214 | + * We must not get preempted while modifying the TLS. | |
9215 | + */ | |
9216 | + cpu = get_cpu(); | |
9217 | + | |
9218 | + if (LDT_empty(&info)) { | |
9219 | + desc->a = 0; | |
9220 | + desc->b = 0; | |
9221 | + } else { | |
9222 | + desc->a = LDT_entry_a(&info); | |
9223 | + desc->b = LDT_entry_b(&info); | |
9224 | + } | |
9225 | + load_TLS(t, cpu); | |
9226 | + | |
9227 | + put_cpu(); | |
9228 | + | |
9229 | + return 0; | |
9230 | +} | |
9231 | + | |
9232 | +/* | |
9233 | + * Get the current Thread-Local Storage area: | |
9234 | + */ | |
9235 | + | |
9236 | +#define GET_BASE(desc) ( \ | |
9237 | + (((desc)->a >> 16) & 0x0000ffff) | \ | |
9238 | + (((desc)->b << 16) & 0x00ff0000) | \ | |
9239 | + ( (desc)->b & 0xff000000) ) | |
9240 | + | |
9241 | +#define GET_LIMIT(desc) ( \ | |
9242 | + ((desc)->a & 0x0ffff) | \ | |
9243 | + ((desc)->b & 0xf0000) ) | |
9244 | + | |
9245 | +#define GET_32BIT(desc) (((desc)->b >> 22) & 1) | |
9246 | +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) | |
9247 | +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) | |
9248 | +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) | |
9249 | +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) | |
9250 | +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) | |
9251 | + | |
9252 | +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) | |
9253 | +{ | |
9254 | + struct user_desc info; | |
9255 | + struct desc_struct *desc; | |
9256 | + int idx; | |
9257 | + | |
9258 | + if (get_user(idx, &u_info->entry_number)) | |
9259 | + return -EFAULT; | |
9260 | + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | |
9261 | + return -EINVAL; | |
9262 | + | |
9263 | + memset(&info, 0, sizeof(info)); | |
9264 | + | |
9265 | + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; | |
9266 | + | |
9267 | + info.entry_number = idx; | |
9268 | + info.base_addr = GET_BASE(desc); | |
9269 | + info.limit = GET_LIMIT(desc); | |
9270 | + info.seg_32bit = GET_32BIT(desc); | |
9271 | + info.contents = GET_CONTENTS(desc); | |
9272 | + info.read_exec_only = !GET_WRITABLE(desc); | |
9273 | + info.limit_in_pages = GET_LIMIT_PAGES(desc); | |
9274 | + info.seg_not_present = !GET_PRESENT(desc); | |
9275 | + info.useable = GET_USEABLE(desc); | |
9276 | + | |
9277 | + if (copy_to_user(u_info, &info, sizeof(info))) | |
9278 | + return -EFAULT; | |
9279 | + return 0; | |
9280 | +} | |
9281 | + | |
9282 | +unsigned long arch_align_stack(unsigned long sp) | |
9283 | +{ | |
9284 | + if (randomize_va_space) | |
9285 | + sp -= get_random_int() % 8192; | |
9286 | + return sp & ~0xf; | |
9287 | +} | |
9288 | Index: head-2008-11-25/arch/x86/kernel/quirks-xen.c | |
9289 | =================================================================== | |
9290 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
9291 | +++ head-2008-11-25/arch/x86/kernel/quirks-xen.c 2008-01-28 12:24:19.000000000 +0100 | |
9292 | @@ -0,0 +1,47 @@ | |
9293 | +/* | |
9294 | + * This file contains work-arounds for x86 and x86_64 platform bugs. | |
9295 | + */ | |
9296 | +#include <linux/pci.h> | |
9297 | +#include <linux/irq.h> | |
9298 | + | |
9299 | +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) | |
9300 | + | |
9301 | +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) | |
9302 | +{ | |
9303 | + u8 config, rev; | |
9304 | + u32 word; | |
9305 | + | |
9306 | + /* BIOS may enable hardware IRQ balancing for | |
9307 | + * E7520/E7320/E7525(revision ID 0x9 and below) | |
9308 | + * based platforms. | |
9309 | + * Disable SW irqbalance/affinity on those platforms. | |
9310 | + */ | |
9311 | + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); | |
9312 | + if (rev > 0x9) | |
9313 | + return; | |
9314 | + | |
9315 | + printk(KERN_INFO "Intel E7520/7320/7525 detected."); | |
9316 | + | |
9317 | + /* enable access to config space*/ | |
9318 | + pci_read_config_byte(dev, 0xf4, &config); | |
9319 | + pci_write_config_byte(dev, 0xf4, config|0x2); | |
9320 | + | |
9321 | + /* read xTPR register */ | |
9322 | + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); | |
9323 | + | |
9324 | + if (!(word & (1 << 13))) { | |
9325 | + struct xen_platform_op op; | |
9326 | + printk(KERN_INFO "Disabling irq balancing and affinity\n"); | |
9327 | + op.cmd = XENPF_platform_quirk; | |
9328 | + op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; | |
9329 | + WARN_ON(HYPERVISOR_platform_op(&op)); | |
9330 | + } | |
9331 | + | |
9332 | + /* put back the original value for config space*/ | |
9333 | + if (!(config & 0x2)) | |
9334 | + pci_write_config_byte(dev, 0xf4, config); | |
9335 | +} | |
9336 | +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); | |
9337 | +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); | |
9338 | +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); | |
9339 | +#endif | |
9340 | Index: head-2008-11-25/arch/x86/kernel/setup_32-xen.c | |
9341 | =================================================================== | |
9342 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
9343 | +++ head-2008-11-25/arch/x86/kernel/setup_32-xen.c 2008-04-22 15:41:51.000000000 +0200 | |
9344 | @@ -0,0 +1,1919 @@ | |
9345 | +/* | |
9346 | + * linux/arch/i386/kernel/setup.c | |
9347 | + * | |
9348 | + * Copyright (C) 1995 Linus Torvalds | |
9349 | + * | |
9350 | + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | |
9351 | + * | |
9352 | + * Memory region support | |
9353 | + * David Parsons <orc@pell.chi.il.us>, July-August 1999 | |
9354 | + * | |
9355 | + * Added E820 sanitization routine (removes overlapping memory regions); | |
9356 | + * Brian Moyle <bmoyle@mvista.com>, February 2001 | |
9357 | + * | |
9358 | + * Moved CPU detection code to cpu/${cpu}.c | |
9359 | + * Patrick Mochel <mochel@osdl.org>, March 2002 | |
9360 | + * | |
9361 | + * Provisions for empty E820 memory regions (reported by certain BIOSes). | |
9362 | + * Alex Achenbach <xela@slit.de>, December 2002. | |
9363 | + * | |
9364 | + */ | |
9365 | + | |
9366 | +/* | |
9367 | + * This file handles the architecture-dependent parts of initialization | |
9368 | + */ | |
9369 | + | |
9370 | +#include <linux/sched.h> | |
9371 | +#include <linux/mm.h> | |
9372 | +#include <linux/mmzone.h> | |
9373 | +#include <linux/screen_info.h> | |
9374 | +#include <linux/ioport.h> | |
9375 | +#include <linux/acpi.h> | |
9376 | +#include <linux/apm_bios.h> | |
9377 | +#include <linux/initrd.h> | |
9378 | +#include <linux/bootmem.h> | |
9379 | +#include <linux/seq_file.h> | |
9380 | +#include <linux/platform_device.h> | |
9381 | +#include <linux/console.h> | |
9382 | +#include <linux/mca.h> | |
9383 | +#include <linux/root_dev.h> | |
9384 | +#include <linux/highmem.h> | |
9385 | +#include <linux/module.h> | |
9386 | +#include <linux/efi.h> | |
9387 | +#include <linux/init.h> | |
9388 | +#include <linux/edd.h> | |
9389 | +#include <linux/nodemask.h> | |
9390 | +#include <linux/kernel.h> | |
9391 | +#include <linux/percpu.h> | |
9392 | +#include <linux/notifier.h> | |
9393 | +#include <linux/kexec.h> | |
9394 | +#include <linux/crash_dump.h> | |
9395 | +#include <linux/dmi.h> | |
9396 | +#include <linux/pfn.h> | |
9397 | + | |
9398 | +#include <video/edid.h> | |
9399 | + | |
9400 | +#include <asm/apic.h> | |
9401 | +#include <asm/e820.h> | |
9402 | +#include <asm/mpspec.h> | |
9403 | +#include <asm/setup.h> | |
9404 | +#include <asm/arch_hooks.h> | |
9405 | +#include <asm/sections.h> | |
9406 | +#include <asm/io_apic.h> | |
9407 | +#include <asm/ist.h> | |
9408 | +#include <asm/io.h> | |
9409 | +#include <asm/hypervisor.h> | |
9410 | +#include <xen/interface/physdev.h> | |
9411 | +#include <xen/interface/memory.h> | |
9412 | +#include <xen/features.h> | |
9413 | +#include <xen/firmware.h> | |
9414 | +#include <xen/xencons.h> | |
9415 | +#include <setup_arch.h> | |
9416 | +#include <bios_ebda.h> | |
9417 | + | |
9418 | +#ifdef CONFIG_XEN | |
9419 | +#include <xen/interface/kexec.h> | |
9420 | +#endif | |
9421 | + | |
9422 | +/* Forward Declaration. */ | |
9423 | +void __init find_max_pfn(void); | |
9424 | + | |
9425 | +static int xen_panic_event(struct notifier_block *, unsigned long, void *); | |
9426 | +static struct notifier_block xen_panic_block = { | |
9427 | + xen_panic_event, NULL, 0 /* try to go last */ | |
9428 | +}; | |
9429 | + | |
9430 | +extern char hypercall_page[PAGE_SIZE]; | |
9431 | +EXPORT_SYMBOL(hypercall_page); | |
9432 | + | |
9433 | +int disable_pse __devinitdata = 0; | |
9434 | + | |
9435 | +/* | |
9436 | + * Machine setup.. | |
9437 | + */ | |
9438 | + | |
9439 | +#ifdef CONFIG_EFI | |
9440 | +int efi_enabled = 0; | |
9441 | +EXPORT_SYMBOL(efi_enabled); | |
9442 | +#endif | |
9443 | + | |
9444 | +/* cpu data as detected by the assembly code in head.S */ | |
9445 | +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; | |
9446 | +/* common cpu data for all cpus */ | |
9447 | +struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; | |
9448 | +EXPORT_SYMBOL(boot_cpu_data); | |
9449 | + | |
9450 | +unsigned long mmu_cr4_features; | |
9451 | + | |
9452 | +#ifdef CONFIG_ACPI | |
9453 | + int acpi_disabled = 0; | |
9454 | +#else | |
9455 | + int acpi_disabled = 1; | |
9456 | +#endif | |
9457 | +EXPORT_SYMBOL(acpi_disabled); | |
9458 | + | |
9459 | +#ifdef CONFIG_ACPI | |
9460 | +int __initdata acpi_force = 0; | |
9461 | +extern acpi_interrupt_flags acpi_sci_flags; | |
9462 | +#endif | |
9463 | + | |
9464 | +/* for MCA, but anyone else can use it if they want */ | |
9465 | +unsigned int machine_id; | |
9466 | +#ifdef CONFIG_MCA | |
9467 | +EXPORT_SYMBOL(machine_id); | |
9468 | +#endif | |
9469 | +unsigned int machine_submodel_id; | |
9470 | +unsigned int BIOS_revision; | |
9471 | +unsigned int mca_pentium_flag; | |
9472 | + | |
9473 | +/* For PCI or other memory-mapped resources */ | |
9474 | +unsigned long pci_mem_start = 0x10000000; | |
9475 | +#ifdef CONFIG_PCI | |
9476 | +EXPORT_SYMBOL(pci_mem_start); | |
9477 | +#endif | |
9478 | + | |
9479 | +/* Boot loader ID as an integer, for the benefit of proc_dointvec */ | |
9480 | +int bootloader_type; | |
9481 | + | |
9482 | +/* user-defined highmem size */ | |
9483 | +static unsigned int highmem_pages = -1; | |
9484 | + | |
9485 | +/* | |
9486 | + * Setup options | |
9487 | + */ | |
9488 | +struct drive_info_struct { char dummy[32]; } drive_info; | |
9489 | +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \ | |
9490 | + defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) | |
9491 | +EXPORT_SYMBOL(drive_info); | |
9492 | +#endif | |
9493 | +struct screen_info screen_info; | |
9494 | +EXPORT_SYMBOL(screen_info); | |
9495 | +struct apm_info apm_info; | |
9496 | +EXPORT_SYMBOL(apm_info); | |
9497 | +struct sys_desc_table_struct { | |
9498 | + unsigned short length; | |
9499 | + unsigned char table[0]; | |
9500 | +}; | |
9501 | +struct edid_info edid_info; | |
9502 | +EXPORT_SYMBOL_GPL(edid_info); | |
9503 | +#ifndef CONFIG_XEN | |
9504 | +#define copy_edid() (edid_info = EDID_INFO) | |
9505 | +#endif | |
9506 | +struct ist_info ist_info; | |
9507 | +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ | |
9508 | + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) | |
9509 | +EXPORT_SYMBOL(ist_info); | |
9510 | +#endif | |
9511 | +struct e820map e820; | |
9512 | +#ifdef CONFIG_XEN | |
9513 | +struct e820map machine_e820; | |
9514 | +#endif | |
9515 | + | |
9516 | +extern void early_cpu_init(void); | |
9517 | +extern void generic_apic_probe(char *); | |
9518 | +extern int root_mountflags; | |
9519 | + | |
9520 | +unsigned long saved_videomode; | |
9521 | + | |
9522 | +#define RAMDISK_IMAGE_START_MASK 0x07FF | |
9523 | +#define RAMDISK_PROMPT_FLAG 0x8000 | |
9524 | +#define RAMDISK_LOAD_FLAG 0x4000 | |
9525 | + | |
9526 | +static char command_line[COMMAND_LINE_SIZE]; | |
9527 | + | |
9528 | +unsigned char __initdata boot_params[PARAM_SIZE]; | |
9529 | + | |
9530 | +static struct resource data_resource = { | |
9531 | + .name = "Kernel data", | |
9532 | + .start = 0, | |
9533 | + .end = 0, | |
9534 | + .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
9535 | +}; | |
9536 | + | |
9537 | +static struct resource code_resource = { | |
9538 | + .name = "Kernel code", | |
9539 | + .start = 0, | |
9540 | + .end = 0, | |
9541 | + .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
9542 | +}; | |
9543 | + | |
9544 | +static struct resource system_rom_resource = { | |
9545 | + .name = "System ROM", | |
9546 | + .start = 0xf0000, | |
9547 | + .end = 0xfffff, | |
9548 | + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | |
9549 | +}; | |
9550 | + | |
9551 | +static struct resource extension_rom_resource = { | |
9552 | + .name = "Extension ROM", | |
9553 | + .start = 0xe0000, | |
9554 | + .end = 0xeffff, | |
9555 | + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | |
9556 | +}; | |
9557 | + | |
9558 | +static struct resource adapter_rom_resources[] = { { | |
9559 | + .name = "Adapter ROM", | |
9560 | + .start = 0xc8000, | |
9561 | + .end = 0, | |
9562 | + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | |
9563 | +}, { | |
9564 | + .name = "Adapter ROM", | |
9565 | + .start = 0, | |
9566 | + .end = 0, | |
9567 | + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | |
9568 | +}, { | |
9569 | + .name = "Adapter ROM", | |
9570 | + .start = 0, | |
9571 | + .end = 0, | |
9572 | + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | |
9573 | +}, { | |
9574 | + .name = "Adapter ROM", | |
9575 | + .start = 0, | |
9576 | + .end = 0, | |
9577 | + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | |
9578 | +}, { | |
9579 | + .name = "Adapter ROM", | |
9580 | + .start = 0, | |
9581 | + .end = 0, | |
9582 | + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | |
9583 | +}, { | |
9584 | + .name = "Adapter ROM", | |
9585 | + .start = 0, | |
9586 | + .end = 0, | |
9587 | + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | |
9588 | +} }; | |
9589 | + | |
9590 | +#define ADAPTER_ROM_RESOURCES \ | |
9591 | + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) | |
9592 | + | |
9593 | +static struct resource video_rom_resource = { | |
9594 | + .name = "Video ROM", | |
9595 | + .start = 0xc0000, | |
9596 | + .end = 0xc7fff, | |
9597 | + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | |
9598 | +}; | |
9599 | + | |
9600 | +static struct resource video_ram_resource = { | |
9601 | + .name = "Video RAM area", | |
9602 | + .start = 0xa0000, | |
9603 | + .end = 0xbffff, | |
9604 | + .flags = IORESOURCE_BUSY | IORESOURCE_MEM | |
9605 | +}; | |
9606 | + | |
9607 | +static struct resource standard_io_resources[] = { { | |
9608 | + .name = "dma1", | |
9609 | + .start = 0x0000, | |
9610 | + .end = 0x001f, | |
9611 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
9612 | +}, { | |
9613 | + .name = "pic1", | |
9614 | + .start = 0x0020, | |
9615 | + .end = 0x0021, | |
9616 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
9617 | +}, { | |
9618 | + .name = "timer0", | |
9619 | + .start = 0x0040, | |
9620 | + .end = 0x0043, | |
9621 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
9622 | +}, { | |
9623 | + .name = "timer1", | |
9624 | + .start = 0x0050, | |
9625 | + .end = 0x0053, | |
9626 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
9627 | +}, { | |
9628 | + .name = "keyboard", | |
9629 | + .start = 0x0060, | |
9630 | + .end = 0x006f, | |
9631 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
9632 | +}, { | |
9633 | + .name = "dma page reg", | |
9634 | + .start = 0x0080, | |
9635 | + .end = 0x008f, | |
9636 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
9637 | +}, { | |
9638 | + .name = "pic2", | |
9639 | + .start = 0x00a0, | |
9640 | + .end = 0x00a1, | |
9641 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
9642 | +}, { | |
9643 | + .name = "dma2", | |
9644 | + .start = 0x00c0, | |
9645 | + .end = 0x00df, | |
9646 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
9647 | +}, { | |
9648 | + .name = "fpu", | |
9649 | + .start = 0x00f0, | |
9650 | + .end = 0x00ff, | |
9651 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO | |
9652 | +} }; | |
9653 | + | |
9654 | +#define STANDARD_IO_RESOURCES \ | |
9655 | + (sizeof standard_io_resources / sizeof standard_io_resources[0]) | |
9656 | + | |
9657 | +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) | |
9658 | + | |
9659 | +static int __init romchecksum(unsigned char *rom, unsigned long length) | |
9660 | +{ | |
9661 | + unsigned char *p, sum = 0; | |
9662 | + | |
9663 | + for (p = rom; p < rom + length; p++) | |
9664 | + sum += *p; | |
9665 | + return sum == 0; | |
9666 | +} | |
9667 | + | |
9668 | +static void __init probe_roms(void) | |
9669 | +{ | |
9670 | + unsigned long start, length, upper; | |
9671 | + unsigned char *rom; | |
9672 | + int i; | |
9673 | + | |
9674 | +#ifdef CONFIG_XEN | |
9675 | + /* Nothing to do if not running in dom0. */ | |
9676 | + if (!is_initial_xendomain()) | |
9677 | + return; | |
9678 | +#endif | |
9679 | + | |
9680 | + /* video rom */ | |
9681 | + upper = adapter_rom_resources[0].start; | |
9682 | + for (start = video_rom_resource.start; start < upper; start += 2048) { | |
9683 | + rom = isa_bus_to_virt(start); | |
9684 | + if (!romsignature(rom)) | |
9685 | + continue; | |
9686 | + | |
9687 | + video_rom_resource.start = start; | |
9688 | + | |
9689 | + /* 0 < length <= 0x7f * 512, historically */ | |
9690 | + length = rom[2] * 512; | |
9691 | + | |
9692 | + /* if checksum okay, trust length byte */ | |
9693 | + if (length && romchecksum(rom, length)) | |
9694 | + video_rom_resource.end = start + length - 1; | |
9695 | + | |
9696 | + request_resource(&iomem_resource, &video_rom_resource); | |
9697 | + break; | |
9698 | + } | |
9699 | + | |
9700 | + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; | |
9701 | + if (start < upper) | |
9702 | + start = upper; | |
9703 | + | |
9704 | + /* system rom */ | |
9705 | + request_resource(&iomem_resource, &system_rom_resource); | |
9706 | + upper = system_rom_resource.start; | |
9707 | + | |
9708 | + /* check for extension rom (ignore length byte!) */ | |
9709 | + rom = isa_bus_to_virt(extension_rom_resource.start); | |
9710 | + if (romsignature(rom)) { | |
9711 | + length = extension_rom_resource.end - extension_rom_resource.start + 1; | |
9712 | + if (romchecksum(rom, length)) { | |
9713 | + request_resource(&iomem_resource, &extension_rom_resource); | |
9714 | + upper = extension_rom_resource.start; | |
9715 | + } | |
9716 | + } | |
9717 | + | |
9718 | + /* check for adapter roms on 2k boundaries */ | |
9719 | + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { | |
9720 | + rom = isa_bus_to_virt(start); | |
9721 | + if (!romsignature(rom)) | |
9722 | + continue; | |
9723 | + | |
9724 | + /* 0 < length <= 0x7f * 512, historically */ | |
9725 | + length = rom[2] * 512; | |
9726 | + | |
9727 | + /* but accept any length that fits if checksum okay */ | |
9728 | + if (!length || start + length > upper || !romchecksum(rom, length)) | |
9729 | + continue; | |
9730 | + | |
9731 | + adapter_rom_resources[i].start = start; | |
9732 | + adapter_rom_resources[i].end = start + length - 1; | |
9733 | + request_resource(&iomem_resource, &adapter_rom_resources[i]); | |
9734 | + | |
9735 | + start = adapter_rom_resources[i++].end & ~2047UL; | |
9736 | + } | |
9737 | +} | |
9738 | + | |
9739 | +/* | |
9740 | + * Point at the empty zero page to start with. We map the real shared_info | |
9741 | + * page as soon as fixmap is up and running. | |
9742 | + */ | |
9743 | +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; | |
9744 | +EXPORT_SYMBOL(HYPERVISOR_shared_info); | |
9745 | + | |
9746 | +unsigned long *phys_to_machine_mapping; | |
9747 | +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16]; | |
9748 | +EXPORT_SYMBOL(phys_to_machine_mapping); | |
9749 | + | |
9750 | +/* Raw start-of-day parameters from the hypervisor. */ | |
9751 | +start_info_t *xen_start_info; | |
9752 | +EXPORT_SYMBOL(xen_start_info); | |
9753 | + | |
9754 | +void __init add_memory_region(unsigned long long start, | |
9755 | + unsigned long long size, int type) | |
9756 | +{ | |
9757 | + int x; | |
9758 | + | |
9759 | + if (!efi_enabled) { | |
9760 | + x = e820.nr_map; | |
9761 | + | |
9762 | + if (x == E820MAX) { | |
9763 | + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | |
9764 | + return; | |
9765 | + } | |
9766 | + | |
9767 | + e820.map[x].addr = start; | |
9768 | + e820.map[x].size = size; | |
9769 | + e820.map[x].type = type; | |
9770 | + e820.nr_map++; | |
9771 | + } | |
9772 | +} /* add_memory_region */ | |
9773 | + | |
9774 | +static void __init limit_regions(unsigned long long size) | |
9775 | +{ | |
9776 | + unsigned long long current_addr = 0; | |
9777 | + int i; | |
9778 | + | |
9779 | + if (efi_enabled) { | |
9780 | + efi_memory_desc_t *md; | |
9781 | + void *p; | |
9782 | + | |
9783 | + for (p = memmap.map, i = 0; p < memmap.map_end; | |
9784 | + p += memmap.desc_size, i++) { | |
9785 | + md = p; | |
9786 | + current_addr = md->phys_addr + (md->num_pages << 12); | |
9787 | + if (md->type == EFI_CONVENTIONAL_MEMORY) { | |
9788 | + if (current_addr >= size) { | |
9789 | + md->num_pages -= | |
9790 | + (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); | |
9791 | + memmap.nr_map = i + 1; | |
9792 | + return; | |
9793 | + } | |
9794 | + } | |
9795 | + } | |
9796 | + } | |
9797 | + for (i = 0; i < e820.nr_map; i++) { | |
9798 | + current_addr = e820.map[i].addr + e820.map[i].size; | |
9799 | + if (current_addr < size) | |
9800 | + continue; | |
9801 | + | |
9802 | + if (e820.map[i].type != E820_RAM) | |
9803 | + continue; | |
9804 | + | |
9805 | + if (e820.map[i].addr >= size) { | |
9806 | + /* | |
9807 | + * This region starts past the end of the | |
9808 | + * requested size, skip it completely. | |
9809 | + */ | |
9810 | + e820.nr_map = i; | |
9811 | + } else { | |
9812 | + e820.nr_map = i + 1; | |
9813 | + e820.map[i].size -= current_addr - size; | |
9814 | + } | |
9815 | + return; | |
9816 | + } | |
9817 | +#ifdef CONFIG_XEN | |
9818 | + if (i==e820.nr_map && current_addr < size) { | |
9819 | + /* | |
9820 | + * The e820 map finished before our requested size so | |
9821 | + * extend the final entry to the requested address. | |
9822 | + */ | |
9823 | + --i; | |
9824 | + if (e820.map[i].type == E820_RAM) | |
9825 | + e820.map[i].size -= current_addr - size; | |
9826 | + else | |
9827 | + add_memory_region(current_addr, size - current_addr, E820_RAM); | |
9828 | + } | |
9829 | +#endif | |
9830 | +} | |
9831 | + | |
9832 | +#define E820_DEBUG 1 | |
9833 | + | |
9834 | +static void __init print_memory_map(char *who) | |
9835 | +{ | |
9836 | + int i; | |
9837 | + | |
9838 | + for (i = 0; i < e820.nr_map; i++) { | |
9839 | + printk(" %s: %016Lx - %016Lx ", who, | |
9840 | + e820.map[i].addr, | |
9841 | + e820.map[i].addr + e820.map[i].size); | |
9842 | + switch (e820.map[i].type) { | |
9843 | + case E820_RAM: printk("(usable)\n"); | |
9844 | + break; | |
9845 | + case E820_RESERVED: | |
9846 | + printk("(reserved)\n"); | |
9847 | + break; | |
9848 | + case E820_ACPI: | |
9849 | + printk("(ACPI data)\n"); | |
9850 | + break; | |
9851 | + case E820_NVS: | |
9852 | + printk("(ACPI NVS)\n"); | |
9853 | + break; | |
9854 | + default: printk("type %lu\n", e820.map[i].type); | |
9855 | + break; | |
9856 | + } | |
9857 | + } | |
9858 | +} | |
9859 | + | |
9860 | +/* | |
9861 | + * Sanitize the BIOS e820 map. | |
9862 | + * | |
9863 | + * Some e820 responses include overlapping entries. The following | |
9864 | + * replaces the original e820 map with a new one, removing overlaps. | |
9865 | + * | |
9866 | + */ | |
9867 | +struct change_member { | |
9868 | + struct e820entry *pbios; /* pointer to original bios entry */ | |
9869 | + unsigned long long addr; /* address for this change point */ | |
9870 | +}; | |
9871 | +static struct change_member change_point_list[2*E820MAX] __initdata; | |
9872 | +static struct change_member *change_point[2*E820MAX] __initdata; | |
9873 | +static struct e820entry *overlap_list[E820MAX] __initdata; | |
9874 | +static struct e820entry new_bios[E820MAX] __initdata; | |
9875 | + | |
9876 | +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) | |
9877 | +{ | |
9878 | + struct change_member *change_tmp; | |
9879 | + unsigned long current_type, last_type; | |
9880 | + unsigned long long last_addr; | |
9881 | + int chgidx, still_changing; | |
9882 | + int overlap_entries; | |
9883 | + int new_bios_entry; | |
9884 | + int old_nr, new_nr, chg_nr; | |
9885 | + int i; | |
9886 | + | |
9887 | + /* | |
9888 | + Visually we're performing the following (1,2,3,4 = memory types)... | |
9889 | + | |
9890 | + Sample memory map (w/overlaps): | |
9891 | + ____22__________________ | |
9892 | + ______________________4_ | |
9893 | + ____1111________________ | |
9894 | + _44_____________________ | |
9895 | + 11111111________________ | |
9896 | + ____________________33__ | |
9897 | + ___________44___________ | |
9898 | + __________33333_________ | |
9899 | + ______________22________ | |
9900 | + ___________________2222_ | |
9901 | + _________111111111______ | |
9902 | + _____________________11_ | |
9903 | + _________________4______ | |
9904 | + | |
9905 | + Sanitized equivalent (no overlap): | |
9906 | + 1_______________________ | |
9907 | + _44_____________________ | |
9908 | + ___1____________________ | |
9909 | + ____22__________________ | |
9910 | + ______11________________ | |
9911 | + _________1______________ | |
9912 | + __________3_____________ | |
9913 | + ___________44___________ | |
9914 | + _____________33_________ | |
9915 | + _______________2________ | |
9916 | + ________________1_______ | |
9917 | + _________________4______ | |
9918 | + ___________________2____ | |
9919 | + ____________________33__ | |
9920 | + ______________________4_ | |
9921 | + */ | |
9922 | + | |
9923 | + /* if there's only one memory region, don't bother */ | |
9924 | + if (*pnr_map < 2) | |
9925 | + return -1; | |
9926 | + | |
9927 | + old_nr = *pnr_map; | |
9928 | + | |
9929 | + /* bail out if we find any unreasonable addresses in bios map */ | |
9930 | + for (i=0; i<old_nr; i++) | |
9931 | + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) | |
9932 | + return -1; | |
9933 | + | |
9934 | + /* create pointers for initial change-point information (for sorting) */ | |
9935 | + for (i=0; i < 2*old_nr; i++) | |
9936 | + change_point[i] = &change_point_list[i]; | |
9937 | + | |
9938 | + /* record all known change-points (starting and ending addresses), | |
9939 | + omitting those that are for empty memory regions */ | |
9940 | + chgidx = 0; | |
9941 | + for (i=0; i < old_nr; i++) { | |
9942 | + if (biosmap[i].size != 0) { | |
9943 | + change_point[chgidx]->addr = biosmap[i].addr; | |
9944 | + change_point[chgidx++]->pbios = &biosmap[i]; | |
9945 | + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; | |
9946 | + change_point[chgidx++]->pbios = &biosmap[i]; | |
9947 | + } | |
9948 | + } | |
9949 | + chg_nr = chgidx; /* true number of change-points */ | |
9950 | + | |
9951 | + /* sort change-point list by memory addresses (low -> high) */ | |
9952 | + still_changing = 1; | |
9953 | + while (still_changing) { | |
9954 | + still_changing = 0; | |
9955 | + for (i=1; i < chg_nr; i++) { | |
9956 | + /* if <current_addr> > <last_addr>, swap */ | |
9957 | + /* or, if current=<start_addr> & last=<end_addr>, swap */ | |
9958 | + if ((change_point[i]->addr < change_point[i-1]->addr) || | |
9959 | + ((change_point[i]->addr == change_point[i-1]->addr) && | |
9960 | + (change_point[i]->addr == change_point[i]->pbios->addr) && | |
9961 | + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) | |
9962 | + ) | |
9963 | + { | |
9964 | + change_tmp = change_point[i]; | |
9965 | + change_point[i] = change_point[i-1]; | |
9966 | + change_point[i-1] = change_tmp; | |
9967 | + still_changing=1; | |
9968 | + } | |
9969 | + } | |
9970 | + } | |
9971 | + | |
9972 | + /* create a new bios memory map, removing overlaps */ | |
9973 | + overlap_entries=0; /* number of entries in the overlap table */ | |
9974 | + new_bios_entry=0; /* index for creating new bios map entries */ | |
9975 | + last_type = 0; /* start with undefined memory type */ | |
9976 | + last_addr = 0; /* start with 0 as last starting address */ | |
9977 | + /* loop through change-points, determining affect on the new bios map */ | |
9978 | + for (chgidx=0; chgidx < chg_nr; chgidx++) | |
9979 | + { | |
9980 | + /* keep track of all overlapping bios entries */ | |
9981 | + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) | |
9982 | + { | |
9983 | + /* add map entry to overlap list (> 1 entry implies an overlap) */ | |
9984 | + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; | |
9985 | + } | |
9986 | + else | |
9987 | + { | |
9988 | + /* remove entry from list (order independent, so swap with last) */ | |
9989 | + for (i=0; i<overlap_entries; i++) | |
9990 | + { | |
9991 | + if (overlap_list[i] == change_point[chgidx]->pbios) | |
9992 | + overlap_list[i] = overlap_list[overlap_entries-1]; | |
9993 | + } | |
9994 | + overlap_entries--; | |
9995 | + } | |
9996 | + /* if there are overlapping entries, decide which "type" to use */ | |
9997 | + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ | |
9998 | + current_type = 0; | |
9999 | + for (i=0; i<overlap_entries; i++) | |
10000 | + if (overlap_list[i]->type > current_type) | |
10001 | + current_type = overlap_list[i]->type; | |
10002 | + /* continue building up new bios map based on this information */ | |
10003 | + if (current_type != last_type) { | |
10004 | + if (last_type != 0) { | |
10005 | + new_bios[new_bios_entry].size = | |
10006 | + change_point[chgidx]->addr - last_addr; | |
10007 | + /* move forward only if the new size was non-zero */ | |
10008 | + if (new_bios[new_bios_entry].size != 0) | |
10009 | + if (++new_bios_entry >= E820MAX) | |
10010 | + break; /* no more space left for new bios entries */ | |
10011 | + } | |
10012 | + if (current_type != 0) { | |
10013 | + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; | |
10014 | + new_bios[new_bios_entry].type = current_type; | |
10015 | + last_addr=change_point[chgidx]->addr; | |
10016 | + } | |
10017 | + last_type = current_type; | |
10018 | + } | |
10019 | + } | |
10020 | + new_nr = new_bios_entry; /* retain count for new bios entries */ | |
10021 | + | |
10022 | + /* copy new bios mapping into original location */ | |
10023 | + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); | |
10024 | + *pnr_map = new_nr; | |
10025 | + | |
10026 | + return 0; | |
10027 | +} | |
10028 | + | |
10029 | +/* | |
10030 | + * Copy the BIOS e820 map into a safe place. | |
10031 | + * | |
10032 | + * Sanity-check it while we're at it.. | |
10033 | + * | |
10034 | + * If we're lucky and live on a modern system, the setup code | |
10035 | + * will have given us a memory map that we can use to properly | |
10036 | + * set up memory. If we aren't, we'll fake a memory map. | |
10037 | + * | |
10038 | + * We check to see that the memory map contains at least 2 elements | |
10039 | + * before we'll use it, because the detection code in setup.S may | |
10040 | + * not be perfect and most every PC known to man has two memory | |
10041 | + * regions: one from 0 to 640k, and one from 1mb up. (The IBM | |
10042 | + * thinkpad 560x, for example, does not cooperate with the memory | |
10043 | + * detection code.) | |
10044 | + */ | |
10045 | +int __init copy_e820_map(struct e820entry * biosmap, int nr_map) | |
10046 | +{ | |
10047 | +#ifndef CONFIG_XEN | |
10048 | + /* Only one memory region (or negative)? Ignore it */ | |
10049 | + if (nr_map < 2) | |
10050 | + return -1; | |
10051 | +#else | |
10052 | + BUG_ON(nr_map < 1); | |
10053 | +#endif | |
10054 | + | |
10055 | + do { | |
10056 | + unsigned long long start = biosmap->addr; | |
10057 | + unsigned long long size = biosmap->size; | |
10058 | + unsigned long long end = start + size; | |
10059 | + unsigned long type = biosmap->type; | |
10060 | + | |
10061 | + /* Overflow in 64 bits? Ignore the memory map. */ | |
10062 | + if (start > end) | |
10063 | + return -1; | |
10064 | + | |
10065 | +#ifndef CONFIG_XEN | |
10066 | + /* | |
10067 | + * Some BIOSes claim RAM in the 640k - 1M region. | |
10068 | + * Not right. Fix it up. | |
10069 | + */ | |
10070 | + if (type == E820_RAM) { | |
10071 | + if (start < 0x100000ULL && end > 0xA0000ULL) { | |
10072 | + if (start < 0xA0000ULL) | |
10073 | + add_memory_region(start, 0xA0000ULL-start, type); | |
10074 | + if (end <= 0x100000ULL) | |
10075 | + continue; | |
10076 | + start = 0x100000ULL; | |
10077 | + size = end - start; | |
10078 | + } | |
10079 | + } | |
10080 | +#endif | |
10081 | + add_memory_region(start, size, type); | |
10082 | + } while (biosmap++,--nr_map); | |
10083 | + | |
10084 | +#ifdef CONFIG_XEN | |
10085 | + if (is_initial_xendomain()) { | |
10086 | + struct xen_memory_map memmap; | |
10087 | + | |
10088 | + memmap.nr_entries = E820MAX; | |
10089 | + set_xen_guest_handle(memmap.buffer, machine_e820.map); | |
10090 | + | |
10091 | + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) | |
10092 | + BUG(); | |
10093 | + machine_e820.nr_map = memmap.nr_entries; | |
10094 | + } else | |
10095 | + machine_e820 = e820; | |
10096 | +#endif | |
10097 | + | |
10098 | + return 0; | |
10099 | +} | |
10100 | + | |
10101 | +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | |
10102 | +struct edd edd; | |
10103 | +#ifdef CONFIG_EDD_MODULE | |
10104 | +EXPORT_SYMBOL(edd); | |
10105 | +#endif | |
10106 | +#ifndef CONFIG_XEN | |
10107 | +/** | |
10108 | + * copy_edd() - Copy the BIOS EDD information | |
10109 | + * from boot_params into a safe place. | |
10110 | + * | |
10111 | + */ | |
10112 | +static inline void copy_edd(void) | |
10113 | +{ | |
10114 | + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); | |
10115 | + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); | |
10116 | + edd.mbr_signature_nr = EDD_MBR_SIG_NR; | |
10117 | + edd.edd_info_nr = EDD_NR; | |
10118 | +} | |
10119 | +#endif | |
10120 | +#else | |
10121 | +static inline void copy_edd(void) | |
10122 | +{ | |
10123 | +} | |
10124 | +#endif | |
10125 | + | |
10126 | +static void __init parse_cmdline_early (char ** cmdline_p) | |
10127 | +{ | |
10128 | + char c = ' ', *to = command_line, *from = saved_command_line; | |
10129 | + int len = 0, max_cmdline; | |
10130 | + int userdef = 0; | |
10131 | + | |
10132 | + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) | |
10133 | + max_cmdline = COMMAND_LINE_SIZE; | |
10134 | + memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline); | |
10135 | + /* Save unparsed command line copy for /proc/cmdline */ | |
10136 | + saved_command_line[max_cmdline-1] = '\0'; | |
10137 | + | |
10138 | + for (;;) { | |
10139 | + if (c != ' ') | |
10140 | + goto next_char; | |
10141 | + /* | |
10142 | + * "mem=nopentium" disables the 4MB page tables. | |
10143 | + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM | |
10144 | + * to <mem>, overriding the bios size. | |
10145 | + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from | |
10146 | + * <start> to <start>+<mem>, overriding the bios size. | |
10147 | + * | |
10148 | + * HPA tells me bootloaders need to parse mem=, so no new | |
10149 | + * option should be mem= [also see Documentation/i386/boot.txt] | |
10150 | + */ | |
10151 | + if (!memcmp(from, "mem=", 4)) { | |
10152 | + if (to != command_line) | |
10153 | + to--; | |
10154 | + if (!memcmp(from+4, "nopentium", 9)) { | |
10155 | + from += 9+4; | |
10156 | + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); | |
10157 | + disable_pse = 1; | |
10158 | + } else { | |
10159 | + /* If the user specifies memory size, we | |
10160 | + * limit the BIOS-provided memory map to | |
10161 | + * that size. exactmap can be used to specify | |
10162 | + * the exact map. mem=number can be used to | |
10163 | + * trim the existing memory map. | |
10164 | + */ | |
10165 | + unsigned long long mem_size; | |
10166 | + | |
10167 | + mem_size = memparse(from+4, &from); | |
10168 | + limit_regions(mem_size); | |
10169 | + userdef=1; | |
10170 | + } | |
10171 | + } | |
10172 | + | |
10173 | + else if (!memcmp(from, "memmap=", 7)) { | |
10174 | + if (to != command_line) | |
10175 | + to--; | |
10176 | + if (!memcmp(from+7, "exactmap", 8)) { | |
10177 | +#ifdef CONFIG_CRASH_DUMP | |
10178 | + /* If we are doing a crash dump, we | |
10179 | + * still need to know the real mem | |
10180 | + * size before original memory map is | |
10181 | + * reset. | |
10182 | + */ | |
10183 | + find_max_pfn(); | |
10184 | + saved_max_pfn = max_pfn; | |
10185 | +#endif | |
10186 | + from += 8+7; | |
10187 | + e820.nr_map = 0; | |
10188 | + userdef = 1; | |
10189 | + } else { | |
10190 | + /* If the user specifies memory size, we | |
10191 | + * limit the BIOS-provided memory map to | |
10192 | + * that size. exactmap can be used to specify | |
10193 | + * the exact map. mem=number can be used to | |
10194 | + * trim the existing memory map. | |
10195 | + */ | |
10196 | + unsigned long long start_at, mem_size; | |
10197 | + | |
10198 | + mem_size = memparse(from+7, &from); | |
10199 | + if (*from == '@') { | |
10200 | + start_at = memparse(from+1, &from); | |
10201 | + add_memory_region(start_at, mem_size, E820_RAM); | |
10202 | + } else if (*from == '#') { | |
10203 | + start_at = memparse(from+1, &from); | |
10204 | + add_memory_region(start_at, mem_size, E820_ACPI); | |
10205 | + } else if (*from == '$') { | |
10206 | + start_at = memparse(from+1, &from); | |
10207 | + add_memory_region(start_at, mem_size, E820_RESERVED); | |
10208 | + } else { | |
10209 | + limit_regions(mem_size); | |
10210 | + userdef=1; | |
10211 | + } | |
10212 | + } | |
10213 | + } | |
10214 | + | |
10215 | + else if (!memcmp(from, "noexec=", 7)) | |
10216 | + noexec_setup(from + 7); | |
10217 | + | |
10218 | + | |
10219 | +#ifdef CONFIG_X86_MPPARSE | |
10220 | + /* | |
10221 | + * If the BIOS enumerates physical processors before logical, | |
10222 | + * maxcpus=N at enumeration-time can be used to disable HT. | |
10223 | + */ | |
10224 | + else if (!memcmp(from, "maxcpus=", 8)) { | |
10225 | + extern unsigned int maxcpus; | |
10226 | + | |
10227 | + maxcpus = simple_strtoul(from + 8, NULL, 0); | |
10228 | + } | |
10229 | +#endif | |
10230 | + | |
10231 | +#ifdef CONFIG_ACPI | |
10232 | + /* "acpi=off" disables both ACPI table parsing and interpreter */ | |
10233 | + else if (!memcmp(from, "acpi=off", 8)) { | |
10234 | + disable_acpi(); | |
10235 | + } | |
10236 | + | |
10237 | + /* acpi=force to over-ride black-list */ | |
10238 | + else if (!memcmp(from, "acpi=force", 10)) { | |
10239 | + acpi_force = 1; | |
10240 | + acpi_ht = 1; | |
10241 | + acpi_disabled = 0; | |
10242 | + } | |
10243 | + | |
10244 | + /* acpi=strict disables out-of-spec workarounds */ | |
10245 | + else if (!memcmp(from, "acpi=strict", 11)) { | |
10246 | + acpi_strict = 1; | |
10247 | + } | |
10248 | + | |
10249 | + /* Limit ACPI just to boot-time to enable HT */ | |
10250 | + else if (!memcmp(from, "acpi=ht", 7)) { | |
10251 | + if (!acpi_force) | |
10252 | + disable_acpi(); | |
10253 | + acpi_ht = 1; | |
10254 | + } | |
10255 | + | |
10256 | + /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ | |
10257 | + else if (!memcmp(from, "pci=noacpi", 10)) { | |
10258 | + acpi_disable_pci(); | |
10259 | + } | |
10260 | + /* "acpi=noirq" disables ACPI interrupt routing */ | |
10261 | + else if (!memcmp(from, "acpi=noirq", 10)) { | |
10262 | + acpi_noirq_set(); | |
10263 | + } | |
10264 | + | |
10265 | + else if (!memcmp(from, "acpi_sci=edge", 13)) | |
10266 | + acpi_sci_flags.trigger = 1; | |
10267 | + | |
10268 | + else if (!memcmp(from, "acpi_sci=level", 14)) | |
10269 | + acpi_sci_flags.trigger = 3; | |
10270 | + | |
10271 | + else if (!memcmp(from, "acpi_sci=high", 13)) | |
10272 | + acpi_sci_flags.polarity = 1; | |
10273 | + | |
10274 | + else if (!memcmp(from, "acpi_sci=low", 12)) | |
10275 | + acpi_sci_flags.polarity = 3; | |
10276 | + | |
10277 | +#ifdef CONFIG_X86_IO_APIC | |
10278 | + else if (!memcmp(from, "acpi_skip_timer_override", 24)) | |
10279 | + acpi_skip_timer_override = 1; | |
10280 | + | |
10281 | + if (!memcmp(from, "disable_timer_pin_1", 19)) | |
10282 | + disable_timer_pin_1 = 1; | |
10283 | + if (!memcmp(from, "enable_timer_pin_1", 18)) | |
10284 | + disable_timer_pin_1 = -1; | |
10285 | + | |
10286 | + /* disable IO-APIC */ | |
10287 | + else if (!memcmp(from, "noapic", 6)) | |
10288 | + disable_ioapic_setup(); | |
10289 | +#endif /* CONFIG_X86_IO_APIC */ | |
10290 | +#endif /* CONFIG_ACPI */ | |
10291 | + | |
10292 | +#ifdef CONFIG_X86_LOCAL_APIC | |
10293 | + /* enable local APIC */ | |
10294 | + else if (!memcmp(from, "lapic", 5)) | |
10295 | + lapic_enable(); | |
10296 | + | |
10297 | + /* disable local APIC */ | |
10298 | + else if (!memcmp(from, "nolapic", 6)) | |
10299 | + lapic_disable(); | |
10300 | +#endif /* CONFIG_X86_LOCAL_APIC */ | |
10301 | + | |
10302 | +#ifdef CONFIG_KEXEC | |
10303 | + /* crashkernel=size@addr specifies the location to reserve for | |
10304 | + * a crash kernel. By reserving this memory we guarantee | |
10305 | + * that linux never set's it up as a DMA target. | |
10306 | + * Useful for holding code to do something appropriate | |
10307 | + * after a kernel panic. | |
10308 | + */ | |
10309 | + else if (!memcmp(from, "crashkernel=", 12)) { | |
10310 | +#ifndef CONFIG_XEN | |
10311 | + unsigned long size, base; | |
10312 | + size = memparse(from+12, &from); | |
10313 | + if (*from == '@') { | |
10314 | + base = memparse(from+1, &from); | |
10315 | + /* FIXME: Do I want a sanity check | |
10316 | + * to validate the memory range? | |
10317 | + */ | |
10318 | + crashk_res.start = base; | |
10319 | + crashk_res.end = base + size - 1; | |
10320 | + } | |
10321 | +#else | |
10322 | + printk("Ignoring crashkernel command line, " | |
10323 | + "parameter will be supplied by xen\n"); | |
10324 | +#endif | |
10325 | + } | |
10326 | +#endif | |
10327 | +#ifdef CONFIG_PROC_VMCORE | |
10328 | + /* elfcorehdr= specifies the location of elf core header | |
10329 | + * stored by the crashed kernel. | |
10330 | + */ | |
10331 | + else if (!memcmp(from, "elfcorehdr=", 11)) | |
10332 | + elfcorehdr_addr = memparse(from+11, &from); | |
10333 | +#endif | |
10334 | + | |
10335 | + /* | |
10336 | + * highmem=size forces highmem to be exactly 'size' bytes. | |
10337 | + * This works even on boxes that have no highmem otherwise. | |
10338 | + * This also works to reduce highmem size on bigger boxes. | |
10339 | + */ | |
10340 | + else if (!memcmp(from, "highmem=", 8)) | |
10341 | + highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; | |
10342 | + | |
10343 | + /* | |
10344 | + * vmalloc=size forces the vmalloc area to be exactly 'size' | |
10345 | + * bytes. This can be used to increase (or decrease) the | |
10346 | + * vmalloc area - the default is 128m. | |
10347 | + */ | |
10348 | + else if (!memcmp(from, "vmalloc=", 8)) | |
10349 | + __VMALLOC_RESERVE = memparse(from+8, &from); | |
10350 | + | |
10351 | + next_char: | |
10352 | + c = *(from++); | |
10353 | + if (!c) | |
10354 | + break; | |
10355 | + if (COMMAND_LINE_SIZE <= ++len) | |
10356 | + break; | |
10357 | + *(to++) = c; | |
10358 | + } | |
10359 | + *to = '\0'; | |
10360 | + *cmdline_p = command_line; | |
10361 | + if (userdef) { | |
10362 | + printk(KERN_INFO "user-defined physical RAM map:\n"); | |
10363 | + print_memory_map("user"); | |
10364 | + } | |
10365 | +} | |
10366 | + | |
10367 | +/* | |
10368 | + * Callback for efi_memory_walk. | |
10369 | + */ | |
10370 | +static int __init | |
10371 | +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) | |
10372 | +{ | |
10373 | + unsigned long *max_pfn = arg, pfn; | |
10374 | + | |
10375 | + if (start < end) { | |
10376 | + pfn = PFN_UP(end -1); | |
10377 | + if (pfn > *max_pfn) | |
10378 | + *max_pfn = pfn; | |
10379 | + } | |
10380 | + return 0; | |
10381 | +} | |
10382 | + | |
10383 | +static int __init | |
10384 | +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) | |
10385 | +{ | |
10386 | + memory_present(0, start, end); | |
10387 | + return 0; | |
10388 | +} | |
10389 | + | |
10390 | +/* | |
10391 | + * This function checks if any part of the range <start,end> is mapped | |
10392 | + * with type. | |
10393 | + */ | |
10394 | +int | |
10395 | +e820_any_mapped(u64 start, u64 end, unsigned type) | |
10396 | +{ | |
10397 | + int i; | |
10398 | + | |
10399 | +#ifndef CONFIG_XEN | |
10400 | + for (i = 0; i < e820.nr_map; i++) { | |
10401 | + const struct e820entry *ei = &e820.map[i]; | |
10402 | +#else | |
10403 | + if (!is_initial_xendomain()) | |
10404 | + return 0; | |
10405 | + for (i = 0; i < machine_e820.nr_map; ++i) { | |
10406 | + const struct e820entry *ei = &machine_e820.map[i]; | |
10407 | +#endif | |
10408 | + | |
10409 | + if (type && ei->type != type) | |
10410 | + continue; | |
10411 | + if (ei->addr >= end || ei->addr + ei->size <= start) | |
10412 | + continue; | |
10413 | + return 1; | |
10414 | + } | |
10415 | + return 0; | |
10416 | +} | |
10417 | +EXPORT_SYMBOL_GPL(e820_any_mapped); | |
10418 | + | |
10419 | + /* | |
10420 | + * This function checks if the entire range <start,end> is mapped with type. | |
10421 | + * | |
10422 | + * Note: this function only works correct if the e820 table is sorted and | |
10423 | + * not-overlapping, which is the case | |
10424 | + */ | |
10425 | +int __init | |
10426 | +e820_all_mapped(unsigned long s, unsigned long e, unsigned type) | |
10427 | +{ | |
10428 | + u64 start = s; | |
10429 | + u64 end = e; | |
10430 | + int i; | |
10431 | + | |
10432 | +#ifndef CONFIG_XEN | |
10433 | + for (i = 0; i < e820.nr_map; i++) { | |
10434 | + struct e820entry *ei = &e820.map[i]; | |
10435 | +#else | |
10436 | + if (!is_initial_xendomain()) | |
10437 | + return 0; | |
10438 | + for (i = 0; i < machine_e820.nr_map; ++i) { | |
10439 | + const struct e820entry *ei = &machine_e820.map[i]; | |
10440 | +#endif | |
10441 | + if (type && ei->type != type) | |
10442 | + continue; | |
10443 | + /* is the region (part) in overlap with the current region ?*/ | |
10444 | + if (ei->addr >= end || ei->addr + ei->size <= start) | |
10445 | + continue; | |
10446 | + /* if the region is at the beginning of <start,end> we move | |
10447 | + * start to the end of the region since it's ok until there | |
10448 | + */ | |
10449 | + if (ei->addr <= start) | |
10450 | + start = ei->addr + ei->size; | |
10451 | + /* if start is now at or beyond end, we're done, full | |
10452 | + * coverage */ | |
10453 | + if (start >= end) | |
10454 | + return 1; /* we're done */ | |
10455 | + } | |
10456 | + return 0; | |
10457 | +} | |
10458 | + | |
10459 | +/* | |
10460 | + * Find the highest page frame number we have available | |
10461 | + */ | |
10462 | +void __init find_max_pfn(void) | |
10463 | +{ | |
10464 | + int i; | |
10465 | + | |
10466 | + max_pfn = 0; | |
10467 | + if (efi_enabled) { | |
10468 | + efi_memmap_walk(efi_find_max_pfn, &max_pfn); | |
10469 | + efi_memmap_walk(efi_memory_present_wrapper, NULL); | |
10470 | + return; | |
10471 | + } | |
10472 | + | |
10473 | + for (i = 0; i < e820.nr_map; i++) { | |
10474 | + unsigned long start, end; | |
10475 | + /* RAM? */ | |
10476 | + if (e820.map[i].type != E820_RAM) | |
10477 | + continue; | |
10478 | + start = PFN_UP(e820.map[i].addr); | |
10479 | + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); | |
10480 | + if (start >= end) | |
10481 | + continue; | |
10482 | + if (end > max_pfn) | |
10483 | + max_pfn = end; | |
10484 | + memory_present(0, start, end); | |
10485 | + } | |
10486 | +} | |
10487 | + | |
10488 | +/* | |
10489 | + * Determine low and high memory ranges: | |
10490 | + */ | |
10491 | +unsigned long __init find_max_low_pfn(void) | |
10492 | +{ | |
10493 | + unsigned long max_low_pfn; | |
10494 | + | |
10495 | + max_low_pfn = max_pfn; | |
10496 | + if (max_low_pfn > MAXMEM_PFN) { | |
10497 | + if (highmem_pages == -1) | |
10498 | + highmem_pages = max_pfn - MAXMEM_PFN; | |
10499 | + if (highmem_pages + MAXMEM_PFN < max_pfn) | |
10500 | + max_pfn = MAXMEM_PFN + highmem_pages; | |
10501 | + if (highmem_pages + MAXMEM_PFN > max_pfn) { | |
10502 | + printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); | |
10503 | + highmem_pages = 0; | |
10504 | + } | |
10505 | + max_low_pfn = MAXMEM_PFN; | |
10506 | +#ifndef CONFIG_HIGHMEM | |
10507 | + /* Maximum memory usable is what is directly addressable */ | |
10508 | + printk(KERN_WARNING "Warning only %ldMB will be used.\n", | |
10509 | + MAXMEM>>20); | |
10510 | + if (max_pfn > MAX_NONPAE_PFN) | |
10511 | + printk(KERN_WARNING "Use a PAE enabled kernel.\n"); | |
10512 | + else | |
10513 | + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); | |
10514 | + max_pfn = MAXMEM_PFN; | |
10515 | +#else /* !CONFIG_HIGHMEM */ | |
10516 | +#ifndef CONFIG_X86_PAE | |
10517 | + if (max_pfn > MAX_NONPAE_PFN) { | |
10518 | + max_pfn = MAX_NONPAE_PFN; | |
10519 | + printk(KERN_WARNING "Warning only 4GB will be used.\n"); | |
10520 | + printk(KERN_WARNING "Use a PAE enabled kernel.\n"); | |
10521 | + } | |
10522 | +#endif /* !CONFIG_X86_PAE */ | |
10523 | +#endif /* !CONFIG_HIGHMEM */ | |
10524 | + } else { | |
10525 | + if (highmem_pages == -1) | |
10526 | + highmem_pages = 0; | |
10527 | +#ifdef CONFIG_HIGHMEM | |
10528 | + if (highmem_pages >= max_pfn) { | |
10529 | + printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); | |
10530 | + highmem_pages = 0; | |
10531 | + } | |
10532 | + if (highmem_pages) { | |
10533 | + if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){ | |
10534 | + printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages)); | |
10535 | + highmem_pages = 0; | |
10536 | + } | |
10537 | + max_low_pfn -= highmem_pages; | |
10538 | + } | |
10539 | +#else | |
10540 | + if (highmem_pages) | |
10541 | + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); | |
10542 | +#endif | |
10543 | + } | |
10544 | + return max_low_pfn; | |
10545 | +} | |
10546 | + | |
10547 | +/* | |
10548 | + * Free all available memory for boot time allocation. Used | |
10549 | + * as a callback function by efi_memory_walk() | |
10550 | + */ | |
10551 | + | |
10552 | +static int __init | |
10553 | +free_available_memory(unsigned long start, unsigned long end, void *arg) | |
10554 | +{ | |
10555 | + /* check max_low_pfn */ | |
10556 | + if (start >= (max_low_pfn << PAGE_SHIFT)) | |
10557 | + return 0; | |
10558 | + if (end >= (max_low_pfn << PAGE_SHIFT)) | |
10559 | + end = max_low_pfn << PAGE_SHIFT; | |
10560 | + if (start < end) | |
10561 | + free_bootmem(start, end - start); | |
10562 | + | |
10563 | + return 0; | |
10564 | +} | |
10565 | +/* | |
10566 | + * Register fully available low RAM pages with the bootmem allocator. | |
10567 | + */ | |
10568 | +static void __init register_bootmem_low_pages(unsigned long max_low_pfn) | |
10569 | +{ | |
10570 | + int i; | |
10571 | + | |
10572 | + if (efi_enabled) { | |
10573 | + efi_memmap_walk(free_available_memory, NULL); | |
10574 | + return; | |
10575 | + } | |
10576 | + for (i = 0; i < e820.nr_map; i++) { | |
10577 | + unsigned long curr_pfn, last_pfn, size; | |
10578 | + /* | |
10579 | + * Reserve usable low memory | |
10580 | + */ | |
10581 | + if (e820.map[i].type != E820_RAM) | |
10582 | + continue; | |
10583 | + /* | |
10584 | + * We are rounding up the start address of usable memory: | |
10585 | + */ | |
10586 | + curr_pfn = PFN_UP(e820.map[i].addr); | |
10587 | + if (curr_pfn >= max_low_pfn) | |
10588 | + continue; | |
10589 | + /* | |
10590 | + * ... and at the end of the usable range downwards: | |
10591 | + */ | |
10592 | + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); | |
10593 | + | |
10594 | +#ifdef CONFIG_XEN | |
10595 | + /* | |
10596 | + * Truncate to the number of actual pages currently | |
10597 | + * present. | |
10598 | + */ | |
10599 | + if (last_pfn > xen_start_info->nr_pages) | |
10600 | + last_pfn = xen_start_info->nr_pages; | |
10601 | +#endif | |
10602 | + | |
10603 | + if (last_pfn > max_low_pfn) | |
10604 | + last_pfn = max_low_pfn; | |
10605 | + | |
10606 | + /* | |
10607 | + * .. finally, did all the rounding and playing | |
10608 | + * around just make the area go away? | |
10609 | + */ | |
10610 | + if (last_pfn <= curr_pfn) | |
10611 | + continue; | |
10612 | + | |
10613 | + size = last_pfn - curr_pfn; | |
10614 | + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); | |
10615 | + } | |
10616 | +} | |
10617 | + | |
10618 | +#ifndef CONFIG_XEN | |
10619 | +/* | |
10620 | + * workaround for Dell systems that neglect to reserve EBDA | |
10621 | + */ | |
10622 | +static void __init reserve_ebda_region(void) | |
10623 | +{ | |
10624 | + unsigned int addr; | |
10625 | + addr = get_bios_ebda(); | |
10626 | + if (addr) | |
10627 | + reserve_bootmem(addr, PAGE_SIZE); | |
10628 | +} | |
10629 | +#endif | |
10630 | + | |
10631 | +#ifndef CONFIG_NEED_MULTIPLE_NODES | |
10632 | +void __init setup_bootmem_allocator(void); | |
10633 | +static unsigned long __init setup_memory(void) | |
10634 | +{ | |
10635 | + /* | |
10636 | + * partially used pages are not usable - thus | |
10637 | + * we are rounding upwards: | |
10638 | + */ | |
10639 | + min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) + | |
10640 | + xen_start_info->nr_pt_frames; | |
10641 | + | |
10642 | + find_max_pfn(); | |
10643 | + | |
10644 | + max_low_pfn = find_max_low_pfn(); | |
10645 | + | |
10646 | +#ifdef CONFIG_HIGHMEM | |
10647 | + highstart_pfn = highend_pfn = max_pfn; | |
10648 | + if (max_pfn > max_low_pfn) { | |
10649 | + highstart_pfn = max_low_pfn; | |
10650 | + } | |
10651 | + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", | |
10652 | + pages_to_mb(highend_pfn - highstart_pfn)); | |
10653 | +#endif | |
10654 | + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", | |
10655 | + pages_to_mb(max_low_pfn)); | |
10656 | + | |
10657 | + setup_bootmem_allocator(); | |
10658 | + | |
10659 | + return max_low_pfn; | |
10660 | +} | |
10661 | + | |
10662 | +void __init zone_sizes_init(void) | |
10663 | +{ | |
10664 | + unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; | |
10665 | + unsigned int max_dma, low; | |
10666 | + | |
10667 | + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | |
10668 | + low = max_low_pfn; | |
10669 | + | |
10670 | + if (low < max_dma) | |
10671 | + zones_size[ZONE_DMA] = low; | |
10672 | + else { | |
10673 | + zones_size[ZONE_DMA] = max_dma; | |
10674 | + zones_size[ZONE_NORMAL] = low - max_dma; | |
10675 | +#ifdef CONFIG_HIGHMEM | |
10676 | + zones_size[ZONE_HIGHMEM] = highend_pfn - low; | |
10677 | +#endif | |
10678 | + } | |
10679 | + free_area_init(zones_size); | |
10680 | +} | |
10681 | +#else | |
10682 | +extern unsigned long __init setup_memory(void); | |
10683 | +extern void zone_sizes_init(void); | |
10684 | +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ | |
10685 | + | |
10686 | +void __init setup_bootmem_allocator(void) | |
10687 | +{ | |
10688 | + unsigned long bootmap_size; | |
10689 | + /* | |
10690 | + * Initialize the boot-time allocator (with low memory only): | |
10691 | + */ | |
10692 | + bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); | |
10693 | + | |
10694 | + register_bootmem_low_pages(max_low_pfn); | |
10695 | + | |
10696 | + /* | |
10697 | + * Reserve the bootmem bitmap itself as well. We do this in two | |
10698 | + * steps (first step was init_bootmem()) because this catches | |
10699 | + * the (very unlikely) case of us accidentally initializing the | |
10700 | + * bootmem allocator with an invalid RAM area. | |
10701 | + */ | |
10702 | + reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + | |
10703 | + bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); | |
10704 | + | |
10705 | +#ifndef CONFIG_XEN | |
10706 | + /* | |
10707 | + * reserve physical page 0 - it's a special BIOS page on many boxes, | |
10708 | + * enabling clean reboots, SMP operation, laptop functions. | |
10709 | + */ | |
10710 | + reserve_bootmem(0, PAGE_SIZE); | |
10711 | + | |
10712 | + /* reserve EBDA region, it's a 4K region */ | |
10713 | + reserve_ebda_region(); | |
10714 | + | |
10715 | + /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent | |
10716 | + PCI prefetch into it (errata #56). Usually the page is reserved anyways, | |
10717 | + unless you have no PS/2 mouse plugged in. */ | |
10718 | + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | |
10719 | + boot_cpu_data.x86 == 6) | |
10720 | + reserve_bootmem(0xa0000 - 4096, 4096); | |
10721 | + | |
10722 | +#ifdef CONFIG_SMP | |
10723 | + /* | |
10724 | + * But first pinch a few for the stack/trampoline stuff | |
10725 | + * FIXME: Don't need the extra page at 4K, but need to fix | |
10726 | + * trampoline before removing it. (see the GDT stuff) | |
10727 | + */ | |
10728 | + reserve_bootmem(PAGE_SIZE, PAGE_SIZE); | |
10729 | +#endif | |
10730 | +#ifdef CONFIG_ACPI_SLEEP | |
10731 | + /* | |
10732 | + * Reserve low memory region for sleep support. | |
10733 | + */ | |
10734 | + acpi_reserve_bootmem(); | |
10735 | +#endif | |
10736 | +#endif /* !CONFIG_XEN */ | |
10737 | + | |
10738 | +#ifdef CONFIG_BLK_DEV_INITRD | |
10739 | + if (xen_start_info->mod_start) { | |
10740 | + if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { | |
10741 | + /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/ | |
10742 | + initrd_start = INITRD_START + PAGE_OFFSET; | |
10743 | + initrd_end = initrd_start+INITRD_SIZE; | |
10744 | + initrd_below_start_ok = 1; | |
10745 | + } | |
10746 | + else { | |
10747 | + printk(KERN_ERR "initrd extends beyond end of memory " | |
10748 | + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | |
10749 | + INITRD_START + INITRD_SIZE, | |
10750 | + max_low_pfn << PAGE_SHIFT); | |
10751 | + initrd_start = 0; | |
10752 | + } | |
10753 | + } | |
10754 | +#endif | |
10755 | +#ifdef CONFIG_KEXEC | |
10756 | +#ifdef CONFIG_XEN | |
10757 | + xen_machine_kexec_setup_resources(); | |
10758 | +#else | |
10759 | + if (crashk_res.start != crashk_res.end) | |
10760 | + reserve_bootmem(crashk_res.start, | |
10761 | + crashk_res.end - crashk_res.start + 1); | |
10762 | +#endif | |
10763 | +#endif | |
10764 | +} | |
10765 | + | |
10766 | +/* | |
10767 | + * The node 0 pgdat is initialized before all of these because | |
10768 | + * it's needed for bootmem. node>0 pgdats have their virtual | |
10769 | + * space allocated before the pagetables are in place to access | |
10770 | + * them, so they can't be cleared then. | |
10771 | + * | |
10772 | + * This should all compile down to nothing when NUMA is off. | |
10773 | + */ | |
10774 | +void __init remapped_pgdat_init(void) | |
10775 | +{ | |
10776 | + int nid; | |
10777 | + | |
10778 | + for_each_online_node(nid) { | |
10779 | + if (nid != 0) | |
10780 | + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); | |
10781 | + } | |
10782 | +} | |
10783 | + | |
10784 | +/* | |
10785 | + * Request address space for all standard RAM and ROM resources | |
10786 | + * and also for regions reported as reserved by the e820. | |
10787 | + */ | |
10788 | +static void __init | |
10789 | +legacy_init_iomem_resources(struct e820entry *e820, int nr_map, | |
10790 | + struct resource *code_resource, | |
10791 | + struct resource *data_resource) | |
10792 | +{ | |
10793 | + int i; | |
10794 | + | |
10795 | + probe_roms(); | |
10796 | + | |
10797 | + for (i = 0; i < nr_map; i++) { | |
10798 | + struct resource *res; | |
10799 | +#ifndef CONFIG_RESOURCES_64BIT | |
10800 | + if (e820[i].addr + e820[i].size > 0x100000000ULL) | |
10801 | + continue; | |
10802 | +#endif | |
10803 | + res = kzalloc(sizeof(struct resource), GFP_ATOMIC); | |
10804 | + switch (e820[i].type) { | |
10805 | + case E820_RAM: res->name = "System RAM"; break; | |
10806 | + case E820_ACPI: res->name = "ACPI Tables"; break; | |
10807 | + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; | |
10808 | + default: res->name = "reserved"; | |
10809 | + } | |
10810 | + res->start = e820[i].addr; | |
10811 | + res->end = res->start + e820[i].size - 1; | |
10812 | + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | |
10813 | + if (request_resource(&iomem_resource, res)) { | |
10814 | + kfree(res); | |
10815 | + continue; | |
10816 | + } | |
10817 | + if (e820[i].type == E820_RAM) { | |
10818 | + /* | |
10819 | + * We don't know which RAM region contains kernel data, | |
10820 | + * so we try it repeatedly and let the resource manager | |
10821 | + * test it. | |
10822 | + */ | |
10823 | +#ifndef CONFIG_XEN | |
10824 | + request_resource(res, code_resource); | |
10825 | + request_resource(res, data_resource); | |
10826 | +#endif | |
10827 | +#ifdef CONFIG_KEXEC | |
10828 | + if (crashk_res.start != crashk_res.end) | |
10829 | + request_resource(res, &crashk_res); | |
10830 | +#ifdef CONFIG_XEN | |
10831 | + xen_machine_kexec_register_resources(res); | |
10832 | +#endif | |
10833 | +#endif | |
10834 | + } | |
10835 | + } | |
10836 | +} | |
10837 | + | |
10838 | +/* | |
10839 | + * Locate a unused range of the physical address space below 4G which | |
10840 | + * can be used for PCI mappings. | |
10841 | + */ | |
10842 | +static void __init | |
10843 | +e820_setup_gap(struct e820entry *e820, int nr_map) | |
10844 | +{ | |
10845 | + unsigned long gapstart, gapsize, round; | |
10846 | + unsigned long long last; | |
10847 | + int i; | |
10848 | + | |
10849 | + /* | |
10850 | + * Search for the bigest gap in the low 32 bits of the e820 | |
10851 | + * memory space. | |
10852 | + */ | |
10853 | + last = 0x100000000ull; | |
10854 | + gapstart = 0x10000000; | |
10855 | + gapsize = 0x400000; | |
10856 | + i = nr_map; | |
10857 | + while (--i >= 0) { | |
10858 | + unsigned long long start = e820[i].addr; | |
10859 | + unsigned long long end = start + e820[i].size; | |
10860 | + | |
10861 | + /* | |
10862 | + * Since "last" is at most 4GB, we know we'll | |
10863 | + * fit in 32 bits if this condition is true | |
10864 | + */ | |
10865 | + if (last > end) { | |
10866 | + unsigned long gap = last - end; | |
10867 | + | |
10868 | + if (gap > gapsize) { | |
10869 | + gapsize = gap; | |
10870 | + gapstart = end; | |
10871 | + } | |
10872 | + } | |
10873 | + if (start < last) | |
10874 | + last = start; | |
10875 | + } | |
10876 | + | |
10877 | + /* | |
10878 | + * See how much we want to round up: start off with | |
10879 | + * rounding to the next 1MB area. | |
10880 | + */ | |
10881 | + round = 0x100000; | |
10882 | + while ((gapsize >> 4) > round) | |
10883 | + round += round; | |
10884 | + /* Fun with two's complement */ | |
10885 | + pci_mem_start = (gapstart + round) & -round; | |
10886 | + | |
10887 | + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", | |
10888 | + pci_mem_start, gapstart, gapsize); | |
10889 | +} | |
10890 | + | |
10891 | +/* | |
10892 | + * Request address space for all standard resources | |
10893 | + * | |
10894 | + * This is called just before pcibios_init(), which is also a | |
10895 | + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). | |
10896 | + */ | |
10897 | +static int __init request_standard_resources(void) | |
10898 | +{ | |
10899 | + int i; | |
10900 | + | |
10901 | + /* Nothing to do if not running in dom0. */ | |
10902 | + if (!is_initial_xendomain()) | |
10903 | + return 0; | |
10904 | + | |
10905 | + printk("Setting up standard PCI resources\n"); | |
10906 | +#ifdef CONFIG_XEN | |
10907 | + legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map, | |
10908 | + &code_resource, &data_resource); | |
10909 | +#else | |
10910 | + if (efi_enabled) | |
10911 | + efi_initialize_iomem_resources(&code_resource, &data_resource); | |
10912 | + else | |
10913 | + legacy_init_iomem_resources(e820.map, e820.nr_map, | |
10914 | + &code_resource, &data_resource); | |
10915 | +#endif | |
10916 | + | |
10917 | + /* EFI systems may still have VGA */ | |
10918 | + request_resource(&iomem_resource, &video_ram_resource); | |
10919 | + | |
10920 | + /* request I/O space for devices used on all i[345]86 PCs */ | |
10921 | + for (i = 0; i < STANDARD_IO_RESOURCES; i++) | |
10922 | + request_resource(&ioport_resource, &standard_io_resources[i]); | |
10923 | + return 0; | |
10924 | +} | |
10925 | + | |
10926 | +subsys_initcall(request_standard_resources); | |
10927 | + | |
10928 | +static void __init register_memory(void) | |
10929 | +{ | |
10930 | +#ifdef CONFIG_XEN | |
10931 | + if (is_initial_xendomain()) | |
10932 | + e820_setup_gap(machine_e820.map, machine_e820.nr_map); | |
10933 | + else | |
10934 | +#endif | |
10935 | + e820_setup_gap(e820.map, e820.nr_map); | |
10936 | +} | |
10937 | + | |
10938 | +#ifdef CONFIG_MCA | |
10939 | +static void set_mca_bus(int x) | |
10940 | +{ | |
10941 | + MCA_bus = x; | |
10942 | +} | |
10943 | +#else | |
10944 | +static void set_mca_bus(int x) { } | |
10945 | +#endif | |
10946 | + | |
10947 | +/* | |
10948 | + * Determine if we were loaded by an EFI loader. If so, then we have also been | |
10949 | + * passed the efi memmap, systab, etc., so we should use these data structures | |
10950 | + * for initialization. Note, the efi init code path is determined by the | |
10951 | + * global efi_enabled. This allows the same kernel image to be used on existing | |
10952 | + * systems (with a traditional BIOS) as well as on EFI systems. | |
10953 | + */ | |
10954 | +void __init setup_arch(char **cmdline_p) | |
10955 | +{ | |
10956 | + int i, j, k, fpp; | |
10957 | + struct physdev_set_iopl set_iopl; | |
10958 | + unsigned long max_low_pfn; | |
10959 | + unsigned long p2m_pages; | |
10960 | + | |
10961 | + /* Force a quick death if the kernel panics (not domain 0). */ | |
10962 | + extern int panic_timeout; | |
10963 | + if (!panic_timeout && !is_initial_xendomain()) | |
10964 | + panic_timeout = 1; | |
10965 | + | |
10966 | + /* Register a call for panic conditions. */ | |
10967 | + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); | |
10968 | + | |
10969 | + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, | |
10970 | + VMASST_TYPE_4gb_segments)); | |
10971 | + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, | |
10972 | + VMASST_TYPE_writable_pagetables)); | |
10973 | + | |
10974 | + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); | |
10975 | + pre_setup_arch_hook(); | |
10976 | + early_cpu_init(); | |
10977 | +#ifdef CONFIG_SMP | |
10978 | + prefill_possible_map(); | |
10979 | +#endif | |
10980 | + | |
10981 | + /* | |
10982 | + * FIXME: This isn't an official loader_type right | |
10983 | + * now but does currently work with elilo. | |
10984 | + * If we were configured as an EFI kernel, check to make | |
10985 | + * sure that we were loaded correctly from elilo and that | |
10986 | + * the system table is valid. If not, then initialize normally. | |
10987 | + */ | |
10988 | +#ifdef CONFIG_EFI | |
10989 | + if ((LOADER_TYPE == 0x50) && EFI_SYSTAB) | |
10990 | + efi_enabled = 1; | |
10991 | +#endif | |
10992 | + | |
10993 | + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work | |
10994 | + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd. | |
10995 | + */ | |
10996 | + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0); | |
10997 | + drive_info = DRIVE_INFO; | |
10998 | + screen_info = SCREEN_INFO; | |
10999 | + copy_edid(); | |
11000 | + apm_info.bios = APM_BIOS_INFO; | |
11001 | + ist_info = IST_INFO; | |
11002 | + saved_videomode = VIDEO_MODE; | |
11003 | + if( SYS_DESC_TABLE.length != 0 ) { | |
11004 | + set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2); | |
11005 | + machine_id = SYS_DESC_TABLE.table[0]; | |
11006 | + machine_submodel_id = SYS_DESC_TABLE.table[1]; | |
11007 | + BIOS_revision = SYS_DESC_TABLE.table[2]; | |
11008 | + } | |
11009 | + bootloader_type = LOADER_TYPE; | |
11010 | + | |
11011 | + if (is_initial_xendomain()) { | |
11012 | + const struct dom0_vga_console_info *info = | |
11013 | + (void *)((char *)xen_start_info + | |
11014 | + xen_start_info->console.dom0.info_off); | |
11015 | + | |
11016 | + dom0_init_screen_info(info, | |
11017 | + xen_start_info->console.dom0.info_size); | |
11018 | + xen_start_info->console.domU.mfn = 0; | |
11019 | + xen_start_info->console.domU.evtchn = 0; | |
11020 | + } else | |
11021 | + screen_info.orig_video_isVGA = 0; | |
11022 | + | |
11023 | +#ifdef CONFIG_BLK_DEV_RAM | |
11024 | + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; | |
11025 | + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); | |
11026 | + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); | |
11027 | +#endif | |
11028 | + | |
11029 | + ARCH_SETUP | |
11030 | + if (efi_enabled) | |
11031 | + efi_init(); | |
11032 | + else { | |
11033 | + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | |
11034 | + print_memory_map(machine_specific_memory_setup()); | |
11035 | + } | |
11036 | + | |
11037 | + copy_edd(); | |
11038 | + | |
11039 | + if (!MOUNT_ROOT_RDONLY) | |
11040 | + root_mountflags &= ~MS_RDONLY; | |
11041 | + init_mm.start_code = (unsigned long) _text; | |
11042 | + init_mm.end_code = (unsigned long) _etext; | |
11043 | + init_mm.end_data = (unsigned long) _edata; | |
11044 | + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) + | |
11045 | + xen_start_info->nr_pt_frames) << PAGE_SHIFT; | |
11046 | + | |
11047 | + code_resource.start = virt_to_phys(_text); | |
11048 | + code_resource.end = virt_to_phys(_etext)-1; | |
11049 | + data_resource.start = virt_to_phys(_etext); | |
11050 | + data_resource.end = virt_to_phys(_edata)-1; | |
11051 | + | |
11052 | + parse_cmdline_early(cmdline_p); | |
11053 | + | |
11054 | +#ifdef CONFIG_EARLY_PRINTK | |
11055 | + { | |
11056 | + char *s = strstr(*cmdline_p, "earlyprintk="); | |
11057 | + if (s) { | |
11058 | + setup_early_printk(strchr(s, '=') + 1); | |
11059 | + printk("early console enabled\n"); | |
11060 | + } | |
11061 | + } | |
11062 | +#endif | |
11063 | + | |
11064 | + max_low_pfn = setup_memory(); | |
11065 | + | |
11066 | + /* | |
11067 | + * NOTE: before this point _nobody_ is allowed to allocate | |
11068 | + * any memory using the bootmem allocator. Although the | |
11069 | + * alloctor is now initialised only the first 8Mb of the kernel | |
11070 | + * virtual address space has been mapped. All allocations before | |
11071 | + * paging_init() has completed must use the alloc_bootmem_low_pages() | |
11072 | + * variant (which allocates DMA'able memory) and care must be taken | |
11073 | + * not to exceed the 8Mb limit. | |
11074 | + */ | |
11075 | + | |
11076 | +#ifdef CONFIG_SMP | |
11077 | + smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ | |
11078 | +#endif | |
11079 | + paging_init(); | |
11080 | + remapped_pgdat_init(); | |
11081 | + sparse_init(); | |
11082 | + zone_sizes_init(); | |
11083 | + | |
11084 | +#ifdef CONFIG_X86_FIND_SMP_CONFIG | |
11085 | + /* | |
11086 | + * Find and reserve possible boot-time SMP configuration: | |
11087 | + */ | |
11088 | + find_smp_config(); | |
11089 | +#endif | |
11090 | + | |
11091 | + p2m_pages = max_pfn; | |
11092 | + if (xen_start_info->nr_pages > max_pfn) { | |
11093 | + /* | |
11094 | + * the max_pfn was shrunk (probably by mem= or highmem= | |
11095 | + * kernel parameter); shrink reservation with the HV | |
11096 | + */ | |
11097 | + struct xen_memory_reservation reservation = { | |
11098 | + .address_bits = 0, | |
11099 | + .extent_order = 0, | |
11100 | + .domid = DOMID_SELF | |
11101 | + }; | |
11102 | + unsigned int difference; | |
11103 | + int ret; | |
11104 | + | |
11105 | + difference = xen_start_info->nr_pages - max_pfn; | |
11106 | + | |
11107 | + set_xen_guest_handle(reservation.extent_start, | |
11108 | + ((unsigned long *)xen_start_info->mfn_list) + max_pfn); | |
11109 | + reservation.nr_extents = difference; | |
11110 | + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, | |
11111 | + &reservation); | |
11112 | + BUG_ON (ret != difference); | |
11113 | + } | |
11114 | + else if (max_pfn > xen_start_info->nr_pages) | |
11115 | + p2m_pages = xen_start_info->nr_pages; | |
11116 | + | |
11117 | + /* Make sure we have a correctly sized P->M table. */ | |
11118 | + if (!xen_feature(XENFEAT_auto_translated_physmap)) { | |
11119 | + phys_to_machine_mapping = alloc_bootmem_low_pages( | |
11120 | + max_pfn * sizeof(unsigned long)); | |
11121 | + memset(phys_to_machine_mapping, ~0, | |
11122 | + max_pfn * sizeof(unsigned long)); | |
11123 | + memcpy(phys_to_machine_mapping, | |
11124 | + (unsigned long *)xen_start_info->mfn_list, | |
11125 | + p2m_pages * sizeof(unsigned long)); | |
11126 | + free_bootmem( | |
11127 | + __pa(xen_start_info->mfn_list), | |
11128 | + PFN_PHYS(PFN_UP(xen_start_info->nr_pages * | |
11129 | + sizeof(unsigned long)))); | |
11130 | + | |
11131 | + /* | |
11132 | + * Initialise the list of the frames that specify the list of | |
11133 | + * frames that make up the p2m table. Used by save/restore | |
11134 | + */ | |
11135 | + pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE); | |
11136 | + | |
11137 | + fpp = PAGE_SIZE/sizeof(unsigned long); | |
11138 | + for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) { | |
11139 | + if ((j % fpp) == 0) { | |
11140 | + k++; | |
11141 | + BUG_ON(k>=16); | |
11142 | + pfn_to_mfn_frame_list[k] = | |
11143 | + alloc_bootmem_low_pages(PAGE_SIZE); | |
11144 | + pfn_to_mfn_frame_list_list[k] = | |
11145 | + virt_to_mfn(pfn_to_mfn_frame_list[k]); | |
11146 | + j=0; | |
11147 | + } | |
11148 | + pfn_to_mfn_frame_list[k][j] = | |
11149 | + virt_to_mfn(&phys_to_machine_mapping[i]); | |
11150 | + } | |
11151 | + HYPERVISOR_shared_info->arch.max_pfn = max_pfn; | |
11152 | + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = | |
11153 | + virt_to_mfn(pfn_to_mfn_frame_list_list); | |
11154 | + } | |
11155 | + | |
11156 | + /* Mark all ISA DMA channels in-use - using them wouldn't work. */ | |
11157 | + for (i = 0; i < MAX_DMA_CHANNELS; ++i) | |
11158 | + if (i != 4 && request_dma(i, "xen") != 0) | |
11159 | + BUG(); | |
11160 | + | |
11161 | + /* | |
11162 | + * NOTE: at this point the bootmem allocator is fully available. | |
11163 | + */ | |
11164 | + | |
11165 | + if (is_initial_xendomain()) | |
11166 | + dmi_scan_machine(); | |
11167 | + | |
11168 | +#ifdef CONFIG_X86_GENERICARCH | |
11169 | + generic_apic_probe(*cmdline_p); | |
11170 | +#endif | |
11171 | + if (efi_enabled) | |
11172 | + efi_map_memmap(); | |
11173 | + | |
11174 | + set_iopl.iopl = 1; | |
11175 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); | |
11176 | + | |
11177 | +#ifdef CONFIG_ACPI | |
11178 | + if (!is_initial_xendomain()) { | |
11179 | + printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); | |
11180 | + acpi_disabled = 1; | |
11181 | + acpi_ht = 0; | |
11182 | + } | |
11183 | + | |
11184 | + /* | |
11185 | + * Parse the ACPI tables for possible boot-time SMP configuration. | |
11186 | + */ | |
11187 | + acpi_boot_table_init(); | |
11188 | +#endif | |
11189 | + | |
11190 | +#ifdef CONFIG_X86_IO_APIC | |
11191 | + check_acpi_pci(); /* Checks more than just ACPI actually */ | |
11192 | +#endif | |
11193 | + | |
11194 | +#ifdef CONFIG_ACPI | |
11195 | + acpi_boot_init(); | |
11196 | + | |
11197 | +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) | |
11198 | + if (def_to_bigsmp) | |
11199 | + printk(KERN_WARNING "More than 8 CPUs detected and " | |
11200 | + "CONFIG_X86_PC cannot handle it.\nUse " | |
11201 | + "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); | |
11202 | +#endif | |
11203 | +#endif | |
11204 | +#ifdef CONFIG_X86_LOCAL_APIC | |
11205 | + if (smp_found_config) | |
11206 | + get_smp_config(); | |
11207 | +#endif | |
11208 | + | |
11209 | + register_memory(); | |
11210 | + | |
11211 | + if (is_initial_xendomain()) { | |
11212 | +#ifdef CONFIG_VT | |
11213 | +#if defined(CONFIG_VGA_CONSOLE) | |
11214 | + if (!efi_enabled || | |
11215 | + (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) | |
11216 | + conswitchp = &vga_con; | |
11217 | +#elif defined(CONFIG_DUMMY_CONSOLE) | |
11218 | + conswitchp = &dummy_con; | |
11219 | +#endif | |
11220 | +#endif | |
11221 | + } else { | |
11222 | +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE) | |
11223 | + conswitchp = &dummy_con; | |
11224 | +#endif | |
11225 | + } | |
11226 | + tsc_init(); | |
11227 | +} | |
11228 | + | |
11229 | +static int | |
11230 | +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) | |
11231 | +{ | |
11232 | + HYPERVISOR_shutdown(SHUTDOWN_crash); | |
11233 | + /* we're never actually going to get here... */ | |
11234 | + return NOTIFY_DONE; | |
11235 | +} | |
11236 | + | |
11237 | +static __init int add_pcspkr(void) | |
11238 | +{ | |
11239 | + struct platform_device *pd; | |
11240 | + int ret; | |
11241 | + | |
11242 | + if (!is_initial_xendomain()) | |
11243 | + return 0; | |
11244 | + | |
11245 | + pd = platform_device_alloc("pcspkr", -1); | |
11246 | + if (!pd) | |
11247 | + return -ENOMEM; | |
11248 | + | |
11249 | + ret = platform_device_add(pd); | |
11250 | + if (ret) | |
11251 | + platform_device_put(pd); | |
11252 | + | |
11253 | + return ret; | |
11254 | +} | |
11255 | +device_initcall(add_pcspkr); | |
11256 | + | |
11257 | +/* | |
11258 | + * Local Variables: | |
11259 | + * mode:c | |
11260 | + * c-file-style:"k&r" | |
11261 | + * c-basic-offset:8 | |
11262 | + * End: | |
11263 | + */ | |
11264 | Index: head-2008-11-25/arch/x86/kernel/smp_32-xen.c | |
11265 | =================================================================== | |
11266 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
11267 | +++ head-2008-11-25/arch/x86/kernel/smp_32-xen.c 2007-12-10 08:47:31.000000000 +0100 | |
11268 | @@ -0,0 +1,605 @@ | |
11269 | +/* | |
11270 | + * Intel SMP support routines. | |
11271 | + * | |
11272 | + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | |
11273 | + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> | |
11274 | + * | |
11275 | + * This code is released under the GNU General Public License version 2 or | |
11276 | + * later. | |
11277 | + */ | |
11278 | + | |
11279 | +#include <linux/init.h> | |
11280 | + | |
11281 | +#include <linux/mm.h> | |
11282 | +#include <linux/delay.h> | |
11283 | +#include <linux/spinlock.h> | |
11284 | +#include <linux/smp_lock.h> | |
11285 | +#include <linux/kernel_stat.h> | |
11286 | +#include <linux/mc146818rtc.h> | |
11287 | +#include <linux/cache.h> | |
11288 | +#include <linux/interrupt.h> | |
11289 | +#include <linux/cpu.h> | |
11290 | +#include <linux/module.h> | |
11291 | + | |
11292 | +#include <asm/mtrr.h> | |
11293 | +#include <asm/tlbflush.h> | |
11294 | +#if 0 | |
11295 | +#include <mach_apic.h> | |
11296 | +#endif | |
11297 | +#include <xen/evtchn.h> | |
11298 | + | |
11299 | +/* | |
11300 | + * Some notes on x86 processor bugs affecting SMP operation: | |
11301 | + * | |
11302 | + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. | |
11303 | + * The Linux implications for SMP are handled as follows: | |
11304 | + * | |
11305 | + * Pentium III / [Xeon] | |
11306 | + * None of the E1AP-E3AP errata are visible to the user. | |
11307 | + * | |
11308 | + * E1AP. see PII A1AP | |
11309 | + * E2AP. see PII A2AP | |
11310 | + * E3AP. see PII A3AP | |
11311 | + * | |
11312 | + * Pentium II / [Xeon] | |
11313 | + * None of the A1AP-A3AP errata are visible to the user. | |
11314 | + * | |
11315 | + * A1AP. see PPro 1AP | |
11316 | + * A2AP. see PPro 2AP | |
11317 | + * A3AP. see PPro 7AP | |
11318 | + * | |
11319 | + * Pentium Pro | |
11320 | + * None of 1AP-9AP errata are visible to the normal user, | |
11321 | + * except occasional delivery of 'spurious interrupt' as trap #15. | |
11322 | + * This is very rare and a non-problem. | |
11323 | + * | |
11324 | + * 1AP. Linux maps APIC as non-cacheable | |
11325 | + * 2AP. worked around in hardware | |
11326 | + * 3AP. fixed in C0 and above steppings microcode update. | |
11327 | + * Linux does not use excessive STARTUP_IPIs. | |
11328 | + * 4AP. worked around in hardware | |
11329 | + * 5AP. symmetric IO mode (normal Linux operation) not affected. | |
11330 | + * 'noapic' mode has vector 0xf filled out properly. | |
11331 | + * 6AP. 'noapic' mode might be affected - fixed in later steppings | |
11332 | + * 7AP. We do not assume writes to the LVT deassering IRQs | |
11333 | + * 8AP. We do not enable low power mode (deep sleep) during MP bootup | |
11334 | + * 9AP. We do not use mixed mode | |
11335 | + * | |
11336 | + * Pentium | |
11337 | + * There is a marginal case where REP MOVS on 100MHz SMP | |
11338 | + * machines with B stepping processors can fail. XXX should provide | |
11339 | + * an L1cache=Writethrough or L1cache=off option. | |
11340 | + * | |
11341 | + * B stepping CPUs may hang. There are hardware work arounds | |
11342 | + * for this. We warn about it in case your board doesn't have the work | |
11343 | + * arounds. Basically thats so I can tell anyone with a B stepping | |
11344 | + * CPU and SMP problems "tough". | |
11345 | + * | |
11346 | + * Specific items [From Pentium Processor Specification Update] | |
11347 | + * | |
11348 | + * 1AP. Linux doesn't use remote read | |
11349 | + * 2AP. Linux doesn't trust APIC errors | |
11350 | + * 3AP. We work around this | |
11351 | + * 4AP. Linux never generated 3 interrupts of the same priority | |
11352 | + * to cause a lost local interrupt. | |
11353 | + * 5AP. Remote read is never used | |
11354 | + * 6AP. not affected - worked around in hardware | |
11355 | + * 7AP. not affected - worked around in hardware | |
11356 | + * 8AP. worked around in hardware - we get explicit CS errors if not | |
11357 | + * 9AP. only 'noapic' mode affected. Might generate spurious | |
11358 | + * interrupts, we log only the first one and count the | |
11359 | + * rest silently. | |
11360 | + * 10AP. not affected - worked around in hardware | |
11361 | + * 11AP. Linux reads the APIC between writes to avoid this, as per | |
11362 | + * the documentation. Make sure you preserve this as it affects | |
11363 | + * the C stepping chips too. | |
11364 | + * 12AP. not affected - worked around in hardware | |
11365 | + * 13AP. not affected - worked around in hardware | |
11366 | + * 14AP. we always deassert INIT during bootup | |
11367 | + * 15AP. not affected - worked around in hardware | |
11368 | + * 16AP. not affected - worked around in hardware | |
11369 | + * 17AP. not affected - worked around in hardware | |
11370 | + * 18AP. not affected - worked around in hardware | |
11371 | + * 19AP. not affected - worked around in BIOS | |
11372 | + * | |
11373 | + * If this sounds worrying believe me these bugs are either ___RARE___, | |
11374 | + * or are signal timing bugs worked around in hardware and there's | |
11375 | + * about nothing of note with C stepping upwards. | |
11376 | + */ | |
11377 | + | |
11378 | +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, }; | |
11379 | + | |
11380 | +/* | |
11381 | + * the following functions deal with sending IPIs between CPUs. | |
11382 | + * | |
11383 | + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. | |
11384 | + */ | |
11385 | + | |
11386 | +static inline int __prepare_ICR (unsigned int shortcut, int vector) | |
11387 | +{ | |
11388 | + unsigned int icr = shortcut | APIC_DEST_LOGICAL; | |
11389 | + | |
11390 | + switch (vector) { | |
11391 | + default: | |
11392 | + icr |= APIC_DM_FIXED | vector; | |
11393 | + break; | |
11394 | + case NMI_VECTOR: | |
11395 | + icr |= APIC_DM_NMI; | |
11396 | + break; | |
11397 | + } | |
11398 | + return icr; | |
11399 | +} | |
11400 | + | |
11401 | +static inline int __prepare_ICR2 (unsigned int mask) | |
11402 | +{ | |
11403 | + return SET_APIC_DEST_FIELD(mask); | |
11404 | +} | |
11405 | + | |
11406 | +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]); | |
11407 | + | |
11408 | +static inline void __send_IPI_one(unsigned int cpu, int vector) | |
11409 | +{ | |
11410 | + int irq = per_cpu(ipi_to_irq, cpu)[vector]; | |
11411 | + BUG_ON(irq < 0); | |
11412 | + notify_remote_via_irq(irq); | |
11413 | +} | |
11414 | + | |
11415 | +void __send_IPI_shortcut(unsigned int shortcut, int vector) | |
11416 | +{ | |
11417 | + int cpu; | |
11418 | + | |
11419 | + switch (shortcut) { | |
11420 | + case APIC_DEST_SELF: | |
11421 | + __send_IPI_one(smp_processor_id(), vector); | |
11422 | + break; | |
11423 | + case APIC_DEST_ALLBUT: | |
11424 | + for (cpu = 0; cpu < NR_CPUS; ++cpu) { | |
11425 | + if (cpu == smp_processor_id()) | |
11426 | + continue; | |
11427 | + if (cpu_isset(cpu, cpu_online_map)) { | |
11428 | + __send_IPI_one(cpu, vector); | |
11429 | + } | |
11430 | + } | |
11431 | + break; | |
11432 | + default: | |
11433 | + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, | |
11434 | + vector); | |
11435 | + break; | |
11436 | + } | |
11437 | +} | |
11438 | + | |
11439 | +void fastcall send_IPI_self(int vector) | |
11440 | +{ | |
11441 | + __send_IPI_shortcut(APIC_DEST_SELF, vector); | |
11442 | +} | |
11443 | + | |
11444 | +/* | |
11445 | + * This is only used on smaller machines. | |
11446 | + */ | |
11447 | +void send_IPI_mask_bitmask(cpumask_t mask, int vector) | |
11448 | +{ | |
11449 | + unsigned long flags; | |
11450 | + unsigned int cpu; | |
11451 | + | |
11452 | + local_irq_save(flags); | |
11453 | + WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]); | |
11454 | + | |
11455 | + for (cpu = 0; cpu < NR_CPUS; ++cpu) { | |
11456 | + if (cpu_isset(cpu, mask)) { | |
11457 | + __send_IPI_one(cpu, vector); | |
11458 | + } | |
11459 | + } | |
11460 | + | |
11461 | + local_irq_restore(flags); | |
11462 | +} | |
11463 | + | |
11464 | +void send_IPI_mask_sequence(cpumask_t mask, int vector) | |
11465 | +{ | |
11466 | + | |
11467 | + send_IPI_mask_bitmask(mask, vector); | |
11468 | +} | |
11469 | + | |
11470 | +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */ | |
11471 | + | |
11472 | +#if 0 /* XEN */ | |
11473 | +/* | |
11474 | + * Smarter SMP flushing macros. | |
11475 | + * c/o Linus Torvalds. | |
11476 | + * | |
11477 | + * These mean you can really definitely utterly forget about | |
11478 | + * writing to user space from interrupts. (Its not allowed anyway). | |
11479 | + * | |
11480 | + * Optimizations Manfred Spraul <manfred@colorfullife.com> | |
11481 | + */ | |
11482 | + | |
11483 | +static cpumask_t flush_cpumask; | |
11484 | +static struct mm_struct * flush_mm; | |
11485 | +static unsigned long flush_va; | |
11486 | +static DEFINE_SPINLOCK(tlbstate_lock); | |
11487 | +#define FLUSH_ALL 0xffffffff | |
11488 | + | |
11489 | +/* | |
11490 | + * We cannot call mmdrop() because we are in interrupt context, | |
11491 | + * instead update mm->cpu_vm_mask. | |
11492 | + * | |
11493 | + * We need to reload %cr3 since the page tables may be going | |
11494 | + * away from under us.. | |
11495 | + */ | |
11496 | +static inline void leave_mm (unsigned long cpu) | |
11497 | +{ | |
11498 | + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) | |
11499 | + BUG(); | |
11500 | + cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); | |
11501 | + load_cr3(swapper_pg_dir); | |
11502 | +} | |
11503 | + | |
11504 | +/* | |
11505 | + * | |
11506 | + * The flush IPI assumes that a thread switch happens in this order: | |
11507 | + * [cpu0: the cpu that switches] | |
11508 | + * 1) switch_mm() either 1a) or 1b) | |
11509 | + * 1a) thread switch to a different mm | |
11510 | + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); | |
11511 | + * Stop ipi delivery for the old mm. This is not synchronized with | |
11512 | + * the other cpus, but smp_invalidate_interrupt ignore flush ipis | |
11513 | + * for the wrong mm, and in the worst case we perform a superflous | |
11514 | + * tlb flush. | |
11515 | + * 1a2) set cpu_tlbstate to TLBSTATE_OK | |
11516 | + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | |
11517 | + * was in lazy tlb mode. | |
11518 | + * 1a3) update cpu_tlbstate[].active_mm | |
11519 | + * Now cpu0 accepts tlb flushes for the new mm. | |
11520 | + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); | |
11521 | + * Now the other cpus will send tlb flush ipis. | |
11522 | + * 1a4) change cr3. | |
11523 | + * 1b) thread switch without mm change | |
11524 | + * cpu_tlbstate[].active_mm is correct, cpu0 already handles | |
11525 | + * flush ipis. | |
11526 | + * 1b1) set cpu_tlbstate to TLBSTATE_OK | |
11527 | + * 1b2) test_and_set the cpu bit in cpu_vm_mask. | |
11528 | + * Atomically set the bit [other cpus will start sending flush ipis], | |
11529 | + * and test the bit. | |
11530 | + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | |
11531 | + * 2) switch %%esp, ie current | |
11532 | + * | |
11533 | + * The interrupt must handle 2 special cases: | |
11534 | + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. | |
11535 | + * - the cpu performs speculative tlb reads, i.e. even if the cpu only | |
11536 | + * runs in kernel space, the cpu could load tlb entries for user space | |
11537 | + * pages. | |
11538 | + * | |
11539 | + * The good news is that cpu_tlbstate is local to each cpu, no | |
11540 | + * write/read ordering problems. | |
11541 | + */ | |
11542 | + | |
11543 | +/* | |
11544 | + * TLB flush IPI: | |
11545 | + * | |
11546 | + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | |
11547 | + * 2) Leave the mm if we are in the lazy tlb mode. | |
11548 | + */ | |
11549 | + | |
11550 | +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id, | |
11551 | + struct pt_regs *regs) | |
11552 | +{ | |
11553 | + unsigned long cpu; | |
11554 | + | |
11555 | + cpu = get_cpu(); | |
11556 | + | |
11557 | + if (!cpu_isset(cpu, flush_cpumask)) | |
11558 | + goto out; | |
11559 | + /* | |
11560 | + * This was a BUG() but until someone can quote me the | |
11561 | + * line from the intel manual that guarantees an IPI to | |
11562 | + * multiple CPUs is retried _only_ on the erroring CPUs | |
11563 | + * its staying as a return | |
11564 | + * | |
11565 | + * BUG(); | |
11566 | + */ | |
11567 | + | |
11568 | + if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { | |
11569 | + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { | |
11570 | + if (flush_va == FLUSH_ALL) | |
11571 | + local_flush_tlb(); | |
11572 | + else | |
11573 | + __flush_tlb_one(flush_va); | |
11574 | + } else | |
11575 | + leave_mm(cpu); | |
11576 | + } | |
11577 | + smp_mb__before_clear_bit(); | |
11578 | + cpu_clear(cpu, flush_cpumask); | |
11579 | + smp_mb__after_clear_bit(); | |
11580 | +out: | |
11581 | + put_cpu_no_resched(); | |
11582 | + | |
11583 | + return IRQ_HANDLED; | |
11584 | +} | |
11585 | + | |
11586 | +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, | |
11587 | + unsigned long va) | |
11588 | +{ | |
11589 | + /* | |
11590 | + * A couple of (to be removed) sanity checks: | |
11591 | + * | |
11592 | + * - current CPU must not be in mask | |
11593 | + * - mask must exist :) | |
11594 | + */ | |
11595 | + BUG_ON(cpus_empty(cpumask)); | |
11596 | + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); | |
11597 | + BUG_ON(!mm); | |
11598 | + | |
11599 | + /* If a CPU which we ran on has gone down, OK. */ | |
11600 | + cpus_and(cpumask, cpumask, cpu_online_map); | |
11601 | + if (cpus_empty(cpumask)) | |
11602 | + return; | |
11603 | + | |
11604 | + /* | |
11605 | + * i'm not happy about this global shared spinlock in the | |
11606 | + * MM hot path, but we'll see how contended it is. | |
11607 | + * Temporarily this turns IRQs off, so that lockups are | |
11608 | + * detected by the NMI watchdog. | |
11609 | + */ | |
11610 | + spin_lock(&tlbstate_lock); | |
11611 | + | |
11612 | + flush_mm = mm; | |
11613 | + flush_va = va; | |
11614 | +#if NR_CPUS <= BITS_PER_LONG | |
11615 | + atomic_set_mask(cpumask, &flush_cpumask); | |
11616 | +#else | |
11617 | + { | |
11618 | + int k; | |
11619 | + unsigned long *flush_mask = (unsigned long *)&flush_cpumask; | |
11620 | + unsigned long *cpu_mask = (unsigned long *)&cpumask; | |
11621 | + for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k) | |
11622 | + atomic_set_mask(cpu_mask[k], &flush_mask[k]); | |
11623 | + } | |
11624 | +#endif | |
11625 | + /* | |
11626 | + * We have to send the IPI only to | |
11627 | + * CPUs affected. | |
11628 | + */ | |
11629 | + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); | |
11630 | + | |
11631 | + while (!cpus_empty(flush_cpumask)) | |
11632 | + /* nothing. lockup detection does not belong here */ | |
11633 | + mb(); | |
11634 | + | |
11635 | + flush_mm = NULL; | |
11636 | + flush_va = 0; | |
11637 | + spin_unlock(&tlbstate_lock); | |
11638 | +} | |
11639 | + | |
11640 | +void flush_tlb_current_task(void) | |
11641 | +{ | |
11642 | + struct mm_struct *mm = current->mm; | |
11643 | + cpumask_t cpu_mask; | |
11644 | + | |
11645 | + preempt_disable(); | |
11646 | + cpu_mask = mm->cpu_vm_mask; | |
11647 | + cpu_clear(smp_processor_id(), cpu_mask); | |
11648 | + | |
11649 | + local_flush_tlb(); | |
11650 | + if (!cpus_empty(cpu_mask)) | |
11651 | + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | |
11652 | + preempt_enable(); | |
11653 | +} | |
11654 | + | |
11655 | +void flush_tlb_mm (struct mm_struct * mm) | |
11656 | +{ | |
11657 | + cpumask_t cpu_mask; | |
11658 | + | |
11659 | + preempt_disable(); | |
11660 | + cpu_mask = mm->cpu_vm_mask; | |
11661 | + cpu_clear(smp_processor_id(), cpu_mask); | |
11662 | + | |
11663 | + if (current->active_mm == mm) { | |
11664 | + if (current->mm) | |
11665 | + local_flush_tlb(); | |
11666 | + else | |
11667 | + leave_mm(smp_processor_id()); | |
11668 | + } | |
11669 | + if (!cpus_empty(cpu_mask)) | |
11670 | + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | |
11671 | + | |
11672 | + preempt_enable(); | |
11673 | +} | |
11674 | + | |
11675 | +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) | |
11676 | +{ | |
11677 | + struct mm_struct *mm = vma->vm_mm; | |
11678 | + cpumask_t cpu_mask; | |
11679 | + | |
11680 | + preempt_disable(); | |
11681 | + cpu_mask = mm->cpu_vm_mask; | |
11682 | + cpu_clear(smp_processor_id(), cpu_mask); | |
11683 | + | |
11684 | + if (current->active_mm == mm) { | |
11685 | + if(current->mm) | |
11686 | + __flush_tlb_one(va); | |
11687 | + else | |
11688 | + leave_mm(smp_processor_id()); | |
11689 | + } | |
11690 | + | |
11691 | + if (!cpus_empty(cpu_mask)) | |
11692 | + flush_tlb_others(cpu_mask, mm, va); | |
11693 | + | |
11694 | + preempt_enable(); | |
11695 | +} | |
11696 | +EXPORT_SYMBOL(flush_tlb_page); | |
11697 | + | |
11698 | +static void do_flush_tlb_all(void* info) | |
11699 | +{ | |
11700 | + unsigned long cpu = smp_processor_id(); | |
11701 | + | |
11702 | + __flush_tlb_all(); | |
11703 | + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) | |
11704 | + leave_mm(cpu); | |
11705 | +} | |
11706 | + | |
11707 | +void flush_tlb_all(void) | |
11708 | +{ | |
11709 | + on_each_cpu(do_flush_tlb_all, NULL, 1, 1); | |
11710 | +} | |
11711 | + | |
11712 | +#endif /* XEN */ | |
11713 | + | |
11714 | +/* | |
11715 | + * this function sends a 'reschedule' IPI to another CPU. | |
11716 | + * it goes straight through and wastes no time serializing | |
11717 | + * anything. Worst case is that we lose a reschedule ... | |
11718 | + */ | |
11719 | +void smp_send_reschedule(int cpu) | |
11720 | +{ | |
11721 | + WARN_ON(cpu_is_offline(cpu)); | |
11722 | + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); | |
11723 | +} | |
11724 | + | |
11725 | +/* | |
11726 | + * Structure and data for smp_call_function(). This is designed to minimise | |
11727 | + * static memory requirements. It also looks cleaner. | |
11728 | + */ | |
11729 | +static DEFINE_SPINLOCK(call_lock); | |
11730 | + | |
11731 | +struct call_data_struct { | |
11732 | + void (*func) (void *info); | |
11733 | + void *info; | |
11734 | + atomic_t started; | |
11735 | + atomic_t finished; | |
11736 | + int wait; | |
11737 | +}; | |
11738 | + | |
11739 | +void lock_ipi_call_lock(void) | |
11740 | +{ | |
11741 | + spin_lock_irq(&call_lock); | |
11742 | +} | |
11743 | + | |
11744 | +void unlock_ipi_call_lock(void) | |
11745 | +{ | |
11746 | + spin_unlock_irq(&call_lock); | |
11747 | +} | |
11748 | + | |
11749 | +static struct call_data_struct *call_data; | |
11750 | + | |
11751 | +/** | |
11752 | + * smp_call_function(): Run a function on all other CPUs. | |
11753 | + * @func: The function to run. This must be fast and non-blocking. | |
11754 | + * @info: An arbitrary pointer to pass to the function. | |
11755 | + * @nonatomic: currently unused. | |
11756 | + * @wait: If true, wait (atomically) until function has completed on other CPUs. | |
11757 | + * | |
11758 | + * Returns 0 on success, else a negative status code. Does not return until | |
11759 | + * remote CPUs are nearly ready to execute <<func>> or are or have executed. | |
11760 | + * | |
11761 | + * You must not call this function with disabled interrupts or from a | |
11762 | + * hardware interrupt handler or from a bottom half handler. | |
11763 | + */ | |
11764 | +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, | |
11765 | + int wait) | |
11766 | +{ | |
11767 | + struct call_data_struct data; | |
11768 | + int cpus; | |
11769 | + | |
11770 | + /* Holding any lock stops cpus from going down. */ | |
11771 | + spin_lock(&call_lock); | |
11772 | + cpus = num_online_cpus() - 1; | |
11773 | + if (!cpus) { | |
11774 | + spin_unlock(&call_lock); | |
11775 | + return 0; | |
11776 | + } | |
11777 | + | |
11778 | + /* Can deadlock when called with interrupts disabled */ | |
11779 | + WARN_ON(irqs_disabled()); | |
11780 | + | |
11781 | + data.func = func; | |
11782 | + data.info = info; | |
11783 | + atomic_set(&data.started, 0); | |
11784 | + data.wait = wait; | |
11785 | + if (wait) | |
11786 | + atomic_set(&data.finished, 0); | |
11787 | + | |
11788 | + call_data = &data; | |
11789 | + mb(); | |
11790 | + | |
11791 | + /* Send a message to all other CPUs and wait for them to respond */ | |
11792 | + send_IPI_allbutself(CALL_FUNCTION_VECTOR); | |
11793 | + | |
11794 | + /* Wait for response */ | |
11795 | + while (atomic_read(&data.started) != cpus) | |
11796 | + cpu_relax(); | |
11797 | + | |
11798 | + if (wait) | |
11799 | + while (atomic_read(&data.finished) != cpus) | |
11800 | + cpu_relax(); | |
11801 | + spin_unlock(&call_lock); | |
11802 | + | |
11803 | + return 0; | |
11804 | +} | |
11805 | +EXPORT_SYMBOL(smp_call_function); | |
11806 | + | |
11807 | +static void stop_this_cpu (void * dummy) | |
11808 | +{ | |
11809 | + /* | |
11810 | + * Remove this CPU: | |
11811 | + */ | |
11812 | + cpu_clear(smp_processor_id(), cpu_online_map); | |
11813 | + local_irq_disable(); | |
11814 | + disable_all_local_evtchn(); | |
11815 | + if (cpu_data[smp_processor_id()].hlt_works_ok) | |
11816 | + for(;;) halt(); | |
11817 | + for (;;); | |
11818 | +} | |
11819 | + | |
11820 | +/* | |
11821 | + * this function calls the 'stop' function on all other CPUs in the system. | |
11822 | + */ | |
11823 | + | |
11824 | +void smp_send_stop(void) | |
11825 | +{ | |
11826 | + smp_call_function(stop_this_cpu, NULL, 1, 0); | |
11827 | + | |
11828 | + local_irq_disable(); | |
11829 | + disable_all_local_evtchn(); | |
11830 | + local_irq_enable(); | |
11831 | +} | |
11832 | + | |
11833 | +/* | |
11834 | + * Reschedule call back. Nothing to do, | |
11835 | + * all the work is done automatically when | |
11836 | + * we return from the interrupt. | |
11837 | + */ | |
11838 | +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id, | |
11839 | + struct pt_regs *regs) | |
11840 | +{ | |
11841 | + | |
11842 | + return IRQ_HANDLED; | |
11843 | +} | |
11844 | + | |
11845 | +#include <linux/kallsyms.h> | |
11846 | +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id, | |
11847 | + struct pt_regs *regs) | |
11848 | +{ | |
11849 | + void (*func) (void *info) = call_data->func; | |
11850 | + void *info = call_data->info; | |
11851 | + int wait = call_data->wait; | |
11852 | + | |
11853 | + /* | |
11854 | + * Notify initiating CPU that I've grabbed the data and am | |
11855 | + * about to execute the function | |
11856 | + */ | |
11857 | + mb(); | |
11858 | + atomic_inc(&call_data->started); | |
11859 | + /* | |
11860 | + * At this point the info structure may be out of scope unless wait==1 | |
11861 | + */ | |
11862 | + irq_enter(); | |
11863 | + (*func)(info); | |
11864 | + irq_exit(); | |
11865 | + | |
11866 | + if (wait) { | |
11867 | + mb(); | |
11868 | + atomic_inc(&call_data->finished); | |
11869 | + } | |
11870 | + | |
11871 | + return IRQ_HANDLED; | |
11872 | +} | |
11873 | + | |
11874 | Index: head-2008-11-25/arch/x86/kernel/time_32-xen.c | |
11875 | =================================================================== | |
11876 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
11877 | +++ head-2008-11-25/arch/x86/kernel/time_32-xen.c 2008-09-01 12:07:31.000000000 +0200 | |
11878 | @@ -0,0 +1,1209 @@ | |
11879 | +/* | |
11880 | + * linux/arch/i386/kernel/time.c | |
11881 | + * | |
11882 | + * Copyright (C) 1991, 1992, 1995 Linus Torvalds | |
11883 | + * | |
11884 | + * This file contains the PC-specific time handling details: | |
11885 | + * reading the RTC at bootup, etc.. | |
11886 | + * 1994-07-02 Alan Modra | |
11887 | + * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime | |
11888 | + * 1995-03-26 Markus Kuhn | |
11889 | + * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 | |
11890 | + * precision CMOS clock update | |
11891 | + * 1996-05-03 Ingo Molnar | |
11892 | + * fixed time warps in do_[slow|fast]_gettimeoffset() | |
11893 | + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 | |
11894 | + * "A Kernel Model for Precision Timekeeping" by Dave Mills | |
11895 | + * 1998-09-05 (Various) | |
11896 | + * More robust do_fast_gettimeoffset() algorithm implemented | |
11897 | + * (works with APM, Cyrix 6x86MX and Centaur C6), | |
11898 | + * monotonic gettimeofday() with fast_get_timeoffset(), | |
11899 | + * drift-proof precision TSC calibration on boot | |
11900 | + * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D. | |
11901 | + * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>; | |
11902 | + * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>). | |
11903 | + * 1998-12-16 Andrea Arcangeli | |
11904 | + * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy | |
11905 | + * because was not accounting lost_ticks. | |
11906 | + * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli | |
11907 | + * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to | |
11908 | + * serialize accesses to xtime/lost_ticks). | |
11909 | + */ | |
11910 | + | |
11911 | +#include <linux/errno.h> | |
11912 | +#include <linux/sched.h> | |
11913 | +#include <linux/kernel.h> | |
11914 | +#include <linux/param.h> | |
11915 | +#include <linux/string.h> | |
11916 | +#include <linux/mm.h> | |
11917 | +#include <linux/interrupt.h> | |
11918 | +#include <linux/time.h> | |
11919 | +#include <linux/delay.h> | |
11920 | +#include <linux/init.h> | |
11921 | +#include <linux/smp.h> | |
11922 | +#include <linux/module.h> | |
11923 | +#include <linux/sysdev.h> | |
11924 | +#include <linux/bcd.h> | |
11925 | +#include <linux/efi.h> | |
11926 | +#include <linux/mca.h> | |
11927 | +#include <linux/sysctl.h> | |
11928 | +#include <linux/percpu.h> | |
11929 | +#include <linux/kernel_stat.h> | |
11930 | +#include <linux/posix-timers.h> | |
11931 | +#include <linux/cpufreq.h> | |
11932 | + | |
11933 | +#include <asm/io.h> | |
11934 | +#include <asm/smp.h> | |
11935 | +#include <asm/irq.h> | |
11936 | +#include <asm/msr.h> | |
11937 | +#include <asm/delay.h> | |
11938 | +#include <asm/mpspec.h> | |
11939 | +#include <asm/uaccess.h> | |
11940 | +#include <asm/processor.h> | |
11941 | +#include <asm/timer.h> | |
11942 | +#include <asm/sections.h> | |
11943 | + | |
11944 | +#include "mach_time.h" | |
11945 | + | |
11946 | +#include <linux/timex.h> | |
11947 | + | |
11948 | +#include <asm/hpet.h> | |
11949 | + | |
11950 | +#include <asm/arch_hooks.h> | |
11951 | + | |
11952 | +#include <xen/evtchn.h> | |
11953 | +#include <xen/interface/vcpu.h> | |
11954 | + | |
11955 | +#if defined (__i386__) | |
11956 | +#include <asm/i8259.h> | |
11957 | +#endif | |
11958 | + | |
11959 | +int pit_latch_buggy; /* extern */ | |
11960 | + | |
11961 | +#if defined(__x86_64__) | |
11962 | +unsigned long vxtime_hz = PIT_TICK_RATE; | |
11963 | +struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ | |
11964 | +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; | |
11965 | +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; | |
11966 | +struct timespec __xtime __section_xtime; | |
11967 | +struct timezone __sys_tz __section_sys_tz; | |
11968 | +#endif | |
11969 | + | |
11970 | +unsigned int cpu_khz; /* Detected as we calibrate the TSC */ | |
11971 | +EXPORT_SYMBOL(cpu_khz); | |
11972 | + | |
11973 | +extern unsigned long wall_jiffies; | |
11974 | + | |
11975 | +DEFINE_SPINLOCK(rtc_lock); | |
11976 | +EXPORT_SYMBOL(rtc_lock); | |
11977 | + | |
11978 | +extern struct init_timer_opts timer_tsc_init; | |
11979 | +extern struct timer_opts timer_tsc; | |
11980 | +#define timer_none timer_tsc | |
11981 | + | |
11982 | +/* These are peridically updated in shared_info, and then copied here. */ | |
11983 | +struct shadow_time_info { | |
11984 | + u64 tsc_timestamp; /* TSC at last update of time vals. */ | |
11985 | + u64 system_timestamp; /* Time, in nanosecs, since boot. */ | |
11986 | + u32 tsc_to_nsec_mul; | |
11987 | + u32 tsc_to_usec_mul; | |
11988 | + int tsc_shift; | |
11989 | + u32 version; | |
11990 | +}; | |
11991 | +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); | |
11992 | +static struct timespec shadow_tv; | |
11993 | +static u32 shadow_tv_version; | |
11994 | + | |
11995 | +static struct timeval monotonic_tv; | |
11996 | +static spinlock_t monotonic_lock = SPIN_LOCK_UNLOCKED; | |
11997 | + | |
11998 | +/* Keep track of last time we did processing/updating of jiffies and xtime. */ | |
11999 | +static u64 processed_system_time; /* System time (ns) at last processing. */ | |
12000 | +static DEFINE_PER_CPU(u64, processed_system_time); | |
12001 | + | |
12002 | +/* How much CPU time was spent blocked and how much was 'stolen'? */ | |
12003 | +static DEFINE_PER_CPU(u64, processed_stolen_time); | |
12004 | +static DEFINE_PER_CPU(u64, processed_blocked_time); | |
12005 | + | |
12006 | +/* Current runstate of each CPU (updated automatically by the hypervisor). */ | |
12007 | +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); | |
12008 | + | |
12009 | +/* Must be signed, as it's compared with s64 quantities which can be -ve. */ | |
12010 | +#define NS_PER_TICK (1000000000LL/HZ) | |
12011 | + | |
12012 | +static void __clock_was_set(void *unused) | |
12013 | +{ | |
12014 | + clock_was_set(); | |
12015 | +} | |
12016 | +static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL); | |
12017 | + | |
12018 | +/* | |
12019 | + * GCC 4.3 can turn loops over an induction variable into division. We do | |
12020 | + * not support arbitrary 64-bit division, and so must break the induction. | |
12021 | + */ | |
12022 | +#define clobber_induction_variable(v) asm ( "" : "+r" (v) ) | |
12023 | + | |
12024 | +static inline void __normalize_time(time_t *sec, s64 *nsec) | |
12025 | +{ | |
12026 | + while (*nsec >= NSEC_PER_SEC) { | |
12027 | + clobber_induction_variable(*nsec); | |
12028 | + (*nsec) -= NSEC_PER_SEC; | |
12029 | + (*sec)++; | |
12030 | + } | |
12031 | + while (*nsec < 0) { | |
12032 | + clobber_induction_variable(*nsec); | |
12033 | + (*nsec) += NSEC_PER_SEC; | |
12034 | + (*sec)--; | |
12035 | + } | |
12036 | +} | |
12037 | + | |
12038 | +/* Does this guest OS track Xen time, or set its wall clock independently? */ | |
12039 | +static int independent_wallclock = 0; | |
12040 | +static int __init __independent_wallclock(char *str) | |
12041 | +{ | |
12042 | + independent_wallclock = 1; | |
12043 | + return 1; | |
12044 | +} | |
12045 | +__setup("independent_wallclock", __independent_wallclock); | |
12046 | + | |
12047 | +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */ | |
12048 | +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */ | |
12049 | +static int __init __permitted_clock_jitter(char *str) | |
12050 | +{ | |
12051 | + permitted_clock_jitter = simple_strtoul(str, NULL, 0); | |
12052 | + return 1; | |
12053 | +} | |
12054 | +__setup("permitted_clock_jitter=", __permitted_clock_jitter); | |
12055 | + | |
12056 | +#if 0 | |
12057 | +static void delay_tsc(unsigned long loops) | |
12058 | +{ | |
12059 | + unsigned long bclock, now; | |
12060 | + | |
12061 | + rdtscl(bclock); | |
12062 | + do { | |
12063 | + rep_nop(); | |
12064 | + rdtscl(now); | |
12065 | + } while ((now - bclock) < loops); | |
12066 | +} | |
12067 | + | |
12068 | +struct timer_opts timer_tsc = { | |
12069 | + .name = "tsc", | |
12070 | + .delay = delay_tsc, | |
12071 | +}; | |
12072 | +#endif | |
12073 | + | |
12074 | +/* | |
12075 | + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | |
12076 | + * yielding a 64-bit result. | |
12077 | + */ | |
12078 | +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) | |
12079 | +{ | |
12080 | + u64 product; | |
12081 | +#ifdef __i386__ | |
12082 | + u32 tmp1, tmp2; | |
12083 | +#endif | |
12084 | + | |
12085 | + if (shift < 0) | |
12086 | + delta >>= -shift; | |
12087 | + else | |
12088 | + delta <<= shift; | |
12089 | + | |
12090 | +#ifdef __i386__ | |
12091 | + __asm__ ( | |
12092 | + "mul %5 ; " | |
12093 | + "mov %4,%%eax ; " | |
12094 | + "mov %%edx,%4 ; " | |
12095 | + "mul %5 ; " | |
12096 | + "xor %5,%5 ; " | |
12097 | + "add %4,%%eax ; " | |
12098 | + "adc %5,%%edx ; " | |
12099 | + : "=A" (product), "=r" (tmp1), "=r" (tmp2) | |
12100 | + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); | |
12101 | +#else | |
12102 | + __asm__ ( | |
12103 | + "mul %%rdx ; shrd $32,%%rdx,%%rax" | |
12104 | + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); | |
12105 | +#endif | |
12106 | + | |
12107 | + return product; | |
12108 | +} | |
12109 | + | |
12110 | +#if 0 /* defined (__i386__) */ | |
12111 | +int read_current_timer(unsigned long *timer_val) | |
12112 | +{ | |
12113 | + rdtscl(*timer_val); | |
12114 | + return 0; | |
12115 | +} | |
12116 | +#endif | |
12117 | + | |
12118 | +void init_cpu_khz(void) | |
12119 | +{ | |
12120 | + u64 __cpu_khz = 1000000ULL << 32; | |
12121 | + struct vcpu_time_info *info = &vcpu_info(0)->time; | |
12122 | + do_div(__cpu_khz, info->tsc_to_system_mul); | |
12123 | + if (info->tsc_shift < 0) | |
12124 | + cpu_khz = __cpu_khz << -info->tsc_shift; | |
12125 | + else | |
12126 | + cpu_khz = __cpu_khz >> info->tsc_shift; | |
12127 | +} | |
12128 | + | |
12129 | +static u64 get_nsec_offset(struct shadow_time_info *shadow) | |
12130 | +{ | |
12131 | + u64 now, delta; | |
12132 | + rdtscll(now); | |
12133 | + delta = now - shadow->tsc_timestamp; | |
12134 | + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); | |
12135 | +} | |
12136 | + | |
12137 | +static unsigned long get_usec_offset(struct shadow_time_info *shadow) | |
12138 | +{ | |
12139 | + u64 now, delta; | |
12140 | + rdtscll(now); | |
12141 | + delta = now - shadow->tsc_timestamp; | |
12142 | + return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift); | |
12143 | +} | |
12144 | + | |
12145 | +static void __update_wallclock(time_t sec, long nsec) | |
12146 | +{ | |
12147 | + long wtm_nsec, xtime_nsec; | |
12148 | + time_t wtm_sec, xtime_sec; | |
12149 | + u64 tmp, wc_nsec; | |
12150 | + | |
12151 | + /* Adjust wall-clock time base based on wall_jiffies ticks. */ | |
12152 | + wc_nsec = processed_system_time; | |
12153 | + wc_nsec += sec * (u64)NSEC_PER_SEC; | |
12154 | + wc_nsec += nsec; | |
12155 | + wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK; | |
12156 | + | |
12157 | + /* Split wallclock base into seconds and nanoseconds. */ | |
12158 | + tmp = wc_nsec; | |
12159 | + xtime_nsec = do_div(tmp, 1000000000); | |
12160 | + xtime_sec = (time_t)tmp; | |
12161 | + | |
12162 | + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec); | |
12163 | + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec); | |
12164 | + | |
12165 | + set_normalized_timespec(&xtime, xtime_sec, xtime_nsec); | |
12166 | + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | |
12167 | + | |
12168 | + ntp_clear(); | |
12169 | +} | |
12170 | + | |
12171 | +static void update_wallclock(void) | |
12172 | +{ | |
12173 | + shared_info_t *s = HYPERVISOR_shared_info; | |
12174 | + | |
12175 | + do { | |
12176 | + shadow_tv_version = s->wc_version; | |
12177 | + rmb(); | |
12178 | + shadow_tv.tv_sec = s->wc_sec; | |
12179 | + shadow_tv.tv_nsec = s->wc_nsec; | |
12180 | + rmb(); | |
12181 | + } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version)); | |
12182 | + | |
12183 | + if (!independent_wallclock) | |
12184 | + __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec); | |
12185 | +} | |
12186 | + | |
12187 | +/* | |
12188 | + * Reads a consistent set of time-base values from Xen, into a shadow data | |
12189 | + * area. | |
12190 | + */ | |
12191 | +static void get_time_values_from_xen(unsigned int cpu) | |
12192 | +{ | |
12193 | + struct vcpu_time_info *src; | |
12194 | + struct shadow_time_info *dst; | |
12195 | + unsigned long flags; | |
12196 | + u32 pre_version, post_version; | |
12197 | + | |
12198 | + src = &vcpu_info(cpu)->time; | |
12199 | + dst = &per_cpu(shadow_time, cpu); | |
12200 | + | |
12201 | + local_irq_save(flags); | |
12202 | + | |
12203 | + do { | |
12204 | + pre_version = dst->version = src->version; | |
12205 | + rmb(); | |
12206 | + dst->tsc_timestamp = src->tsc_timestamp; | |
12207 | + dst->system_timestamp = src->system_time; | |
12208 | + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; | |
12209 | + dst->tsc_shift = src->tsc_shift; | |
12210 | + rmb(); | |
12211 | + post_version = src->version; | |
12212 | + } while ((pre_version & 1) | (pre_version ^ post_version)); | |
12213 | + | |
12214 | + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; | |
12215 | + | |
12216 | + local_irq_restore(flags); | |
12217 | +} | |
12218 | + | |
12219 | +static inline int time_values_up_to_date(unsigned int cpu) | |
12220 | +{ | |
12221 | + struct vcpu_time_info *src; | |
12222 | + struct shadow_time_info *dst; | |
12223 | + | |
12224 | + src = &vcpu_info(cpu)->time; | |
12225 | + dst = &per_cpu(shadow_time, cpu); | |
12226 | + | |
12227 | + rmb(); | |
12228 | + return (dst->version == src->version); | |
12229 | +} | |
12230 | + | |
12231 | +/* | |
12232 | + * This is a special lock that is owned by the CPU and holds the index | |
12233 | + * register we are working with. It is required for NMI access to the | |
12234 | + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. | |
12235 | + */ | |
12236 | +volatile unsigned long cmos_lock = 0; | |
12237 | +EXPORT_SYMBOL(cmos_lock); | |
12238 | + | |
12239 | +/* Routines for accessing the CMOS RAM/RTC. */ | |
12240 | +unsigned char rtc_cmos_read(unsigned char addr) | |
12241 | +{ | |
12242 | + unsigned char val; | |
12243 | + lock_cmos_prefix(addr); | |
12244 | + outb_p(addr, RTC_PORT(0)); | |
12245 | + val = inb_p(RTC_PORT(1)); | |
12246 | + lock_cmos_suffix(addr); | |
12247 | + return val; | |
12248 | +} | |
12249 | +EXPORT_SYMBOL(rtc_cmos_read); | |
12250 | + | |
12251 | +void rtc_cmos_write(unsigned char val, unsigned char addr) | |
12252 | +{ | |
12253 | + lock_cmos_prefix(addr); | |
12254 | + outb_p(addr, RTC_PORT(0)); | |
12255 | + outb_p(val, RTC_PORT(1)); | |
12256 | + lock_cmos_suffix(addr); | |
12257 | +} | |
12258 | +EXPORT_SYMBOL(rtc_cmos_write); | |
12259 | + | |
12260 | +/* | |
12261 | + * This version of gettimeofday has microsecond resolution | |
12262 | + * and better than microsecond precision on fast x86 machines with TSC. | |
12263 | + */ | |
12264 | +void do_gettimeofday(struct timeval *tv) | |
12265 | +{ | |
12266 | + unsigned long seq; | |
12267 | + unsigned long usec, sec; | |
12268 | + unsigned long flags; | |
12269 | + s64 nsec; | |
12270 | + unsigned int cpu; | |
12271 | + struct shadow_time_info *shadow; | |
12272 | + u32 local_time_version; | |
12273 | + | |
12274 | + cpu = get_cpu(); | |
12275 | + shadow = &per_cpu(shadow_time, cpu); | |
12276 | + | |
12277 | + do { | |
12278 | + unsigned long lost; | |
12279 | + | |
12280 | + local_time_version = shadow->version; | |
12281 | + seq = read_seqbegin(&xtime_lock); | |
12282 | + | |
12283 | + usec = get_usec_offset(shadow); | |
12284 | + lost = jiffies - wall_jiffies; | |
12285 | + | |
12286 | + if (unlikely(lost)) | |
12287 | + usec += lost * (USEC_PER_SEC / HZ); | |
12288 | + | |
12289 | + sec = xtime.tv_sec; | |
12290 | + usec += (xtime.tv_nsec / NSEC_PER_USEC); | |
12291 | + | |
12292 | + nsec = shadow->system_timestamp - processed_system_time; | |
12293 | + __normalize_time(&sec, &nsec); | |
12294 | + usec += (long)nsec / NSEC_PER_USEC; | |
12295 | + | |
12296 | + if (unlikely(!time_values_up_to_date(cpu))) { | |
12297 | + /* | |
12298 | + * We may have blocked for a long time, | |
12299 | + * rendering our calculations invalid | |
12300 | + * (e.g. the time delta may have | |
12301 | + * overflowed). Detect that and recalculate | |
12302 | + * with fresh values. | |
12303 | + */ | |
12304 | + get_time_values_from_xen(cpu); | |
12305 | + continue; | |
12306 | + } | |
12307 | + } while (read_seqretry(&xtime_lock, seq) || | |
12308 | + (local_time_version != shadow->version)); | |
12309 | + | |
12310 | + put_cpu(); | |
12311 | + | |
12312 | + while (usec >= USEC_PER_SEC) { | |
12313 | + usec -= USEC_PER_SEC; | |
12314 | + sec++; | |
12315 | + } | |
12316 | + | |
12317 | + spin_lock_irqsave(&monotonic_lock, flags); | |
12318 | + if ((sec > monotonic_tv.tv_sec) || | |
12319 | + ((sec == monotonic_tv.tv_sec) && (usec > monotonic_tv.tv_usec))) | |
12320 | + { | |
12321 | + monotonic_tv.tv_sec = sec; | |
12322 | + monotonic_tv.tv_usec = usec; | |
12323 | + } else { | |
12324 | + sec = monotonic_tv.tv_sec; | |
12325 | + usec = monotonic_tv.tv_usec; | |
12326 | + } | |
12327 | + spin_unlock_irqrestore(&monotonic_lock, flags); | |
12328 | + | |
12329 | + tv->tv_sec = sec; | |
12330 | + tv->tv_usec = usec; | |
12331 | +} | |
12332 | + | |
12333 | +EXPORT_SYMBOL(do_gettimeofday); | |
12334 | + | |
12335 | +int do_settimeofday(struct timespec *tv) | |
12336 | +{ | |
12337 | + time_t sec; | |
12338 | + s64 nsec; | |
12339 | + unsigned int cpu; | |
12340 | + struct shadow_time_info *shadow; | |
12341 | + struct xen_platform_op op; | |
12342 | + | |
12343 | + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | |
12344 | + return -EINVAL; | |
12345 | + | |
12346 | + cpu = get_cpu(); | |
12347 | + shadow = &per_cpu(shadow_time, cpu); | |
12348 | + | |
12349 | + write_seqlock_irq(&xtime_lock); | |
12350 | + | |
12351 | + /* | |
12352 | + * Ensure we don't get blocked for a long time so that our time delta | |
12353 | + * overflows. If that were to happen then our shadow time values would | |
12354 | + * be stale, so we can retry with fresh ones. | |
12355 | + */ | |
12356 | + for (;;) { | |
12357 | + nsec = tv->tv_nsec - get_nsec_offset(shadow); | |
12358 | + if (time_values_up_to_date(cpu)) | |
12359 | + break; | |
12360 | + get_time_values_from_xen(cpu); | |
12361 | + } | |
12362 | + sec = tv->tv_sec; | |
12363 | + __normalize_time(&sec, &nsec); | |
12364 | + | |
12365 | + if (is_initial_xendomain() && !independent_wallclock) { | |
12366 | + op.cmd = XENPF_settime; | |
12367 | + op.u.settime.secs = sec; | |
12368 | + op.u.settime.nsecs = nsec; | |
12369 | + op.u.settime.system_time = shadow->system_timestamp; | |
12370 | + WARN_ON(HYPERVISOR_platform_op(&op)); | |
12371 | + update_wallclock(); | |
12372 | + } else if (independent_wallclock) { | |
12373 | + nsec -= shadow->system_timestamp; | |
12374 | + __normalize_time(&sec, &nsec); | |
12375 | + __update_wallclock(sec, nsec); | |
12376 | + } | |
12377 | + | |
12378 | + /* Reset monotonic gettimeofday() timeval. */ | |
12379 | + spin_lock(&monotonic_lock); | |
12380 | + monotonic_tv.tv_sec = 0; | |
12381 | + monotonic_tv.tv_usec = 0; | |
12382 | + spin_unlock(&monotonic_lock); | |
12383 | + | |
12384 | + write_sequnlock_irq(&xtime_lock); | |
12385 | + | |
12386 | + put_cpu(); | |
12387 | + | |
12388 | + clock_was_set(); | |
12389 | + return 0; | |
12390 | +} | |
12391 | + | |
12392 | +EXPORT_SYMBOL(do_settimeofday); | |
12393 | + | |
12394 | +static void sync_xen_wallclock(unsigned long dummy); | |
12395 | +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0); | |
12396 | +static void sync_xen_wallclock(unsigned long dummy) | |
12397 | +{ | |
12398 | + time_t sec; | |
12399 | + s64 nsec; | |
12400 | + struct xen_platform_op op; | |
12401 | + | |
12402 | + if (!ntp_synced() || independent_wallclock || !is_initial_xendomain()) | |
12403 | + return; | |
12404 | + | |
12405 | + write_seqlock_irq(&xtime_lock); | |
12406 | + | |
12407 | + sec = xtime.tv_sec; | |
12408 | + nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK); | |
12409 | + __normalize_time(&sec, &nsec); | |
12410 | + | |
12411 | + op.cmd = XENPF_settime; | |
12412 | + op.u.settime.secs = sec; | |
12413 | + op.u.settime.nsecs = nsec; | |
12414 | + op.u.settime.system_time = processed_system_time; | |
12415 | + WARN_ON(HYPERVISOR_platform_op(&op)); | |
12416 | + | |
12417 | + update_wallclock(); | |
12418 | + | |
12419 | + write_sequnlock_irq(&xtime_lock); | |
12420 | + | |
12421 | + /* Once per minute. */ | |
12422 | + mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ); | |
12423 | +} | |
12424 | + | |
12425 | +static int set_rtc_mmss(unsigned long nowtime) | |
12426 | +{ | |
12427 | + int retval; | |
12428 | + unsigned long flags; | |
12429 | + | |
12430 | + if (independent_wallclock || !is_initial_xendomain()) | |
12431 | + return 0; | |
12432 | + | |
12433 | + /* gets recalled with irq locally disabled */ | |
12434 | + /* XXX - does irqsave resolve this? -johnstul */ | |
12435 | + spin_lock_irqsave(&rtc_lock, flags); | |
12436 | + if (efi_enabled) | |
12437 | + retval = efi_set_rtc_mmss(nowtime); | |
12438 | + else | |
12439 | + retval = mach_set_rtc_mmss(nowtime); | |
12440 | + spin_unlock_irqrestore(&rtc_lock, flags); | |
12441 | + | |
12442 | + return retval; | |
12443 | +} | |
12444 | + | |
12445 | +/* monotonic_clock(): returns # of nanoseconds passed since time_init() | |
12446 | + * Note: This function is required to return accurate | |
12447 | + * time even in the absence of multiple timer ticks. | |
12448 | + */ | |
12449 | +unsigned long long monotonic_clock(void) | |
12450 | +{ | |
12451 | + unsigned int cpu = get_cpu(); | |
12452 | + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); | |
12453 | + u64 time; | |
12454 | + u32 local_time_version; | |
12455 | + | |
12456 | + do { | |
12457 | + local_time_version = shadow->version; | |
12458 | + barrier(); | |
12459 | + time = shadow->system_timestamp + get_nsec_offset(shadow); | |
12460 | + if (!time_values_up_to_date(cpu)) | |
12461 | + get_time_values_from_xen(cpu); | |
12462 | + barrier(); | |
12463 | + } while (local_time_version != shadow->version); | |
12464 | + | |
12465 | + put_cpu(); | |
12466 | + | |
12467 | + return time; | |
12468 | +} | |
12469 | +EXPORT_SYMBOL(monotonic_clock); | |
12470 | + | |
12471 | +#ifdef __x86_64__ | |
12472 | +unsigned long long sched_clock(void) | |
12473 | +{ | |
12474 | + return monotonic_clock(); | |
12475 | +} | |
12476 | +#endif | |
12477 | + | |
12478 | +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) | |
12479 | +unsigned long profile_pc(struct pt_regs *regs) | |
12480 | +{ | |
12481 | + unsigned long pc = instruction_pointer(regs); | |
12482 | + | |
12483 | +#ifdef __x86_64__ | |
12484 | + /* Assume the lock function has either no stack frame or only a single word. | |
12485 | + This checks if the address on the stack looks like a kernel text address. | |
12486 | + There is a small window for false hits, but in that case the tick | |
12487 | + is just accounted to the spinlock function. | |
12488 | + Better would be to write these functions in assembler again | |
12489 | + and check exactly. */ | |
12490 | + if (!user_mode_vm(regs) && in_lock_functions(pc)) { | |
12491 | + char *v = *(char **)regs->rsp; | |
12492 | + if ((v >= _stext && v <= _etext) || | |
12493 | + (v >= _sinittext && v <= _einittext) || | |
12494 | + (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) | |
12495 | + return (unsigned long)v; | |
12496 | + return ((unsigned long *)regs->rsp)[1]; | |
12497 | + } | |
12498 | +#else | |
12499 | + if (!user_mode_vm(regs) && in_lock_functions(pc)) | |
12500 | + return *(unsigned long *)(regs->ebp + 4); | |
12501 | +#endif | |
12502 | + | |
12503 | + return pc; | |
12504 | +} | |
12505 | +EXPORT_SYMBOL(profile_pc); | |
12506 | +#endif | |
12507 | + | |
12508 | +/* | |
12509 | + * This is the same as the above, except we _also_ save the current | |
12510 | + * Time Stamp Counter value at the time of the timer interrupt, so that | |
12511 | + * we later on can estimate the time of day more exactly. | |
12512 | + */ | |
12513 | +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) | |
12514 | +{ | |
12515 | + s64 delta, delta_cpu, stolen, blocked; | |
12516 | + u64 sched_time; | |
12517 | + unsigned int i, cpu = smp_processor_id(); | |
12518 | + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); | |
12519 | + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); | |
12520 | + | |
12521 | + /* | |
12522 | + * Here we are in the timer irq handler. We just have irqs locally | |
12523 | + * disabled but we don't know if the timer_bh is running on the other | |
12524 | + * CPU. We need to avoid to SMP race with it. NOTE: we don' t need | |
12525 | + * the irq version of write_lock because as just said we have irq | |
12526 | + * locally disabled. -arca | |
12527 | + */ | |
12528 | + write_seqlock(&xtime_lock); | |
12529 | + | |
12530 | + do { | |
12531 | + get_time_values_from_xen(cpu); | |
12532 | + | |
12533 | + /* Obtain a consistent snapshot of elapsed wallclock cycles. */ | |
12534 | + delta = delta_cpu = | |
12535 | + shadow->system_timestamp + get_nsec_offset(shadow); | |
12536 | + delta -= processed_system_time; | |
12537 | + delta_cpu -= per_cpu(processed_system_time, cpu); | |
12538 | + | |
12539 | + /* | |
12540 | + * Obtain a consistent snapshot of stolen/blocked cycles. We | |
12541 | + * can use state_entry_time to detect if we get preempted here. | |
12542 | + */ | |
12543 | + do { | |
12544 | + sched_time = runstate->state_entry_time; | |
12545 | + barrier(); | |
12546 | + stolen = runstate->time[RUNSTATE_runnable] + | |
12547 | + runstate->time[RUNSTATE_offline] - | |
12548 | + per_cpu(processed_stolen_time, cpu); | |
12549 | + blocked = runstate->time[RUNSTATE_blocked] - | |
12550 | + per_cpu(processed_blocked_time, cpu); | |
12551 | + barrier(); | |
12552 | + } while (sched_time != runstate->state_entry_time); | |
12553 | + } while (!time_values_up_to_date(cpu)); | |
12554 | + | |
12555 | + if ((unlikely(delta < -(s64)permitted_clock_jitter) || | |
12556 | + unlikely(delta_cpu < -(s64)permitted_clock_jitter)) | |
12557 | + && printk_ratelimit()) { | |
12558 | + printk("Timer ISR/%u: Time went backwards: " | |
12559 | + "delta=%lld delta_cpu=%lld shadow=%lld " | |
12560 | + "off=%lld processed=%lld cpu_processed=%lld\n", | |
12561 | + cpu, delta, delta_cpu, shadow->system_timestamp, | |
12562 | + (s64)get_nsec_offset(shadow), | |
12563 | + processed_system_time, | |
12564 | + per_cpu(processed_system_time, cpu)); | |
12565 | + for (i = 0; i < num_online_cpus(); i++) | |
12566 | + printk(" %d: %lld\n", i, | |
12567 | + per_cpu(processed_system_time, i)); | |
12568 | + } | |
12569 | + | |
12570 | + /* System-wide jiffy work. */ | |
12571 | + while (delta >= NS_PER_TICK) { | |
12572 | + delta -= NS_PER_TICK; | |
12573 | + processed_system_time += NS_PER_TICK; | |
12574 | + do_timer(regs); | |
12575 | + } | |
12576 | + | |
12577 | + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { | |
12578 | + update_wallclock(); | |
12579 | + if (keventd_up()) | |
12580 | + schedule_work(&clock_was_set_work); | |
12581 | + } | |
12582 | + | |
12583 | + write_sequnlock(&xtime_lock); | |
12584 | + | |
12585 | + /* | |
12586 | + * Account stolen ticks. | |
12587 | + * HACK: Passing NULL to account_steal_time() | |
12588 | + * ensures that the ticks are accounted as stolen. | |
12589 | + */ | |
12590 | + if ((stolen > 0) && (delta_cpu > 0)) { | |
12591 | + delta_cpu -= stolen; | |
12592 | + if (unlikely(delta_cpu < 0)) | |
12593 | + stolen += delta_cpu; /* clamp local-time progress */ | |
12594 | + do_div(stolen, NS_PER_TICK); | |
12595 | + per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK; | |
12596 | + per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK; | |
12597 | + account_steal_time(NULL, (cputime_t)stolen); | |
12598 | + } | |
12599 | + | |
12600 | + /* | |
12601 | + * Account blocked ticks. | |
12602 | + * HACK: Passing idle_task to account_steal_time() | |
12603 | + * ensures that the ticks are accounted as idle/wait. | |
12604 | + */ | |
12605 | + if ((blocked > 0) && (delta_cpu > 0)) { | |
12606 | + delta_cpu -= blocked; | |
12607 | + if (unlikely(delta_cpu < 0)) | |
12608 | + blocked += delta_cpu; /* clamp local-time progress */ | |
12609 | + do_div(blocked, NS_PER_TICK); | |
12610 | + per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK; | |
12611 | + per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK; | |
12612 | + account_steal_time(idle_task(cpu), (cputime_t)blocked); | |
12613 | + } | |
12614 | + | |
12615 | + /* Account user/system ticks. */ | |
12616 | + if (delta_cpu > 0) { | |
12617 | + do_div(delta_cpu, NS_PER_TICK); | |
12618 | + per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK; | |
12619 | + if (user_mode_vm(regs)) | |
12620 | + account_user_time(current, (cputime_t)delta_cpu); | |
12621 | + else | |
12622 | + account_system_time(current, HARDIRQ_OFFSET, | |
12623 | + (cputime_t)delta_cpu); | |
12624 | + } | |
12625 | + | |
12626 | + /* Offlined for more than a few seconds? Avoid lockup warnings. */ | |
12627 | + if (stolen > 5*HZ) | |
12628 | + touch_softlockup_watchdog(); | |
12629 | + | |
12630 | + /* Local timer processing (see update_process_times()). */ | |
12631 | + run_local_timers(); | |
12632 | + if (rcu_pending(cpu)) | |
12633 | + rcu_check_callbacks(cpu, user_mode_vm(regs)); | |
12634 | + scheduler_tick(); | |
12635 | + run_posix_cpu_timers(current); | |
12636 | + profile_tick(CPU_PROFILING, regs); | |
12637 | + | |
12638 | + return IRQ_HANDLED; | |
12639 | +} | |
12640 | + | |
12641 | +static void init_missing_ticks_accounting(unsigned int cpu) | |
12642 | +{ | |
12643 | + struct vcpu_register_runstate_memory_area area; | |
12644 | + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); | |
12645 | + int rc; | |
12646 | + | |
12647 | + memset(runstate, 0, sizeof(*runstate)); | |
12648 | + | |
12649 | + area.addr.v = runstate; | |
12650 | + rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area); | |
12651 | + WARN_ON(rc && rc != -ENOSYS); | |
12652 | + | |
12653 | + per_cpu(processed_blocked_time, cpu) = | |
12654 | + runstate->time[RUNSTATE_blocked]; | |
12655 | + per_cpu(processed_stolen_time, cpu) = | |
12656 | + runstate->time[RUNSTATE_runnable] + | |
12657 | + runstate->time[RUNSTATE_offline]; | |
12658 | +} | |
12659 | + | |
12660 | +/* not static: needed by APM */ | |
12661 | +unsigned long get_cmos_time(void) | |
12662 | +{ | |
12663 | + unsigned long retval; | |
12664 | + unsigned long flags; | |
12665 | + | |
12666 | + spin_lock_irqsave(&rtc_lock, flags); | |
12667 | + | |
12668 | + if (efi_enabled) | |
12669 | + retval = efi_get_time(); | |
12670 | + else | |
12671 | + retval = mach_get_cmos_time(); | |
12672 | + | |
12673 | + spin_unlock_irqrestore(&rtc_lock, flags); | |
12674 | + | |
12675 | + return retval; | |
12676 | +} | |
12677 | +EXPORT_SYMBOL(get_cmos_time); | |
12678 | + | |
12679 | +static void sync_cmos_clock(unsigned long dummy); | |
12680 | + | |
12681 | +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); | |
12682 | + | |
12683 | +static void sync_cmos_clock(unsigned long dummy) | |
12684 | +{ | |
12685 | + struct timeval now, next; | |
12686 | + int fail = 1; | |
12687 | + | |
12688 | + /* | |
12689 | + * If we have an externally synchronized Linux clock, then update | |
12690 | + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be | |
12691 | + * called as close as possible to 500 ms before the new second starts. | |
12692 | + * This code is run on a timer. If the clock is set, that timer | |
12693 | + * may not expire at the correct time. Thus, we adjust... | |
12694 | + */ | |
12695 | + if (!ntp_synced()) | |
12696 | + /* | |
12697 | + * Not synced, exit, do not restart a timer (if one is | |
12698 | + * running, let it run out). | |
12699 | + */ | |
12700 | + return; | |
12701 | + | |
12702 | + do_gettimeofday(&now); | |
12703 | + if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && | |
12704 | + now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) | |
12705 | + fail = set_rtc_mmss(now.tv_sec); | |
12706 | + | |
12707 | + next.tv_usec = USEC_AFTER - now.tv_usec; | |
12708 | + if (next.tv_usec <= 0) | |
12709 | + next.tv_usec += USEC_PER_SEC; | |
12710 | + | |
12711 | + if (!fail) | |
12712 | + next.tv_sec = 659; | |
12713 | + else | |
12714 | + next.tv_sec = 0; | |
12715 | + | |
12716 | + if (next.tv_usec >= USEC_PER_SEC) { | |
12717 | + next.tv_sec++; | |
12718 | + next.tv_usec -= USEC_PER_SEC; | |
12719 | + } | |
12720 | + mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); | |
12721 | +} | |
12722 | + | |
12723 | +void notify_arch_cmos_timer(void) | |
12724 | +{ | |
12725 | + mod_timer(&sync_cmos_timer, jiffies + 1); | |
12726 | + mod_timer(&sync_xen_wallclock_timer, jiffies + 1); | |
12727 | +} | |
12728 | + | |
12729 | +static int timer_resume(struct sys_device *dev) | |
12730 | +{ | |
12731 | + extern void time_resume(void); | |
12732 | + time_resume(); | |
12733 | + return 0; | |
12734 | +} | |
12735 | + | |
12736 | +static struct sysdev_class timer_sysclass = { | |
12737 | + .resume = timer_resume, | |
12738 | + set_kset_name("timer"), | |
12739 | +}; | |
12740 | + | |
12741 | + | |
12742 | +/* XXX this driverfs stuff should probably go elsewhere later -john */ | |
12743 | +static struct sys_device device_timer = { | |
12744 | + .id = 0, | |
12745 | + .cls = &timer_sysclass, | |
12746 | +}; | |
12747 | + | |
12748 | +static int time_init_device(void) | |
12749 | +{ | |
12750 | + int error = sysdev_class_register(&timer_sysclass); | |
12751 | + if (!error) | |
12752 | + error = sysdev_register(&device_timer); | |
12753 | + return error; | |
12754 | +} | |
12755 | + | |
12756 | +device_initcall(time_init_device); | |
12757 | + | |
12758 | +#ifdef CONFIG_HPET_TIMER | |
12759 | +extern void (*late_time_init)(void); | |
12760 | +/* Duplicate of time_init() below, with hpet_enable part added */ | |
12761 | +static void __init hpet_time_init(void) | |
12762 | +{ | |
12763 | + xtime.tv_sec = get_cmos_time(); | |
12764 | + xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); | |
12765 | + set_normalized_timespec(&wall_to_monotonic, | |
12766 | + -xtime.tv_sec, -xtime.tv_nsec); | |
12767 | + | |
12768 | + if ((hpet_enable() >= 0) && hpet_use_timer) { | |
12769 | + printk("Using HPET for base-timer\n"); | |
12770 | + } | |
12771 | + | |
12772 | + time_init_hook(); | |
12773 | +} | |
12774 | +#endif | |
12775 | + | |
12776 | +/* Dynamically-mapped IRQ. */ | |
12777 | +DEFINE_PER_CPU(int, timer_irq); | |
12778 | + | |
12779 | +extern void (*late_time_init)(void); | |
12780 | +static void setup_cpu0_timer_irq(void) | |
12781 | +{ | |
12782 | + per_cpu(timer_irq, 0) = | |
12783 | + bind_virq_to_irqhandler( | |
12784 | + VIRQ_TIMER, | |
12785 | + 0, | |
12786 | + timer_interrupt, | |
12787 | + SA_INTERRUPT, | |
12788 | + "timer0", | |
12789 | + NULL); | |
12790 | + BUG_ON(per_cpu(timer_irq, 0) < 0); | |
12791 | +} | |
12792 | + | |
12793 | +static struct vcpu_set_periodic_timer xen_set_periodic_tick = { | |
12794 | + .period_ns = NS_PER_TICK | |
12795 | +}; | |
12796 | + | |
12797 | +void __init time_init(void) | |
12798 | +{ | |
12799 | +#ifdef CONFIG_HPET_TIMER | |
12800 | + if (is_hpet_capable()) { | |
12801 | + /* | |
12802 | + * HPET initialization needs to do memory-mapped io. So, let | |
12803 | + * us do a late initialization after mem_init(). | |
12804 | + */ | |
12805 | + late_time_init = hpet_time_init; | |
12806 | + return; | |
12807 | + } | |
12808 | +#endif | |
12809 | + | |
12810 | + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0, | |
12811 | + &xen_set_periodic_tick)) { | |
12812 | + case 0: | |
12813 | +#if CONFIG_XEN_COMPAT <= 0x030004 | |
12814 | + case -ENOSYS: | |
12815 | +#endif | |
12816 | + break; | |
12817 | + default: | |
12818 | + BUG(); | |
12819 | + } | |
12820 | + | |
12821 | + get_time_values_from_xen(0); | |
12822 | + | |
12823 | + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; | |
12824 | + per_cpu(processed_system_time, 0) = processed_system_time; | |
12825 | + init_missing_ticks_accounting(0); | |
12826 | + | |
12827 | + update_wallclock(); | |
12828 | + | |
12829 | + init_cpu_khz(); | |
12830 | + printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n", | |
12831 | + cpu_khz / 1000, cpu_khz % 1000); | |
12832 | + | |
12833 | +#if defined(__x86_64__) | |
12834 | + vxtime.mode = VXTIME_TSC; | |
12835 | + vxtime.quot = (1000000L << 32) / vxtime_hz; | |
12836 | + vxtime.tsc_quot = (1000L << 32) / cpu_khz; | |
12837 | + sync_core(); | |
12838 | + rdtscll(vxtime.last_tsc); | |
12839 | +#endif | |
12840 | + | |
12841 | + /* Cannot request_irq() until kmem is initialised. */ | |
12842 | + late_time_init = setup_cpu0_timer_irq; | |
12843 | +} | |
12844 | + | |
12845 | +/* Convert jiffies to system time. */ | |
12846 | +u64 jiffies_to_st(unsigned long j) | |
12847 | +{ | |
12848 | + unsigned long seq; | |
12849 | + long delta; | |
12850 | + u64 st; | |
12851 | + | |
12852 | + do { | |
12853 | + seq = read_seqbegin(&xtime_lock); | |
12854 | + delta = j - jiffies; | |
12855 | + if (delta < 1) { | |
12856 | + /* Triggers in some wrap-around cases, but that's okay: | |
12857 | + * we just end up with a shorter timeout. */ | |
12858 | + st = processed_system_time + NS_PER_TICK; | |
12859 | + } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) { | |
12860 | + /* Very long timeout means there is no pending timer. | |
12861 | + * We indicate this to Xen by passing zero timeout. */ | |
12862 | + st = 0; | |
12863 | + } else { | |
12864 | + st = processed_system_time + delta * (u64)NS_PER_TICK; | |
12865 | + } | |
12866 | + } while (read_seqretry(&xtime_lock, seq)); | |
12867 | + | |
12868 | + return st; | |
12869 | +} | |
12870 | +EXPORT_SYMBOL(jiffies_to_st); | |
12871 | + | |
12872 | +/* | |
12873 | + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu | |
12874 | + * These functions are based on implementations from arch/s390/kernel/time.c | |
12875 | + */ | |
12876 | +static void stop_hz_timer(void) | |
12877 | +{ | |
12878 | + struct vcpu_set_singleshot_timer singleshot; | |
12879 | + unsigned int cpu = smp_processor_id(); | |
12880 | + unsigned long j; | |
12881 | + int rc; | |
12882 | + | |
12883 | + cpu_set(cpu, nohz_cpu_mask); | |
12884 | + | |
12885 | + /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */ | |
12886 | + /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */ | |
12887 | + /* value of rcp->cur that matches rdp->quiescbatch and allows us to */ | |
12888 | + /* stop the hz timer then the cpumasks created for subsequent values */ | |
12889 | + /* of cur in rcu_start_batch are guaranteed to pick up the updated */ | |
12890 | + /* nohz_cpu_mask and so will not depend on this cpu. */ | |
12891 | + | |
12892 | + smp_mb(); | |
12893 | + | |
12894 | + /* Leave ourselves in tick mode if rcu or softirq or timer pending. */ | |
12895 | + if (rcu_needs_cpu(cpu) || local_softirq_pending() || | |
12896 | + (j = next_timer_interrupt(), time_before_eq(j, jiffies))) { | |
12897 | + cpu_clear(cpu, nohz_cpu_mask); | |
12898 | + j = jiffies + 1; | |
12899 | + } | |
12900 | + | |
12901 | + singleshot.timeout_abs_ns = jiffies_to_st(j) + NS_PER_TICK/2; | |
12902 | + singleshot.flags = 0; | |
12903 | + rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot); | |
12904 | +#if CONFIG_XEN_COMPAT <= 0x030004 | |
12905 | + if (rc) { | |
12906 | + BUG_ON(rc != -ENOSYS); | |
12907 | + rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns); | |
12908 | + } | |
12909 | +#endif | |
12910 | + BUG_ON(rc); | |
12911 | +} | |
12912 | + | |
12913 | +static void start_hz_timer(void) | |
12914 | +{ | |
12915 | + cpu_clear(smp_processor_id(), nohz_cpu_mask); | |
12916 | +} | |
12917 | + | |
12918 | +void raw_safe_halt(void) | |
12919 | +{ | |
12920 | + stop_hz_timer(); | |
12921 | + /* Blocking includes an implicit local_irq_enable(). */ | |
12922 | + HYPERVISOR_block(); | |
12923 | + start_hz_timer(); | |
12924 | +} | |
12925 | +EXPORT_SYMBOL(raw_safe_halt); | |
12926 | + | |
12927 | +void halt(void) | |
12928 | +{ | |
12929 | + if (irqs_disabled()) | |
12930 | + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); | |
12931 | +} | |
12932 | +EXPORT_SYMBOL(halt); | |
12933 | + | |
12934 | +/* No locking required. Interrupts are disabled on all CPUs. */ | |
12935 | +void time_resume(void) | |
12936 | +{ | |
12937 | + unsigned int cpu; | |
12938 | + | |
12939 | + init_cpu_khz(); | |
12940 | + | |
12941 | + for_each_online_cpu(cpu) { | |
12942 | + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu, | |
12943 | + &xen_set_periodic_tick)) { | |
12944 | + case 0: | |
12945 | +#if CONFIG_XEN_COMPAT <= 0x030004 | |
12946 | + case -ENOSYS: | |
12947 | +#endif | |
12948 | + break; | |
12949 | + default: | |
12950 | + BUG(); | |
12951 | + } | |
12952 | + get_time_values_from_xen(cpu); | |
12953 | + per_cpu(processed_system_time, cpu) = | |
12954 | + per_cpu(shadow_time, 0).system_timestamp; | |
12955 | + init_missing_ticks_accounting(cpu); | |
12956 | + } | |
12957 | + | |
12958 | + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; | |
12959 | + | |
12960 | + update_wallclock(); | |
12961 | +} | |
12962 | + | |
12963 | +#ifdef CONFIG_SMP | |
12964 | +static char timer_name[NR_CPUS][15]; | |
12965 | + | |
12966 | +int __cpuinit local_setup_timer(unsigned int cpu) | |
12967 | +{ | |
12968 | + int seq, irq; | |
12969 | + | |
12970 | + BUG_ON(cpu == 0); | |
12971 | + | |
12972 | + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu, | |
12973 | + &xen_set_periodic_tick)) { | |
12974 | + case 0: | |
12975 | +#if CONFIG_XEN_COMPAT <= 0x030004 | |
12976 | + case -ENOSYS: | |
12977 | +#endif | |
12978 | + break; | |
12979 | + default: | |
12980 | + BUG(); | |
12981 | + } | |
12982 | + | |
12983 | + do { | |
12984 | + seq = read_seqbegin(&xtime_lock); | |
12985 | + /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */ | |
12986 | + per_cpu(processed_system_time, cpu) = | |
12987 | + per_cpu(shadow_time, 0).system_timestamp; | |
12988 | + init_missing_ticks_accounting(cpu); | |
12989 | + } while (read_seqretry(&xtime_lock, seq)); | |
12990 | + | |
12991 | + sprintf(timer_name[cpu], "timer%u", cpu); | |
12992 | + irq = bind_virq_to_irqhandler(VIRQ_TIMER, | |
12993 | + cpu, | |
12994 | + timer_interrupt, | |
12995 | + SA_INTERRUPT, | |
12996 | + timer_name[cpu], | |
12997 | + NULL); | |
12998 | + if (irq < 0) | |
12999 | + return irq; | |
13000 | + per_cpu(timer_irq, cpu) = irq; | |
13001 | + | |
13002 | + return 0; | |
13003 | +} | |
13004 | + | |
13005 | +void __cpuexit local_teardown_timer(unsigned int cpu) | |
13006 | +{ | |
13007 | + BUG_ON(cpu == 0); | |
13008 | + unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL); | |
13009 | +} | |
13010 | +#endif | |
13011 | + | |
13012 | +#ifdef CONFIG_CPU_FREQ | |
13013 | +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | |
13014 | + void *data) | |
13015 | +{ | |
13016 | + struct cpufreq_freqs *freq = data; | |
13017 | + struct xen_platform_op op; | |
13018 | + | |
13019 | + if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) | |
13020 | + return 0; | |
13021 | + | |
13022 | + if (val == CPUFREQ_PRECHANGE) | |
13023 | + return 0; | |
13024 | + | |
13025 | + op.cmd = XENPF_change_freq; | |
13026 | + op.u.change_freq.flags = 0; | |
13027 | + op.u.change_freq.cpu = freq->cpu; | |
13028 | + op.u.change_freq.freq = (u64)freq->new * 1000; | |
13029 | + WARN_ON(HYPERVISOR_platform_op(&op)); | |
13030 | + | |
13031 | + return 0; | |
13032 | +} | |
13033 | + | |
13034 | +static struct notifier_block time_cpufreq_notifier_block = { | |
13035 | + .notifier_call = time_cpufreq_notifier | |
13036 | +}; | |
13037 | + | |
13038 | +static int __init cpufreq_time_setup(void) | |
13039 | +{ | |
13040 | + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, | |
13041 | + CPUFREQ_TRANSITION_NOTIFIER)) { | |
13042 | + printk(KERN_ERR "failed to set up cpufreq notifier\n"); | |
13043 | + return -ENODEV; | |
13044 | + } | |
13045 | + return 0; | |
13046 | +} | |
13047 | + | |
13048 | +core_initcall(cpufreq_time_setup); | |
13049 | +#endif | |
13050 | + | |
13051 | +/* | |
13052 | + * /proc/sys/xen: This really belongs in another file. It can stay here for | |
13053 | + * now however. | |
13054 | + */ | |
13055 | +static ctl_table xen_subtable[] = { | |
13056 | + { | |
13057 | + .ctl_name = 1, | |
13058 | + .procname = "independent_wallclock", | |
13059 | + .data = &independent_wallclock, | |
13060 | + .maxlen = sizeof(independent_wallclock), | |
13061 | + .mode = 0644, | |
13062 | + .proc_handler = proc_dointvec | |
13063 | + }, | |
13064 | + { | |
13065 | + .ctl_name = 2, | |
13066 | + .procname = "permitted_clock_jitter", | |
13067 | + .data = &permitted_clock_jitter, | |
13068 | + .maxlen = sizeof(permitted_clock_jitter), | |
13069 | + .mode = 0644, | |
13070 | + .proc_handler = proc_doulongvec_minmax | |
13071 | + }, | |
13072 | + { 0 } | |
13073 | +}; | |
13074 | +static ctl_table xen_table[] = { | |
13075 | + { | |
13076 | + .ctl_name = 123, | |
13077 | + .procname = "xen", | |
13078 | + .mode = 0555, | |
13079 | + .child = xen_subtable}, | |
13080 | + { 0 } | |
13081 | +}; | |
13082 | +static int __init xen_sysctl_init(void) | |
13083 | +{ | |
13084 | + (void)register_sysctl_table(xen_table, 0); | |
13085 | + return 0; | |
13086 | +} | |
13087 | +__initcall(xen_sysctl_init); | |
13088 | Index: head-2008-11-25/arch/x86/kernel/traps_32-xen.c | |
13089 | =================================================================== | |
13090 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
13091 | +++ head-2008-11-25/arch/x86/kernel/traps_32-xen.c 2008-04-02 12:34:02.000000000 +0200 | |
13092 | @@ -0,0 +1,1190 @@ | |
13093 | +/* | |
13094 | + * linux/arch/i386/traps.c | |
13095 | + * | |
13096 | + * Copyright (C) 1991, 1992 Linus Torvalds | |
13097 | + * | |
13098 | + * Pentium III FXSR, SSE support | |
13099 | + * Gareth Hughes <gareth@valinux.com>, May 2000 | |
13100 | + */ | |
13101 | + | |
13102 | +/* | |
13103 | + * 'Traps.c' handles hardware traps and faults after we have saved some | |
13104 | + * state in 'asm.s'. | |
13105 | + */ | |
13106 | +#include <linux/sched.h> | |
13107 | +#include <linux/kernel.h> | |
13108 | +#include <linux/string.h> | |
13109 | +#include <linux/errno.h> | |
13110 | +#include <linux/timer.h> | |
13111 | +#include <linux/mm.h> | |
13112 | +#include <linux/init.h> | |
13113 | +#include <linux/delay.h> | |
13114 | +#include <linux/spinlock.h> | |
13115 | +#include <linux/interrupt.h> | |
13116 | +#include <linux/highmem.h> | |
13117 | +#include <linux/kallsyms.h> | |
13118 | +#include <linux/ptrace.h> | |
13119 | +#include <linux/utsname.h> | |
13120 | +#include <linux/kprobes.h> | |
13121 | +#include <linux/kexec.h> | |
13122 | +#include <linux/unwind.h> | |
13123 | + | |
13124 | +#ifdef CONFIG_EISA | |
13125 | +#include <linux/ioport.h> | |
13126 | +#include <linux/eisa.h> | |
13127 | +#endif | |
13128 | + | |
13129 | +#ifdef CONFIG_MCA | |
13130 | +#include <linux/mca.h> | |
13131 | +#endif | |
13132 | + | |
13133 | +#include <asm/processor.h> | |
13134 | +#include <asm/system.h> | |
13135 | +#include <asm/uaccess.h> | |
13136 | +#include <asm/io.h> | |
13137 | +#include <asm/atomic.h> | |
13138 | +#include <asm/debugreg.h> | |
13139 | +#include <asm/desc.h> | |
13140 | +#include <asm/i387.h> | |
13141 | +#include <asm/nmi.h> | |
13142 | +#include <asm/unwind.h> | |
13143 | +#include <asm/smp.h> | |
13144 | +#include <asm/arch_hooks.h> | |
13145 | +#include <asm/kdebug.h> | |
13146 | + | |
13147 | +#include <linux/module.h> | |
13148 | + | |
13149 | +#include "mach_traps.h" | |
13150 | + | |
13151 | +asmlinkage int system_call(void); | |
13152 | + | |
13153 | +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, | |
13154 | + { 0, 0 }, { 0, 0 } }; | |
13155 | + | |
13156 | +/* Do we ignore FPU interrupts ? */ | |
13157 | +char ignore_fpu_irq = 0; | |
13158 | + | |
13159 | +#ifndef CONFIG_X86_NO_IDT | |
13160 | +/* | |
13161 | + * The IDT has to be page-aligned to simplify the Pentium | |
13162 | + * F0 0F bug workaround.. We have a special link segment | |
13163 | + * for this. | |
13164 | + */ | |
13165 | +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; | |
13166 | +#endif | |
13167 | + | |
13168 | +asmlinkage void divide_error(void); | |
13169 | +asmlinkage void debug(void); | |
13170 | +asmlinkage void nmi(void); | |
13171 | +asmlinkage void int3(void); | |
13172 | +asmlinkage void overflow(void); | |
13173 | +asmlinkage void bounds(void); | |
13174 | +asmlinkage void invalid_op(void); | |
13175 | +asmlinkage void device_not_available(void); | |
13176 | +asmlinkage void coprocessor_segment_overrun(void); | |
13177 | +asmlinkage void invalid_TSS(void); | |
13178 | +asmlinkage void segment_not_present(void); | |
13179 | +asmlinkage void stack_segment(void); | |
13180 | +asmlinkage void general_protection(void); | |
13181 | +asmlinkage void page_fault(void); | |
13182 | +asmlinkage void coprocessor_error(void); | |
13183 | +asmlinkage void simd_coprocessor_error(void); | |
13184 | +asmlinkage void alignment_check(void); | |
13185 | +#ifndef CONFIG_XEN | |
13186 | +asmlinkage void spurious_interrupt_bug(void); | |
13187 | +#else | |
13188 | +asmlinkage void fixup_4gb_segment(void); | |
13189 | +#endif | |
13190 | +asmlinkage void machine_check(void); | |
13191 | + | |
13192 | +static int kstack_depth_to_print = 24; | |
13193 | +#ifdef CONFIG_STACK_UNWIND | |
13194 | +static int call_trace = 1; | |
13195 | +#else | |
13196 | +#define call_trace (-1) | |
13197 | +#endif | |
13198 | +ATOMIC_NOTIFIER_HEAD(i386die_chain); | |
13199 | + | |
13200 | +int register_die_notifier(struct notifier_block *nb) | |
13201 | +{ | |
13202 | + vmalloc_sync_all(); | |
13203 | + return atomic_notifier_chain_register(&i386die_chain, nb); | |
13204 | +} | |
13205 | +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ | |
13206 | + | |
13207 | +int unregister_die_notifier(struct notifier_block *nb) | |
13208 | +{ | |
13209 | + return atomic_notifier_chain_unregister(&i386die_chain, nb); | |
13210 | +} | |
13211 | +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ | |
13212 | + | |
13213 | +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) | |
13214 | +{ | |
13215 | + return p > (void *)tinfo && | |
13216 | + p < (void *)tinfo + THREAD_SIZE - 3; | |
13217 | +} | |
13218 | + | |
13219 | +/* | |
13220 | + * Print one address/symbol entries per line. | |
13221 | + */ | |
13222 | +static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl) | |
13223 | +{ | |
13224 | + printk(" [<%08lx>] ", addr); | |
13225 | + | |
13226 | + print_symbol("%s\n", addr); | |
13227 | +} | |
13228 | + | |
13229 | +static inline unsigned long print_context_stack(struct thread_info *tinfo, | |
13230 | + unsigned long *stack, unsigned long ebp, | |
13231 | + char *log_lvl) | |
13232 | +{ | |
13233 | + unsigned long addr; | |
13234 | + | |
13235 | +#ifdef CONFIG_FRAME_POINTER | |
13236 | + while (valid_stack_ptr(tinfo, (void *)ebp)) { | |
13237 | + addr = *(unsigned long *)(ebp + 4); | |
13238 | + print_addr_and_symbol(addr, log_lvl); | |
13239 | + /* | |
13240 | + * break out of recursive entries (such as | |
13241 | + * end_of_stack_stop_unwind_function): | |
13242 | + */ | |
13243 | + if (ebp == *(unsigned long *)ebp) | |
13244 | + break; | |
13245 | + ebp = *(unsigned long *)ebp; | |
13246 | + } | |
13247 | +#else | |
13248 | + while (valid_stack_ptr(tinfo, stack)) { | |
13249 | + addr = *stack++; | |
13250 | + if (__kernel_text_address(addr)) | |
13251 | + print_addr_and_symbol(addr, log_lvl); | |
13252 | + } | |
13253 | +#endif | |
13254 | + return ebp; | |
13255 | +} | |
13256 | + | |
13257 | +static asmlinkage int | |
13258 | +show_trace_unwind(struct unwind_frame_info *info, void *log_lvl) | |
13259 | +{ | |
13260 | + int n = 0; | |
13261 | + | |
13262 | + while (unwind(info) == 0 && UNW_PC(info)) { | |
13263 | + n++; | |
13264 | + print_addr_and_symbol(UNW_PC(info), log_lvl); | |
13265 | + if (arch_unw_user_mode(info)) | |
13266 | + break; | |
13267 | + } | |
13268 | + return n; | |
13269 | +} | |
13270 | + | |
13271 | +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | |
13272 | + unsigned long *stack, char *log_lvl) | |
13273 | +{ | |
13274 | + unsigned long ebp; | |
13275 | + | |
13276 | + if (!task) | |
13277 | + task = current; | |
13278 | + | |
13279 | + if (call_trace >= 0) { | |
13280 | + int unw_ret = 0; | |
13281 | + struct unwind_frame_info info; | |
13282 | + | |
13283 | + if (regs) { | |
13284 | + if (unwind_init_frame_info(&info, task, regs) == 0) | |
13285 | + unw_ret = show_trace_unwind(&info, log_lvl); | |
13286 | + } else if (task == current) | |
13287 | + unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl); | |
13288 | + else { | |
13289 | + if (unwind_init_blocked(&info, task) == 0) | |
13290 | + unw_ret = show_trace_unwind(&info, log_lvl); | |
13291 | + } | |
13292 | + if (unw_ret > 0) { | |
13293 | + if (call_trace == 1 && !arch_unw_user_mode(&info)) { | |
13294 | + print_symbol("DWARF2 unwinder stuck at %s\n", | |
13295 | + UNW_PC(&info)); | |
13296 | + if (UNW_SP(&info) >= PAGE_OFFSET) { | |
13297 | + printk("Leftover inexact backtrace:\n"); | |
13298 | + stack = (void *)UNW_SP(&info); | |
13299 | + } else | |
13300 | + printk("Full inexact backtrace again:\n"); | |
13301 | + } else if (call_trace >= 1) | |
13302 | + return; | |
13303 | + else | |
13304 | + printk("Full inexact backtrace again:\n"); | |
13305 | + } else | |
13306 | + printk("Inexact backtrace:\n"); | |
13307 | + } | |
13308 | + | |
13309 | + if (task == current) { | |
13310 | + /* Grab ebp right from our regs */ | |
13311 | + asm ("movl %%ebp, %0" : "=r" (ebp) : ); | |
13312 | + } else { | |
13313 | + /* ebp is the last reg pushed by switch_to */ | |
13314 | + ebp = *(unsigned long *) task->thread.esp; | |
13315 | + } | |
13316 | + | |
13317 | + while (1) { | |
13318 | + struct thread_info *context; | |
13319 | + context = (struct thread_info *) | |
13320 | + ((unsigned long)stack & (~(THREAD_SIZE - 1))); | |
13321 | + ebp = print_context_stack(context, stack, ebp, log_lvl); | |
13322 | + stack = (unsigned long*)context->previous_esp; | |
13323 | + if (!stack) | |
13324 | + break; | |
13325 | + printk("%s =======================\n", log_lvl); | |
13326 | + } | |
13327 | +} | |
13328 | + | |
13329 | +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack) | |
13330 | +{ | |
13331 | + show_trace_log_lvl(task, regs, stack, ""); | |
13332 | +} | |
13333 | + | |
13334 | +static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |
13335 | + unsigned long *esp, char *log_lvl) | |
13336 | +{ | |
13337 | + unsigned long *stack; | |
13338 | + int i; | |
13339 | + | |
13340 | + if (esp == NULL) { | |
13341 | + if (task) | |
13342 | + esp = (unsigned long*)task->thread.esp; | |
13343 | + else | |
13344 | + esp = (unsigned long *)&esp; | |
13345 | + } | |
13346 | + | |
13347 | + stack = esp; | |
13348 | + for(i = 0; i < kstack_depth_to_print; i++) { | |
13349 | + if (kstack_end(stack)) | |
13350 | + break; | |
13351 | + if (i && ((i % 8) == 0)) | |
13352 | + printk("\n%s ", log_lvl); | |
13353 | + printk("%08lx ", *stack++); | |
13354 | + } | |
13355 | + printk("\n%sCall Trace:\n", log_lvl); | |
13356 | + show_trace_log_lvl(task, regs, esp, log_lvl); | |
13357 | +} | |
13358 | + | |
13359 | +void show_stack(struct task_struct *task, unsigned long *esp) | |
13360 | +{ | |
13361 | + printk(" "); | |
13362 | + show_stack_log_lvl(task, NULL, esp, ""); | |
13363 | +} | |
13364 | + | |
13365 | +/* | |
13366 | + * The architecture-independent dump_stack generator | |
13367 | + */ | |
13368 | +void dump_stack(void) | |
13369 | +{ | |
13370 | + unsigned long stack; | |
13371 | + | |
13372 | + show_trace(current, NULL, &stack); | |
13373 | +} | |
13374 | + | |
13375 | +EXPORT_SYMBOL(dump_stack); | |
13376 | + | |
13377 | +void show_registers(struct pt_regs *regs) | |
13378 | +{ | |
13379 | + int i; | |
13380 | + int in_kernel = 1; | |
13381 | + unsigned long esp; | |
13382 | + unsigned short ss; | |
13383 | + | |
13384 | + esp = (unsigned long) (®s->esp); | |
13385 | + savesegment(ss, ss); | |
13386 | + if (user_mode_vm(regs)) { | |
13387 | + in_kernel = 0; | |
13388 | + esp = regs->esp; | |
13389 | + ss = regs->xss & 0xffff; | |
13390 | + } | |
13391 | + print_modules(); | |
13392 | + printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n" | |
13393 | + "EFLAGS: %08lx (%s %.*s) \n", | |
13394 | + smp_processor_id(), 0xffff & regs->xcs, regs->eip, | |
13395 | + print_tainted(), regs->eflags, system_utsname.release, | |
13396 | + (int)strcspn(system_utsname.version, " "), | |
13397 | + system_utsname.version); | |
13398 | + print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip); | |
13399 | + printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", | |
13400 | + regs->eax, regs->ebx, regs->ecx, regs->edx); | |
13401 | + printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", | |
13402 | + regs->esi, regs->edi, regs->ebp, esp); | |
13403 | + printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", | |
13404 | + regs->xds & 0xffff, regs->xes & 0xffff, ss); | |
13405 | + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", | |
13406 | + TASK_COMM_LEN, current->comm, current->pid, | |
13407 | + current_thread_info(), current, current->thread_info); | |
13408 | + /* | |
13409 | + * When in-kernel, we also print out the stack and code at the | |
13410 | + * time of the fault.. | |
13411 | + */ | |
13412 | + if (in_kernel) { | |
13413 | + u8 __user *eip; | |
13414 | + | |
13415 | + printk("\n" KERN_EMERG "Stack: "); | |
13416 | + show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG); | |
13417 | + | |
13418 | + printk(KERN_EMERG "Code: "); | |
13419 | + | |
13420 | + eip = (u8 __user *)regs->eip - 43; | |
13421 | + for (i = 0; i < 64; i++, eip++) { | |
13422 | + unsigned char c; | |
13423 | + | |
13424 | + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { | |
13425 | + printk(" Bad EIP value."); | |
13426 | + break; | |
13427 | + } | |
13428 | + if (eip == (u8 __user *)regs->eip) | |
13429 | + printk("<%02x> ", c); | |
13430 | + else | |
13431 | + printk("%02x ", c); | |
13432 | + } | |
13433 | + } | |
13434 | + printk("\n"); | |
13435 | +} | |
13436 | + | |
13437 | +static void handle_BUG(struct pt_regs *regs) | |
13438 | +{ | |
13439 | + unsigned long eip = regs->eip; | |
13440 | + unsigned short ud2; | |
13441 | + | |
13442 | + if (eip < PAGE_OFFSET) | |
13443 | + return; | |
13444 | + if (__get_user(ud2, (unsigned short __user *)eip)) | |
13445 | + return; | |
13446 | + if (ud2 != 0x0b0f) | |
13447 | + return; | |
13448 | + | |
13449 | + printk(KERN_EMERG "------------[ cut here ]------------\n"); | |
13450 | + | |
13451 | +#ifdef CONFIG_DEBUG_BUGVERBOSE | |
13452 | + do { | |
13453 | + unsigned short line; | |
13454 | + char *file; | |
13455 | + char c; | |
13456 | + | |
13457 | + if (__get_user(line, (unsigned short __user *)(eip + 2))) | |
13458 | + break; | |
13459 | + if (__get_user(file, (char * __user *)(eip + 4)) || | |
13460 | + (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) | |
13461 | + file = "<bad filename>"; | |
13462 | + | |
13463 | + printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); | |
13464 | + return; | |
13465 | + } while (0); | |
13466 | +#endif | |
13467 | + printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n"); | |
13468 | +} | |
13469 | + | |
13470 | +/* This is gone through when something in the kernel | |
13471 | + * has done something bad and is about to be terminated. | |
13472 | +*/ | |
13473 | +void die(const char * str, struct pt_regs * regs, long err) | |
13474 | +{ | |
13475 | + static struct { | |
13476 | + spinlock_t lock; | |
13477 | + u32 lock_owner; | |
13478 | + int lock_owner_depth; | |
13479 | + } die = { | |
13480 | + .lock = SPIN_LOCK_UNLOCKED, | |
13481 | + .lock_owner = -1, | |
13482 | + .lock_owner_depth = 0 | |
13483 | + }; | |
13484 | + static int die_counter; | |
13485 | + unsigned long flags; | |
13486 | + | |
13487 | + oops_enter(); | |
13488 | + | |
13489 | + if (die.lock_owner != raw_smp_processor_id()) { | |
13490 | + console_verbose(); | |
13491 | + spin_lock_irqsave(&die.lock, flags); | |
13492 | + die.lock_owner = smp_processor_id(); | |
13493 | + die.lock_owner_depth = 0; | |
13494 | + bust_spinlocks(1); | |
13495 | + } | |
13496 | + else | |
13497 | + local_save_flags(flags); | |
13498 | + | |
13499 | + if (++die.lock_owner_depth < 3) { | |
13500 | + int nl = 0; | |
13501 | + unsigned long esp; | |
13502 | + unsigned short ss; | |
13503 | + | |
13504 | + handle_BUG(regs); | |
13505 | + printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); | |
13506 | +#ifdef CONFIG_PREEMPT | |
13507 | + printk(KERN_EMERG "PREEMPT "); | |
13508 | + nl = 1; | |
13509 | +#endif | |
13510 | +#ifdef CONFIG_SMP | |
13511 | + if (!nl) | |
13512 | + printk(KERN_EMERG); | |
13513 | + printk("SMP "); | |
13514 | + nl = 1; | |
13515 | +#endif | |
13516 | +#ifdef CONFIG_DEBUG_PAGEALLOC | |
13517 | + if (!nl) | |
13518 | + printk(KERN_EMERG); | |
13519 | + printk("DEBUG_PAGEALLOC"); | |
13520 | + nl = 1; | |
13521 | +#endif | |
13522 | + if (nl) | |
13523 | + printk("\n"); | |
13524 | + if (notify_die(DIE_OOPS, str, regs, err, | |
13525 | + current->thread.trap_no, SIGSEGV) != | |
13526 | + NOTIFY_STOP) { | |
13527 | + show_registers(regs); | |
13528 | + /* Executive summary in case the oops scrolled away */ | |
13529 | + esp = (unsigned long) (®s->esp); | |
13530 | + savesegment(ss, ss); | |
13531 | + if (user_mode(regs)) { | |
13532 | + esp = regs->esp; | |
13533 | + ss = regs->xss & 0xffff; | |
13534 | + } | |
13535 | + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip); | |
13536 | + print_symbol("%s", regs->eip); | |
13537 | + printk(" SS:ESP %04x:%08lx\n", ss, esp); | |
13538 | + } | |
13539 | + else | |
13540 | + regs = NULL; | |
13541 | + } else | |
13542 | + printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); | |
13543 | + | |
13544 | + bust_spinlocks(0); | |
13545 | + die.lock_owner = -1; | |
13546 | + spin_unlock_irqrestore(&die.lock, flags); | |
13547 | + | |
13548 | + if (!regs) | |
13549 | + return; | |
13550 | + | |
13551 | + if (kexec_should_crash(current)) | |
13552 | + crash_kexec(regs); | |
13553 | + | |
13554 | + if (in_interrupt()) | |
13555 | + panic("Fatal exception in interrupt"); | |
13556 | + | |
13557 | + if (panic_on_oops) | |
13558 | + panic("Fatal exception"); | |
13559 | + | |
13560 | + oops_exit(); | |
13561 | + do_exit(SIGSEGV); | |
13562 | +} | |
13563 | + | |
13564 | +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) | |
13565 | +{ | |
13566 | + if (!user_mode_vm(regs)) | |
13567 | + die(str, regs, err); | |
13568 | +} | |
13569 | + | |
13570 | +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, | |
13571 | + struct pt_regs * regs, long error_code, | |
13572 | + siginfo_t *info) | |
13573 | +{ | |
13574 | + struct task_struct *tsk = current; | |
13575 | + tsk->thread.error_code = error_code; | |
13576 | + tsk->thread.trap_no = trapnr; | |
13577 | + | |
13578 | + if (regs->eflags & VM_MASK) { | |
13579 | + if (vm86) | |
13580 | + goto vm86_trap; | |
13581 | + goto trap_signal; | |
13582 | + } | |
13583 | + | |
13584 | + if (!user_mode(regs)) | |
13585 | + goto kernel_trap; | |
13586 | + | |
13587 | + trap_signal: { | |
13588 | + if (info) | |
13589 | + force_sig_info(signr, info, tsk); | |
13590 | + else | |
13591 | + force_sig(signr, tsk); | |
13592 | + return; | |
13593 | + } | |
13594 | + | |
13595 | + kernel_trap: { | |
13596 | + if (!fixup_exception(regs)) | |
13597 | + die(str, regs, error_code); | |
13598 | + return; | |
13599 | + } | |
13600 | + | |
13601 | + vm86_trap: { | |
13602 | + int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr); | |
13603 | + if (ret) goto trap_signal; | |
13604 | + return; | |
13605 | + } | |
13606 | +} | |
13607 | + | |
13608 | +#define DO_ERROR(trapnr, signr, str, name) \ | |
13609 | +fastcall void do_##name(struct pt_regs * regs, long error_code) \ | |
13610 | +{ \ | |
13611 | + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | |
13612 | + == NOTIFY_STOP) \ | |
13613 | + return; \ | |
13614 | + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ | |
13615 | +} | |
13616 | + | |
13617 | +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | |
13618 | +fastcall void do_##name(struct pt_regs * regs, long error_code) \ | |
13619 | +{ \ | |
13620 | + siginfo_t info; \ | |
13621 | + info.si_signo = signr; \ | |
13622 | + info.si_errno = 0; \ | |
13623 | + info.si_code = sicode; \ | |
13624 | + info.si_addr = (void __user *)siaddr; \ | |
13625 | + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | |
13626 | + == NOTIFY_STOP) \ | |
13627 | + return; \ | |
13628 | + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ | |
13629 | +} | |
13630 | + | |
13631 | +#define DO_VM86_ERROR(trapnr, signr, str, name) \ | |
13632 | +fastcall void do_##name(struct pt_regs * regs, long error_code) \ | |
13633 | +{ \ | |
13634 | + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | |
13635 | + == NOTIFY_STOP) \ | |
13636 | + return; \ | |
13637 | + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ | |
13638 | +} | |
13639 | + | |
13640 | +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | |
13641 | +fastcall void do_##name(struct pt_regs * regs, long error_code) \ | |
13642 | +{ \ | |
13643 | + siginfo_t info; \ | |
13644 | + info.si_signo = signr; \ | |
13645 | + info.si_errno = 0; \ | |
13646 | + info.si_code = sicode; \ | |
13647 | + info.si_addr = (void __user *)siaddr; \ | |
13648 | + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | |
13649 | + == NOTIFY_STOP) \ | |
13650 | + return; \ | |
13651 | + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ | |
13652 | +} | |
13653 | + | |
13654 | +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) | |
13655 | +#ifndef CONFIG_KPROBES | |
13656 | +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) | |
13657 | +#endif | |
13658 | +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) | |
13659 | +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) | |
13660 | +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip) | |
13661 | +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | |
13662 | +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | |
13663 | +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | |
13664 | +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) | |
13665 | +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) | |
13666 | +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0) | |
13667 | + | |
13668 | +fastcall void __kprobes do_general_protection(struct pt_regs * regs, | |
13669 | + long error_code) | |
13670 | +{ | |
13671 | + current->thread.error_code = error_code; | |
13672 | + current->thread.trap_no = 13; | |
13673 | + | |
13674 | + if (regs->eflags & VM_MASK) | |
13675 | + goto gp_in_vm86; | |
13676 | + | |
13677 | + if (!user_mode(regs)) | |
13678 | + goto gp_in_kernel; | |
13679 | + | |
13680 | + current->thread.error_code = error_code; | |
13681 | + current->thread.trap_no = 13; | |
13682 | + force_sig(SIGSEGV, current); | |
13683 | + return; | |
13684 | + | |
13685 | +gp_in_vm86: | |
13686 | + local_irq_enable(); | |
13687 | + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); | |
13688 | + return; | |
13689 | + | |
13690 | +gp_in_kernel: | |
13691 | + if (!fixup_exception(regs)) { | |
13692 | + if (notify_die(DIE_GPF, "general protection fault", regs, | |
13693 | + error_code, 13, SIGSEGV) == NOTIFY_STOP) | |
13694 | + return; | |
13695 | + die("general protection fault", regs, error_code); | |
13696 | + } | |
13697 | +} | |
13698 | + | |
13699 | +static void mem_parity_error(unsigned char reason, struct pt_regs * regs) | |
13700 | +{ | |
13701 | + printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying " | |
13702 | + "to continue\n"); | |
13703 | + printk(KERN_EMERG "You probably have a hardware problem with your RAM " | |
13704 | + "chips\n"); | |
13705 | + | |
13706 | + /* Clear and disable the memory parity error line. */ | |
13707 | + clear_mem_error(reason); | |
13708 | +} | |
13709 | + | |
13710 | +static void io_check_error(unsigned char reason, struct pt_regs * regs) | |
13711 | +{ | |
13712 | + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); | |
13713 | + show_registers(regs); | |
13714 | + | |
13715 | + /* Re-enable the IOCK line, wait for a few seconds */ | |
13716 | + clear_io_check_error(reason); | |
13717 | +} | |
13718 | + | |
13719 | +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) | |
13720 | +{ | |
13721 | +#ifdef CONFIG_MCA | |
13722 | + /* Might actually be able to figure out what the guilty party | |
13723 | + * is. */ | |
13724 | + if( MCA_bus ) { | |
13725 | + mca_handle_nmi(); | |
13726 | + return; | |
13727 | + } | |
13728 | +#endif | |
13729 | + printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", | |
13730 | + reason, smp_processor_id()); | |
13731 | + printk("Dazed and confused, but trying to continue\n"); | |
13732 | + printk("Do you have a strange power saving mode enabled?\n"); | |
13733 | +} | |
13734 | + | |
13735 | +static DEFINE_SPINLOCK(nmi_print_lock); | |
13736 | + | |
13737 | +void die_nmi (struct pt_regs *regs, const char *msg) | |
13738 | +{ | |
13739 | + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == | |
13740 | + NOTIFY_STOP) | |
13741 | + return; | |
13742 | + | |
13743 | + spin_lock(&nmi_print_lock); | |
13744 | + /* | |
13745 | + * We are in trouble anyway, lets at least try | |
13746 | + * to get a message out. | |
13747 | + */ | |
13748 | + bust_spinlocks(1); | |
13749 | + printk(KERN_EMERG "%s", msg); | |
13750 | + printk(" on CPU%d, eip %08lx, registers:\n", | |
13751 | + smp_processor_id(), regs->eip); | |
13752 | + show_registers(regs); | |
13753 | + printk(KERN_EMERG "console shuts up ...\n"); | |
13754 | + console_silent(); | |
13755 | + spin_unlock(&nmi_print_lock); | |
13756 | + bust_spinlocks(0); | |
13757 | + | |
13758 | + /* If we are in kernel we are probably nested up pretty bad | |
13759 | + * and might aswell get out now while we still can. | |
13760 | + */ | |
13761 | + if (!user_mode_vm(regs)) { | |
13762 | + current->thread.trap_no = 2; | |
13763 | + crash_kexec(regs); | |
13764 | + } | |
13765 | + | |
13766 | + do_exit(SIGSEGV); | |
13767 | +} | |
13768 | + | |
13769 | +static void default_do_nmi(struct pt_regs * regs) | |
13770 | +{ | |
13771 | + unsigned char reason = 0; | |
13772 | + | |
13773 | + /* Only the BSP gets external NMIs from the system. */ | |
13774 | + if (!smp_processor_id()) | |
13775 | + reason = get_nmi_reason(); | |
13776 | + | |
13777 | + if (!(reason & 0xc0)) { | |
13778 | + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | |
13779 | + == NOTIFY_STOP) | |
13780 | + return; | |
13781 | +#ifdef CONFIG_X86_LOCAL_APIC | |
13782 | + /* | |
13783 | + * Ok, so this is none of the documented NMI sources, | |
13784 | + * so it must be the NMI watchdog. | |
13785 | + */ | |
13786 | + if (nmi_watchdog) { | |
13787 | + nmi_watchdog_tick(regs); | |
13788 | + return; | |
13789 | + } | |
13790 | +#endif | |
13791 | + unknown_nmi_error(reason, regs); | |
13792 | + return; | |
13793 | + } | |
13794 | + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | |
13795 | + return; | |
13796 | + if (reason & 0x80) | |
13797 | + mem_parity_error(reason, regs); | |
13798 | + if (reason & 0x40) | |
13799 | + io_check_error(reason, regs); | |
13800 | + /* | |
13801 | + * Reassert NMI in case it became active meanwhile | |
13802 | + * as it's edge-triggered. | |
13803 | + */ | |
13804 | + reassert_nmi(); | |
13805 | +} | |
13806 | + | |
13807 | +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) | |
13808 | +{ | |
13809 | + return 0; | |
13810 | +} | |
13811 | + | |
13812 | +static nmi_callback_t nmi_callback = dummy_nmi_callback; | |
13813 | + | |
13814 | +fastcall void do_nmi(struct pt_regs * regs, long error_code) | |
13815 | +{ | |
13816 | + int cpu; | |
13817 | + | |
13818 | + nmi_enter(); | |
13819 | + | |
13820 | + cpu = smp_processor_id(); | |
13821 | + | |
13822 | + ++nmi_count(cpu); | |
13823 | + | |
13824 | + if (!rcu_dereference(nmi_callback)(regs, cpu)) | |
13825 | + default_do_nmi(regs); | |
13826 | + | |
13827 | + nmi_exit(); | |
13828 | +} | |
13829 | + | |
13830 | +void set_nmi_callback(nmi_callback_t callback) | |
13831 | +{ | |
13832 | + vmalloc_sync_all(); | |
13833 | + rcu_assign_pointer(nmi_callback, callback); | |
13834 | +} | |
13835 | +EXPORT_SYMBOL_GPL(set_nmi_callback); | |
13836 | + | |
13837 | +void unset_nmi_callback(void) | |
13838 | +{ | |
13839 | + nmi_callback = dummy_nmi_callback; | |
13840 | +} | |
13841 | +EXPORT_SYMBOL_GPL(unset_nmi_callback); | |
13842 | + | |
13843 | +#ifdef CONFIG_KPROBES | |
13844 | +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) | |
13845 | +{ | |
13846 | + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) | |
13847 | + == NOTIFY_STOP) | |
13848 | + return; | |
13849 | + /* This is an interrupt gate, because kprobes wants interrupts | |
13850 | + disabled. Normal trap handlers don't. */ | |
13851 | + restore_interrupts(regs); | |
13852 | + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); | |
13853 | +} | |
13854 | +#endif | |
13855 | + | |
13856 | +/* | |
13857 | + * Our handling of the processor debug registers is non-trivial. | |
13858 | + * We do not clear them on entry and exit from the kernel. Therefore | |
13859 | + * it is possible to get a watchpoint trap here from inside the kernel. | |
13860 | + * However, the code in ./ptrace.c has ensured that the user can | |
13861 | + * only set watchpoints on userspace addresses. Therefore the in-kernel | |
13862 | + * watchpoint trap can only occur in code which is reading/writing | |
13863 | + * from user space. Such code must not hold kernel locks (since it | |
13864 | + * can equally take a page fault), therefore it is safe to call | |
13865 | + * force_sig_info even though that claims and releases locks. | |
13866 | + * | |
13867 | + * Code in ./signal.c ensures that the debug control register | |
13868 | + * is restored before we deliver any signal, and therefore that | |
13869 | + * user code runs with the correct debug control register even though | |
13870 | + * we clear it here. | |
13871 | + * | |
13872 | + * Being careful here means that we don't have to be as careful in a | |
13873 | + * lot of more complicated places (task switching can be a bit lazy | |
13874 | + * about restoring all the debug state, and ptrace doesn't have to | |
13875 | + * find every occurrence of the TF bit that could be saved away even | |
13876 | + * by user code) | |
13877 | + */ | |
13878 | +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) | |
13879 | +{ | |
13880 | + unsigned int condition; | |
13881 | + struct task_struct *tsk = current; | |
13882 | + | |
13883 | + get_debugreg(condition, 6); | |
13884 | + | |
13885 | + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | |
13886 | + SIGTRAP) == NOTIFY_STOP) | |
13887 | + return; | |
13888 | + /* It's safe to allow irq's after DR6 has been saved */ | |
13889 | + if (regs->eflags & X86_EFLAGS_IF) | |
13890 | + local_irq_enable(); | |
13891 | + | |
13892 | + /* Mask out spurious debug traps due to lazy DR7 setting */ | |
13893 | + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | |
13894 | + if (!tsk->thread.debugreg[7]) | |
13895 | + goto clear_dr7; | |
13896 | + } | |
13897 | + | |
13898 | + if (regs->eflags & VM_MASK) | |
13899 | + goto debug_vm86; | |
13900 | + | |
13901 | + /* Save debug status register where ptrace can see it */ | |
13902 | + tsk->thread.debugreg[6] = condition; | |
13903 | + | |
13904 | + /* | |
13905 | + * Single-stepping through TF: make sure we ignore any events in | |
13906 | + * kernel space (but re-enable TF when returning to user mode). | |
13907 | + */ | |
13908 | + if (condition & DR_STEP) { | |
13909 | + /* | |
13910 | + * We already checked v86 mode above, so we can | |
13911 | + * check for kernel mode by just checking the CPL | |
13912 | + * of CS. | |
13913 | + */ | |
13914 | + if (!user_mode(regs)) | |
13915 | + goto clear_TF_reenable; | |
13916 | + } | |
13917 | + | |
13918 | + /* Ok, finally something we can handle */ | |
13919 | + send_sigtrap(tsk, regs, error_code); | |
13920 | + | |
13921 | + /* Disable additional traps. They'll be re-enabled when | |
13922 | + * the signal is delivered. | |
13923 | + */ | |
13924 | +clear_dr7: | |
13925 | + set_debugreg(0, 7); | |
13926 | + return; | |
13927 | + | |
13928 | +debug_vm86: | |
13929 | + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); | |
13930 | + return; | |
13931 | + | |
13932 | +clear_TF_reenable: | |
13933 | + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | |
13934 | + regs->eflags &= ~TF_MASK; | |
13935 | + return; | |
13936 | +} | |
13937 | + | |
13938 | +/* | |
13939 | + * Note that we play around with the 'TS' bit in an attempt to get | |
13940 | + * the correct behaviour even in the presence of the asynchronous | |
13941 | + * IRQ13 behaviour | |
13942 | + */ | |
13943 | +void math_error(void __user *eip) | |
13944 | +{ | |
13945 | + struct task_struct * task; | |
13946 | + siginfo_t info; | |
13947 | + unsigned short cwd, swd; | |
13948 | + | |
13949 | + /* | |
13950 | + * Save the info for the exception handler and clear the error. | |
13951 | + */ | |
13952 | + task = current; | |
13953 | + save_init_fpu(task); | |
13954 | + task->thread.trap_no = 16; | |
13955 | + task->thread.error_code = 0; | |
13956 | + info.si_signo = SIGFPE; | |
13957 | + info.si_errno = 0; | |
13958 | + info.si_code = __SI_FAULT; | |
13959 | + info.si_addr = eip; | |
13960 | + /* | |
13961 | + * (~cwd & swd) will mask out exceptions that are not set to unmasked | |
13962 | + * status. 0x3f is the exception bits in these regs, 0x200 is the | |
13963 | + * C1 reg you need in case of a stack fault, 0x040 is the stack | |
13964 | + * fault bit. We should only be taking one exception at a time, | |
13965 | + * so if this combination doesn't produce any single exception, | |
13966 | + * then we have a bad program that isn't syncronizing its FPU usage | |
13967 | + * and it will suffer the consequences since we won't be able to | |
13968 | + * fully reproduce the context of the exception | |
13969 | + */ | |
13970 | + cwd = get_fpu_cwd(task); | |
13971 | + swd = get_fpu_swd(task); | |
13972 | + switch (swd & ~cwd & 0x3f) { | |
13973 | + case 0x000: /* No unmasked exception */ | |
13974 | + return; | |
13975 | + default: /* Multiple exceptions */ | |
13976 | + break; | |
13977 | + case 0x001: /* Invalid Op */ | |
13978 | + /* | |
13979 | + * swd & 0x240 == 0x040: Stack Underflow | |
13980 | + * swd & 0x240 == 0x240: Stack Overflow | |
13981 | + * User must clear the SF bit (0x40) if set | |
13982 | + */ | |
13983 | + info.si_code = FPE_FLTINV; | |
13984 | + break; | |
13985 | + case 0x002: /* Denormalize */ | |
13986 | + case 0x010: /* Underflow */ | |
13987 | + info.si_code = FPE_FLTUND; | |
13988 | + break; | |
13989 | + case 0x004: /* Zero Divide */ | |
13990 | + info.si_code = FPE_FLTDIV; | |
13991 | + break; | |
13992 | + case 0x008: /* Overflow */ | |
13993 | + info.si_code = FPE_FLTOVF; | |
13994 | + break; | |
13995 | + case 0x020: /* Precision */ | |
13996 | + info.si_code = FPE_FLTRES; | |
13997 | + break; | |
13998 | + } | |
13999 | + force_sig_info(SIGFPE, &info, task); | |
14000 | +} | |
14001 | + | |
14002 | +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) | |
14003 | +{ | |
14004 | + ignore_fpu_irq = 1; | |
14005 | + math_error((void __user *)regs->eip); | |
14006 | +} | |
14007 | + | |
14008 | +static void simd_math_error(void __user *eip) | |
14009 | +{ | |
14010 | + struct task_struct * task; | |
14011 | + siginfo_t info; | |
14012 | + unsigned short mxcsr; | |
14013 | + | |
14014 | + /* | |
14015 | + * Save the info for the exception handler and clear the error. | |
14016 | + */ | |
14017 | + task = current; | |
14018 | + save_init_fpu(task); | |
14019 | + task->thread.trap_no = 19; | |
14020 | + task->thread.error_code = 0; | |
14021 | + info.si_signo = SIGFPE; | |
14022 | + info.si_errno = 0; | |
14023 | + info.si_code = __SI_FAULT; | |
14024 | + info.si_addr = eip; | |
14025 | + /* | |
14026 | + * The SIMD FPU exceptions are handled a little differently, as there | |
14027 | + * is only a single status/control register. Thus, to determine which | |
14028 | + * unmasked exception was caught we must mask the exception mask bits | |
14029 | + * at 0x1f80, and then use these to mask the exception bits at 0x3f. | |
14030 | + */ | |
14031 | + mxcsr = get_fpu_mxcsr(task); | |
14032 | + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | |
14033 | + case 0x000: | |
14034 | + default: | |
14035 | + break; | |
14036 | + case 0x001: /* Invalid Op */ | |
14037 | + info.si_code = FPE_FLTINV; | |
14038 | + break; | |
14039 | + case 0x002: /* Denormalize */ | |
14040 | + case 0x010: /* Underflow */ | |
14041 | + info.si_code = FPE_FLTUND; | |
14042 | + break; | |
14043 | + case 0x004: /* Zero Divide */ | |
14044 | + info.si_code = FPE_FLTDIV; | |
14045 | + break; | |
14046 | + case 0x008: /* Overflow */ | |
14047 | + info.si_code = FPE_FLTOVF; | |
14048 | + break; | |
14049 | + case 0x020: /* Precision */ | |
14050 | + info.si_code = FPE_FLTRES; | |
14051 | + break; | |
14052 | + } | |
14053 | + force_sig_info(SIGFPE, &info, task); | |
14054 | +} | |
14055 | + | |
14056 | +fastcall void do_simd_coprocessor_error(struct pt_regs * regs, | |
14057 | + long error_code) | |
14058 | +{ | |
14059 | + if (cpu_has_xmm) { | |
14060 | + /* Handle SIMD FPU exceptions on PIII+ processors. */ | |
14061 | + ignore_fpu_irq = 1; | |
14062 | + simd_math_error((void __user *)regs->eip); | |
14063 | + } else { | |
14064 | + /* | |
14065 | + * Handle strange cache flush from user space exception | |
14066 | + * in all other cases. This is undocumented behaviour. | |
14067 | + */ | |
14068 | + if (regs->eflags & VM_MASK) { | |
14069 | + handle_vm86_fault((struct kernel_vm86_regs *)regs, | |
14070 | + error_code); | |
14071 | + return; | |
14072 | + } | |
14073 | + current->thread.trap_no = 19; | |
14074 | + current->thread.error_code = error_code; | |
14075 | + die_if_kernel("cache flush denied", regs, error_code); | |
14076 | + force_sig(SIGSEGV, current); | |
14077 | + } | |
14078 | +} | |
14079 | + | |
14080 | +#ifndef CONFIG_XEN | |
14081 | +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, | |
14082 | + long error_code) | |
14083 | +{ | |
14084 | +#if 0 | |
14085 | + /* No need to warn about this any longer. */ | |
14086 | + printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); | |
14087 | +#endif | |
14088 | +} | |
14089 | + | |
14090 | +fastcall void setup_x86_bogus_stack(unsigned char * stk) | |
14091 | +{ | |
14092 | + unsigned long *switch16_ptr, *switch32_ptr; | |
14093 | + struct pt_regs *regs; | |
14094 | + unsigned long stack_top, stack_bot; | |
14095 | + unsigned short iret_frame16_off; | |
14096 | + int cpu = smp_processor_id(); | |
14097 | + /* reserve the space on 32bit stack for the magic switch16 pointer */ | |
14098 | + memmove(stk, stk + 8, sizeof(struct pt_regs)); | |
14099 | + switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); | |
14100 | + regs = (struct pt_regs *)stk; | |
14101 | + /* now the switch32 on 16bit stack */ | |
14102 | + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); | |
14103 | + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; | |
14104 | + switch32_ptr = (unsigned long *)(stack_top - 8); | |
14105 | + iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; | |
14106 | + /* copy iret frame on 16bit stack */ | |
14107 | + memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); | |
14108 | + /* fill in the switch pointers */ | |
14109 | + switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; | |
14110 | + switch16_ptr[1] = __ESPFIX_SS; | |
14111 | + switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + | |
14112 | + 8 - CPU_16BIT_STACK_SIZE; | |
14113 | + switch32_ptr[1] = __KERNEL_DS; | |
14114 | +} | |
14115 | + | |
14116 | +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) | |
14117 | +{ | |
14118 | + unsigned long *switch32_ptr; | |
14119 | + unsigned char *stack16, *stack32; | |
14120 | + unsigned long stack_top, stack_bot; | |
14121 | + int len; | |
14122 | + int cpu = smp_processor_id(); | |
14123 | + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); | |
14124 | + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; | |
14125 | + switch32_ptr = (unsigned long *)(stack_top - 8); | |
14126 | + /* copy the data from 16bit stack to 32bit stack */ | |
14127 | + len = CPU_16BIT_STACK_SIZE - 8 - sp; | |
14128 | + stack16 = (unsigned char *)(stack_bot + sp); | |
14129 | + stack32 = (unsigned char *) | |
14130 | + (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); | |
14131 | + memcpy(stack32, stack16, len); | |
14132 | + return stack32; | |
14133 | +} | |
14134 | +#endif | |
14135 | + | |
14136 | +/* | |
14137 | + * 'math_state_restore()' saves the current math information in the | |
14138 | + * old math state array, and gets the new ones from the current task | |
14139 | + * | |
14140 | + * Careful.. There are problems with IBM-designed IRQ13 behaviour. | |
14141 | + * Don't touch unless you *really* know how it works. | |
14142 | + * | |
14143 | + * Must be called with kernel preemption disabled (in this case, | |
14144 | + * local interrupts are disabled at the call-site in entry.S). | |
14145 | + */ | |
14146 | +asmlinkage void math_state_restore(struct pt_regs regs) | |
14147 | +{ | |
14148 | + struct thread_info *thread = current_thread_info(); | |
14149 | + struct task_struct *tsk = thread->task; | |
14150 | + | |
14151 | + /* NB. 'clts' is done for us by Xen during virtual trap. */ | |
14152 | + if (!tsk_used_math(tsk)) | |
14153 | + init_fpu(tsk); | |
14154 | + restore_fpu(tsk); | |
14155 | + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ | |
14156 | +} | |
14157 | + | |
14158 | +#ifndef CONFIG_MATH_EMULATION | |
14159 | + | |
14160 | +asmlinkage void math_emulate(long arg) | |
14161 | +{ | |
14162 | + printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n"); | |
14163 | + printk(KERN_EMERG "killing %s.\n",current->comm); | |
14164 | + force_sig(SIGFPE,current); | |
14165 | + schedule(); | |
14166 | +} | |
14167 | + | |
14168 | +#endif /* CONFIG_MATH_EMULATION */ | |
14169 | + | |
14170 | +#ifdef CONFIG_X86_F00F_BUG | |
14171 | +void __init trap_init_f00f_bug(void) | |
14172 | +{ | |
14173 | + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); | |
14174 | + | |
14175 | + /* | |
14176 | + * Update the IDT descriptor and reload the IDT so that | |
14177 | + * it uses the read-only mapped virtual address. | |
14178 | + */ | |
14179 | + idt_descr.address = fix_to_virt(FIX_F00F_IDT); | |
14180 | + load_idt(&idt_descr); | |
14181 | +} | |
14182 | +#endif | |
14183 | + | |
14184 | + | |
14185 | +/* | |
14186 | + * NB. All these are "trap gates" (i.e. events_mask isn't set) except | |
14187 | + * for those that specify <dpl>|4 in the second field. | |
14188 | + */ | |
14189 | +static trap_info_t __cpuinitdata trap_table[] = { | |
14190 | + { 0, 0, __KERNEL_CS, (unsigned long)divide_error }, | |
14191 | + { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, | |
14192 | + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, | |
14193 | + { 4, 3, __KERNEL_CS, (unsigned long)overflow }, | |
14194 | + { 5, 0, __KERNEL_CS, (unsigned long)bounds }, | |
14195 | + { 6, 0, __KERNEL_CS, (unsigned long)invalid_op }, | |
14196 | + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available }, | |
14197 | + { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun }, | |
14198 | + { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS }, | |
14199 | + { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present }, | |
14200 | + { 12, 0, __KERNEL_CS, (unsigned long)stack_segment }, | |
14201 | + { 13, 0, __KERNEL_CS, (unsigned long)general_protection }, | |
14202 | + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault }, | |
14203 | + { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment }, | |
14204 | + { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error }, | |
14205 | + { 17, 0, __KERNEL_CS, (unsigned long)alignment_check }, | |
14206 | +#ifdef CONFIG_X86_MCE | |
14207 | + { 18, 0, __KERNEL_CS, (unsigned long)machine_check }, | |
14208 | +#endif | |
14209 | + { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, | |
14210 | + { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call }, | |
14211 | + { 0, 0, 0, 0 } | |
14212 | +}; | |
14213 | + | |
14214 | +void __init trap_init(void) | |
14215 | +{ | |
14216 | + int ret; | |
14217 | + | |
14218 | + ret = HYPERVISOR_set_trap_table(trap_table); | |
14219 | + if (ret) | |
14220 | + printk("HYPERVISOR_set_trap_table failed: error %d\n", ret); | |
14221 | + | |
14222 | + if (cpu_has_fxsr) { | |
14223 | + /* | |
14224 | + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. | |
14225 | + * Generates a compile-time "error: zero width for bit-field" if | |
14226 | + * the alignment is wrong. | |
14227 | + */ | |
14228 | + struct fxsrAlignAssert { | |
14229 | + int _:!(offsetof(struct task_struct, | |
14230 | + thread.i387.fxsave) & 15); | |
14231 | + }; | |
14232 | + | |
14233 | + printk(KERN_INFO "Enabling fast FPU save and restore... "); | |
14234 | + set_in_cr4(X86_CR4_OSFXSR); | |
14235 | + printk("done.\n"); | |
14236 | + } | |
14237 | + if (cpu_has_xmm) { | |
14238 | + printk(KERN_INFO "Enabling unmasked SIMD FPU exception " | |
14239 | + "support... "); | |
14240 | + set_in_cr4(X86_CR4_OSXMMEXCPT); | |
14241 | + printk("done.\n"); | |
14242 | + } | |
14243 | + | |
14244 | + /* | |
14245 | + * Should be a barrier for any external CPU state. | |
14246 | + */ | |
14247 | + cpu_init(); | |
14248 | +} | |
14249 | + | |
14250 | +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt) | |
14251 | +{ | |
14252 | + const trap_info_t *t = trap_table; | |
14253 | + | |
14254 | + for (t = trap_table; t->address; t++) { | |
14255 | + trap_ctxt[t->vector].flags = t->flags; | |
14256 | + trap_ctxt[t->vector].cs = t->cs; | |
14257 | + trap_ctxt[t->vector].address = t->address; | |
14258 | + } | |
14259 | +} | |
14260 | + | |
14261 | +static int __init kstack_setup(char *s) | |
14262 | +{ | |
14263 | + kstack_depth_to_print = simple_strtoul(s, NULL, 0); | |
14264 | + return 1; | |
14265 | +} | |
14266 | +__setup("kstack=", kstack_setup); | |
14267 | + | |
14268 | +#ifdef CONFIG_STACK_UNWIND | |
14269 | +static int __init call_trace_setup(char *s) | |
14270 | +{ | |
14271 | + if (strcmp(s, "old") == 0) | |
14272 | + call_trace = -1; | |
14273 | + else if (strcmp(s, "both") == 0) | |
14274 | + call_trace = 0; | |
14275 | + else if (strcmp(s, "newfallback") == 0) | |
14276 | + call_trace = 1; | |
14277 | + else if (strcmp(s, "new") == 2) | |
14278 | + call_trace = 2; | |
14279 | + return 1; | |
14280 | +} | |
14281 | +__setup("call_trace=", call_trace_setup); | |
14282 | +#endif | |
14283 | Index: head-2008-11-25/arch/x86/mach-xen/Makefile | |
14284 | =================================================================== | |
14285 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
14286 | +++ head-2008-11-25/arch/x86/mach-xen/Makefile 2007-06-12 13:12:48.000000000 +0200 | |
14287 | @@ -0,0 +1,5 @@ | |
14288 | +# | |
14289 | +# Makefile for the linux kernel. | |
14290 | +# | |
14291 | + | |
14292 | +obj-y := setup.o | |
14293 | Index: head-2008-11-25/arch/x86/mach-xen/setup.c | |
14294 | =================================================================== | |
14295 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
14296 | +++ head-2008-11-25/arch/x86/mach-xen/setup.c 2008-04-02 12:34:02.000000000 +0200 | |
14297 | @@ -0,0 +1,158 @@ | |
14298 | +/* | |
14299 | + * Machine specific setup for generic | |
14300 | + */ | |
14301 | + | |
14302 | +#include <linux/mm.h> | |
14303 | +#include <linux/smp.h> | |
14304 | +#include <linux/init.h> | |
14305 | +#include <linux/interrupt.h> | |
14306 | +#include <linux/module.h> | |
14307 | +#include <asm/acpi.h> | |
14308 | +#include <asm/arch_hooks.h> | |
14309 | +#include <asm/e820.h> | |
14310 | +#include <asm/setup.h> | |
14311 | +#include <asm/fixmap.h> | |
14312 | + | |
14313 | +#include <xen/interface/callback.h> | |
14314 | +#include <xen/interface/memory.h> | |
14315 | + | |
14316 | +#ifdef CONFIG_HOTPLUG_CPU | |
14317 | +#define DEFAULT_SEND_IPI (1) | |
14318 | +#else | |
14319 | +#define DEFAULT_SEND_IPI (0) | |
14320 | +#endif | |
14321 | + | |
14322 | +int no_broadcast=DEFAULT_SEND_IPI; | |
14323 | + | |
14324 | +static __init int no_ipi_broadcast(char *str) | |
14325 | +{ | |
14326 | + get_option(&str, &no_broadcast); | |
14327 | + printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : | |
14328 | + "IPI Broadcast"); | |
14329 | + return 1; | |
14330 | +} | |
14331 | + | |
14332 | +__setup("no_ipi_broadcast", no_ipi_broadcast); | |
14333 | + | |
14334 | +static int __init print_ipi_mode(void) | |
14335 | +{ | |
14336 | + printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : | |
14337 | + "Shortcut"); | |
14338 | + return 0; | |
14339 | +} | |
14340 | + | |
14341 | +late_initcall(print_ipi_mode); | |
14342 | + | |
14343 | +/** | |
14344 | + * machine_specific_memory_setup - Hook for machine specific memory setup. | |
14345 | + * | |
14346 | + * Description: | |
14347 | + * This is included late in kernel/setup.c so that it can make | |
14348 | + * use of all of the static functions. | |
14349 | + **/ | |
14350 | + | |
14351 | +char * __init machine_specific_memory_setup(void) | |
14352 | +{ | |
14353 | + int rc; | |
14354 | + struct xen_memory_map memmap; | |
14355 | + /* | |
14356 | + * This is rather large for a stack variable but this early in | |
14357 | + * the boot process we know we have plenty slack space. | |
14358 | + */ | |
14359 | + struct e820entry map[E820MAX]; | |
14360 | + | |
14361 | + memmap.nr_entries = E820MAX; | |
14362 | + set_xen_guest_handle(memmap.buffer, map); | |
14363 | + | |
14364 | + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); | |
14365 | + if ( rc == -ENOSYS ) { | |
14366 | + memmap.nr_entries = 1; | |
14367 | + map[0].addr = 0ULL; | |
14368 | + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages); | |
14369 | + /* 8MB slack (to balance backend allocations). */ | |
14370 | + map[0].size += 8ULL << 20; | |
14371 | + map[0].type = E820_RAM; | |
14372 | + rc = 0; | |
14373 | + } | |
14374 | + BUG_ON(rc); | |
14375 | + | |
14376 | + sanitize_e820_map(map, (char *)&memmap.nr_entries); | |
14377 | + | |
14378 | + BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0); | |
14379 | + | |
14380 | + return "Xen"; | |
14381 | +} | |
14382 | + | |
14383 | + | |
14384 | +extern void hypervisor_callback(void); | |
14385 | +extern void failsafe_callback(void); | |
14386 | +extern void nmi(void); | |
14387 | + | |
14388 | +unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; | |
14389 | +EXPORT_SYMBOL(machine_to_phys_mapping); | |
14390 | +unsigned int machine_to_phys_order; | |
14391 | +EXPORT_SYMBOL(machine_to_phys_order); | |
14392 | + | |
14393 | +void __init pre_setup_arch_hook(void) | |
14394 | +{ | |
14395 | + struct xen_machphys_mapping mapping; | |
14396 | + unsigned long machine_to_phys_nr_ents; | |
14397 | + struct xen_platform_parameters pp; | |
14398 | + | |
14399 | + init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base; | |
14400 | + | |
14401 | + setup_xen_features(); | |
14402 | + | |
14403 | + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) | |
14404 | + set_fixaddr_top(pp.virt_start); | |
14405 | + | |
14406 | + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { | |
14407 | + machine_to_phys_mapping = (unsigned long *)mapping.v_start; | |
14408 | + machine_to_phys_nr_ents = mapping.max_mfn + 1; | |
14409 | + } else | |
14410 | + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; | |
14411 | + machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); | |
14412 | + | |
14413 | + if (!xen_feature(XENFEAT_auto_translated_physmap)) | |
14414 | + phys_to_machine_mapping = | |
14415 | + (unsigned long *)xen_start_info->mfn_list; | |
14416 | +} | |
14417 | + | |
14418 | +void __init machine_specific_arch_setup(void) | |
14419 | +{ | |
14420 | + int ret; | |
14421 | + static struct callback_register __initdata event = { | |
14422 | + .type = CALLBACKTYPE_event, | |
14423 | + .address = { __KERNEL_CS, (unsigned long)hypervisor_callback }, | |
14424 | + }; | |
14425 | + static struct callback_register __initdata failsafe = { | |
14426 | + .type = CALLBACKTYPE_failsafe, | |
14427 | + .address = { __KERNEL_CS, (unsigned long)failsafe_callback }, | |
14428 | + }; | |
14429 | + static struct callback_register __initdata nmi_cb = { | |
14430 | + .type = CALLBACKTYPE_nmi, | |
14431 | + .address = { __KERNEL_CS, (unsigned long)nmi }, | |
14432 | + }; | |
14433 | + | |
14434 | + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event); | |
14435 | + if (ret == 0) | |
14436 | + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); | |
14437 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
14438 | + if (ret == -ENOSYS) | |
14439 | + ret = HYPERVISOR_set_callbacks( | |
14440 | + event.address.cs, event.address.eip, | |
14441 | + failsafe.address.cs, failsafe.address.eip); | |
14442 | +#endif | |
14443 | + BUG_ON(ret); | |
14444 | + | |
14445 | + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb); | |
14446 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
14447 | + if (ret == -ENOSYS) { | |
14448 | + static struct xennmi_callback __initdata cb = { | |
14449 | + .handler_address = (unsigned long)nmi | |
14450 | + }; | |
14451 | + | |
14452 | + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb); | |
14453 | + } | |
14454 | +#endif | |
14455 | +} | |
14456 | Index: head-2008-11-25/arch/x86/lib/scrub.c | |
14457 | =================================================================== | |
14458 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
14459 | +++ head-2008-11-25/arch/x86/lib/scrub.c 2008-02-08 12:30:51.000000000 +0100 | |
14460 | @@ -0,0 +1,21 @@ | |
14461 | +#include <asm/cpufeature.h> | |
14462 | +#include <asm/page.h> | |
14463 | +#include <asm/processor.h> | |
14464 | + | |
14465 | +void scrub_pages(void *v, unsigned int count) | |
14466 | +{ | |
14467 | + if (likely(cpu_has_xmm2)) { | |
14468 | + unsigned long n = count * (PAGE_SIZE / sizeof(long) / 4); | |
14469 | + | |
14470 | + for (; n--; v += sizeof(long) * 4) | |
14471 | + asm("movnti %1,(%0)\n\t" | |
14472 | + "movnti %1,%c2(%0)\n\t" | |
14473 | + "movnti %1,2*%c2(%0)\n\t" | |
14474 | + "movnti %1,3*%c2(%0)\n\t" | |
14475 | + : : "r" (v), "r" (0L), "i" (sizeof(long)) | |
14476 | + : "memory"); | |
14477 | + asm volatile("sfence" : : : "memory"); | |
14478 | + } else | |
14479 | + for (; count--; v += PAGE_SIZE) | |
14480 | + clear_page(v); | |
14481 | +} | |
14482 | Index: head-2008-11-25/arch/x86/mm/fault_32-xen.c | |
14483 | =================================================================== | |
14484 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
14485 | +++ head-2008-11-25/arch/x86/mm/fault_32-xen.c 2007-12-10 08:47:31.000000000 +0100 | |
14486 | @@ -0,0 +1,779 @@ | |
14487 | +/* | |
14488 | + * linux/arch/i386/mm/fault.c | |
14489 | + * | |
14490 | + * Copyright (C) 1995 Linus Torvalds | |
14491 | + */ | |
14492 | + | |
14493 | +#include <linux/signal.h> | |
14494 | +#include <linux/sched.h> | |
14495 | +#include <linux/kernel.h> | |
14496 | +#include <linux/errno.h> | |
14497 | +#include <linux/string.h> | |
14498 | +#include <linux/types.h> | |
14499 | +#include <linux/ptrace.h> | |
14500 | +#include <linux/mman.h> | |
14501 | +#include <linux/mm.h> | |
14502 | +#include <linux/smp.h> | |
14503 | +#include <linux/smp_lock.h> | |
14504 | +#include <linux/interrupt.h> | |
14505 | +#include <linux/init.h> | |
14506 | +#include <linux/tty.h> | |
14507 | +#include <linux/vt_kern.h> /* For unblank_screen() */ | |
14508 | +#include <linux/highmem.h> | |
14509 | +#include <linux/module.h> | |
14510 | +#include <linux/kprobes.h> | |
14511 | + | |
14512 | +#include <asm/system.h> | |
14513 | +#include <asm/uaccess.h> | |
14514 | +#include <asm/desc.h> | |
14515 | +#include <asm/kdebug.h> | |
14516 | + | |
14517 | +extern void die(const char *,struct pt_regs *,long); | |
14518 | + | |
14519 | +#ifdef CONFIG_KPROBES | |
14520 | +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); | |
14521 | +int register_page_fault_notifier(struct notifier_block *nb) | |
14522 | +{ | |
14523 | + vmalloc_sync_all(); | |
14524 | + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); | |
14525 | +} | |
14526 | + | |
14527 | +int unregister_page_fault_notifier(struct notifier_block *nb) | |
14528 | +{ | |
14529 | + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); | |
14530 | +} | |
14531 | + | |
14532 | +static inline int notify_page_fault(enum die_val val, const char *str, | |
14533 | + struct pt_regs *regs, long err, int trap, int sig) | |
14534 | +{ | |
14535 | + struct die_args args = { | |
14536 | + .regs = regs, | |
14537 | + .str = str, | |
14538 | + .err = err, | |
14539 | + .trapnr = trap, | |
14540 | + .signr = sig | |
14541 | + }; | |
14542 | + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); | |
14543 | +} | |
14544 | +#else | |
14545 | +static inline int notify_page_fault(enum die_val val, const char *str, | |
14546 | + struct pt_regs *regs, long err, int trap, int sig) | |
14547 | +{ | |
14548 | + return NOTIFY_DONE; | |
14549 | +} | |
14550 | +#endif | |
14551 | + | |
14552 | + | |
14553 | +/* | |
14554 | + * Unlock any spinlocks which will prevent us from getting the | |
14555 | + * message out | |
14556 | + */ | |
14557 | +void bust_spinlocks(int yes) | |
14558 | +{ | |
14559 | + int loglevel_save = console_loglevel; | |
14560 | + | |
14561 | + if (yes) { | |
14562 | + oops_in_progress = 1; | |
14563 | + return; | |
14564 | + } | |
14565 | +#ifdef CONFIG_VT | |
14566 | + unblank_screen(); | |
14567 | +#endif | |
14568 | + oops_in_progress = 0; | |
14569 | + /* | |
14570 | + * OK, the message is on the console. Now we call printk() | |
14571 | + * without oops_in_progress set so that printk will give klogd | |
14572 | + * a poke. Hold onto your hats... | |
14573 | + */ | |
14574 | + console_loglevel = 15; /* NMI oopser may have shut the console up */ | |
14575 | + printk(" "); | |
14576 | + console_loglevel = loglevel_save; | |
14577 | +} | |
14578 | + | |
14579 | +/* | |
14580 | + * Return EIP plus the CS segment base. The segment limit is also | |
14581 | + * adjusted, clamped to the kernel/user address space (whichever is | |
14582 | + * appropriate), and returned in *eip_limit. | |
14583 | + * | |
14584 | + * The segment is checked, because it might have been changed by another | |
14585 | + * task between the original faulting instruction and here. | |
14586 | + * | |
14587 | + * If CS is no longer a valid code segment, or if EIP is beyond the | |
14588 | + * limit, or if it is a kernel address when CS is not a kernel segment, | |
14589 | + * then the returned value will be greater than *eip_limit. | |
14590 | + * | |
14591 | + * This is slow, but is very rarely executed. | |
14592 | + */ | |
14593 | +static inline unsigned long get_segment_eip(struct pt_regs *regs, | |
14594 | + unsigned long *eip_limit) | |
14595 | +{ | |
14596 | + unsigned long eip = regs->eip; | |
14597 | + unsigned seg = regs->xcs & 0xffff; | |
14598 | + u32 seg_ar, seg_limit, base, *desc; | |
14599 | + | |
14600 | + /* Unlikely, but must come before segment checks. */ | |
14601 | + if (unlikely(regs->eflags & VM_MASK)) { | |
14602 | + base = seg << 4; | |
14603 | + *eip_limit = base + 0xffff; | |
14604 | + return base + (eip & 0xffff); | |
14605 | + } | |
14606 | + | |
14607 | + /* The standard kernel/user address space limit. */ | |
14608 | + *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg; | |
14609 | + | |
14610 | + /* By far the most common cases. */ | |
14611 | + if (likely(seg == __USER_CS || seg == GET_KERNEL_CS())) | |
14612 | + return eip; | |
14613 | + | |
14614 | + /* Check the segment exists, is within the current LDT/GDT size, | |
14615 | + that kernel/user (ring 0..3) has the appropriate privilege, | |
14616 | + that it's a code segment, and get the limit. */ | |
14617 | + __asm__ ("larl %3,%0; lsll %3,%1" | |
14618 | + : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); | |
14619 | + if ((~seg_ar & 0x9800) || eip > seg_limit) { | |
14620 | + *eip_limit = 0; | |
14621 | + return 1; /* So that returned eip > *eip_limit. */ | |
14622 | + } | |
14623 | + | |
14624 | + /* Get the GDT/LDT descriptor base. | |
14625 | + When you look for races in this code remember that | |
14626 | + LDT and other horrors are only used in user space. */ | |
14627 | + if (seg & (1<<2)) { | |
14628 | + /* Must lock the LDT while reading it. */ | |
14629 | + down(¤t->mm->context.sem); | |
14630 | + desc = current->mm->context.ldt; | |
14631 | + desc = (void *)desc + (seg & ~7); | |
14632 | + } else { | |
14633 | + /* Must disable preemption while reading the GDT. */ | |
14634 | + desc = (u32 *)get_cpu_gdt_table(get_cpu()); | |
14635 | + desc = (void *)desc + (seg & ~7); | |
14636 | + } | |
14637 | + | |
14638 | + /* Decode the code segment base from the descriptor */ | |
14639 | + base = get_desc_base((unsigned long *)desc); | |
14640 | + | |
14641 | + if (seg & (1<<2)) { | |
14642 | + up(¤t->mm->context.sem); | |
14643 | + } else | |
14644 | + put_cpu(); | |
14645 | + | |
14646 | + /* Adjust EIP and segment limit, and clamp at the kernel limit. | |
14647 | + It's legitimate for segments to wrap at 0xffffffff. */ | |
14648 | + seg_limit += base; | |
14649 | + if (seg_limit < *eip_limit && seg_limit >= base) | |
14650 | + *eip_limit = seg_limit; | |
14651 | + return eip + base; | |
14652 | +} | |
14653 | + | |
14654 | +/* | |
14655 | + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. | |
14656 | + * Check that here and ignore it. | |
14657 | + */ | |
14658 | +static int __is_prefetch(struct pt_regs *regs, unsigned long addr) | |
14659 | +{ | |
14660 | + unsigned long limit; | |
14661 | + unsigned long instr = get_segment_eip (regs, &limit); | |
14662 | + int scan_more = 1; | |
14663 | + int prefetch = 0; | |
14664 | + int i; | |
14665 | + | |
14666 | + for (i = 0; scan_more && i < 15; i++) { | |
14667 | + unsigned char opcode; | |
14668 | + unsigned char instr_hi; | |
14669 | + unsigned char instr_lo; | |
14670 | + | |
14671 | + if (instr > limit) | |
14672 | + break; | |
14673 | + if (__get_user(opcode, (unsigned char __user *) instr)) | |
14674 | + break; | |
14675 | + | |
14676 | + instr_hi = opcode & 0xf0; | |
14677 | + instr_lo = opcode & 0x0f; | |
14678 | + instr++; | |
14679 | + | |
14680 | + switch (instr_hi) { | |
14681 | + case 0x20: | |
14682 | + case 0x30: | |
14683 | + /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ | |
14684 | + scan_more = ((instr_lo & 7) == 0x6); | |
14685 | + break; | |
14686 | + | |
14687 | + case 0x60: | |
14688 | + /* 0x64 thru 0x67 are valid prefixes in all modes. */ | |
14689 | + scan_more = (instr_lo & 0xC) == 0x4; | |
14690 | + break; | |
14691 | + case 0xF0: | |
14692 | + /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ | |
14693 | + scan_more = !instr_lo || (instr_lo>>1) == 1; | |
14694 | + break; | |
14695 | + case 0x00: | |
14696 | + /* Prefetch instruction is 0x0F0D or 0x0F18 */ | |
14697 | + scan_more = 0; | |
14698 | + if (instr > limit) | |
14699 | + break; | |
14700 | + if (__get_user(opcode, (unsigned char __user *) instr)) | |
14701 | + break; | |
14702 | + prefetch = (instr_lo == 0xF) && | |
14703 | + (opcode == 0x0D || opcode == 0x18); | |
14704 | + break; | |
14705 | + default: | |
14706 | + scan_more = 0; | |
14707 | + break; | |
14708 | + } | |
14709 | + } | |
14710 | + return prefetch; | |
14711 | +} | |
14712 | + | |
14713 | +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, | |
14714 | + unsigned long error_code) | |
14715 | +{ | |
14716 | + if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | |
14717 | + boot_cpu_data.x86 >= 6)) { | |
14718 | + /* Catch an obscure case of prefetch inside an NX page. */ | |
14719 | + if (nx_enabled && (error_code & 16)) | |
14720 | + return 0; | |
14721 | + return __is_prefetch(regs, addr); | |
14722 | + } | |
14723 | + return 0; | |
14724 | +} | |
14725 | + | |
14726 | +static noinline void force_sig_info_fault(int si_signo, int si_code, | |
14727 | + unsigned long address, struct task_struct *tsk) | |
14728 | +{ | |
14729 | + siginfo_t info; | |
14730 | + | |
14731 | + info.si_signo = si_signo; | |
14732 | + info.si_errno = 0; | |
14733 | + info.si_code = si_code; | |
14734 | + info.si_addr = (void __user *)address; | |
14735 | + force_sig_info(si_signo, &info, tsk); | |
14736 | +} | |
14737 | + | |
14738 | +fastcall void do_invalid_op(struct pt_regs *, unsigned long); | |
14739 | + | |
14740 | +#ifdef CONFIG_X86_PAE | |
14741 | +static void dump_fault_path(unsigned long address) | |
14742 | +{ | |
14743 | + unsigned long *p, page; | |
14744 | + unsigned long mfn; | |
14745 | + | |
14746 | + page = read_cr3(); | |
14747 | + p = (unsigned long *)__va(page); | |
14748 | + p += (address >> 30) * 2; | |
14749 | + printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]); | |
14750 | + if (p[0] & _PAGE_PRESENT) { | |
14751 | + mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20); | |
14752 | + page = mfn_to_pfn(mfn) << PAGE_SHIFT; | |
14753 | + p = (unsigned long *)__va(page); | |
14754 | + address &= 0x3fffffff; | |
14755 | + p += (address >> 21) * 2; | |
14756 | + printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n", | |
14757 | + page, p[1], p[0]); | |
14758 | + mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20); | |
14759 | +#ifdef CONFIG_HIGHPTE | |
14760 | + if (mfn_to_pfn(mfn) >= highstart_pfn) | |
14761 | + return; | |
14762 | +#endif | |
14763 | + if (p[0] & _PAGE_PRESENT) { | |
14764 | + page = mfn_to_pfn(mfn) << PAGE_SHIFT; | |
14765 | + p = (unsigned long *) __va(page); | |
14766 | + address &= 0x001fffff; | |
14767 | + p += (address >> 12) * 2; | |
14768 | + printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n", | |
14769 | + page, p[1], p[0]); | |
14770 | + } | |
14771 | + } | |
14772 | +} | |
14773 | +#else | |
14774 | +static void dump_fault_path(unsigned long address) | |
14775 | +{ | |
14776 | + unsigned long page; | |
14777 | + | |
14778 | + page = read_cr3(); | |
14779 | + page = ((unsigned long *) __va(page))[address >> 22]; | |
14780 | + if (oops_may_print()) | |
14781 | + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, | |
14782 | + machine_to_phys(page)); | |
14783 | + /* | |
14784 | + * We must not directly access the pte in the highpte | |
14785 | + * case if the page table is located in highmem. | |
14786 | + * And lets rather not kmap-atomic the pte, just in case | |
14787 | + * it's allocated already. | |
14788 | + */ | |
14789 | +#ifdef CONFIG_HIGHPTE | |
14790 | + if ((page >> PAGE_SHIFT) >= highstart_pfn) | |
14791 | + return; | |
14792 | +#endif | |
14793 | + if ((page & 1) && oops_may_print()) { | |
14794 | + page &= PAGE_MASK; | |
14795 | + address &= 0x003ff000; | |
14796 | + page = machine_to_phys(page); | |
14797 | + page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; | |
14798 | + printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page, | |
14799 | + machine_to_phys(page)); | |
14800 | + } | |
14801 | +} | |
14802 | +#endif | |
14803 | + | |
14804 | +static int spurious_fault(struct pt_regs *regs, | |
14805 | + unsigned long address, | |
14806 | + unsigned long error_code) | |
14807 | +{ | |
14808 | + pgd_t *pgd; | |
14809 | + pud_t *pud; | |
14810 | + pmd_t *pmd; | |
14811 | + pte_t *pte; | |
14812 | + | |
14813 | + /* Reserved-bit violation or user access to kernel space? */ | |
14814 | + if (error_code & 0x0c) | |
14815 | + return 0; | |
14816 | + | |
14817 | + pgd = init_mm.pgd + pgd_index(address); | |
14818 | + if (!pgd_present(*pgd)) | |
14819 | + return 0; | |
14820 | + | |
14821 | + pud = pud_offset(pgd, address); | |
14822 | + if (!pud_present(*pud)) | |
14823 | + return 0; | |
14824 | + | |
14825 | + pmd = pmd_offset(pud, address); | |
14826 | + if (!pmd_present(*pmd)) | |
14827 | + return 0; | |
14828 | + | |
14829 | + pte = pte_offset_kernel(pmd, address); | |
14830 | + if (!pte_present(*pte)) | |
14831 | + return 0; | |
14832 | + if ((error_code & 0x02) && !pte_write(*pte)) | |
14833 | + return 0; | |
14834 | +#ifdef CONFIG_X86_PAE | |
14835 | + if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX)) | |
14836 | + return 0; | |
14837 | +#endif | |
14838 | + | |
14839 | + return 1; | |
14840 | +} | |
14841 | + | |
14842 | +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | |
14843 | +{ | |
14844 | + unsigned index = pgd_index(address); | |
14845 | + pgd_t *pgd_k; | |
14846 | + pud_t *pud, *pud_k; | |
14847 | + pmd_t *pmd, *pmd_k; | |
14848 | + | |
14849 | + pgd += index; | |
14850 | + pgd_k = init_mm.pgd + index; | |
14851 | + | |
14852 | + if (!pgd_present(*pgd_k)) | |
14853 | + return NULL; | |
14854 | + | |
14855 | + /* | |
14856 | + * set_pgd(pgd, *pgd_k); here would be useless on PAE | |
14857 | + * and redundant with the set_pmd() on non-PAE. As would | |
14858 | + * set_pud. | |
14859 | + */ | |
14860 | + | |
14861 | + pud = pud_offset(pgd, address); | |
14862 | + pud_k = pud_offset(pgd_k, address); | |
14863 | + if (!pud_present(*pud_k)) | |
14864 | + return NULL; | |
14865 | + | |
14866 | + pmd = pmd_offset(pud, address); | |
14867 | + pmd_k = pmd_offset(pud_k, address); | |
14868 | + if (!pmd_present(*pmd_k)) | |
14869 | + return NULL; | |
14870 | + if (!pmd_present(*pmd)) | |
14871 | +#if CONFIG_XEN_COMPAT > 0x030002 | |
14872 | + set_pmd(pmd, *pmd_k); | |
14873 | +#else | |
14874 | + /* | |
14875 | + * When running on older Xen we must launder *pmd_k through | |
14876 | + * pmd_val() to ensure that _PAGE_PRESENT is correctly set. | |
14877 | + */ | |
14878 | + set_pmd(pmd, __pmd(pmd_val(*pmd_k))); | |
14879 | +#endif | |
14880 | + else | |
14881 | + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); | |
14882 | + return pmd_k; | |
14883 | +} | |
14884 | + | |
14885 | +/* | |
14886 | + * Handle a fault on the vmalloc or module mapping area | |
14887 | + * | |
14888 | + * This assumes no large pages in there. | |
14889 | + */ | |
14890 | +static inline int vmalloc_fault(unsigned long address) | |
14891 | +{ | |
14892 | + unsigned long pgd_paddr; | |
14893 | + pmd_t *pmd_k; | |
14894 | + pte_t *pte_k; | |
14895 | + /* | |
14896 | + * Synchronize this task's top level page-table | |
14897 | + * with the 'reference' page table. | |
14898 | + * | |
14899 | + * Do _not_ use "current" here. We might be inside | |
14900 | + * an interrupt in the middle of a task switch.. | |
14901 | + */ | |
14902 | + pgd_paddr = read_cr3(); | |
14903 | + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); | |
14904 | + if (!pmd_k) | |
14905 | + return -1; | |
14906 | + pte_k = pte_offset_kernel(pmd_k, address); | |
14907 | + if (!pte_present(*pte_k)) | |
14908 | + return -1; | |
14909 | + return 0; | |
14910 | +} | |
14911 | + | |
14912 | +/* | |
14913 | + * This routine handles page faults. It determines the address, | |
14914 | + * and the problem, and then passes it off to one of the appropriate | |
14915 | + * routines. | |
14916 | + * | |
14917 | + * error_code: | |
14918 | + * bit 0 == 0 means no page found, 1 means protection fault | |
14919 | + * bit 1 == 0 means read, 1 means write | |
14920 | + * bit 2 == 0 means kernel, 1 means user-mode | |
14921 | + * bit 3 == 1 means use of reserved bit detected | |
14922 | + * bit 4 == 1 means fault was an instruction fetch | |
14923 | + */ | |
14924 | +fastcall void __kprobes do_page_fault(struct pt_regs *regs, | |
14925 | + unsigned long error_code) | |
14926 | +{ | |
14927 | + struct task_struct *tsk; | |
14928 | + struct mm_struct *mm; | |
14929 | + struct vm_area_struct * vma; | |
14930 | + unsigned long address; | |
14931 | + int write, si_code; | |
14932 | + | |
14933 | + /* get the address */ | |
14934 | + address = read_cr2(); | |
14935 | + | |
14936 | + /* Set the "privileged fault" bit to something sane. */ | |
14937 | + error_code &= ~4; | |
14938 | + error_code |= (regs->xcs & 2) << 1; | |
14939 | + if (regs->eflags & X86_EFLAGS_VM) | |
14940 | + error_code |= 4; | |
14941 | + | |
14942 | + tsk = current; | |
14943 | + | |
14944 | + si_code = SEGV_MAPERR; | |
14945 | + | |
14946 | + /* | |
14947 | + * We fault-in kernel-space virtual memory on-demand. The | |
14948 | + * 'reference' page table is init_mm.pgd. | |
14949 | + * | |
14950 | + * NOTE! We MUST NOT take any locks for this case. We may | |
14951 | + * be in an interrupt or a critical region, and should | |
14952 | + * only copy the information from the master page table, | |
14953 | + * nothing more. | |
14954 | + * | |
14955 | + * This verifies that the fault happens in kernel space | |
14956 | + * (error_code & 4) == 0, and that the fault was not a | |
14957 | + * protection error (error_code & 9) == 0. | |
14958 | + */ | |
14959 | + if (unlikely(address >= TASK_SIZE)) { | |
14960 | +#ifdef CONFIG_XEN | |
14961 | + /* Faults in hypervisor area can never be patched up. */ | |
14962 | + if (address >= hypervisor_virt_start) | |
14963 | + goto bad_area_nosemaphore; | |
14964 | +#endif | |
14965 | + if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) | |
14966 | + return; | |
14967 | + /* Can take a spurious fault if mapping changes R/O -> R/W. */ | |
14968 | + if (spurious_fault(regs, address, error_code)) | |
14969 | + return; | |
14970 | + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | |
14971 | + SIGSEGV) == NOTIFY_STOP) | |
14972 | + return; | |
14973 | + /* | |
14974 | + * Don't take the mm semaphore here. If we fixup a prefetch | |
14975 | + * fault we could otherwise deadlock. | |
14976 | + */ | |
14977 | + goto bad_area_nosemaphore; | |
14978 | + } | |
14979 | + | |
14980 | + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | |
14981 | + SIGSEGV) == NOTIFY_STOP) | |
14982 | + return; | |
14983 | + | |
14984 | + /* It's safe to allow irq's after cr2 has been saved and the vmalloc | |
14985 | + fault has been handled. */ | |
14986 | + if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | |
14987 | + local_irq_enable(); | |
14988 | + | |
14989 | + mm = tsk->mm; | |
14990 | + | |
14991 | + /* | |
14992 | + * If we're in an interrupt, have no user context or are running in an | |
14993 | + * atomic region then we must not take the fault.. | |
14994 | + */ | |
14995 | + if (in_atomic() || !mm) | |
14996 | + goto bad_area_nosemaphore; | |
14997 | + | |
14998 | + /* When running in the kernel we expect faults to occur only to | |
14999 | + * addresses in user space. All other faults represent errors in the | |
15000 | + * kernel and should generate an OOPS. Unfortunatly, in the case of an | |
15001 | + * erroneous fault occurring in a code path which already holds mmap_sem | |
15002 | + * we will deadlock attempting to validate the fault against the | |
15003 | + * address space. Luckily the kernel only validly references user | |
15004 | + * space from well defined areas of code, which are listed in the | |
15005 | + * exceptions table. | |
15006 | + * | |
15007 | + * As the vast majority of faults will be valid we will only perform | |
15008 | + * the source reference check when there is a possibilty of a deadlock. | |
15009 | + * Attempt to lock the address space, if we cannot we then validate the | |
15010 | + * source. If this is invalid we can skip the address space check, | |
15011 | + * thus avoiding the deadlock. | |
15012 | + */ | |
15013 | + if (!down_read_trylock(&mm->mmap_sem)) { | |
15014 | + if ((error_code & 4) == 0 && | |
15015 | + !search_exception_tables(regs->eip)) | |
15016 | + goto bad_area_nosemaphore; | |
15017 | + down_read(&mm->mmap_sem); | |
15018 | + } | |
15019 | + | |
15020 | + vma = find_vma(mm, address); | |
15021 | + if (!vma) | |
15022 | + goto bad_area; | |
15023 | + if (vma->vm_start <= address) | |
15024 | + goto good_area; | |
15025 | + if (!(vma->vm_flags & VM_GROWSDOWN)) | |
15026 | + goto bad_area; | |
15027 | + if (error_code & 4) { | |
15028 | + /* | |
15029 | + * Accessing the stack below %esp is always a bug. | |
15030 | + * The large cushion allows instructions like enter | |
15031 | + * and pusha to work. ("enter $65535,$31" pushes | |
15032 | + * 32 pointers and then decrements %esp by 65535.) | |
15033 | + */ | |
15034 | + if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp) | |
15035 | + goto bad_area; | |
15036 | + } | |
15037 | + if (expand_stack(vma, address)) | |
15038 | + goto bad_area; | |
15039 | +/* | |
15040 | + * Ok, we have a good vm_area for this memory access, so | |
15041 | + * we can handle it.. | |
15042 | + */ | |
15043 | +good_area: | |
15044 | + si_code = SEGV_ACCERR; | |
15045 | + write = 0; | |
15046 | + switch (error_code & 3) { | |
15047 | + default: /* 3: write, present */ | |
15048 | +#ifdef TEST_VERIFY_AREA | |
15049 | + if (regs->cs == GET_KERNEL_CS()) | |
15050 | + printk("WP fault at %08lx\n", regs->eip); | |
15051 | +#endif | |
15052 | + /* fall through */ | |
15053 | + case 2: /* write, not present */ | |
15054 | + if (!(vma->vm_flags & VM_WRITE)) | |
15055 | + goto bad_area; | |
15056 | + write++; | |
15057 | + break; | |
15058 | + case 1: /* read, present */ | |
15059 | + goto bad_area; | |
15060 | + case 0: /* read, not present */ | |
15061 | + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | |
15062 | + goto bad_area; | |
15063 | + } | |
15064 | + | |
15065 | + survive: | |
15066 | + /* | |
15067 | + * If for any reason at all we couldn't handle the fault, | |
15068 | + * make sure we exit gracefully rather than endlessly redo | |
15069 | + * the fault. | |
15070 | + */ | |
15071 | + switch (handle_mm_fault(mm, vma, address, write)) { | |
15072 | + case VM_FAULT_MINOR: | |
15073 | + tsk->min_flt++; | |
15074 | + break; | |
15075 | + case VM_FAULT_MAJOR: | |
15076 | + tsk->maj_flt++; | |
15077 | + break; | |
15078 | + case VM_FAULT_SIGBUS: | |
15079 | + goto do_sigbus; | |
15080 | + case VM_FAULT_OOM: | |
15081 | + goto out_of_memory; | |
15082 | + default: | |
15083 | + BUG(); | |
15084 | + } | |
15085 | + | |
15086 | + /* | |
15087 | + * Did it hit the DOS screen memory VA from vm86 mode? | |
15088 | + */ | |
15089 | + if (regs->eflags & VM_MASK) { | |
15090 | + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; | |
15091 | + if (bit < 32) | |
15092 | + tsk->thread.screen_bitmap |= 1 << bit; | |
15093 | + } | |
15094 | + up_read(&mm->mmap_sem); | |
15095 | + return; | |
15096 | + | |
15097 | +/* | |
15098 | + * Something tried to access memory that isn't in our memory map.. | |
15099 | + * Fix it, but check if it's kernel or user first.. | |
15100 | + */ | |
15101 | +bad_area: | |
15102 | + up_read(&mm->mmap_sem); | |
15103 | + | |
15104 | +bad_area_nosemaphore: | |
15105 | + /* User mode accesses just cause a SIGSEGV */ | |
15106 | + if (error_code & 4) { | |
15107 | + /* | |
15108 | + * Valid to do another page fault here because this one came | |
15109 | + * from user space. | |
15110 | + */ | |
15111 | + if (is_prefetch(regs, address, error_code)) | |
15112 | + return; | |
15113 | + | |
15114 | + tsk->thread.cr2 = address; | |
15115 | + /* Kernel addresses are always protection faults */ | |
15116 | + tsk->thread.error_code = error_code | (address >= TASK_SIZE); | |
15117 | + tsk->thread.trap_no = 14; | |
15118 | + force_sig_info_fault(SIGSEGV, si_code, address, tsk); | |
15119 | + return; | |
15120 | + } | |
15121 | + | |
15122 | +#ifdef CONFIG_X86_F00F_BUG | |
15123 | + /* | |
15124 | + * Pentium F0 0F C7 C8 bug workaround. | |
15125 | + */ | |
15126 | + if (boot_cpu_data.f00f_bug) { | |
15127 | + unsigned long nr; | |
15128 | + | |
15129 | + nr = (address - idt_descr.address) >> 3; | |
15130 | + | |
15131 | + if (nr == 6) { | |
15132 | + do_invalid_op(regs, 0); | |
15133 | + return; | |
15134 | + } | |
15135 | + } | |
15136 | +#endif | |
15137 | + | |
15138 | +no_context: | |
15139 | + /* Are we prepared to handle this kernel fault? */ | |
15140 | + if (fixup_exception(regs)) | |
15141 | + return; | |
15142 | + | |
15143 | + /* | |
15144 | + * Valid to do another page fault here, because if this fault | |
15145 | + * had been triggered by is_prefetch fixup_exception would have | |
15146 | + * handled it. | |
15147 | + */ | |
15148 | + if (is_prefetch(regs, address, error_code)) | |
15149 | + return; | |
15150 | + | |
15151 | +/* | |
15152 | + * Oops. The kernel tried to access some bad page. We'll have to | |
15153 | + * terminate things with extreme prejudice. | |
15154 | + */ | |
15155 | + | |
15156 | + bust_spinlocks(1); | |
15157 | + | |
15158 | + if (oops_may_print()) { | |
15159 | + #ifdef CONFIG_X86_PAE | |
15160 | + if (error_code & 16) { | |
15161 | + pte_t *pte = lookup_address(address); | |
15162 | + | |
15163 | + if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) | |
15164 | + printk(KERN_CRIT "kernel tried to execute " | |
15165 | + "NX-protected page - exploit attempt? " | |
15166 | + "(uid: %d)\n", current->uid); | |
15167 | + } | |
15168 | + #endif | |
15169 | + if (address < PAGE_SIZE) | |
15170 | + printk(KERN_ALERT "BUG: unable to handle kernel NULL " | |
15171 | + "pointer dereference"); | |
15172 | + else | |
15173 | + printk(KERN_ALERT "BUG: unable to handle kernel paging" | |
15174 | + " request"); | |
15175 | + printk(" at virtual address %08lx\n",address); | |
15176 | + printk(KERN_ALERT " printing eip:\n"); | |
15177 | + printk("%08lx\n", regs->eip); | |
15178 | + } | |
15179 | + dump_fault_path(address); | |
15180 | + tsk->thread.cr2 = address; | |
15181 | + tsk->thread.trap_no = 14; | |
15182 | + tsk->thread.error_code = error_code; | |
15183 | + die("Oops", regs, error_code); | |
15184 | + bust_spinlocks(0); | |
15185 | + do_exit(SIGKILL); | |
15186 | + | |
15187 | +/* | |
15188 | + * We ran out of memory, or some other thing happened to us that made | |
15189 | + * us unable to handle the page fault gracefully. | |
15190 | + */ | |
15191 | +out_of_memory: | |
15192 | + up_read(&mm->mmap_sem); | |
15193 | + if (tsk->pid == 1) { | |
15194 | + yield(); | |
15195 | + down_read(&mm->mmap_sem); | |
15196 | + goto survive; | |
15197 | + } | |
15198 | + printk("VM: killing process %s\n", tsk->comm); | |
15199 | + if (error_code & 4) | |
15200 | + do_exit(SIGKILL); | |
15201 | + goto no_context; | |
15202 | + | |
15203 | +do_sigbus: | |
15204 | + up_read(&mm->mmap_sem); | |
15205 | + | |
15206 | + /* Kernel mode? Handle exceptions or die */ | |
15207 | + if (!(error_code & 4)) | |
15208 | + goto no_context; | |
15209 | + | |
15210 | + /* User space => ok to do another page fault */ | |
15211 | + if (is_prefetch(regs, address, error_code)) | |
15212 | + return; | |
15213 | + | |
15214 | + tsk->thread.cr2 = address; | |
15215 | + tsk->thread.error_code = error_code; | |
15216 | + tsk->thread.trap_no = 14; | |
15217 | + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | |
15218 | +} | |
15219 | + | |
15220 | +#if !HAVE_SHARED_KERNEL_PMD | |
15221 | +void vmalloc_sync_all(void) | |
15222 | +{ | |
15223 | + /* | |
15224 | + * Note that races in the updates of insync and start aren't | |
15225 | + * problematic: insync can only get set bits added, and updates to | |
15226 | + * start are only improving performance (without affecting correctness | |
15227 | + * if undone). | |
15228 | + * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs. | |
15229 | + * This change works just fine with 2-level paging too. | |
15230 | + */ | |
15231 | +#define sync_index(a) ((a) >> PMD_SHIFT) | |
15232 | + static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD); | |
15233 | + static unsigned long start = TASK_SIZE; | |
15234 | + unsigned long address; | |
15235 | + | |
15236 | + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); | |
15237 | + for (address = start; | |
15238 | + address >= TASK_SIZE && address < hypervisor_virt_start; | |
15239 | + address += 1UL << PMD_SHIFT) { | |
15240 | + if (!test_bit(sync_index(address), insync)) { | |
15241 | + unsigned long flags; | |
15242 | + struct page *page; | |
15243 | + | |
15244 | + spin_lock_irqsave(&pgd_lock, flags); | |
15245 | + /* XEN: failure path assumes non-empty pgd_list. */ | |
15246 | + if (unlikely(!pgd_list)) { | |
15247 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
15248 | + return; | |
15249 | + } | |
15250 | + for (page = pgd_list; page; page = | |
15251 | + (struct page *)page->index) | |
15252 | + if (!vmalloc_sync_one(page_address(page), | |
15253 | + address)) { | |
15254 | + BUG_ON(page != pgd_list); | |
15255 | + break; | |
15256 | + } | |
15257 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
15258 | + if (!page) | |
15259 | + set_bit(sync_index(address), insync); | |
15260 | + } | |
15261 | + if (address == start && test_bit(sync_index(address), insync)) | |
15262 | + start = address + (1UL << PMD_SHIFT); | |
15263 | + } | |
15264 | +} | |
15265 | +#endif | |
15266 | Index: head-2008-11-25/arch/x86/mm/highmem_32-xen.c | |
15267 | =================================================================== | |
15268 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
15269 | +++ head-2008-11-25/arch/x86/mm/highmem_32-xen.c 2008-10-29 09:55:56.000000000 +0100 | |
15270 | @@ -0,0 +1,183 @@ | |
15271 | +#include <linux/highmem.h> | |
15272 | +#include <linux/module.h> | |
15273 | + | |
15274 | +void *kmap(struct page *page) | |
15275 | +{ | |
15276 | + might_sleep(); | |
15277 | + if (!PageHighMem(page)) | |
15278 | + return page_address(page); | |
15279 | + return kmap_high(page); | |
15280 | +} | |
15281 | + | |
15282 | +void kunmap(struct page *page) | |
15283 | +{ | |
15284 | + if (in_interrupt()) | |
15285 | + BUG(); | |
15286 | + if (!PageHighMem(page)) | |
15287 | + return; | |
15288 | + kunmap_high(page); | |
15289 | +} | |
15290 | + | |
15291 | +/* | |
15292 | + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because | |
15293 | + * no global lock is needed and because the kmap code must perform a global TLB | |
15294 | + * invalidation when the kmap pool wraps. | |
15295 | + * | |
15296 | + * However when holding an atomic kmap is is not legal to sleep, so atomic | |
15297 | + * kmaps are appropriate for short, tight code paths only. | |
15298 | + */ | |
15299 | +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot) | |
15300 | +{ | |
15301 | + enum fixed_addresses idx; | |
15302 | + unsigned long vaddr; | |
15303 | + | |
15304 | + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ | |
15305 | + inc_preempt_count(); | |
15306 | + if (!PageHighMem(page)) | |
15307 | + return page_address(page); | |
15308 | + | |
15309 | + idx = type + KM_TYPE_NR*smp_processor_id(); | |
15310 | + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | |
15311 | +#ifdef CONFIG_DEBUG_HIGHMEM | |
15312 | + if (!pte_none(*(kmap_pte-idx))) | |
15313 | + BUG(); | |
15314 | +#endif | |
15315 | + set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); | |
15316 | + | |
15317 | + return (void*) vaddr; | |
15318 | +} | |
15319 | + | |
15320 | +void *kmap_atomic(struct page *page, enum km_type type) | |
15321 | +{ | |
15322 | + return __kmap_atomic(page, type, kmap_prot); | |
15323 | +} | |
15324 | + | |
15325 | +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */ | |
15326 | +void *kmap_atomic_pte(struct page *page, enum km_type type) | |
15327 | +{ | |
15328 | + return __kmap_atomic(page, type, | |
15329 | + test_bit(PG_pinned, &page->flags) | |
15330 | + ? PAGE_KERNEL_RO : kmap_prot); | |
15331 | +} | |
15332 | + | |
15333 | +void kunmap_atomic(void *kvaddr, enum km_type type) | |
15334 | +{ | |
15335 | +#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN) | |
15336 | + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; | |
15337 | + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); | |
15338 | + | |
15339 | + if (vaddr < FIXADDR_START) { // FIXME | |
15340 | + dec_preempt_count(); | |
15341 | + preempt_check_resched(); | |
15342 | + return; | |
15343 | + } | |
15344 | +#endif | |
15345 | + | |
15346 | +#if defined(CONFIG_DEBUG_HIGHMEM) | |
15347 | + if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) | |
15348 | + BUG(); | |
15349 | + | |
15350 | + /* | |
15351 | + * force other mappings to Oops if they'll try to access | |
15352 | + * this pte without first remap it | |
15353 | + */ | |
15354 | + pte_clear(&init_mm, vaddr, kmap_pte-idx); | |
15355 | + __flush_tlb_one(vaddr); | |
15356 | +#elif defined(CONFIG_XEN) | |
15357 | + /* | |
15358 | + * We must ensure there are no dangling pagetable references when | |
15359 | + * returning memory to Xen (decrease_reservation). | |
15360 | + * XXX TODO: We could make this faster by only zapping when | |
15361 | + * kmap_flush_unused is called but that is trickier and more invasive. | |
15362 | + */ | |
15363 | + pte_clear(&init_mm, vaddr, kmap_pte-idx); | |
15364 | +#endif | |
15365 | + | |
15366 | + dec_preempt_count(); | |
15367 | + preempt_check_resched(); | |
15368 | +} | |
15369 | + | |
15370 | +/* This is the same as kmap_atomic() but can map memory that doesn't | |
15371 | + * have a struct page associated with it. | |
15372 | + */ | |
15373 | +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) | |
15374 | +{ | |
15375 | + enum fixed_addresses idx; | |
15376 | + unsigned long vaddr; | |
15377 | + | |
15378 | + inc_preempt_count(); | |
15379 | + | |
15380 | + idx = type + KM_TYPE_NR*smp_processor_id(); | |
15381 | + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | |
15382 | + set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); | |
15383 | + __flush_tlb_one(vaddr); | |
15384 | + | |
15385 | + return (void*) vaddr; | |
15386 | +} | |
15387 | + | |
15388 | +struct page *kmap_atomic_to_page(void *ptr) | |
15389 | +{ | |
15390 | + unsigned long idx, vaddr = (unsigned long)ptr; | |
15391 | + pte_t *pte; | |
15392 | + | |
15393 | + if (vaddr < FIXADDR_START) | |
15394 | + return virt_to_page(ptr); | |
15395 | + | |
15396 | + idx = virt_to_fix(vaddr); | |
15397 | + pte = kmap_pte - (idx - FIX_KMAP_BEGIN); | |
15398 | + return pte_page(*pte); | |
15399 | +} | |
15400 | + | |
15401 | +void clear_highpage(struct page *page) | |
15402 | +{ | |
15403 | + void *kaddr; | |
15404 | + | |
15405 | + if (likely(xen_feature(XENFEAT_highmem_assist)) | |
15406 | + && PageHighMem(page)) { | |
15407 | + struct mmuext_op meo; | |
15408 | + | |
15409 | + meo.cmd = MMUEXT_CLEAR_PAGE; | |
15410 | + meo.arg1.mfn = pfn_to_mfn(page_to_pfn(page)); | |
15411 | + if (HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0) | |
15412 | + return; | |
15413 | + } | |
15414 | + | |
15415 | + kaddr = kmap_atomic(page, KM_USER0); | |
15416 | + clear_page(kaddr); | |
15417 | + kunmap_atomic(kaddr, KM_USER0); | |
15418 | +} | |
15419 | + | |
15420 | +void copy_highpage(struct page *to, struct page *from) | |
15421 | +{ | |
15422 | + void *vfrom, *vto; | |
15423 | + | |
15424 | + if (likely(xen_feature(XENFEAT_highmem_assist)) | |
15425 | + && (PageHighMem(from) || PageHighMem(to))) { | |
15426 | + unsigned long from_pfn = page_to_pfn(from); | |
15427 | + unsigned long to_pfn = page_to_pfn(to); | |
15428 | + struct mmuext_op meo; | |
15429 | + | |
15430 | + meo.cmd = MMUEXT_COPY_PAGE; | |
15431 | + meo.arg1.mfn = pfn_to_mfn(to_pfn); | |
15432 | + meo.arg2.src_mfn = pfn_to_mfn(from_pfn); | |
15433 | + if (mfn_to_pfn(meo.arg2.src_mfn) == from_pfn | |
15434 | + && mfn_to_pfn(meo.arg1.mfn) == to_pfn | |
15435 | + && HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0) | |
15436 | + return; | |
15437 | + } | |
15438 | + | |
15439 | + vfrom = kmap_atomic(from, KM_USER0); | |
15440 | + vto = kmap_atomic(to, KM_USER1); | |
15441 | + copy_page(vto, vfrom); | |
15442 | + kunmap_atomic(vfrom, KM_USER0); | |
15443 | + kunmap_atomic(vto, KM_USER1); | |
15444 | +} | |
15445 | + | |
15446 | +EXPORT_SYMBOL(kmap); | |
15447 | +EXPORT_SYMBOL(kunmap); | |
15448 | +EXPORT_SYMBOL(kmap_atomic); | |
15449 | +EXPORT_SYMBOL(kmap_atomic_pte); | |
15450 | +EXPORT_SYMBOL(kunmap_atomic); | |
15451 | +EXPORT_SYMBOL(kmap_atomic_to_page); | |
15452 | +EXPORT_SYMBOL(clear_highpage); | |
15453 | +EXPORT_SYMBOL(copy_highpage); | |
15454 | Index: head-2008-11-25/arch/x86/mm/hypervisor.c | |
15455 | =================================================================== | |
15456 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
15457 | +++ head-2008-11-25/arch/x86/mm/hypervisor.c 2008-10-29 09:55:56.000000000 +0100 | |
15458 | @@ -0,0 +1,547 @@ | |
15459 | +/****************************************************************************** | |
15460 | + * mm/hypervisor.c | |
15461 | + * | |
15462 | + * Update page tables via the hypervisor. | |
15463 | + * | |
15464 | + * Copyright (c) 2002-2004, K A Fraser | |
15465 | + * | |
15466 | + * This program is free software; you can redistribute it and/or | |
15467 | + * modify it under the terms of the GNU General Public License version 2 | |
15468 | + * as published by the Free Software Foundation; or, when distributed | |
15469 | + * separately from the Linux kernel or incorporated into other | |
15470 | + * software packages, subject to the following license: | |
15471 | + * | |
15472 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | |
15473 | + * of this source file (the "Software"), to deal in the Software without | |
15474 | + * restriction, including without limitation the rights to use, copy, modify, | |
15475 | + * merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
15476 | + * and to permit persons to whom the Software is furnished to do so, subject to | |
15477 | + * the following conditions: | |
15478 | + * | |
15479 | + * The above copyright notice and this permission notice shall be included in | |
15480 | + * all copies or substantial portions of the Software. | |
15481 | + * | |
15482 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15483 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15484 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
15485 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
15486 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
15487 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
15488 | + * IN THE SOFTWARE. | |
15489 | + */ | |
15490 | + | |
15491 | +#include <linux/sched.h> | |
15492 | +#include <linux/mm.h> | |
15493 | +#include <linux/vmalloc.h> | |
15494 | +#include <asm/page.h> | |
15495 | +#include <asm/pgtable.h> | |
15496 | +#include <asm/hypervisor.h> | |
15497 | +#include <xen/balloon.h> | |
15498 | +#include <xen/features.h> | |
15499 | +#include <xen/interface/memory.h> | |
15500 | +#include <linux/module.h> | |
15501 | +#include <linux/percpu.h> | |
15502 | +#include <asm/tlbflush.h> | |
15503 | +#include <linux/highmem.h> | |
15504 | + | |
15505 | +void xen_l1_entry_update(pte_t *ptr, pte_t val) | |
15506 | +{ | |
15507 | + mmu_update_t u; | |
15508 | +#ifdef CONFIG_HIGHPTE | |
15509 | + u.ptr = ((unsigned long)ptr >= (unsigned long)high_memory) ? | |
15510 | + arbitrary_virt_to_machine(ptr) : virt_to_machine(ptr); | |
15511 | +#else | |
15512 | + u.ptr = virt_to_machine(ptr); | |
15513 | +#endif | |
15514 | + u.val = __pte_val(val); | |
15515 | + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); | |
15516 | +} | |
15517 | +EXPORT_SYMBOL_GPL(xen_l1_entry_update); | |
15518 | + | |
15519 | +void xen_l2_entry_update(pmd_t *ptr, pmd_t val) | |
15520 | +{ | |
15521 | + mmu_update_t u; | |
15522 | + u.ptr = virt_to_machine(ptr); | |
15523 | + u.val = __pmd_val(val); | |
15524 | + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); | |
15525 | +} | |
15526 | + | |
15527 | +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) | |
15528 | +void xen_l3_entry_update(pud_t *ptr, pud_t val) | |
15529 | +{ | |
15530 | + mmu_update_t u; | |
15531 | + u.ptr = virt_to_machine(ptr); | |
15532 | + u.val = __pud_val(val); | |
15533 | + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); | |
15534 | +} | |
15535 | +#endif | |
15536 | + | |
15537 | +#ifdef CONFIG_X86_64 | |
15538 | +void xen_l4_entry_update(pgd_t *ptr, pgd_t val) | |
15539 | +{ | |
15540 | + mmu_update_t u; | |
15541 | + u.ptr = virt_to_machine(ptr); | |
15542 | + u.val = __pgd_val(val); | |
15543 | + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); | |
15544 | +} | |
15545 | +#endif /* CONFIG_X86_64 */ | |
15546 | + | |
15547 | +void xen_pt_switch(unsigned long ptr) | |
15548 | +{ | |
15549 | + struct mmuext_op op; | |
15550 | + op.cmd = MMUEXT_NEW_BASEPTR; | |
15551 | + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); | |
15552 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15553 | +} | |
15554 | + | |
15555 | +void xen_new_user_pt(unsigned long ptr) | |
15556 | +{ | |
15557 | + struct mmuext_op op; | |
15558 | + op.cmd = MMUEXT_NEW_USER_BASEPTR; | |
15559 | + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); | |
15560 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15561 | +} | |
15562 | + | |
15563 | +void xen_tlb_flush(void) | |
15564 | +{ | |
15565 | + struct mmuext_op op; | |
15566 | + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; | |
15567 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15568 | +} | |
15569 | +EXPORT_SYMBOL(xen_tlb_flush); | |
15570 | + | |
15571 | +void xen_invlpg(unsigned long ptr) | |
15572 | +{ | |
15573 | + struct mmuext_op op; | |
15574 | + op.cmd = MMUEXT_INVLPG_LOCAL; | |
15575 | + op.arg1.linear_addr = ptr & PAGE_MASK; | |
15576 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15577 | +} | |
15578 | +EXPORT_SYMBOL(xen_invlpg); | |
15579 | + | |
15580 | +#ifdef CONFIG_SMP | |
15581 | + | |
15582 | +void xen_tlb_flush_all(void) | |
15583 | +{ | |
15584 | + struct mmuext_op op; | |
15585 | + op.cmd = MMUEXT_TLB_FLUSH_ALL; | |
15586 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15587 | +} | |
15588 | + | |
15589 | +void xen_tlb_flush_mask(cpumask_t *mask) | |
15590 | +{ | |
15591 | + struct mmuext_op op; | |
15592 | + if ( cpus_empty(*mask) ) | |
15593 | + return; | |
15594 | + op.cmd = MMUEXT_TLB_FLUSH_MULTI; | |
15595 | + set_xen_guest_handle(op.arg2.vcpumask, mask->bits); | |
15596 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15597 | +} | |
15598 | + | |
15599 | +void xen_invlpg_all(unsigned long ptr) | |
15600 | +{ | |
15601 | + struct mmuext_op op; | |
15602 | + op.cmd = MMUEXT_INVLPG_ALL; | |
15603 | + op.arg1.linear_addr = ptr & PAGE_MASK; | |
15604 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15605 | +} | |
15606 | + | |
15607 | +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr) | |
15608 | +{ | |
15609 | + struct mmuext_op op; | |
15610 | + if ( cpus_empty(*mask) ) | |
15611 | + return; | |
15612 | + op.cmd = MMUEXT_INVLPG_MULTI; | |
15613 | + op.arg1.linear_addr = ptr & PAGE_MASK; | |
15614 | + set_xen_guest_handle(op.arg2.vcpumask, mask->bits); | |
15615 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15616 | +} | |
15617 | + | |
15618 | +#endif /* CONFIG_SMP */ | |
15619 | + | |
15620 | +void xen_pgd_pin(unsigned long ptr) | |
15621 | +{ | |
15622 | + struct mmuext_op op; | |
15623 | +#ifdef CONFIG_X86_64 | |
15624 | + op.cmd = MMUEXT_PIN_L4_TABLE; | |
15625 | +#elif defined(CONFIG_X86_PAE) | |
15626 | + op.cmd = MMUEXT_PIN_L3_TABLE; | |
15627 | +#else | |
15628 | + op.cmd = MMUEXT_PIN_L2_TABLE; | |
15629 | +#endif | |
15630 | + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); | |
15631 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15632 | +} | |
15633 | + | |
15634 | +void xen_pgd_unpin(unsigned long ptr) | |
15635 | +{ | |
15636 | + struct mmuext_op op; | |
15637 | + op.cmd = MMUEXT_UNPIN_TABLE; | |
15638 | + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); | |
15639 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15640 | +} | |
15641 | + | |
15642 | +void xen_set_ldt(const void *ptr, unsigned int ents) | |
15643 | +{ | |
15644 | + struct mmuext_op op; | |
15645 | + op.cmd = MMUEXT_SET_LDT; | |
15646 | + op.arg1.linear_addr = (unsigned long)ptr; | |
15647 | + op.arg2.nr_ents = ents; | |
15648 | + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); | |
15649 | +} | |
15650 | + | |
15651 | +/* Protected by balloon_lock. */ | |
15652 | +#define MAX_CONTIG_ORDER 9 /* 2MB */ | |
15653 | +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; | |
15654 | +static unsigned long limited_frames[1<<MAX_CONTIG_ORDER]; | |
15655 | +static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER]; | |
15656 | + | |
15657 | +/* Ensure multi-page extents are contiguous in machine memory. */ | |
15658 | +int xen_create_contiguous_region( | |
15659 | + unsigned long vstart, unsigned int order, unsigned int address_bits) | |
15660 | +{ | |
15661 | + unsigned long *in_frames = discontig_frames, out_frame; | |
15662 | + unsigned long frame, flags; | |
15663 | + unsigned int i; | |
15664 | + int rc, success; | |
15665 | + struct xen_memory_exchange exchange = { | |
15666 | + .in = { | |
15667 | + .nr_extents = 1UL << order, | |
15668 | + .extent_order = 0, | |
15669 | + .domid = DOMID_SELF | |
15670 | + }, | |
15671 | + .out = { | |
15672 | + .nr_extents = 1, | |
15673 | + .extent_order = order, | |
15674 | + .address_bits = address_bits, | |
15675 | + .domid = DOMID_SELF | |
15676 | + } | |
15677 | + }; | |
15678 | + | |
15679 | + /* | |
15680 | + * Currently an auto-translated guest will not perform I/O, nor will | |
15681 | + * it require PAE page directories below 4GB. Therefore any calls to | |
15682 | + * this function are redundant and can be ignored. | |
15683 | + */ | |
15684 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
15685 | + return 0; | |
15686 | + | |
15687 | + if (unlikely(order > MAX_CONTIG_ORDER)) | |
15688 | + return -ENOMEM; | |
15689 | + | |
15690 | + set_xen_guest_handle(exchange.in.extent_start, in_frames); | |
15691 | + set_xen_guest_handle(exchange.out.extent_start, &out_frame); | |
15692 | + | |
15693 | + scrub_pages((void *)vstart, 1 << order); | |
15694 | + | |
15695 | + balloon_lock(flags); | |
15696 | + | |
15697 | + /* 1. Zap current PTEs, remembering MFNs. */ | |
15698 | + for (i = 0; i < (1U<<order); i++) { | |
15699 | + in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i); | |
15700 | + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), | |
15701 | + __pte_ma(0), 0); | |
15702 | + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, | |
15703 | + INVALID_P2M_ENTRY); | |
15704 | + } | |
15705 | + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) | |
15706 | + BUG(); | |
15707 | + | |
15708 | + /* 2. Get a new contiguous memory extent. */ | |
15709 | + out_frame = __pa(vstart) >> PAGE_SHIFT; | |
15710 | + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); | |
15711 | + success = (exchange.nr_exchanged == (1UL << order)); | |
15712 | + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); | |
15713 | + BUG_ON(success && (rc != 0)); | |
15714 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
15715 | + if (unlikely(rc == -ENOSYS)) { | |
15716 | + /* Compatibility when XENMEM_exchange is unsupported. */ | |
15717 | + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, | |
15718 | + &exchange.in) != (1UL << order)) | |
15719 | + BUG(); | |
15720 | + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap, | |
15721 | + &exchange.out) == 1); | |
15722 | + if (!success) { | |
15723 | + /* Couldn't get special memory: fall back to normal. */ | |
15724 | + for (i = 0; i < (1U<<order); i++) | |
15725 | + in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i; | |
15726 | + if (HYPERVISOR_memory_op(XENMEM_populate_physmap, | |
15727 | + &exchange.in) != (1UL<<order)) | |
15728 | + BUG(); | |
15729 | + } | |
15730 | + } | |
15731 | +#endif | |
15732 | + | |
15733 | + /* 3. Map the new extent in place of old pages. */ | |
15734 | + for (i = 0; i < (1U<<order); i++) { | |
15735 | + frame = success ? (out_frame + i) : in_frames[i]; | |
15736 | + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), | |
15737 | + pfn_pte_ma(frame, PAGE_KERNEL), 0); | |
15738 | + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame); | |
15739 | + } | |
15740 | + | |
15741 | + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order | |
15742 | + ? UVMF_TLB_FLUSH|UVMF_ALL | |
15743 | + : UVMF_INVLPG|UVMF_ALL; | |
15744 | + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) | |
15745 | + BUG(); | |
15746 | + | |
15747 | + balloon_unlock(flags); | |
15748 | + | |
15749 | + return success ? 0 : -ENOMEM; | |
15750 | +} | |
15751 | +EXPORT_SYMBOL_GPL(xen_create_contiguous_region); | |
15752 | + | |
15753 | +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) | |
15754 | +{ | |
15755 | + unsigned long *out_frames = discontig_frames, in_frame; | |
15756 | + unsigned long frame, flags; | |
15757 | + unsigned int i; | |
15758 | + int rc, success; | |
15759 | + struct xen_memory_exchange exchange = { | |
15760 | + .in = { | |
15761 | + .nr_extents = 1, | |
15762 | + .extent_order = order, | |
15763 | + .domid = DOMID_SELF | |
15764 | + }, | |
15765 | + .out = { | |
15766 | + .nr_extents = 1UL << order, | |
15767 | + .extent_order = 0, | |
15768 | + .domid = DOMID_SELF | |
15769 | + } | |
15770 | + }; | |
15771 | + | |
15772 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
15773 | + return; | |
15774 | + | |
15775 | + if (unlikely(order > MAX_CONTIG_ORDER)) | |
15776 | + return; | |
15777 | + | |
15778 | + set_xen_guest_handle(exchange.in.extent_start, &in_frame); | |
15779 | + set_xen_guest_handle(exchange.out.extent_start, out_frames); | |
15780 | + | |
15781 | + scrub_pages((void *)vstart, 1 << order); | |
15782 | + | |
15783 | + balloon_lock(flags); | |
15784 | + | |
15785 | + /* 1. Find start MFN of contiguous extent. */ | |
15786 | + in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT); | |
15787 | + | |
15788 | + /* 2. Zap current PTEs. */ | |
15789 | + for (i = 0; i < (1U<<order); i++) { | |
15790 | + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), | |
15791 | + __pte_ma(0), 0); | |
15792 | + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, | |
15793 | + INVALID_P2M_ENTRY); | |
15794 | + out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i; | |
15795 | + } | |
15796 | + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) | |
15797 | + BUG(); | |
15798 | + | |
15799 | + /* 3. Do the exchange for non-contiguous MFNs. */ | |
15800 | + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); | |
15801 | + success = (exchange.nr_exchanged == 1); | |
15802 | + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); | |
15803 | + BUG_ON(success && (rc != 0)); | |
15804 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
15805 | + if (unlikely(rc == -ENOSYS)) { | |
15806 | + /* Compatibility when XENMEM_exchange is unsupported. */ | |
15807 | + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, | |
15808 | + &exchange.in) != 1) | |
15809 | + BUG(); | |
15810 | + if (HYPERVISOR_memory_op(XENMEM_populate_physmap, | |
15811 | + &exchange.out) != (1UL << order)) | |
15812 | + BUG(); | |
15813 | + success = 1; | |
15814 | + } | |
15815 | +#endif | |
15816 | + | |
15817 | + /* 4. Map new pages in place of old pages. */ | |
15818 | + for (i = 0; i < (1U<<order); i++) { | |
15819 | + frame = success ? out_frames[i] : (in_frame + i); | |
15820 | + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), | |
15821 | + pfn_pte_ma(frame, PAGE_KERNEL), 0); | |
15822 | + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame); | |
15823 | + } | |
15824 | + | |
15825 | + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order | |
15826 | + ? UVMF_TLB_FLUSH|UVMF_ALL | |
15827 | + : UVMF_INVLPG|UVMF_ALL; | |
15828 | + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) | |
15829 | + BUG(); | |
15830 | + | |
15831 | + balloon_unlock(flags); | |
15832 | +} | |
15833 | +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); | |
15834 | + | |
15835 | +int xen_limit_pages_to_max_mfn( | |
15836 | + struct page *pages, unsigned int order, unsigned int address_bits) | |
15837 | +{ | |
15838 | + unsigned long flags, frame; | |
15839 | + unsigned long *in_frames = discontig_frames, *out_frames = limited_frames; | |
15840 | + struct page *page; | |
15841 | + unsigned int i, n, nr_mcl; | |
15842 | + int rc, success; | |
15843 | + DECLARE_BITMAP(limit_map, 1 << MAX_CONTIG_ORDER); | |
15844 | + | |
15845 | + struct xen_memory_exchange exchange = { | |
15846 | + .in = { | |
15847 | + .extent_order = 0, | |
15848 | + .domid = DOMID_SELF | |
15849 | + }, | |
15850 | + .out = { | |
15851 | + .extent_order = 0, | |
15852 | + .address_bits = address_bits, | |
15853 | + .domid = DOMID_SELF | |
15854 | + } | |
15855 | + }; | |
15856 | + | |
15857 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
15858 | + return 0; | |
15859 | + | |
15860 | + if (unlikely(order > MAX_CONTIG_ORDER)) | |
15861 | + return -ENOMEM; | |
15862 | + | |
15863 | + bitmap_zero(limit_map, 1U << order); | |
15864 | + set_xen_guest_handle(exchange.in.extent_start, in_frames); | |
15865 | + set_xen_guest_handle(exchange.out.extent_start, out_frames); | |
15866 | + | |
15867 | + /* 0. Scrub the pages. */ | |
15868 | + for (i = 0, n = 0; i < 1U<<order ; i++) { | |
15869 | + page = &pages[i]; | |
15870 | + if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT))) | |
15871 | + continue; | |
15872 | + __set_bit(i, limit_map); | |
15873 | + | |
15874 | + if (!PageHighMem(page)) | |
15875 | + scrub_pages(page_address(page), 1); | |
15876 | +#ifdef CONFIG_XEN_SCRUB_PAGES | |
15877 | + else { | |
15878 | + scrub_pages(kmap(page), 1); | |
15879 | + kunmap(page); | |
15880 | + ++n; | |
15881 | + } | |
15882 | +#endif | |
15883 | + } | |
15884 | + if (bitmap_empty(limit_map, 1U << order)) | |
15885 | + return 0; | |
15886 | + | |
15887 | + if (n) | |
15888 | + kmap_flush_unused(); | |
15889 | + | |
15890 | + balloon_lock(flags); | |
15891 | + | |
15892 | + /* 1. Zap current PTEs (if any), remembering MFNs. */ | |
15893 | + for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) { | |
15894 | + if(!test_bit(i, limit_map)) | |
15895 | + continue; | |
15896 | + page = &pages[i]; | |
15897 | + | |
15898 | + out_frames[n] = page_to_pfn(page); | |
15899 | + in_frames[n] = pfn_to_mfn(out_frames[n]); | |
15900 | + | |
15901 | + if (!PageHighMem(page)) | |
15902 | + MULTI_update_va_mapping(cr_mcl + nr_mcl++, | |
15903 | + (unsigned long)page_address(page), | |
15904 | + __pte_ma(0), 0); | |
15905 | + | |
15906 | + set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY); | |
15907 | + ++n; | |
15908 | + } | |
15909 | + if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL)) | |
15910 | + BUG(); | |
15911 | + | |
15912 | + /* 2. Get new memory below the required limit. */ | |
15913 | + exchange.in.nr_extents = n; | |
15914 | + exchange.out.nr_extents = n; | |
15915 | + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); | |
15916 | + success = (exchange.nr_exchanged == n); | |
15917 | + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); | |
15918 | + BUG_ON(success && (rc != 0)); | |
15919 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
15920 | + if (unlikely(rc == -ENOSYS)) { | |
15921 | + /* Compatibility when XENMEM_exchange is unsupported. */ | |
15922 | + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, | |
15923 | + &exchange.in) != n) | |
15924 | + BUG(); | |
15925 | + if (HYPERVISOR_memory_op(XENMEM_populate_physmap, | |
15926 | + &exchange.out) != n) | |
15927 | + BUG(); | |
15928 | + success = 1; | |
15929 | + } | |
15930 | +#endif | |
15931 | + | |
15932 | + /* 3. Map the new pages in place of old pages. */ | |
15933 | + for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) { | |
15934 | + if(!test_bit(i, limit_map)) | |
15935 | + continue; | |
15936 | + page = &pages[i]; | |
15937 | + | |
15938 | + frame = success ? out_frames[n] : in_frames[n]; | |
15939 | + | |
15940 | + if (!PageHighMem(page)) | |
15941 | + MULTI_update_va_mapping(cr_mcl + nr_mcl++, | |
15942 | + (unsigned long)page_address(page), | |
15943 | + pfn_pte_ma(frame, PAGE_KERNEL), 0); | |
15944 | + | |
15945 | + set_phys_to_machine(page_to_pfn(page), frame); | |
15946 | + ++n; | |
15947 | + } | |
15948 | + if (nr_mcl) { | |
15949 | + cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order | |
15950 | + ? UVMF_TLB_FLUSH|UVMF_ALL | |
15951 | + : UVMF_INVLPG|UVMF_ALL; | |
15952 | + if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL)) | |
15953 | + BUG(); | |
15954 | + } | |
15955 | + | |
15956 | + balloon_unlock(flags); | |
15957 | + | |
15958 | + return success ? 0 : -ENOMEM; | |
15959 | +} | |
15960 | +EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn); | |
15961 | + | |
15962 | +#ifdef __i386__ | |
15963 | +int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b) | |
15964 | +{ | |
15965 | + __u32 *lp = (__u32 *)((char *)ldt + entry * 8); | |
15966 | + maddr_t mach_lp = arbitrary_virt_to_machine(lp); | |
15967 | + return HYPERVISOR_update_descriptor( | |
15968 | + mach_lp, (u64)entry_a | ((u64)entry_b<<32)); | |
15969 | +} | |
15970 | +#endif | |
15971 | + | |
15972 | +#define MAX_BATCHED_FULL_PTES 32 | |
15973 | + | |
15974 | +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |
15975 | + unsigned long addr, unsigned long end, pgprot_t newprot) | |
15976 | +{ | |
15977 | + int rc = 0, i = 0; | |
15978 | + mmu_update_t u[MAX_BATCHED_FULL_PTES]; | |
15979 | + pte_t *pte; | |
15980 | + spinlock_t *ptl; | |
15981 | + | |
15982 | + if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) | |
15983 | + return 0; | |
15984 | + | |
15985 | + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | |
15986 | + do { | |
15987 | + if (pte_present(*pte)) { | |
15988 | + u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK) | |
15989 | + | ((unsigned long)pte & ~PAGE_MASK) | |
15990 | + | MMU_PT_UPDATE_PRESERVE_AD; | |
15991 | + u[i].val = __pte_val(pte_modify(*pte, newprot)); | |
15992 | + if (++i == MAX_BATCHED_FULL_PTES) { | |
15993 | + if ((rc = HYPERVISOR_mmu_update( | |
15994 | + &u[0], i, NULL, DOMID_SELF)) != 0) | |
15995 | + break; | |
15996 | + i = 0; | |
15997 | + } | |
15998 | + } | |
15999 | + } while (pte++, addr += PAGE_SIZE, addr != end); | |
16000 | + if (i) | |
16001 | + rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF); | |
16002 | + pte_unmap_unlock(pte - 1, ptl); | |
16003 | + BUG_ON(rc && rc != -ENOSYS); | |
16004 | + return !rc; | |
16005 | +} | |
16006 | Index: head-2008-11-25/arch/x86/mm/init_32-xen.c | |
16007 | =================================================================== | |
16008 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
16009 | +++ head-2008-11-25/arch/x86/mm/init_32-xen.c 2008-10-29 09:55:56.000000000 +0100 | |
16010 | @@ -0,0 +1,840 @@ | |
16011 | +/* | |
16012 | + * linux/arch/i386/mm/init.c | |
16013 | + * | |
16014 | + * Copyright (C) 1995 Linus Torvalds | |
16015 | + * | |
16016 | + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | |
16017 | + */ | |
16018 | + | |
16019 | +#include <linux/module.h> | |
16020 | +#include <linux/signal.h> | |
16021 | +#include <linux/sched.h> | |
16022 | +#include <linux/kernel.h> | |
16023 | +#include <linux/errno.h> | |
16024 | +#include <linux/string.h> | |
16025 | +#include <linux/types.h> | |
16026 | +#include <linux/ptrace.h> | |
16027 | +#include <linux/mman.h> | |
16028 | +#include <linux/mm.h> | |
16029 | +#include <linux/hugetlb.h> | |
16030 | +#include <linux/swap.h> | |
16031 | +#include <linux/smp.h> | |
16032 | +#include <linux/init.h> | |
16033 | +#include <linux/highmem.h> | |
16034 | +#include <linux/pagemap.h> | |
16035 | +#include <linux/poison.h> | |
16036 | +#include <linux/bootmem.h> | |
16037 | +#include <linux/slab.h> | |
16038 | +#include <linux/proc_fs.h> | |
16039 | +#include <linux/efi.h> | |
16040 | +#include <linux/memory_hotplug.h> | |
16041 | +#include <linux/initrd.h> | |
16042 | +#include <linux/cpumask.h> | |
16043 | +#include <linux/dma-mapping.h> | |
16044 | +#include <linux/scatterlist.h> | |
16045 | + | |
16046 | +#include <asm/processor.h> | |
16047 | +#include <asm/system.h> | |
16048 | +#include <asm/uaccess.h> | |
16049 | +#include <asm/pgtable.h> | |
16050 | +#include <asm/dma.h> | |
16051 | +#include <asm/fixmap.h> | |
16052 | +#include <asm/e820.h> | |
16053 | +#include <asm/apic.h> | |
16054 | +#include <asm/tlb.h> | |
16055 | +#include <asm/tlbflush.h> | |
16056 | +#include <asm/sections.h> | |
16057 | +#include <asm/hypervisor.h> | |
16058 | +#include <asm/swiotlb.h> | |
16059 | + | |
16060 | +unsigned int __VMALLOC_RESERVE = 128 << 20; | |
16061 | + | |
16062 | +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | |
16063 | +unsigned long highstart_pfn, highend_pfn; | |
16064 | + | |
16065 | +static int noinline do_test_wp_bit(void); | |
16066 | + | |
16067 | +/* | |
16068 | + * Creates a middle page table and puts a pointer to it in the | |
16069 | + * given global directory entry. This only returns the gd entry | |
16070 | + * in non-PAE compilation mode, since the middle layer is folded. | |
16071 | + */ | |
16072 | +static pmd_t * __init one_md_table_init(pgd_t *pgd) | |
16073 | +{ | |
16074 | + pud_t *pud; | |
16075 | + pmd_t *pmd_table; | |
16076 | + | |
16077 | +#ifdef CONFIG_X86_PAE | |
16078 | + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); | |
16079 | + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); | |
16080 | + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | |
16081 | + pud = pud_offset(pgd, 0); | |
16082 | + if (pmd_table != pmd_offset(pud, 0)) | |
16083 | + BUG(); | |
16084 | +#else | |
16085 | + pud = pud_offset(pgd, 0); | |
16086 | + pmd_table = pmd_offset(pud, 0); | |
16087 | +#endif | |
16088 | + | |
16089 | + return pmd_table; | |
16090 | +} | |
16091 | + | |
16092 | +/* | |
16093 | + * Create a page table and place a pointer to it in a middle page | |
16094 | + * directory entry. | |
16095 | + */ | |
16096 | +static pte_t * __init one_page_table_init(pmd_t *pmd) | |
16097 | +{ | |
16098 | + if (pmd_none(*pmd)) { | |
16099 | + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); | |
16100 | + make_lowmem_page_readonly(page_table, | |
16101 | + XENFEAT_writable_page_tables); | |
16102 | + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); | |
16103 | + if (page_table != pte_offset_kernel(pmd, 0)) | |
16104 | + BUG(); | |
16105 | + | |
16106 | + return page_table; | |
16107 | + } | |
16108 | + | |
16109 | + return pte_offset_kernel(pmd, 0); | |
16110 | +} | |
16111 | + | |
16112 | +/* | |
16113 | + * This function initializes a certain range of kernel virtual memory | |
16114 | + * with new bootmem page tables, everywhere page tables are missing in | |
16115 | + * the given range. | |
16116 | + */ | |
16117 | + | |
16118 | +/* | |
16119 | + * NOTE: The pagetables are allocated contiguous on the physical space | |
16120 | + * so we can cache the place of the first one and move around without | |
16121 | + * checking the pgd every time. | |
16122 | + */ | |
16123 | +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) | |
16124 | +{ | |
16125 | + pgd_t *pgd; | |
16126 | + pud_t *pud; | |
16127 | + pmd_t *pmd; | |
16128 | + int pgd_idx, pmd_idx; | |
16129 | + unsigned long vaddr; | |
16130 | + | |
16131 | + vaddr = start; | |
16132 | + pgd_idx = pgd_index(vaddr); | |
16133 | + pmd_idx = pmd_index(vaddr); | |
16134 | + pgd = pgd_base + pgd_idx; | |
16135 | + | |
16136 | + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { | |
16137 | + if (pgd_none(*pgd)) | |
16138 | + one_md_table_init(pgd); | |
16139 | + pud = pud_offset(pgd, vaddr); | |
16140 | + pmd = pmd_offset(pud, vaddr); | |
16141 | + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { | |
16142 | + if (vaddr < hypervisor_virt_start && pmd_none(*pmd)) | |
16143 | + one_page_table_init(pmd); | |
16144 | + | |
16145 | + vaddr += PMD_SIZE; | |
16146 | + } | |
16147 | + pmd_idx = 0; | |
16148 | + } | |
16149 | +} | |
16150 | + | |
16151 | +static inline int is_kernel_text(unsigned long addr) | |
16152 | +{ | |
16153 | + if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) | |
16154 | + return 1; | |
16155 | + return 0; | |
16156 | +} | |
16157 | + | |
16158 | +/* | |
16159 | + * This maps the physical memory to kernel virtual address space, a total | |
16160 | + * of max_low_pfn pages, by creating page tables starting from address | |
16161 | + * PAGE_OFFSET. | |
16162 | + */ | |
16163 | +static void __init kernel_physical_mapping_init(pgd_t *pgd_base) | |
16164 | +{ | |
16165 | + unsigned long pfn; | |
16166 | + pgd_t *pgd; | |
16167 | + pmd_t *pmd; | |
16168 | + pte_t *pte; | |
16169 | + int pgd_idx, pmd_idx, pte_ofs; | |
16170 | + | |
16171 | + unsigned long max_ram_pfn = xen_start_info->nr_pages; | |
16172 | + if (max_ram_pfn > max_low_pfn) | |
16173 | + max_ram_pfn = max_low_pfn; | |
16174 | + | |
16175 | + pgd_idx = pgd_index(PAGE_OFFSET); | |
16176 | + pgd = pgd_base + pgd_idx; | |
16177 | + pfn = 0; | |
16178 | + pmd_idx = pmd_index(PAGE_OFFSET); | |
16179 | + pte_ofs = pte_index(PAGE_OFFSET); | |
16180 | + | |
16181 | + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { | |
16182 | +#ifdef CONFIG_XEN | |
16183 | + /* | |
16184 | + * Native linux hasn't PAE-paging enabled yet at this | |
16185 | + * point. When running as xen domain we are in PAE | |
16186 | + * mode already, thus we can't simply hook a empty | |
16187 | + * pmd. That would kill the mappings we are currently | |
16188 | + * using ... | |
16189 | + */ | |
16190 | + pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET); | |
16191 | +#else | |
16192 | + pmd = one_md_table_init(pgd); | |
16193 | +#endif | |
16194 | + if (pfn >= max_low_pfn) | |
16195 | + continue; | |
16196 | + pmd += pmd_idx; | |
16197 | + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { | |
16198 | + unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; | |
16199 | + if (address >= hypervisor_virt_start) | |
16200 | + continue; | |
16201 | + | |
16202 | + /* Map with big pages if possible, otherwise create normal page tables. */ | |
16203 | + if (cpu_has_pse) { | |
16204 | + unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; | |
16205 | + | |
16206 | + if (is_kernel_text(address) || is_kernel_text(address2)) | |
16207 | + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); | |
16208 | + else | |
16209 | + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); | |
16210 | + pfn += PTRS_PER_PTE; | |
16211 | + } else { | |
16212 | + pte = one_page_table_init(pmd); | |
16213 | + | |
16214 | + pte += pte_ofs; | |
16215 | + for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { | |
16216 | + /* XEN: Only map initial RAM allocation. */ | |
16217 | + if ((pfn >= max_ram_pfn) || pte_present(*pte)) | |
16218 | + continue; | |
16219 | + if (is_kernel_text(address)) | |
16220 | + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); | |
16221 | + else | |
16222 | + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); | |
16223 | + } | |
16224 | + pte_ofs = 0; | |
16225 | + } | |
16226 | + } | |
16227 | + pmd_idx = 0; | |
16228 | + } | |
16229 | +} | |
16230 | + | |
16231 | +#ifndef CONFIG_XEN | |
16232 | + | |
16233 | +static inline int page_kills_ppro(unsigned long pagenr) | |
16234 | +{ | |
16235 | + if (pagenr >= 0x70000 && pagenr <= 0x7003F) | |
16236 | + return 1; | |
16237 | + return 0; | |
16238 | +} | |
16239 | + | |
16240 | +#else | |
16241 | + | |
16242 | +#define page_kills_ppro(p) 0 | |
16243 | + | |
16244 | +#endif | |
16245 | + | |
16246 | +extern int is_available_memory(efi_memory_desc_t *); | |
16247 | + | |
16248 | +int page_is_ram(unsigned long pagenr) | |
16249 | +{ | |
16250 | + int i; | |
16251 | + unsigned long addr, end; | |
16252 | + | |
16253 | + if (efi_enabled) { | |
16254 | + efi_memory_desc_t *md; | |
16255 | + void *p; | |
16256 | + | |
16257 | + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | |
16258 | + md = p; | |
16259 | + if (!is_available_memory(md)) | |
16260 | + continue; | |
16261 | + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; | |
16262 | + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; | |
16263 | + | |
16264 | + if ((pagenr >= addr) && (pagenr < end)) | |
16265 | + return 1; | |
16266 | + } | |
16267 | + return 0; | |
16268 | + } | |
16269 | + | |
16270 | + for (i = 0; i < e820.nr_map; i++) { | |
16271 | + | |
16272 | + if (e820.map[i].type != E820_RAM) /* not usable memory */ | |
16273 | + continue; | |
16274 | + /* | |
16275 | + * !!!FIXME!!! Some BIOSen report areas as RAM that | |
16276 | + * are not. Notably the 640->1Mb area. We need a sanity | |
16277 | + * check here. | |
16278 | + */ | |
16279 | + addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; | |
16280 | + end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; | |
16281 | + if ((pagenr >= addr) && (pagenr < end)) | |
16282 | + return 1; | |
16283 | + } | |
16284 | + return 0; | |
16285 | +} | |
16286 | + | |
16287 | +#ifdef CONFIG_HIGHMEM | |
16288 | +pte_t *kmap_pte; | |
16289 | +pgprot_t kmap_prot; | |
16290 | + | |
16291 | +#define kmap_get_fixmap_pte(vaddr) \ | |
16292 | + pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) | |
16293 | + | |
16294 | +static void __init kmap_init(void) | |
16295 | +{ | |
16296 | + unsigned long kmap_vstart; | |
16297 | + | |
16298 | + /* cache the first kmap pte */ | |
16299 | + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); | |
16300 | + kmap_pte = kmap_get_fixmap_pte(kmap_vstart); | |
16301 | + | |
16302 | + kmap_prot = PAGE_KERNEL; | |
16303 | +} | |
16304 | + | |
16305 | +static void __init permanent_kmaps_init(pgd_t *pgd_base) | |
16306 | +{ | |
16307 | + pgd_t *pgd; | |
16308 | + pud_t *pud; | |
16309 | + pmd_t *pmd; | |
16310 | + pte_t *pte; | |
16311 | + unsigned long vaddr; | |
16312 | + | |
16313 | + vaddr = PKMAP_BASE; | |
16314 | + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); | |
16315 | + | |
16316 | + pgd = swapper_pg_dir + pgd_index(vaddr); | |
16317 | + pud = pud_offset(pgd, vaddr); | |
16318 | + pmd = pmd_offset(pud, vaddr); | |
16319 | + pte = pte_offset_kernel(pmd, vaddr); | |
16320 | + pkmap_page_table = pte; | |
16321 | +} | |
16322 | + | |
16323 | +static void __meminit free_new_highpage(struct page *page, int pfn) | |
16324 | +{ | |
16325 | + init_page_count(page); | |
16326 | + if (pfn < xen_start_info->nr_pages) | |
16327 | + __free_page(page); | |
16328 | + totalhigh_pages++; | |
16329 | +} | |
16330 | + | |
16331 | +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) | |
16332 | +{ | |
16333 | + if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { | |
16334 | + ClearPageReserved(page); | |
16335 | + free_new_highpage(page, pfn); | |
16336 | + } else | |
16337 | + SetPageReserved(page); | |
16338 | +} | |
16339 | + | |
16340 | +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) | |
16341 | +{ | |
16342 | + free_new_highpage(page, pfn); | |
16343 | + totalram_pages++; | |
16344 | +#ifdef CONFIG_FLATMEM | |
16345 | + max_mapnr = max(pfn, max_mapnr); | |
16346 | +#endif | |
16347 | + num_physpages++; | |
16348 | + return 0; | |
16349 | +} | |
16350 | + | |
16351 | +/* | |
16352 | + * Not currently handling the NUMA case. | |
16353 | + * Assuming single node and all memory that | |
16354 | + * has been added dynamically that would be | |
16355 | + * onlined here is in HIGHMEM | |
16356 | + */ | |
16357 | +void online_page(struct page *page) | |
16358 | +{ | |
16359 | + ClearPageReserved(page); | |
16360 | + add_one_highpage_hotplug(page, page_to_pfn(page)); | |
16361 | +} | |
16362 | + | |
16363 | + | |
16364 | +#ifdef CONFIG_NUMA | |
16365 | +extern void set_highmem_pages_init(int); | |
16366 | +#else | |
16367 | +static void __init set_highmem_pages_init(int bad_ppro) | |
16368 | +{ | |
16369 | + int pfn; | |
16370 | + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) | |
16371 | + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); | |
16372 | + totalram_pages += totalhigh_pages; | |
16373 | +} | |
16374 | +#endif /* CONFIG_FLATMEM */ | |
16375 | + | |
16376 | +#else | |
16377 | +#define kmap_init() do { } while (0) | |
16378 | +#define permanent_kmaps_init(pgd_base) do { } while (0) | |
16379 | +#define set_highmem_pages_init(bad_ppro) do { } while (0) | |
16380 | +#endif /* CONFIG_HIGHMEM */ | |
16381 | + | |
16382 | +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; | |
16383 | +EXPORT_SYMBOL(__PAGE_KERNEL); | |
16384 | +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; | |
16385 | + | |
16386 | +#ifdef CONFIG_NUMA | |
16387 | +extern void __init remap_numa_kva(void); | |
16388 | +#else | |
16389 | +#define remap_numa_kva() do {} while (0) | |
16390 | +#endif | |
16391 | + | |
16392 | +pgd_t *swapper_pg_dir; | |
16393 | + | |
16394 | +static void __init pagetable_init (void) | |
16395 | +{ | |
16396 | + unsigned long vaddr; | |
16397 | + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base; | |
16398 | + | |
16399 | + /* Enable PSE if available */ | |
16400 | + if (cpu_has_pse) { | |
16401 | + set_in_cr4(X86_CR4_PSE); | |
16402 | + } | |
16403 | + | |
16404 | + /* Enable PGE if available */ | |
16405 | + if (cpu_has_pge) { | |
16406 | + set_in_cr4(X86_CR4_PGE); | |
16407 | + __PAGE_KERNEL |= _PAGE_GLOBAL; | |
16408 | + __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; | |
16409 | + } | |
16410 | + | |
16411 | + kernel_physical_mapping_init(pgd_base); | |
16412 | + remap_numa_kva(); | |
16413 | + | |
16414 | + /* | |
16415 | + * Fixed mappings, only the page table structure has to be | |
16416 | + * created - mappings will be set by set_fixmap(): | |
16417 | + */ | |
16418 | + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; | |
16419 | + page_table_range_init(vaddr, hypervisor_virt_start, pgd_base); | |
16420 | + | |
16421 | + permanent_kmaps_init(pgd_base); | |
16422 | +} | |
16423 | + | |
16424 | +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) | |
16425 | +/* | |
16426 | + * Swap suspend & friends need this for resume because things like the intel-agp | |
16427 | + * driver might have split up a kernel 4MB mapping. | |
16428 | + */ | |
16429 | +char __nosavedata swsusp_pg_dir[PAGE_SIZE] | |
16430 | + __attribute__ ((aligned (PAGE_SIZE))); | |
16431 | + | |
16432 | +static inline void save_pg_dir(void) | |
16433 | +{ | |
16434 | + memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); | |
16435 | +} | |
16436 | +#else | |
16437 | +static inline void save_pg_dir(void) | |
16438 | +{ | |
16439 | +} | |
16440 | +#endif | |
16441 | + | |
16442 | +void zap_low_mappings (void) | |
16443 | +{ | |
16444 | + int i; | |
16445 | + | |
16446 | + save_pg_dir(); | |
16447 | + | |
16448 | + /* | |
16449 | + * Zap initial low-memory mappings. | |
16450 | + * | |
16451 | + * Note that "pgd_clear()" doesn't do it for | |
16452 | + * us, because pgd_clear() is a no-op on i386. | |
16453 | + */ | |
16454 | + for (i = 0; i < USER_PTRS_PER_PGD; i++) | |
16455 | +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) | |
16456 | + set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); | |
16457 | +#else | |
16458 | + set_pgd(swapper_pg_dir+i, __pgd(0)); | |
16459 | +#endif | |
16460 | + flush_tlb_all(); | |
16461 | +} | |
16462 | + | |
16463 | +static int disable_nx __initdata = 0; | |
16464 | +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; | |
16465 | +EXPORT_SYMBOL(__supported_pte_mask); | |
16466 | + | |
16467 | +/* | |
16468 | + * noexec = on|off | |
16469 | + * | |
16470 | + * Control non executable mappings. | |
16471 | + * | |
16472 | + * on Enable | |
16473 | + * off Disable | |
16474 | + */ | |
16475 | +void __init noexec_setup(const char *str) | |
16476 | +{ | |
16477 | + if (!strncmp(str, "on",2) && cpu_has_nx) { | |
16478 | + __supported_pte_mask |= _PAGE_NX; | |
16479 | + disable_nx = 0; | |
16480 | + } else if (!strncmp(str,"off",3)) { | |
16481 | + disable_nx = 1; | |
16482 | + __supported_pte_mask &= ~_PAGE_NX; | |
16483 | + } | |
16484 | +} | |
16485 | + | |
16486 | +int nx_enabled = 0; | |
16487 | +#ifdef CONFIG_X86_PAE | |
16488 | + | |
16489 | +static void __init set_nx(void) | |
16490 | +{ | |
16491 | + unsigned int v[4], l, h; | |
16492 | + | |
16493 | + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { | |
16494 | + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); | |
16495 | + if ((v[3] & (1 << 20)) && !disable_nx) { | |
16496 | + rdmsr(MSR_EFER, l, h); | |
16497 | + l |= EFER_NX; | |
16498 | + wrmsr(MSR_EFER, l, h); | |
16499 | + nx_enabled = 1; | |
16500 | + __supported_pte_mask |= _PAGE_NX; | |
16501 | + } | |
16502 | + } | |
16503 | +} | |
16504 | + | |
16505 | +/* | |
16506 | + * Enables/disables executability of a given kernel page and | |
16507 | + * returns the previous setting. | |
16508 | + */ | |
16509 | +int __init set_kernel_exec(unsigned long vaddr, int enable) | |
16510 | +{ | |
16511 | + pte_t *pte; | |
16512 | + int ret = 1; | |
16513 | + | |
16514 | + if (!nx_enabled) | |
16515 | + goto out; | |
16516 | + | |
16517 | + pte = lookup_address(vaddr); | |
16518 | + BUG_ON(!pte); | |
16519 | + | |
16520 | + if (!pte_exec_kernel(*pte)) | |
16521 | + ret = 0; | |
16522 | + | |
16523 | + if (enable) | |
16524 | + pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); | |
16525 | + else | |
16526 | + pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); | |
16527 | + __flush_tlb_all(); | |
16528 | +out: | |
16529 | + return ret; | |
16530 | +} | |
16531 | + | |
16532 | +#endif | |
16533 | + | |
16534 | +/* | |
16535 | + * paging_init() sets up the page tables - note that the first 8MB are | |
16536 | + * already mapped by head.S. | |
16537 | + * | |
16538 | + * This routines also unmaps the page at virtual kernel address 0, so | |
16539 | + * that we can trap those pesky NULL-reference errors in the kernel. | |
16540 | + */ | |
16541 | +void __init paging_init(void) | |
16542 | +{ | |
16543 | + int i; | |
16544 | + | |
16545 | +#ifdef CONFIG_X86_PAE | |
16546 | + set_nx(); | |
16547 | + if (nx_enabled) | |
16548 | + printk("NX (Execute Disable) protection: active\n"); | |
16549 | +#endif | |
16550 | + | |
16551 | + pagetable_init(); | |
16552 | + | |
16553 | +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) | |
16554 | + /* | |
16555 | + * We will bail out later - printk doesn't work right now so | |
16556 | + * the user would just see a hanging kernel. | |
16557 | + * when running as xen domain we are already in PAE mode at | |
16558 | + * this point. | |
16559 | + */ | |
16560 | + if (cpu_has_pae) | |
16561 | + set_in_cr4(X86_CR4_PAE); | |
16562 | +#endif | |
16563 | + __flush_tlb_all(); | |
16564 | + | |
16565 | + kmap_init(); | |
16566 | + | |
16567 | + /* Switch to the real shared_info page, and clear the | |
16568 | + * dummy page. */ | |
16569 | + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); | |
16570 | + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); | |
16571 | + memset(empty_zero_page, 0, sizeof(empty_zero_page)); | |
16572 | + | |
16573 | + /* Setup mapping of lower 1st MB */ | |
16574 | + for (i = 0; i < NR_FIX_ISAMAPS; i++) | |
16575 | + if (is_initial_xendomain()) | |
16576 | + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); | |
16577 | + else | |
16578 | + __set_fixmap(FIX_ISAMAP_BEGIN - i, | |
16579 | + virt_to_machine(empty_zero_page), | |
16580 | + PAGE_KERNEL_RO); | |
16581 | +} | |
16582 | + | |
16583 | +/* | |
16584 | + * Test if the WP bit works in supervisor mode. It isn't supported on 386's | |
16585 | + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This | |
16586 | + * used to involve black magic jumps to work around some nasty CPU bugs, | |
16587 | + * but fortunately the switch to using exceptions got rid of all that. | |
16588 | + */ | |
16589 | + | |
16590 | +static void __init test_wp_bit(void) | |
16591 | +{ | |
16592 | + printk("Checking if this processor honours the WP bit even in supervisor mode... "); | |
16593 | + | |
16594 | + /* Any page-aligned address will do, the test is non-destructive */ | |
16595 | + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); | |
16596 | + boot_cpu_data.wp_works_ok = do_test_wp_bit(); | |
16597 | + clear_fixmap(FIX_WP_TEST); | |
16598 | + | |
16599 | + if (!boot_cpu_data.wp_works_ok) { | |
16600 | + printk("No.\n"); | |
16601 | +#ifdef CONFIG_X86_WP_WORKS_OK | |
16602 | + panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); | |
16603 | +#endif | |
16604 | + } else { | |
16605 | + printk("Ok.\n"); | |
16606 | + } | |
16607 | +} | |
16608 | + | |
16609 | +static void __init set_max_mapnr_init(void) | |
16610 | +{ | |
16611 | +#ifdef CONFIG_HIGHMEM | |
16612 | + num_physpages = highend_pfn; | |
16613 | +#else | |
16614 | + num_physpages = max_low_pfn; | |
16615 | +#endif | |
16616 | +#ifdef CONFIG_FLATMEM | |
16617 | + max_mapnr = num_physpages; | |
16618 | +#endif | |
16619 | +} | |
16620 | + | |
16621 | +static struct kcore_list kcore_mem, kcore_vmalloc; | |
16622 | + | |
16623 | +void __init mem_init(void) | |
16624 | +{ | |
16625 | + extern int ppro_with_ram_bug(void); | |
16626 | + int codesize, reservedpages, datasize, initsize; | |
16627 | + int tmp; | |
16628 | + int bad_ppro; | |
16629 | + unsigned long pfn; | |
16630 | + | |
16631 | +#if defined(CONFIG_SWIOTLB) | |
16632 | + swiotlb_init(); | |
16633 | +#endif | |
16634 | + | |
16635 | +#ifdef CONFIG_FLATMEM | |
16636 | + if (!mem_map) | |
16637 | + BUG(); | |
16638 | +#endif | |
16639 | + | |
16640 | + bad_ppro = ppro_with_ram_bug(); | |
16641 | + | |
16642 | +#ifdef CONFIG_HIGHMEM | |
16643 | + /* check that fixmap and pkmap do not overlap */ | |
16644 | + if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { | |
16645 | + printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); | |
16646 | + printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", | |
16647 | + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); | |
16648 | + BUG(); | |
16649 | + } | |
16650 | +#endif | |
16651 | + | |
16652 | + set_max_mapnr_init(); | |
16653 | + | |
16654 | +#ifdef CONFIG_HIGHMEM | |
16655 | + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; | |
16656 | +#else | |
16657 | + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; | |
16658 | +#endif | |
16659 | + printk("vmalloc area: %lx-%lx, maxmem %lx\n", | |
16660 | + VMALLOC_START,VMALLOC_END,MAXMEM); | |
16661 | + BUG_ON(VMALLOC_START > VMALLOC_END); | |
16662 | + | |
16663 | + /* this will put all low memory onto the freelists */ | |
16664 | + totalram_pages += free_all_bootmem(); | |
16665 | + /* XEN: init and count low-mem pages outside initial allocation. */ | |
16666 | + for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) { | |
16667 | + ClearPageReserved(pfn_to_page(pfn)); | |
16668 | + init_page_count(pfn_to_page(pfn)); | |
16669 | + totalram_pages++; | |
16670 | + } | |
16671 | + | |
16672 | + reservedpages = 0; | |
16673 | + for (tmp = 0; tmp < max_low_pfn; tmp++) | |
16674 | + /* | |
16675 | + * Only count reserved RAM pages | |
16676 | + */ | |
16677 | + if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) | |
16678 | + reservedpages++; | |
16679 | + | |
16680 | + set_highmem_pages_init(bad_ppro); | |
16681 | + | |
16682 | + codesize = (unsigned long) &_etext - (unsigned long) &_text; | |
16683 | + datasize = (unsigned long) &_edata - (unsigned long) &_etext; | |
16684 | + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | |
16685 | + | |
16686 | + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | |
16687 | + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | |
16688 | + VMALLOC_END-VMALLOC_START); | |
16689 | + | |
16690 | + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", | |
16691 | + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | |
16692 | + num_physpages << (PAGE_SHIFT-10), | |
16693 | + codesize >> 10, | |
16694 | + reservedpages << (PAGE_SHIFT-10), | |
16695 | + datasize >> 10, | |
16696 | + initsize >> 10, | |
16697 | + (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) | |
16698 | + ); | |
16699 | + | |
16700 | +#ifdef CONFIG_X86_PAE | |
16701 | + if (!cpu_has_pae) | |
16702 | + panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); | |
16703 | +#endif | |
16704 | + if (boot_cpu_data.wp_works_ok < 0) | |
16705 | + test_wp_bit(); | |
16706 | + | |
16707 | + /* | |
16708 | + * Subtle. SMP is doing it's boot stuff late (because it has to | |
16709 | + * fork idle threads) - but it also needs low mappings for the | |
16710 | + * protected-mode entry to work. We zap these entries only after | |
16711 | + * the WP-bit has been tested. | |
16712 | + */ | |
16713 | +#ifndef CONFIG_SMP | |
16714 | + zap_low_mappings(); | |
16715 | +#endif | |
16716 | + | |
16717 | + set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags); | |
16718 | +} | |
16719 | + | |
16720 | +/* | |
16721 | + * this is for the non-NUMA, single node SMP system case. | |
16722 | + * Specifically, in the case of x86, we will always add | |
16723 | + * memory to the highmem for now. | |
16724 | + */ | |
16725 | +#ifdef CONFIG_MEMORY_HOTPLUG | |
16726 | +#ifndef CONFIG_NEED_MULTIPLE_NODES | |
16727 | +int arch_add_memory(int nid, u64 start, u64 size) | |
16728 | +{ | |
16729 | + struct pglist_data *pgdata = &contig_page_data; | |
16730 | + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; | |
16731 | + unsigned long start_pfn = start >> PAGE_SHIFT; | |
16732 | + unsigned long nr_pages = size >> PAGE_SHIFT; | |
16733 | + | |
16734 | + return __add_pages(zone, start_pfn, nr_pages); | |
16735 | +} | |
16736 | + | |
16737 | +int remove_memory(u64 start, u64 size) | |
16738 | +{ | |
16739 | + return -EINVAL; | |
16740 | +} | |
16741 | +#endif | |
16742 | +#endif | |
16743 | + | |
16744 | +kmem_cache_t *pgd_cache; | |
16745 | +kmem_cache_t *pmd_cache; | |
16746 | + | |
16747 | +void __init pgtable_cache_init(void) | |
16748 | +{ | |
16749 | + if (PTRS_PER_PMD > 1) { | |
16750 | + pmd_cache = kmem_cache_create("pmd", | |
16751 | + PTRS_PER_PMD*sizeof(pmd_t), | |
16752 | + PTRS_PER_PMD*sizeof(pmd_t), | |
16753 | + 0, | |
16754 | + pmd_ctor, | |
16755 | + NULL); | |
16756 | + if (!pmd_cache) | |
16757 | + panic("pgtable_cache_init(): cannot create pmd cache"); | |
16758 | + } | |
16759 | + pgd_cache = kmem_cache_create("pgd", | |
16760 | +#ifndef CONFIG_XEN | |
16761 | + PTRS_PER_PGD*sizeof(pgd_t), | |
16762 | + PTRS_PER_PGD*sizeof(pgd_t), | |
16763 | +#else | |
16764 | + PAGE_SIZE, | |
16765 | + PAGE_SIZE, | |
16766 | +#endif | |
16767 | + 0, | |
16768 | + pgd_ctor, | |
16769 | + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); | |
16770 | + if (!pgd_cache) | |
16771 | + panic("pgtable_cache_init(): Cannot create pgd cache"); | |
16772 | +} | |
16773 | + | |
16774 | +/* | |
16775 | + * This function cannot be __init, since exceptions don't work in that | |
16776 | + * section. Put this after the callers, so that it cannot be inlined. | |
16777 | + */ | |
16778 | +static int noinline do_test_wp_bit(void) | |
16779 | +{ | |
16780 | + char tmp_reg; | |
16781 | + int flag; | |
16782 | + | |
16783 | + __asm__ __volatile__( | |
16784 | + " movb %0,%1 \n" | |
16785 | + "1: movb %1,%0 \n" | |
16786 | + " xorl %2,%2 \n" | |
16787 | + "2: \n" | |
16788 | + ".section __ex_table,\"a\"\n" | |
16789 | + " .align 4 \n" | |
16790 | + " .long 1b,2b \n" | |
16791 | + ".previous \n" | |
16792 | + :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), | |
16793 | + "=q" (tmp_reg), | |
16794 | + "=r" (flag) | |
16795 | + :"2" (1) | |
16796 | + :"memory"); | |
16797 | + | |
16798 | + return flag; | |
16799 | +} | |
16800 | + | |
16801 | +#ifdef CONFIG_DEBUG_RODATA | |
16802 | + | |
16803 | +void mark_rodata_ro(void) | |
16804 | +{ | |
16805 | + unsigned long addr = (unsigned long)__start_rodata; | |
16806 | + | |
16807 | + for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) | |
16808 | + change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); | |
16809 | + | |
16810 | + printk("Write protecting the kernel read-only data: %uk\n", | |
16811 | + (__end_rodata - __start_rodata) >> 10); | |
16812 | + | |
16813 | + /* | |
16814 | + * change_page_attr() requires a global_flush_tlb() call after it. | |
16815 | + * We do this after the printk so that if something went wrong in the | |
16816 | + * change, the printk gets out at least to give a better debug hint | |
16817 | + * of who is the culprit. | |
16818 | + */ | |
16819 | + global_flush_tlb(); | |
16820 | +} | |
16821 | +#endif | |
16822 | + | |
16823 | +void free_init_pages(char *what, unsigned long begin, unsigned long end) | |
16824 | +{ | |
16825 | + unsigned long addr; | |
16826 | + | |
16827 | + for (addr = begin; addr < end; addr += PAGE_SIZE) { | |
16828 | + ClearPageReserved(virt_to_page(addr)); | |
16829 | + init_page_count(virt_to_page(addr)); | |
16830 | + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); | |
16831 | + free_page(addr); | |
16832 | + totalram_pages++; | |
16833 | + } | |
16834 | + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); | |
16835 | +} | |
16836 | + | |
16837 | +void free_initmem(void) | |
16838 | +{ | |
16839 | + free_init_pages("unused kernel memory", | |
16840 | + (unsigned long)(&__init_begin), | |
16841 | + (unsigned long)(&__init_end)); | |
16842 | +} | |
16843 | + | |
16844 | +#ifdef CONFIG_BLK_DEV_INITRD | |
16845 | +void free_initrd_mem(unsigned long start, unsigned long end) | |
16846 | +{ | |
16847 | + free_init_pages("initrd memory", start, end); | |
16848 | +} | |
16849 | +#endif | |
16850 | + | |
16851 | Index: head-2008-11-25/arch/x86/mm/ioremap_32-xen.c | |
16852 | =================================================================== | |
16853 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
16854 | +++ head-2008-11-25/arch/x86/mm/ioremap_32-xen.c 2008-04-02 12:34:02.000000000 +0200 | |
16855 | @@ -0,0 +1,443 @@ | |
16856 | +/* | |
16857 | + * arch/i386/mm/ioremap.c | |
16858 | + * | |
16859 | + * Re-map IO memory to kernel address space so that we can access it. | |
16860 | + * This is needed for high PCI addresses that aren't mapped in the | |
16861 | + * 640k-1MB IO memory area on PC's | |
16862 | + * | |
16863 | + * (C) Copyright 1995 1996 Linus Torvalds | |
16864 | + */ | |
16865 | + | |
16866 | +#include <linux/vmalloc.h> | |
16867 | +#include <linux/init.h> | |
16868 | +#include <linux/slab.h> | |
16869 | +#include <linux/module.h> | |
16870 | +#include <asm/io.h> | |
16871 | +#include <asm/fixmap.h> | |
16872 | +#include <asm/cacheflush.h> | |
16873 | +#include <asm/tlbflush.h> | |
16874 | +#include <asm/pgtable.h> | |
16875 | +#include <asm/pgalloc.h> | |
16876 | + | |
16877 | +#define ISA_START_ADDRESS 0x0 | |
16878 | +#define ISA_END_ADDRESS 0x100000 | |
16879 | + | |
16880 | +static int direct_remap_area_pte_fn(pte_t *pte, | |
16881 | + struct page *pmd_page, | |
16882 | + unsigned long address, | |
16883 | + void *data) | |
16884 | +{ | |
16885 | + mmu_update_t **v = (mmu_update_t **)data; | |
16886 | + | |
16887 | + BUG_ON(!pte_none(*pte)); | |
16888 | + | |
16889 | + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) << | |
16890 | + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); | |
16891 | + (*v)++; | |
16892 | + | |
16893 | + return 0; | |
16894 | +} | |
16895 | + | |
16896 | +static int __direct_remap_pfn_range(struct mm_struct *mm, | |
16897 | + unsigned long address, | |
16898 | + unsigned long mfn, | |
16899 | + unsigned long size, | |
16900 | + pgprot_t prot, | |
16901 | + domid_t domid) | |
16902 | +{ | |
16903 | + int rc; | |
16904 | + unsigned long i, start_address; | |
16905 | + mmu_update_t *u, *v, *w; | |
16906 | + | |
16907 | + u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); | |
16908 | + if (u == NULL) | |
16909 | + return -ENOMEM; | |
16910 | + | |
16911 | + start_address = address; | |
16912 | + | |
16913 | + flush_cache_all(); | |
16914 | + | |
16915 | + for (i = 0; i < size; i += PAGE_SIZE) { | |
16916 | + if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) { | |
16917 | + /* Flush a full batch after filling in the PTE ptrs. */ | |
16918 | + rc = apply_to_page_range(mm, start_address, | |
16919 | + address - start_address, | |
16920 | + direct_remap_area_pte_fn, &w); | |
16921 | + if (rc) | |
16922 | + goto out; | |
16923 | + rc = -EFAULT; | |
16924 | + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) | |
16925 | + goto out; | |
16926 | + v = w = u; | |
16927 | + start_address = address; | |
16928 | + } | |
16929 | + | |
16930 | + /* | |
16931 | + * Fill in the machine address: PTE ptr is done later by | |
16932 | + * apply_to_page_range(). | |
16933 | + */ | |
16934 | + v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO; | |
16935 | + | |
16936 | + mfn++; | |
16937 | + address += PAGE_SIZE; | |
16938 | + v++; | |
16939 | + } | |
16940 | + | |
16941 | + if (v != u) { | |
16942 | + /* Final batch. */ | |
16943 | + rc = apply_to_page_range(mm, start_address, | |
16944 | + address - start_address, | |
16945 | + direct_remap_area_pte_fn, &w); | |
16946 | + if (rc) | |
16947 | + goto out; | |
16948 | + rc = -EFAULT; | |
16949 | + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) | |
16950 | + goto out; | |
16951 | + } | |
16952 | + | |
16953 | + rc = 0; | |
16954 | + | |
16955 | + out: | |
16956 | + flush_tlb_all(); | |
16957 | + | |
16958 | + free_page((unsigned long)u); | |
16959 | + | |
16960 | + return rc; | |
16961 | +} | |
16962 | + | |
16963 | +int direct_remap_pfn_range(struct vm_area_struct *vma, | |
16964 | + unsigned long address, | |
16965 | + unsigned long mfn, | |
16966 | + unsigned long size, | |
16967 | + pgprot_t prot, | |
16968 | + domid_t domid) | |
16969 | +{ | |
16970 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
16971 | + return remap_pfn_range(vma, address, mfn, size, prot); | |
16972 | + | |
16973 | + if (domid == DOMID_SELF) | |
16974 | + return -EINVAL; | |
16975 | + | |
16976 | + vma->vm_flags |= VM_IO | VM_RESERVED; | |
16977 | + | |
16978 | + vma->vm_mm->context.has_foreign_mappings = 1; | |
16979 | + | |
16980 | + return __direct_remap_pfn_range( | |
16981 | + vma->vm_mm, address, mfn, size, prot, domid); | |
16982 | +} | |
16983 | +EXPORT_SYMBOL(direct_remap_pfn_range); | |
16984 | + | |
16985 | +int direct_kernel_remap_pfn_range(unsigned long address, | |
16986 | + unsigned long mfn, | |
16987 | + unsigned long size, | |
16988 | + pgprot_t prot, | |
16989 | + domid_t domid) | |
16990 | +{ | |
16991 | + return __direct_remap_pfn_range( | |
16992 | + &init_mm, address, mfn, size, prot, domid); | |
16993 | +} | |
16994 | +EXPORT_SYMBOL(direct_kernel_remap_pfn_range); | |
16995 | + | |
16996 | +static int lookup_pte_fn( | |
16997 | + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) | |
16998 | +{ | |
16999 | + uint64_t *ptep = (uint64_t *)data; | |
17000 | + if (ptep) | |
17001 | + *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) << | |
17002 | + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); | |
17003 | + return 0; | |
17004 | +} | |
17005 | + | |
17006 | +int create_lookup_pte_addr(struct mm_struct *mm, | |
17007 | + unsigned long address, | |
17008 | + uint64_t *ptep) | |
17009 | +{ | |
17010 | + return apply_to_page_range(mm, address, PAGE_SIZE, | |
17011 | + lookup_pte_fn, ptep); | |
17012 | +} | |
17013 | + | |
17014 | +EXPORT_SYMBOL(create_lookup_pte_addr); | |
17015 | + | |
17016 | +static int noop_fn( | |
17017 | + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) | |
17018 | +{ | |
17019 | + return 0; | |
17020 | +} | |
17021 | + | |
17022 | +int touch_pte_range(struct mm_struct *mm, | |
17023 | + unsigned long address, | |
17024 | + unsigned long size) | |
17025 | +{ | |
17026 | + return apply_to_page_range(mm, address, size, noop_fn, NULL); | |
17027 | +} | |
17028 | + | |
17029 | +EXPORT_SYMBOL(touch_pte_range); | |
17030 | + | |
17031 | +/* | |
17032 | + * Does @address reside within a non-highmem page that is local to this virtual | |
17033 | + * machine (i.e., not an I/O page, nor a memory page belonging to another VM). | |
17034 | + * See the comment that accompanies mfn_to_local_pfn() in page.h to understand | |
17035 | + * why this works. | |
17036 | + */ | |
17037 | +static inline int is_local_lowmem(unsigned long address) | |
17038 | +{ | |
17039 | + extern unsigned long max_low_pfn; | |
17040 | + return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn); | |
17041 | +} | |
17042 | + | |
17043 | +/* | |
17044 | + * Generic mapping function (not visible outside): | |
17045 | + */ | |
17046 | + | |
17047 | +/* | |
17048 | + * Remap an arbitrary physical address space into the kernel virtual | |
17049 | + * address space. Needed when the kernel wants to access high addresses | |
17050 | + * directly. | |
17051 | + * | |
17052 | + * NOTE! We need to allow non-page-aligned mappings too: we will obviously | |
17053 | + * have to convert them into an offset in a page-aligned mapping, but the | |
17054 | + * caller shouldn't need to know that small detail. | |
17055 | + */ | |
17056 | +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) | |
17057 | +{ | |
17058 | + void __iomem * addr; | |
17059 | + struct vm_struct * area; | |
17060 | + unsigned long offset, last_addr; | |
17061 | + domid_t domid = DOMID_IO; | |
17062 | + | |
17063 | + /* Don't allow wraparound or zero size */ | |
17064 | + last_addr = phys_addr + size - 1; | |
17065 | + if (!size || last_addr < phys_addr) | |
17066 | + return NULL; | |
17067 | + | |
17068 | + /* | |
17069 | + * Don't remap the low PCI/ISA area, it's always mapped.. | |
17070 | + */ | |
17071 | + if (is_initial_xendomain() && | |
17072 | + phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | |
17073 | + return (void __iomem *) isa_bus_to_virt(phys_addr); | |
17074 | + | |
17075 | + /* | |
17076 | + * Don't allow anybody to remap normal RAM that we're using.. | |
17077 | + */ | |
17078 | + if (is_local_lowmem(phys_addr)) { | |
17079 | + char *t_addr, *t_end; | |
17080 | + struct page *page; | |
17081 | + | |
17082 | + t_addr = bus_to_virt(phys_addr); | |
17083 | + t_end = t_addr + (size - 1); | |
17084 | + | |
17085 | + for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) | |
17086 | + if(!PageReserved(page)) | |
17087 | + return NULL; | |
17088 | + | |
17089 | + domid = DOMID_SELF; | |
17090 | + } | |
17091 | + | |
17092 | + /* | |
17093 | + * Mappings have to be page-aligned | |
17094 | + */ | |
17095 | + offset = phys_addr & ~PAGE_MASK; | |
17096 | + phys_addr &= PAGE_MASK; | |
17097 | + size = PAGE_ALIGN(last_addr+1) - phys_addr; | |
17098 | + | |
17099 | + /* | |
17100 | + * Ok, go for it.. | |
17101 | + */ | |
17102 | + area = get_vm_area(size, VM_IOREMAP | (flags << 20)); | |
17103 | + if (!area) | |
17104 | + return NULL; | |
17105 | + area->phys_addr = phys_addr; | |
17106 | + addr = (void __iomem *) area->addr; | |
17107 | + flags |= _KERNPG_TABLE; | |
17108 | + if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr, | |
17109 | + phys_addr>>PAGE_SHIFT, | |
17110 | + size, __pgprot(flags), domid)) { | |
17111 | + vunmap((void __force *) addr); | |
17112 | + return NULL; | |
17113 | + } | |
17114 | + return (void __iomem *) (offset + (char __iomem *)addr); | |
17115 | +} | |
17116 | +EXPORT_SYMBOL(__ioremap); | |
17117 | + | |
17118 | +/** | |
17119 | + * ioremap_nocache - map bus memory into CPU space | |
17120 | + * @offset: bus address of the memory | |
17121 | + * @size: size of the resource to map | |
17122 | + * | |
17123 | + * ioremap_nocache performs a platform specific sequence of operations to | |
17124 | + * make bus memory CPU accessible via the readb/readw/readl/writeb/ | |
17125 | + * writew/writel functions and the other mmio helpers. The returned | |
17126 | + * address is not guaranteed to be usable directly as a virtual | |
17127 | + * address. | |
17128 | + * | |
17129 | + * This version of ioremap ensures that the memory is marked uncachable | |
17130 | + * on the CPU as well as honouring existing caching rules from things like | |
17131 | + * the PCI bus. Note that there are other caches and buffers on many | |
17132 | + * busses. In particular driver authors should read up on PCI writes | |
17133 | + * | |
17134 | + * It's useful if some control registers are in such an area and | |
17135 | + * write combining or read caching is not desirable: | |
17136 | + * | |
17137 | + * Must be freed with iounmap. | |
17138 | + */ | |
17139 | + | |
17140 | +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) | |
17141 | +{ | |
17142 | + unsigned long last_addr; | |
17143 | + void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); | |
17144 | + if (!p) | |
17145 | + return p; | |
17146 | + | |
17147 | + /* Guaranteed to be > phys_addr, as per __ioremap() */ | |
17148 | + last_addr = phys_addr + size - 1; | |
17149 | + | |
17150 | + if (is_local_lowmem(last_addr)) { | |
17151 | + struct page *ppage = virt_to_page(bus_to_virt(phys_addr)); | |
17152 | + unsigned long npages; | |
17153 | + | |
17154 | + phys_addr &= PAGE_MASK; | |
17155 | + | |
17156 | + /* This might overflow and become zero.. */ | |
17157 | + last_addr = PAGE_ALIGN(last_addr); | |
17158 | + | |
17159 | + /* .. but that's ok, because modulo-2**n arithmetic will make | |
17160 | + * the page-aligned "last - first" come out right. | |
17161 | + */ | |
17162 | + npages = (last_addr - phys_addr) >> PAGE_SHIFT; | |
17163 | + | |
17164 | + if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { | |
17165 | + iounmap(p); | |
17166 | + p = NULL; | |
17167 | + } | |
17168 | + global_flush_tlb(); | |
17169 | + } | |
17170 | + | |
17171 | + return p; | |
17172 | +} | |
17173 | +EXPORT_SYMBOL(ioremap_nocache); | |
17174 | + | |
17175 | +/** | |
17176 | + * iounmap - Free a IO remapping | |
17177 | + * @addr: virtual address from ioremap_* | |
17178 | + * | |
17179 | + * Caller must ensure there is only one unmapping for the same pointer. | |
17180 | + */ | |
17181 | +void iounmap(volatile void __iomem *addr) | |
17182 | +{ | |
17183 | + struct vm_struct *p, *o; | |
17184 | + | |
17185 | + if ((void __force *)addr <= high_memory) | |
17186 | + return; | |
17187 | + | |
17188 | + /* | |
17189 | + * __ioremap special-cases the PCI/ISA range by not instantiating a | |
17190 | + * vm_area and by simply returning an address into the kernel mapping | |
17191 | + * of ISA space. So handle that here. | |
17192 | + */ | |
17193 | + if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) | |
17194 | + return; | |
17195 | + | |
17196 | + addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); | |
17197 | + | |
17198 | + /* Use the vm area unlocked, assuming the caller | |
17199 | + ensures there isn't another iounmap for the same address | |
17200 | + in parallel. Reuse of the virtual address is prevented by | |
17201 | + leaving it in the global lists until we're done with it. | |
17202 | + cpa takes care of the direct mappings. */ | |
17203 | + read_lock(&vmlist_lock); | |
17204 | + for (p = vmlist; p; p = p->next) { | |
17205 | + if (p->addr == addr) | |
17206 | + break; | |
17207 | + } | |
17208 | + read_unlock(&vmlist_lock); | |
17209 | + | |
17210 | + if (!p) { | |
17211 | + printk("iounmap: bad address %p\n", addr); | |
17212 | + dump_stack(); | |
17213 | + return; | |
17214 | + } | |
17215 | + | |
17216 | + /* Reset the direct mapping. Can block */ | |
17217 | + if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { | |
17218 | + /* p->size includes the guard page, but cpa doesn't like that */ | |
17219 | + change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), | |
17220 | + (p->size - PAGE_SIZE) >> PAGE_SHIFT, | |
17221 | + PAGE_KERNEL); | |
17222 | + global_flush_tlb(); | |
17223 | + } | |
17224 | + | |
17225 | + /* Finally remove it */ | |
17226 | + o = remove_vm_area((void *)addr); | |
17227 | + BUG_ON(p != o || o == NULL); | |
17228 | + kfree(p); | |
17229 | +} | |
17230 | +EXPORT_SYMBOL(iounmap); | |
17231 | + | |
17232 | +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) | |
17233 | +{ | |
17234 | + unsigned long offset, last_addr; | |
17235 | + unsigned int nrpages; | |
17236 | + enum fixed_addresses idx; | |
17237 | + | |
17238 | + /* Don't allow wraparound or zero size */ | |
17239 | + last_addr = phys_addr + size - 1; | |
17240 | + if (!size || last_addr < phys_addr) | |
17241 | + return NULL; | |
17242 | + | |
17243 | + /* | |
17244 | + * Don't remap the low PCI/ISA area, it's always mapped.. | |
17245 | + */ | |
17246 | + if (is_initial_xendomain() && | |
17247 | + phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | |
17248 | + return isa_bus_to_virt(phys_addr); | |
17249 | + | |
17250 | + /* | |
17251 | + * Mappings have to be page-aligned | |
17252 | + */ | |
17253 | + offset = phys_addr & ~PAGE_MASK; | |
17254 | + phys_addr &= PAGE_MASK; | |
17255 | + size = PAGE_ALIGN(last_addr) - phys_addr; | |
17256 | + | |
17257 | + /* | |
17258 | + * Mappings have to fit in the FIX_BTMAP area. | |
17259 | + */ | |
17260 | + nrpages = size >> PAGE_SHIFT; | |
17261 | + if (nrpages > NR_FIX_BTMAPS) | |
17262 | + return NULL; | |
17263 | + | |
17264 | + /* | |
17265 | + * Ok, go for it.. | |
17266 | + */ | |
17267 | + idx = FIX_BTMAP_BEGIN; | |
17268 | + while (nrpages > 0) { | |
17269 | + set_fixmap(idx, phys_addr); | |
17270 | + phys_addr += PAGE_SIZE; | |
17271 | + --idx; | |
17272 | + --nrpages; | |
17273 | + } | |
17274 | + return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); | |
17275 | +} | |
17276 | + | |
17277 | +void __init bt_iounmap(void *addr, unsigned long size) | |
17278 | +{ | |
17279 | + unsigned long virt_addr; | |
17280 | + unsigned long offset; | |
17281 | + unsigned int nrpages; | |
17282 | + enum fixed_addresses idx; | |
17283 | + | |
17284 | + virt_addr = (unsigned long)addr; | |
17285 | + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) | |
17286 | + return; | |
17287 | + if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) | |
17288 | + return; | |
17289 | + offset = virt_addr & ~PAGE_MASK; | |
17290 | + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; | |
17291 | + | |
17292 | + idx = FIX_BTMAP_BEGIN; | |
17293 | + while (nrpages > 0) { | |
17294 | + clear_fixmap(idx); | |
17295 | + --idx; | |
17296 | + --nrpages; | |
17297 | + } | |
17298 | +} | |
17299 | Index: head-2008-11-25/arch/x86/mm/pgtable_32-xen.c | |
17300 | =================================================================== | |
17301 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
17302 | +++ head-2008-11-25/arch/x86/mm/pgtable_32-xen.c 2007-10-09 11:48:25.000000000 +0200 | |
17303 | @@ -0,0 +1,725 @@ | |
17304 | +/* | |
17305 | + * linux/arch/i386/mm/pgtable.c | |
17306 | + */ | |
17307 | + | |
17308 | +#include <linux/sched.h> | |
17309 | +#include <linux/kernel.h> | |
17310 | +#include <linux/errno.h> | |
17311 | +#include <linux/mm.h> | |
17312 | +#include <linux/swap.h> | |
17313 | +#include <linux/smp.h> | |
17314 | +#include <linux/highmem.h> | |
17315 | +#include <linux/slab.h> | |
17316 | +#include <linux/pagemap.h> | |
17317 | +#include <linux/spinlock.h> | |
17318 | +#include <linux/module.h> | |
17319 | + | |
17320 | +#include <asm/system.h> | |
17321 | +#include <asm/pgtable.h> | |
17322 | +#include <asm/pgalloc.h> | |
17323 | +#include <asm/fixmap.h> | |
17324 | +#include <asm/e820.h> | |
17325 | +#include <asm/tlb.h> | |
17326 | +#include <asm/tlbflush.h> | |
17327 | +#include <asm/io.h> | |
17328 | +#include <asm/mmu_context.h> | |
17329 | + | |
17330 | +#include <xen/features.h> | |
17331 | +#include <asm/hypervisor.h> | |
17332 | + | |
17333 | +static void pgd_test_and_unpin(pgd_t *pgd); | |
17334 | + | |
17335 | +void show_mem(void) | |
17336 | +{ | |
17337 | + int total = 0, reserved = 0; | |
17338 | + int shared = 0, cached = 0; | |
17339 | + int highmem = 0; | |
17340 | + struct page *page; | |
17341 | + pg_data_t *pgdat; | |
17342 | + unsigned long i; | |
17343 | + unsigned long flags; | |
17344 | + | |
17345 | + printk(KERN_INFO "Mem-info:\n"); | |
17346 | + show_free_areas(); | |
17347 | + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | |
17348 | + for_each_online_pgdat(pgdat) { | |
17349 | + pgdat_resize_lock(pgdat, &flags); | |
17350 | + for (i = 0; i < pgdat->node_spanned_pages; ++i) { | |
17351 | + page = pgdat_page_nr(pgdat, i); | |
17352 | + total++; | |
17353 | + if (PageHighMem(page)) | |
17354 | + highmem++; | |
17355 | + if (PageReserved(page)) | |
17356 | + reserved++; | |
17357 | + else if (PageSwapCache(page)) | |
17358 | + cached++; | |
17359 | + else if (page_count(page)) | |
17360 | + shared += page_count(page) - 1; | |
17361 | + } | |
17362 | + pgdat_resize_unlock(pgdat, &flags); | |
17363 | + } | |
17364 | + printk(KERN_INFO "%d pages of RAM\n", total); | |
17365 | + printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); | |
17366 | + printk(KERN_INFO "%d reserved pages\n", reserved); | |
17367 | + printk(KERN_INFO "%d pages shared\n", shared); | |
17368 | + printk(KERN_INFO "%d pages swap cached\n", cached); | |
17369 | + | |
17370 | + printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY)); | |
17371 | + printk(KERN_INFO "%lu pages writeback\n", | |
17372 | + global_page_state(NR_WRITEBACK)); | |
17373 | + printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED)); | |
17374 | + printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB)); | |
17375 | + printk(KERN_INFO "%lu pages pagetables\n", | |
17376 | + global_page_state(NR_PAGETABLE)); | |
17377 | +} | |
17378 | + | |
17379 | +/* | |
17380 | + * Associate a large virtual page frame with a given physical page frame | |
17381 | + * and protection flags for that frame. pfn is for the base of the page, | |
17382 | + * vaddr is what the page gets mapped to - both must be properly aligned. | |
17383 | + * The pmd must already be instantiated. Assumes PAE mode. | |
17384 | + */ | |
17385 | +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | |
17386 | +{ | |
17387 | + pgd_t *pgd; | |
17388 | + pud_t *pud; | |
17389 | + pmd_t *pmd; | |
17390 | + | |
17391 | + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ | |
17392 | + printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); | |
17393 | + return; /* BUG(); */ | |
17394 | + } | |
17395 | + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ | |
17396 | + printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); | |
17397 | + return; /* BUG(); */ | |
17398 | + } | |
17399 | + pgd = swapper_pg_dir + pgd_index(vaddr); | |
17400 | + if (pgd_none(*pgd)) { | |
17401 | + printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); | |
17402 | + return; /* BUG(); */ | |
17403 | + } | |
17404 | + pud = pud_offset(pgd, vaddr); | |
17405 | + pmd = pmd_offset(pud, vaddr); | |
17406 | + set_pmd(pmd, pfn_pmd(pfn, flags)); | |
17407 | + /* | |
17408 | + * It's enough to flush this one mapping. | |
17409 | + * (PGE mappings get flushed as well) | |
17410 | + */ | |
17411 | + __flush_tlb_one(vaddr); | |
17412 | +} | |
17413 | + | |
17414 | +static int nr_fixmaps = 0; | |
17415 | +unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START; | |
17416 | +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE); | |
17417 | +EXPORT_SYMBOL(__FIXADDR_TOP); | |
17418 | + | |
17419 | +void __init set_fixaddr_top(unsigned long top) | |
17420 | +{ | |
17421 | + BUG_ON(nr_fixmaps > 0); | |
17422 | + hypervisor_virt_start = top; | |
17423 | + __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE; | |
17424 | +} | |
17425 | + | |
17426 | +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags) | |
17427 | +{ | |
17428 | + unsigned long address = __fix_to_virt(idx); | |
17429 | + pte_t pte; | |
17430 | + | |
17431 | + if (idx >= __end_of_fixed_addresses) { | |
17432 | + BUG(); | |
17433 | + return; | |
17434 | + } | |
17435 | + switch (idx) { | |
17436 | + case FIX_WP_TEST: | |
17437 | + case FIX_VDSO: | |
17438 | + pte = pfn_pte(phys >> PAGE_SHIFT, flags); | |
17439 | + break; | |
17440 | + default: | |
17441 | + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags); | |
17442 | + break; | |
17443 | + } | |
17444 | + if (HYPERVISOR_update_va_mapping(address, pte, | |
17445 | + UVMF_INVLPG|UVMF_ALL)) | |
17446 | + BUG(); | |
17447 | + nr_fixmaps++; | |
17448 | +} | |
17449 | + | |
17450 | +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | |
17451 | +{ | |
17452 | + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); | |
17453 | + if (pte) | |
17454 | + make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables); | |
17455 | + return pte; | |
17456 | +} | |
17457 | + | |
17458 | +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) | |
17459 | +{ | |
17460 | + struct page *pte; | |
17461 | + | |
17462 | +#ifdef CONFIG_HIGHPTE | |
17463 | + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); | |
17464 | +#else | |
17465 | + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | |
17466 | +#endif | |
17467 | + if (pte) { | |
17468 | + SetPageForeign(pte, pte_free); | |
17469 | + init_page_count(pte); | |
17470 | + } | |
17471 | + return pte; | |
17472 | +} | |
17473 | + | |
17474 | +void pte_free(struct page *pte) | |
17475 | +{ | |
17476 | + unsigned long pfn = page_to_pfn(pte); | |
17477 | + | |
17478 | + if (!PageHighMem(pte)) { | |
17479 | + unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT); | |
17480 | + | |
17481 | + if (!pte_write(*virt_to_ptep(va))) | |
17482 | + if (HYPERVISOR_update_va_mapping( | |
17483 | + va, pfn_pte(pfn, PAGE_KERNEL), 0)) | |
17484 | + BUG(); | |
17485 | + } else | |
17486 | + clear_bit(PG_pinned, &pte->flags); | |
17487 | + | |
17488 | + ClearPageForeign(pte); | |
17489 | + init_page_count(pte); | |
17490 | + | |
17491 | + __free_page(pte); | |
17492 | +} | |
17493 | + | |
17494 | +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) | |
17495 | +{ | |
17496 | + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); | |
17497 | +} | |
17498 | + | |
17499 | +/* | |
17500 | + * List of all pgd's needed for non-PAE so it can invalidate entries | |
17501 | + * in both cached and uncached pgd's; not needed for PAE since the | |
17502 | + * kernel pmd is shared. If PAE were not to share the pmd a similar | |
17503 | + * tactic would be needed. This is essentially codepath-based locking | |
17504 | + * against pageattr.c; it is the unique case in which a valid change | |
17505 | + * of kernel pagetables can't be lazily synchronized by vmalloc faults. | |
17506 | + * vmalloc faults work because attached pagetables are never freed. | |
17507 | + * The locking scheme was chosen on the basis of manfred's | |
17508 | + * recommendations and having no core impact whatsoever. | |
17509 | + * -- wli | |
17510 | + */ | |
17511 | +DEFINE_SPINLOCK(pgd_lock); | |
17512 | +struct page *pgd_list; | |
17513 | + | |
17514 | +static inline void pgd_list_add(pgd_t *pgd) | |
17515 | +{ | |
17516 | + struct page *page = virt_to_page(pgd); | |
17517 | + page->index = (unsigned long)pgd_list; | |
17518 | + if (pgd_list) | |
17519 | + set_page_private(pgd_list, (unsigned long)&page->index); | |
17520 | + pgd_list = page; | |
17521 | + set_page_private(page, (unsigned long)&pgd_list); | |
17522 | +} | |
17523 | + | |
17524 | +static inline void pgd_list_del(pgd_t *pgd) | |
17525 | +{ | |
17526 | + struct page *next, **pprev, *page = virt_to_page(pgd); | |
17527 | + next = (struct page *)page->index; | |
17528 | + pprev = (struct page **)page_private(page); | |
17529 | + *pprev = next; | |
17530 | + if (next) | |
17531 | + set_page_private(next, (unsigned long)pprev); | |
17532 | +} | |
17533 | + | |
17534 | +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) | |
17535 | +{ | |
17536 | + unsigned long flags; | |
17537 | + | |
17538 | + if (PTRS_PER_PMD > 1) { | |
17539 | + if (HAVE_SHARED_KERNEL_PMD) | |
17540 | + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | |
17541 | + swapper_pg_dir + USER_PTRS_PER_PGD, | |
17542 | + KERNEL_PGD_PTRS); | |
17543 | + } else { | |
17544 | + spin_lock_irqsave(&pgd_lock, flags); | |
17545 | + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | |
17546 | + swapper_pg_dir + USER_PTRS_PER_PGD, | |
17547 | + KERNEL_PGD_PTRS); | |
17548 | + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | |
17549 | + pgd_list_add(pgd); | |
17550 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17551 | + } | |
17552 | +} | |
17553 | + | |
17554 | +/* never called when PTRS_PER_PMD > 1 */ | |
17555 | +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) | |
17556 | +{ | |
17557 | + unsigned long flags; /* can be called from interrupt context */ | |
17558 | + | |
17559 | + spin_lock_irqsave(&pgd_lock, flags); | |
17560 | + pgd_list_del(pgd); | |
17561 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17562 | + | |
17563 | + pgd_test_and_unpin(pgd); | |
17564 | +} | |
17565 | + | |
17566 | +pgd_t *pgd_alloc(struct mm_struct *mm) | |
17567 | +{ | |
17568 | + int i; | |
17569 | + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); | |
17570 | + pmd_t **pmd; | |
17571 | + unsigned long flags; | |
17572 | + | |
17573 | + pgd_test_and_unpin(pgd); | |
17574 | + | |
17575 | + if (PTRS_PER_PMD == 1 || !pgd) | |
17576 | + return pgd; | |
17577 | + | |
17578 | + if (HAVE_SHARED_KERNEL_PMD) { | |
17579 | + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { | |
17580 | + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); | |
17581 | + if (!pmd) | |
17582 | + goto out_oom; | |
17583 | + set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); | |
17584 | + } | |
17585 | + return pgd; | |
17586 | + } | |
17587 | + | |
17588 | + /* | |
17589 | + * We can race save/restore (if we sleep during a GFP_KERNEL memory | |
17590 | + * allocation). We therefore store virtual addresses of pmds as they | |
17591 | + * do not change across save/restore, and poke the machine addresses | |
17592 | + * into the pgdir under the pgd_lock. | |
17593 | + */ | |
17594 | + pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL); | |
17595 | + if (!pmd) { | |
17596 | + kmem_cache_free(pgd_cache, pgd); | |
17597 | + return NULL; | |
17598 | + } | |
17599 | + | |
17600 | + /* Allocate pmds, remember virtual addresses. */ | |
17601 | + for (i = 0; i < PTRS_PER_PGD; ++i) { | |
17602 | + pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL); | |
17603 | + if (!pmd[i]) | |
17604 | + goto out_oom; | |
17605 | + } | |
17606 | + | |
17607 | + spin_lock_irqsave(&pgd_lock, flags); | |
17608 | + | |
17609 | + /* Protect against save/restore: move below 4GB under pgd_lock. */ | |
17610 | + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) { | |
17611 | + int rc = xen_create_contiguous_region( | |
17612 | + (unsigned long)pgd, 0, 32); | |
17613 | + if (rc) { | |
17614 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17615 | + goto out_oom; | |
17616 | + } | |
17617 | + } | |
17618 | + | |
17619 | + /* Copy kernel pmd contents and write-protect the new pmds. */ | |
17620 | + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { | |
17621 | + unsigned long v = (unsigned long)i << PGDIR_SHIFT; | |
17622 | + pgd_t *kpgd = pgd_offset_k(v); | |
17623 | + pud_t *kpud = pud_offset(kpgd, v); | |
17624 | + pmd_t *kpmd = pmd_offset(kpud, v); | |
17625 | + memcpy(pmd[i], kpmd, PAGE_SIZE); | |
17626 | + make_lowmem_page_readonly( | |
17627 | + pmd[i], XENFEAT_writable_page_tables); | |
17628 | + } | |
17629 | + | |
17630 | + /* It is safe to poke machine addresses of pmds under the pmd_lock. */ | |
17631 | + for (i = 0; i < PTRS_PER_PGD; i++) | |
17632 | + set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i]))); | |
17633 | + | |
17634 | + /* Ensure this pgd gets picked up and pinned on save/restore. */ | |
17635 | + pgd_list_add(pgd); | |
17636 | + | |
17637 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17638 | + | |
17639 | + kfree(pmd); | |
17640 | + | |
17641 | + return pgd; | |
17642 | + | |
17643 | +out_oom: | |
17644 | + if (HAVE_SHARED_KERNEL_PMD) { | |
17645 | + for (i--; i >= 0; i--) | |
17646 | + kmem_cache_free(pmd_cache, | |
17647 | + (void *)__va(pgd_val(pgd[i])-1)); | |
17648 | + } else { | |
17649 | + for (i--; i >= 0; i--) | |
17650 | + kmem_cache_free(pmd_cache, pmd[i]); | |
17651 | + kfree(pmd); | |
17652 | + } | |
17653 | + kmem_cache_free(pgd_cache, pgd); | |
17654 | + return NULL; | |
17655 | +} | |
17656 | + | |
17657 | +void pgd_free(pgd_t *pgd) | |
17658 | +{ | |
17659 | + int i; | |
17660 | + | |
17661 | + /* | |
17662 | + * After this the pgd should not be pinned for the duration of this | |
17663 | + * function's execution. We should never sleep and thus never race: | |
17664 | + * 1. User pmds will not become write-protected under our feet due | |
17665 | + * to a concurrent mm_pin_all(). | |
17666 | + * 2. The machine addresses in PGD entries will not become invalid | |
17667 | + * due to a concurrent save/restore. | |
17668 | + */ | |
17669 | + pgd_test_and_unpin(pgd); | |
17670 | + | |
17671 | + /* in the PAE case user pgd entries are overwritten before usage */ | |
17672 | + if (PTRS_PER_PMD > 1) { | |
17673 | + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { | |
17674 | + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); | |
17675 | + kmem_cache_free(pmd_cache, pmd); | |
17676 | + } | |
17677 | + | |
17678 | + if (!HAVE_SHARED_KERNEL_PMD) { | |
17679 | + unsigned long flags; | |
17680 | + spin_lock_irqsave(&pgd_lock, flags); | |
17681 | + pgd_list_del(pgd); | |
17682 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17683 | + | |
17684 | + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { | |
17685 | + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); | |
17686 | + make_lowmem_page_writable( | |
17687 | + pmd, XENFEAT_writable_page_tables); | |
17688 | + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); | |
17689 | + kmem_cache_free(pmd_cache, pmd); | |
17690 | + } | |
17691 | + | |
17692 | + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) | |
17693 | + xen_destroy_contiguous_region( | |
17694 | + (unsigned long)pgd, 0); | |
17695 | + } | |
17696 | + } | |
17697 | + | |
17698 | + /* in the non-PAE case, free_pgtables() clears user pgd entries */ | |
17699 | + kmem_cache_free(pgd_cache, pgd); | |
17700 | +} | |
17701 | + | |
17702 | +void make_lowmem_page_readonly(void *va, unsigned int feature) | |
17703 | +{ | |
17704 | + pte_t *pte; | |
17705 | + int rc; | |
17706 | + | |
17707 | + if (xen_feature(feature)) | |
17708 | + return; | |
17709 | + | |
17710 | + pte = virt_to_ptep(va); | |
17711 | + rc = HYPERVISOR_update_va_mapping( | |
17712 | + (unsigned long)va, pte_wrprotect(*pte), 0); | |
17713 | + BUG_ON(rc); | |
17714 | +} | |
17715 | + | |
17716 | +void make_lowmem_page_writable(void *va, unsigned int feature) | |
17717 | +{ | |
17718 | + pte_t *pte; | |
17719 | + int rc; | |
17720 | + | |
17721 | + if (xen_feature(feature)) | |
17722 | + return; | |
17723 | + | |
17724 | + pte = virt_to_ptep(va); | |
17725 | + rc = HYPERVISOR_update_va_mapping( | |
17726 | + (unsigned long)va, pte_mkwrite(*pte), 0); | |
17727 | + BUG_ON(rc); | |
17728 | +} | |
17729 | + | |
17730 | +void make_page_readonly(void *va, unsigned int feature) | |
17731 | +{ | |
17732 | + pte_t *pte; | |
17733 | + int rc; | |
17734 | + | |
17735 | + if (xen_feature(feature)) | |
17736 | + return; | |
17737 | + | |
17738 | + pte = virt_to_ptep(va); | |
17739 | + rc = HYPERVISOR_update_va_mapping( | |
17740 | + (unsigned long)va, pte_wrprotect(*pte), 0); | |
17741 | + if (rc) /* fallback? */ | |
17742 | + xen_l1_entry_update(pte, pte_wrprotect(*pte)); | |
17743 | + if ((unsigned long)va >= (unsigned long)high_memory) { | |
17744 | + unsigned long pfn = pte_pfn(*pte); | |
17745 | +#ifdef CONFIG_HIGHMEM | |
17746 | + if (pfn >= highstart_pfn) | |
17747 | + kmap_flush_unused(); /* flush stale writable kmaps */ | |
17748 | + else | |
17749 | +#endif | |
17750 | + make_lowmem_page_readonly( | |
17751 | + phys_to_virt(pfn << PAGE_SHIFT), feature); | |
17752 | + } | |
17753 | +} | |
17754 | + | |
17755 | +void make_page_writable(void *va, unsigned int feature) | |
17756 | +{ | |
17757 | + pte_t *pte; | |
17758 | + int rc; | |
17759 | + | |
17760 | + if (xen_feature(feature)) | |
17761 | + return; | |
17762 | + | |
17763 | + pte = virt_to_ptep(va); | |
17764 | + rc = HYPERVISOR_update_va_mapping( | |
17765 | + (unsigned long)va, pte_mkwrite(*pte), 0); | |
17766 | + if (rc) /* fallback? */ | |
17767 | + xen_l1_entry_update(pte, pte_mkwrite(*pte)); | |
17768 | + if ((unsigned long)va >= (unsigned long)high_memory) { | |
17769 | + unsigned long pfn = pte_pfn(*pte); | |
17770 | +#ifdef CONFIG_HIGHMEM | |
17771 | + if (pfn < highstart_pfn) | |
17772 | +#endif | |
17773 | + make_lowmem_page_writable( | |
17774 | + phys_to_virt(pfn << PAGE_SHIFT), feature); | |
17775 | + } | |
17776 | +} | |
17777 | + | |
17778 | +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature) | |
17779 | +{ | |
17780 | + if (xen_feature(feature)) | |
17781 | + return; | |
17782 | + | |
17783 | + while (nr-- != 0) { | |
17784 | + make_page_readonly(va, feature); | |
17785 | + va = (void *)((unsigned long)va + PAGE_SIZE); | |
17786 | + } | |
17787 | +} | |
17788 | + | |
17789 | +void make_pages_writable(void *va, unsigned int nr, unsigned int feature) | |
17790 | +{ | |
17791 | + if (xen_feature(feature)) | |
17792 | + return; | |
17793 | + | |
17794 | + while (nr-- != 0) { | |
17795 | + make_page_writable(va, feature); | |
17796 | + va = (void *)((unsigned long)va + PAGE_SIZE); | |
17797 | + } | |
17798 | +} | |
17799 | + | |
17800 | +static void _pin_lock(struct mm_struct *mm, int lock) { | |
17801 | + if (lock) | |
17802 | + spin_lock(&mm->page_table_lock); | |
17803 | +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | |
17804 | + /* While mm->page_table_lock protects us against insertions and | |
17805 | + * removals of higher level page table pages, it doesn't protect | |
17806 | + * against updates of pte-s. Such updates, however, require the | |
17807 | + * pte pages to be in consistent state (unpinned+writable or | |
17808 | + * pinned+readonly). The pinning and attribute changes, however | |
17809 | + * cannot be done atomically, which is why such updates must be | |
17810 | + * prevented from happening concurrently. | |
17811 | + * Note that no pte lock can ever elsewhere be acquired nesting | |
17812 | + * with an already acquired one in the same mm, or with the mm's | |
17813 | + * page_table_lock already acquired, as that would break in the | |
17814 | + * non-split case (where all these are actually resolving to the | |
17815 | + * one page_table_lock). Thus acquiring all of them here is not | |
17816 | + * going to result in dead locks, and the order of acquires | |
17817 | + * doesn't matter. | |
17818 | + */ | |
17819 | + { | |
17820 | + pgd_t *pgd = mm->pgd; | |
17821 | + unsigned g; | |
17822 | + | |
17823 | + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { | |
17824 | + pud_t *pud; | |
17825 | + unsigned u; | |
17826 | + | |
17827 | + if (pgd_none(*pgd)) | |
17828 | + continue; | |
17829 | + pud = pud_offset(pgd, 0); | |
17830 | + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
17831 | + pmd_t *pmd; | |
17832 | + unsigned m; | |
17833 | + | |
17834 | + if (pud_none(*pud)) | |
17835 | + continue; | |
17836 | + pmd = pmd_offset(pud, 0); | |
17837 | + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | |
17838 | + spinlock_t *ptl; | |
17839 | + | |
17840 | + if (pmd_none(*pmd)) | |
17841 | + continue; | |
17842 | + ptl = pte_lockptr(0, pmd); | |
17843 | + if (lock) | |
17844 | + spin_lock(ptl); | |
17845 | + else | |
17846 | + spin_unlock(ptl); | |
17847 | + } | |
17848 | + } | |
17849 | + } | |
17850 | + } | |
17851 | +#endif | |
17852 | + if (!lock) | |
17853 | + spin_unlock(&mm->page_table_lock); | |
17854 | +} | |
17855 | +#define pin_lock(mm) _pin_lock(mm, 1) | |
17856 | +#define pin_unlock(mm) _pin_lock(mm, 0) | |
17857 | + | |
17858 | +#define PIN_BATCH 4 | |
17859 | +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); | |
17860 | + | |
17861 | +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags, | |
17862 | + unsigned int cpu, unsigned seq) | |
17863 | +{ | |
17864 | + unsigned long pfn = page_to_pfn(page); | |
17865 | + | |
17866 | + if (PageHighMem(page)) { | |
17867 | + if (pgprot_val(flags) & _PAGE_RW) | |
17868 | + clear_bit(PG_pinned, &page->flags); | |
17869 | + else | |
17870 | + set_bit(PG_pinned, &page->flags); | |
17871 | + } else { | |
17872 | + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, | |
17873 | + (unsigned long)__va(pfn << PAGE_SHIFT), | |
17874 | + pfn_pte(pfn, flags), 0); | |
17875 | + if (unlikely(++seq == PIN_BATCH)) { | |
17876 | + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), | |
17877 | + PIN_BATCH, NULL))) | |
17878 | + BUG(); | |
17879 | + seq = 0; | |
17880 | + } | |
17881 | + } | |
17882 | + | |
17883 | + return seq; | |
17884 | +} | |
17885 | + | |
17886 | +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) | |
17887 | +{ | |
17888 | + pgd_t *pgd = pgd_base; | |
17889 | + pud_t *pud; | |
17890 | + pmd_t *pmd; | |
17891 | + int g, u, m; | |
17892 | + unsigned int cpu, seq; | |
17893 | + | |
17894 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
17895 | + return; | |
17896 | + | |
17897 | + cpu = get_cpu(); | |
17898 | + | |
17899 | + for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { | |
17900 | + if (pgd_none(*pgd)) | |
17901 | + continue; | |
17902 | + pud = pud_offset(pgd, 0); | |
17903 | + if (PTRS_PER_PUD > 1) /* not folded */ | |
17904 | + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq); | |
17905 | + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
17906 | + if (pud_none(*pud)) | |
17907 | + continue; | |
17908 | + pmd = pmd_offset(pud, 0); | |
17909 | + if (PTRS_PER_PMD > 1) /* not folded */ | |
17910 | + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq); | |
17911 | + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | |
17912 | + if (pmd_none(*pmd)) | |
17913 | + continue; | |
17914 | + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq); | |
17915 | + } | |
17916 | + } | |
17917 | + } | |
17918 | + | |
17919 | + if (likely(seq != 0)) { | |
17920 | + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, | |
17921 | + (unsigned long)pgd_base, | |
17922 | + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), | |
17923 | + UVMF_TLB_FLUSH); | |
17924 | + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), | |
17925 | + seq + 1, NULL))) | |
17926 | + BUG(); | |
17927 | + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base, | |
17928 | + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), | |
17929 | + UVMF_TLB_FLUSH)) | |
17930 | + BUG(); | |
17931 | + | |
17932 | + put_cpu(); | |
17933 | +} | |
17934 | + | |
17935 | +static void __pgd_pin(pgd_t *pgd) | |
17936 | +{ | |
17937 | + pgd_walk(pgd, PAGE_KERNEL_RO); | |
17938 | + kmap_flush_unused(); | |
17939 | + xen_pgd_pin(__pa(pgd)); | |
17940 | + set_bit(PG_pinned, &virt_to_page(pgd)->flags); | |
17941 | +} | |
17942 | + | |
17943 | +static void __pgd_unpin(pgd_t *pgd) | |
17944 | +{ | |
17945 | + xen_pgd_unpin(__pa(pgd)); | |
17946 | + pgd_walk(pgd, PAGE_KERNEL); | |
17947 | + clear_bit(PG_pinned, &virt_to_page(pgd)->flags); | |
17948 | +} | |
17949 | + | |
17950 | +static void pgd_test_and_unpin(pgd_t *pgd) | |
17951 | +{ | |
17952 | + if (test_bit(PG_pinned, &virt_to_page(pgd)->flags)) | |
17953 | + __pgd_unpin(pgd); | |
17954 | +} | |
17955 | + | |
17956 | +void mm_pin(struct mm_struct *mm) | |
17957 | +{ | |
17958 | + if (xen_feature(XENFEAT_writable_page_tables)) | |
17959 | + return; | |
17960 | + pin_lock(mm); | |
17961 | + __pgd_pin(mm->pgd); | |
17962 | + pin_unlock(mm); | |
17963 | +} | |
17964 | + | |
17965 | +void mm_unpin(struct mm_struct *mm) | |
17966 | +{ | |
17967 | + if (xen_feature(XENFEAT_writable_page_tables)) | |
17968 | + return; | |
17969 | + pin_lock(mm); | |
17970 | + __pgd_unpin(mm->pgd); | |
17971 | + pin_unlock(mm); | |
17972 | +} | |
17973 | + | |
17974 | +void mm_pin_all(void) | |
17975 | +{ | |
17976 | + struct page *page; | |
17977 | + unsigned long flags; | |
17978 | + | |
17979 | + if (xen_feature(XENFEAT_writable_page_tables)) | |
17980 | + return; | |
17981 | + | |
17982 | + /* | |
17983 | + * Allow uninterrupted access to the pgd_list. Also protects | |
17984 | + * __pgd_pin() by disabling preemption. | |
17985 | + * All other CPUs must be at a safe point (e.g., in stop_machine | |
17986 | + * or offlined entirely). | |
17987 | + */ | |
17988 | + spin_lock_irqsave(&pgd_lock, flags); | |
17989 | + for (page = pgd_list; page; page = (struct page *)page->index) { | |
17990 | + if (!test_bit(PG_pinned, &page->flags)) | |
17991 | + __pgd_pin((pgd_t *)page_address(page)); | |
17992 | + } | |
17993 | + spin_unlock_irqrestore(&pgd_lock, flags); | |
17994 | +} | |
17995 | + | |
17996 | +void _arch_dup_mmap(struct mm_struct *mm) | |
17997 | +{ | |
17998 | + if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags)) | |
17999 | + mm_pin(mm); | |
18000 | +} | |
18001 | + | |
18002 | +void _arch_exit_mmap(struct mm_struct *mm) | |
18003 | +{ | |
18004 | + struct task_struct *tsk = current; | |
18005 | + | |
18006 | + task_lock(tsk); | |
18007 | + | |
18008 | + /* | |
18009 | + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() | |
18010 | + * *much* faster this way, as no tlb flushes means bigger wrpt batches. | |
18011 | + */ | |
18012 | + if (tsk->active_mm == mm) { | |
18013 | + tsk->active_mm = &init_mm; | |
18014 | + atomic_inc(&init_mm.mm_count); | |
18015 | + | |
18016 | + switch_mm(mm, &init_mm, tsk); | |
18017 | + | |
18018 | + atomic_dec(&mm->mm_count); | |
18019 | + BUG_ON(atomic_read(&mm->mm_count) == 0); | |
18020 | + } | |
18021 | + | |
18022 | + task_unlock(tsk); | |
18023 | + | |
18024 | + if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) && | |
18025 | + (atomic_read(&mm->mm_count) == 1) && | |
18026 | + !mm->context.has_foreign_mappings) | |
18027 | + mm_unpin(mm); | |
18028 | +} | |
18029 | Index: head-2008-11-25/arch/x86/oprofile/xenoprof.c | |
18030 | =================================================================== | |
18031 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
18032 | +++ head-2008-11-25/arch/x86/oprofile/xenoprof.c 2008-01-28 12:24:19.000000000 +0100 | |
18033 | @@ -0,0 +1,179 @@ | |
18034 | +/** | |
18035 | + * @file xenoprof.c | |
18036 | + * | |
18037 | + * @remark Copyright 2002 OProfile authors | |
18038 | + * @remark Read the file COPYING | |
18039 | + * | |
18040 | + * @author John Levon <levon@movementarian.org> | |
18041 | + * | |
18042 | + * Modified by Aravind Menon and Jose Renato Santos for Xen | |
18043 | + * These modifications are: | |
18044 | + * Copyright (C) 2005 Hewlett-Packard Co. | |
18045 | + * | |
18046 | + * x86-specific part | |
18047 | + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> | |
18048 | + * VA Linux Systems Japan K.K. | |
18049 | + */ | |
18050 | + | |
18051 | +#include <linux/init.h> | |
18052 | +#include <linux/oprofile.h> | |
18053 | +#include <linux/sched.h> | |
18054 | +#include <asm/pgtable.h> | |
18055 | + | |
18056 | +#include <xen/driver_util.h> | |
18057 | +#include <xen/interface/xen.h> | |
18058 | +#include <xen/interface/xenoprof.h> | |
18059 | +#include <xen/xenoprof.h> | |
18060 | +#include "op_counter.h" | |
18061 | + | |
18062 | +static unsigned int num_events = 0; | |
18063 | + | |
18064 | +void __init xenoprof_arch_init_counter(struct xenoprof_init *init) | |
18065 | +{ | |
18066 | + num_events = init->num_events; | |
18067 | + /* just in case - make sure we do not overflow event list | |
18068 | + (i.e. counter_config list) */ | |
18069 | + if (num_events > OP_MAX_COUNTER) { | |
18070 | + num_events = OP_MAX_COUNTER; | |
18071 | + init->num_events = num_events; | |
18072 | + } | |
18073 | +} | |
18074 | + | |
18075 | +void xenoprof_arch_counter(void) | |
18076 | +{ | |
18077 | + int i; | |
18078 | + struct xenoprof_counter counter; | |
18079 | + | |
18080 | + for (i=0; i<num_events; i++) { | |
18081 | + counter.ind = i; | |
18082 | + counter.count = (uint64_t)counter_config[i].count; | |
18083 | + counter.enabled = (uint32_t)counter_config[i].enabled; | |
18084 | + counter.event = (uint32_t)counter_config[i].event; | |
18085 | + counter.kernel = (uint32_t)counter_config[i].kernel; | |
18086 | + counter.user = (uint32_t)counter_config[i].user; | |
18087 | + counter.unit_mask = (uint64_t)counter_config[i].unit_mask; | |
18088 | + WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_counter, | |
18089 | + &counter)); | |
18090 | + } | |
18091 | +} | |
18092 | + | |
18093 | +void xenoprof_arch_start(void) | |
18094 | +{ | |
18095 | + /* nothing */ | |
18096 | +} | |
18097 | + | |
18098 | +void xenoprof_arch_stop(void) | |
18099 | +{ | |
18100 | + /* nothing */ | |
18101 | +} | |
18102 | + | |
18103 | +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf) | |
18104 | +{ | |
18105 | + if (sbuf->buffer) { | |
18106 | + vunmap(sbuf->buffer); | |
18107 | + sbuf->buffer = NULL; | |
18108 | + } | |
18109 | +} | |
18110 | + | |
18111 | +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer, | |
18112 | + struct xenoprof_shared_buffer * sbuf) | |
18113 | +{ | |
18114 | + int npages, ret; | |
18115 | + struct vm_struct *area; | |
18116 | + | |
18117 | + sbuf->buffer = NULL; | |
18118 | + if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) ) | |
18119 | + return ret; | |
18120 | + | |
18121 | + npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1; | |
18122 | + | |
18123 | + area = alloc_vm_area(npages * PAGE_SIZE); | |
18124 | + if (area == NULL) | |
18125 | + return -ENOMEM; | |
18126 | + | |
18127 | + if ( (ret = direct_kernel_remap_pfn_range( | |
18128 | + (unsigned long)area->addr, | |
18129 | + get_buffer->buf_gmaddr >> PAGE_SHIFT, | |
18130 | + npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE), | |
18131 | + DOMID_SELF)) ) { | |
18132 | + vunmap(area->addr); | |
18133 | + return ret; | |
18134 | + } | |
18135 | + | |
18136 | + sbuf->buffer = area->addr; | |
18137 | + return ret; | |
18138 | +} | |
18139 | + | |
18140 | +int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain, | |
18141 | + struct xenoprof_shared_buffer * sbuf) | |
18142 | +{ | |
18143 | + int ret; | |
18144 | + int npages; | |
18145 | + struct vm_struct *area; | |
18146 | + pgprot_t prot = __pgprot(_KERNPG_TABLE); | |
18147 | + | |
18148 | + sbuf->buffer = NULL; | |
18149 | + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain); | |
18150 | + if (ret) | |
18151 | + goto out; | |
18152 | + | |
18153 | + npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1; | |
18154 | + | |
18155 | + area = alloc_vm_area(npages * PAGE_SIZE); | |
18156 | + if (area == NULL) { | |
18157 | + ret = -ENOMEM; | |
18158 | + goto out; | |
18159 | + } | |
18160 | + | |
18161 | + ret = direct_kernel_remap_pfn_range( | |
18162 | + (unsigned long)area->addr, | |
18163 | + pdomain->buf_gmaddr >> PAGE_SHIFT, | |
18164 | + npages * PAGE_SIZE, prot, DOMID_SELF); | |
18165 | + if (ret) { | |
18166 | + vunmap(area->addr); | |
18167 | + goto out; | |
18168 | + } | |
18169 | + sbuf->buffer = area->addr; | |
18170 | + | |
18171 | +out: | |
18172 | + return ret; | |
18173 | +} | |
18174 | + | |
18175 | +struct op_counter_config counter_config[OP_MAX_COUNTER]; | |
18176 | + | |
18177 | +int xenoprof_create_files(struct super_block * sb, struct dentry * root) | |
18178 | +{ | |
18179 | + unsigned int i; | |
18180 | + | |
18181 | + for (i = 0; i < num_events; ++i) { | |
18182 | + struct dentry * dir; | |
18183 | + char buf[2]; | |
18184 | + | |
18185 | + snprintf(buf, 2, "%d", i); | |
18186 | + dir = oprofilefs_mkdir(sb, root, buf); | |
18187 | + oprofilefs_create_ulong(sb, dir, "enabled", | |
18188 | + &counter_config[i].enabled); | |
18189 | + oprofilefs_create_ulong(sb, dir, "event", | |
18190 | + &counter_config[i].event); | |
18191 | + oprofilefs_create_ulong(sb, dir, "count", | |
18192 | + &counter_config[i].count); | |
18193 | + oprofilefs_create_ulong(sb, dir, "unit_mask", | |
18194 | + &counter_config[i].unit_mask); | |
18195 | + oprofilefs_create_ulong(sb, dir, "kernel", | |
18196 | + &counter_config[i].kernel); | |
18197 | + oprofilefs_create_ulong(sb, dir, "user", | |
18198 | + &counter_config[i].user); | |
18199 | + } | |
18200 | + | |
18201 | + return 0; | |
18202 | +} | |
18203 | + | |
18204 | +int __init oprofile_arch_init(struct oprofile_operations * ops) | |
18205 | +{ | |
18206 | + return xenoprofile_init(ops); | |
18207 | +} | |
18208 | + | |
18209 | +void oprofile_arch_exit(void) | |
18210 | +{ | |
18211 | + xenoprofile_exit(); | |
18212 | +} | |
18213 | Index: head-2008-11-25/arch/x86/pci/irq-xen.c | |
18214 | =================================================================== | |
18215 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
18216 | +++ head-2008-11-25/arch/x86/pci/irq-xen.c 2008-03-06 08:54:32.000000000 +0100 | |
18217 | @@ -0,0 +1,1211 @@ | |
18218 | +/* | |
18219 | + * Low-Level PCI Support for PC -- Routing of Interrupts | |
18220 | + * | |
18221 | + * (c) 1999--2000 Martin Mares <mj@ucw.cz> | |
18222 | + */ | |
18223 | + | |
18224 | +#include <linux/types.h> | |
18225 | +#include <linux/kernel.h> | |
18226 | +#include <linux/pci.h> | |
18227 | +#include <linux/init.h> | |
18228 | +#include <linux/slab.h> | |
18229 | +#include <linux/interrupt.h> | |
18230 | +#include <linux/dmi.h> | |
18231 | +#include <asm/io.h> | |
18232 | +#include <asm/smp.h> | |
18233 | +#include <asm/io_apic.h> | |
18234 | +#include <linux/irq.h> | |
18235 | +#include <linux/acpi.h> | |
18236 | + | |
18237 | +#include "pci.h" | |
18238 | + | |
18239 | +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) | |
18240 | +#define PIRQ_VERSION 0x0100 | |
18241 | + | |
18242 | +static int broken_hp_bios_irq9; | |
18243 | +static int acer_tm360_irqrouting; | |
18244 | + | |
18245 | +static struct irq_routing_table *pirq_table; | |
18246 | + | |
18247 | +static int pirq_enable_irq(struct pci_dev *dev); | |
18248 | + | |
18249 | +/* | |
18250 | + * Never use: 0, 1, 2 (timer, keyboard, and cascade) | |
18251 | + * Avoid using: 13, 14 and 15 (FP error and IDE). | |
18252 | + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse) | |
18253 | + */ | |
18254 | +unsigned int pcibios_irq_mask = 0xfff8; | |
18255 | + | |
18256 | +static int pirq_penalty[16] = { | |
18257 | + 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000, | |
18258 | + 0, 0, 0, 0, 1000, 100000, 100000, 100000 | |
18259 | +}; | |
18260 | + | |
18261 | +struct irq_router { | |
18262 | + char *name; | |
18263 | + u16 vendor, device; | |
18264 | + int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); | |
18265 | + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); | |
18266 | +}; | |
18267 | + | |
18268 | +struct irq_router_handler { | |
18269 | + u16 vendor; | |
18270 | + int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); | |
18271 | +}; | |
18272 | + | |
18273 | +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; | |
18274 | +void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; | |
18275 | + | |
18276 | +/* | |
18277 | + * Check passed address for the PCI IRQ Routing Table signature | |
18278 | + * and perform checksum verification. | |
18279 | + */ | |
18280 | + | |
18281 | +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr) | |
18282 | +{ | |
18283 | + struct irq_routing_table *rt; | |
18284 | + int i; | |
18285 | + u8 sum; | |
18286 | + | |
18287 | + rt = (struct irq_routing_table *) addr; | |
18288 | + if (rt->signature != PIRQ_SIGNATURE || | |
18289 | + rt->version != PIRQ_VERSION || | |
18290 | + rt->size % 16 || | |
18291 | + rt->size < sizeof(struct irq_routing_table)) | |
18292 | + return NULL; | |
18293 | + sum = 0; | |
18294 | + for (i=0; i < rt->size; i++) | |
18295 | + sum += addr[i]; | |
18296 | + if (!sum) { | |
18297 | + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt); | |
18298 | + return rt; | |
18299 | + } | |
18300 | + return NULL; | |
18301 | +} | |
18302 | + | |
18303 | + | |
18304 | + | |
18305 | +/* | |
18306 | + * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table. | |
18307 | + */ | |
18308 | + | |
18309 | +static struct irq_routing_table * __init pirq_find_routing_table(void) | |
18310 | +{ | |
18311 | + u8 *addr; | |
18312 | + struct irq_routing_table *rt; | |
18313 | + | |
18314 | +#ifdef CONFIG_XEN | |
18315 | + if (!is_initial_xendomain()) | |
18316 | + return NULL; | |
18317 | +#endif | |
18318 | + if (pirq_table_addr) { | |
18319 | + rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr)); | |
18320 | + if (rt) | |
18321 | + return rt; | |
18322 | + printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n"); | |
18323 | + } | |
18324 | + for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) { | |
18325 | + rt = pirq_check_routing_table(addr); | |
18326 | + if (rt) | |
18327 | + return rt; | |
18328 | + } | |
18329 | + return NULL; | |
18330 | +} | |
18331 | + | |
18332 | +/* | |
18333 | + * If we have a IRQ routing table, use it to search for peer host | |
18334 | + * bridges. It's a gross hack, but since there are no other known | |
18335 | + * ways how to get a list of buses, we have to go this way. | |
18336 | + */ | |
18337 | + | |
18338 | +static void __init pirq_peer_trick(void) | |
18339 | +{ | |
18340 | + struct irq_routing_table *rt = pirq_table; | |
18341 | + u8 busmap[256]; | |
18342 | + int i; | |
18343 | + struct irq_info *e; | |
18344 | + | |
18345 | + memset(busmap, 0, sizeof(busmap)); | |
18346 | + for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) { | |
18347 | + e = &rt->slots[i]; | |
18348 | +#ifdef DEBUG | |
18349 | + { | |
18350 | + int j; | |
18351 | + DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot); | |
18352 | + for(j=0; j<4; j++) | |
18353 | + DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap); | |
18354 | + DBG("\n"); | |
18355 | + } | |
18356 | +#endif | |
18357 | + busmap[e->bus] = 1; | |
18358 | + } | |
18359 | + for(i = 1; i < 256; i++) { | |
18360 | + if (!busmap[i] || pci_find_bus(0, i)) | |
18361 | + continue; | |
18362 | + if (pci_scan_bus(i, &pci_root_ops, NULL)) | |
18363 | + printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i); | |
18364 | + } | |
18365 | + pcibios_last_bus = -1; | |
18366 | +} | |
18367 | + | |
18368 | +/* | |
18369 | + * Code for querying and setting of IRQ routes on various interrupt routers. | |
18370 | + */ | |
18371 | + | |
18372 | +void eisa_set_level_irq(unsigned int irq) | |
18373 | +{ | |
18374 | + unsigned char mask = 1 << (irq & 7); | |
18375 | + unsigned int port = 0x4d0 + (irq >> 3); | |
18376 | + unsigned char val; | |
18377 | + static u16 eisa_irq_mask; | |
18378 | + | |
18379 | + if (irq >= 16 || (1 << irq) & eisa_irq_mask) | |
18380 | + return; | |
18381 | + | |
18382 | + eisa_irq_mask |= (1 << irq); | |
18383 | + printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq); | |
18384 | + val = inb(port); | |
18385 | + if (!(val & mask)) { | |
18386 | + DBG(KERN_DEBUG " -> edge"); | |
18387 | + outb(val | mask, port); | |
18388 | + } | |
18389 | +} | |
18390 | + | |
18391 | +/* | |
18392 | + * Common IRQ routing practice: nybbles in config space, | |
18393 | + * offset by some magic constant. | |
18394 | + */ | |
18395 | +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr) | |
18396 | +{ | |
18397 | + u8 x; | |
18398 | + unsigned reg = offset + (nr >> 1); | |
18399 | + | |
18400 | + pci_read_config_byte(router, reg, &x); | |
18401 | + return (nr & 1) ? (x >> 4) : (x & 0xf); | |
18402 | +} | |
18403 | + | |
18404 | +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val) | |
18405 | +{ | |
18406 | + u8 x; | |
18407 | + unsigned reg = offset + (nr >> 1); | |
18408 | + | |
18409 | + pci_read_config_byte(router, reg, &x); | |
18410 | + x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val); | |
18411 | + pci_write_config_byte(router, reg, x); | |
18412 | +} | |
18413 | + | |
18414 | +/* | |
18415 | + * ALI pirq entries are damn ugly, and completely undocumented. | |
18416 | + * This has been figured out from pirq tables, and it's not a pretty | |
18417 | + * picture. | |
18418 | + */ | |
18419 | +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18420 | +{ | |
18421 | + static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; | |
18422 | + | |
18423 | + return irqmap[read_config_nybble(router, 0x48, pirq-1)]; | |
18424 | +} | |
18425 | + | |
18426 | +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18427 | +{ | |
18428 | + static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; | |
18429 | + unsigned int val = irqmap[irq]; | |
18430 | + | |
18431 | + if (val) { | |
18432 | + write_config_nybble(router, 0x48, pirq-1, val); | |
18433 | + return 1; | |
18434 | + } | |
18435 | + return 0; | |
18436 | +} | |
18437 | + | |
18438 | +/* | |
18439 | + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is | |
18440 | + * just a pointer to the config space. | |
18441 | + */ | |
18442 | +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18443 | +{ | |
18444 | + u8 x; | |
18445 | + | |
18446 | + pci_read_config_byte(router, pirq, &x); | |
18447 | + return (x < 16) ? x : 0; | |
18448 | +} | |
18449 | + | |
18450 | +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18451 | +{ | |
18452 | + pci_write_config_byte(router, pirq, irq); | |
18453 | + return 1; | |
18454 | +} | |
18455 | + | |
18456 | +/* | |
18457 | + * The VIA pirq rules are nibble-based, like ALI, | |
18458 | + * but without the ugly irq number munging. | |
18459 | + * However, PIRQD is in the upper instead of lower 4 bits. | |
18460 | + */ | |
18461 | +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18462 | +{ | |
18463 | + return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq); | |
18464 | +} | |
18465 | + | |
18466 | +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18467 | +{ | |
18468 | + write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq); | |
18469 | + return 1; | |
18470 | +} | |
18471 | + | |
18472 | +/* | |
18473 | + * The VIA pirq rules are nibble-based, like ALI, | |
18474 | + * but without the ugly irq number munging. | |
18475 | + * However, for 82C586, nibble map is different . | |
18476 | + */ | |
18477 | +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18478 | +{ | |
18479 | + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; | |
18480 | + return read_config_nybble(router, 0x55, pirqmap[pirq-1]); | |
18481 | +} | |
18482 | + | |
18483 | +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18484 | +{ | |
18485 | + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; | |
18486 | + write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); | |
18487 | + return 1; | |
18488 | +} | |
18489 | + | |
18490 | +/* | |
18491 | + * ITE 8330G pirq rules are nibble-based | |
18492 | + * FIXME: pirqmap may be { 1, 0, 3, 2 }, | |
18493 | + * 2+3 are both mapped to irq 9 on my system | |
18494 | + */ | |
18495 | +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18496 | +{ | |
18497 | + static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; | |
18498 | + return read_config_nybble(router,0x43, pirqmap[pirq-1]); | |
18499 | +} | |
18500 | + | |
18501 | +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18502 | +{ | |
18503 | + static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; | |
18504 | + write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); | |
18505 | + return 1; | |
18506 | +} | |
18507 | + | |
18508 | +/* | |
18509 | + * OPTI: high four bits are nibble pointer.. | |
18510 | + * I wonder what the low bits do? | |
18511 | + */ | |
18512 | +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18513 | +{ | |
18514 | + return read_config_nybble(router, 0xb8, pirq >> 4); | |
18515 | +} | |
18516 | + | |
18517 | +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18518 | +{ | |
18519 | + write_config_nybble(router, 0xb8, pirq >> 4, irq); | |
18520 | + return 1; | |
18521 | +} | |
18522 | + | |
18523 | +/* | |
18524 | + * Cyrix: nibble offset 0x5C | |
18525 | + * 0x5C bits 7:4 is INTB bits 3:0 is INTA | |
18526 | + * 0x5D bits 7:4 is INTD bits 3:0 is INTC | |
18527 | + */ | |
18528 | +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18529 | +{ | |
18530 | + return read_config_nybble(router, 0x5C, (pirq-1)^1); | |
18531 | +} | |
18532 | + | |
18533 | +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18534 | +{ | |
18535 | + write_config_nybble(router, 0x5C, (pirq-1)^1, irq); | |
18536 | + return 1; | |
18537 | +} | |
18538 | + | |
18539 | +/* | |
18540 | + * PIRQ routing for SiS 85C503 router used in several SiS chipsets. | |
18541 | + * We have to deal with the following issues here: | |
18542 | + * - vendors have different ideas about the meaning of link values | |
18543 | + * - some onboard devices (integrated in the chipset) have special | |
18544 | + * links and are thus routed differently (i.e. not via PCI INTA-INTD) | |
18545 | + * - different revision of the router have a different layout for | |
18546 | + * the routing registers, particularly for the onchip devices | |
18547 | + * | |
18548 | + * For all routing registers the common thing is we have one byte | |
18549 | + * per routeable link which is defined as: | |
18550 | + * bit 7 IRQ mapping enabled (0) or disabled (1) | |
18551 | + * bits [6:4] reserved (sometimes used for onchip devices) | |
18552 | + * bits [3:0] IRQ to map to | |
18553 | + * allowed: 3-7, 9-12, 14-15 | |
18554 | + * reserved: 0, 1, 2, 8, 13 | |
18555 | + * | |
18556 | + * The config-space registers located at 0x41/0x42/0x43/0x44 are | |
18557 | + * always used to route the normal PCI INT A/B/C/D respectively. | |
18558 | + * Apparently there are systems implementing PCI routing table using | |
18559 | + * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D. | |
18560 | + * We try our best to handle both link mappings. | |
18561 | + * | |
18562 | + * Currently (2003-05-21) it appears most SiS chipsets follow the | |
18563 | + * definition of routing registers from the SiS-5595 southbridge. | |
18564 | + * According to the SiS 5595 datasheets the revision id's of the | |
18565 | + * router (ISA-bridge) should be 0x01 or 0xb0. | |
18566 | + * | |
18567 | + * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1. | |
18568 | + * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets. | |
18569 | + * They seem to work with the current routing code. However there is | |
18570 | + * some concern because of the two USB-OHCI HCs (original SiS 5595 | |
18571 | + * had only one). YMMV. | |
18572 | + * | |
18573 | + * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1: | |
18574 | + * | |
18575 | + * 0x61: IDEIRQ: | |
18576 | + * bits [6:5] must be written 01 | |
18577 | + * bit 4 channel-select primary (0), secondary (1) | |
18578 | + * | |
18579 | + * 0x62: USBIRQ: | |
18580 | + * bit 6 OHCI function disabled (0), enabled (1) | |
18581 | + * | |
18582 | + * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved | |
18583 | + * | |
18584 | + * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved | |
18585 | + * | |
18586 | + * We support USBIRQ (in addition to INTA-INTD) and keep the | |
18587 | + * IDE, ACPI and DAQ routing untouched as set by the BIOS. | |
18588 | + * | |
18589 | + * Currently the only reported exception is the new SiS 65x chipset | |
18590 | + * which includes the SiS 69x southbridge. Here we have the 85C503 | |
18591 | + * router revision 0x04 and there are changes in the register layout | |
18592 | + * mostly related to the different USB HCs with USB 2.0 support. | |
18593 | + * | |
18594 | + * Onchip routing for router rev-id 0x04 (try-and-error observation) | |
18595 | + * | |
18596 | + * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs | |
18597 | + * bit 6-4 are probably unused, not like 5595 | |
18598 | + */ | |
18599 | + | |
18600 | +#define PIRQ_SIS_IRQ_MASK 0x0f | |
18601 | +#define PIRQ_SIS_IRQ_DISABLE 0x80 | |
18602 | +#define PIRQ_SIS_USB_ENABLE 0x40 | |
18603 | + | |
18604 | +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18605 | +{ | |
18606 | + u8 x; | |
18607 | + int reg; | |
18608 | + | |
18609 | + reg = pirq; | |
18610 | + if (reg >= 0x01 && reg <= 0x04) | |
18611 | + reg += 0x40; | |
18612 | + pci_read_config_byte(router, reg, &x); | |
18613 | + return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK); | |
18614 | +} | |
18615 | + | |
18616 | +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18617 | +{ | |
18618 | + u8 x; | |
18619 | + int reg; | |
18620 | + | |
18621 | + reg = pirq; | |
18622 | + if (reg >= 0x01 && reg <= 0x04) | |
18623 | + reg += 0x40; | |
18624 | + pci_read_config_byte(router, reg, &x); | |
18625 | + x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE); | |
18626 | + x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE; | |
18627 | + pci_write_config_byte(router, reg, x); | |
18628 | + return 1; | |
18629 | +} | |
18630 | + | |
18631 | + | |
18632 | +/* | |
18633 | + * VLSI: nibble offset 0x74 - educated guess due to routing table and | |
18634 | + * config space of VLSI 82C534 PCI-bridge/router (1004:0102) | |
18635 | + * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard | |
18636 | + * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6 | |
18637 | + * for the busbridge to the docking station. | |
18638 | + */ | |
18639 | + | |
18640 | +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18641 | +{ | |
18642 | + if (pirq > 8) { | |
18643 | + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); | |
18644 | + return 0; | |
18645 | + } | |
18646 | + return read_config_nybble(router, 0x74, pirq-1); | |
18647 | +} | |
18648 | + | |
18649 | +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18650 | +{ | |
18651 | + if (pirq > 8) { | |
18652 | + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); | |
18653 | + return 0; | |
18654 | + } | |
18655 | + write_config_nybble(router, 0x74, pirq-1, irq); | |
18656 | + return 1; | |
18657 | +} | |
18658 | + | |
18659 | +/* | |
18660 | + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index | |
18661 | + * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register | |
18662 | + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect | |
18663 | + * register is a straight binary coding of desired PIC IRQ (low nibble). | |
18664 | + * | |
18665 | + * The 'link' value in the PIRQ table is already in the correct format | |
18666 | + * for the Index register. There are some special index values: | |
18667 | + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1, | |
18668 | + * and 0x03 for SMBus. | |
18669 | + */ | |
18670 | +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18671 | +{ | |
18672 | + outb_p(pirq, 0xc00); | |
18673 | + return inb(0xc01) & 0xf; | |
18674 | +} | |
18675 | + | |
18676 | +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18677 | +{ | |
18678 | + outb_p(pirq, 0xc00); | |
18679 | + outb_p(irq, 0xc01); | |
18680 | + return 1; | |
18681 | +} | |
18682 | + | |
18683 | +/* Support for AMD756 PCI IRQ Routing | |
18684 | + * Jhon H. Caicedo <jhcaiced@osso.org.co> | |
18685 | + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced) | |
18686 | + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced) | |
18687 | + * The AMD756 pirq rules are nibble-based | |
18688 | + * offset 0x56 0-3 PIRQA 4-7 PIRQB | |
18689 | + * offset 0x57 0-3 PIRQC 4-7 PIRQD | |
18690 | + */ | |
18691 | +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq) | |
18692 | +{ | |
18693 | + u8 irq; | |
18694 | + irq = 0; | |
18695 | + if (pirq <= 4) | |
18696 | + { | |
18697 | + irq = read_config_nybble(router, 0x56, pirq - 1); | |
18698 | + } | |
18699 | + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", | |
18700 | + dev->vendor, dev->device, pirq, irq); | |
18701 | + return irq; | |
18702 | +} | |
18703 | + | |
18704 | +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18705 | +{ | |
18706 | + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", | |
18707 | + dev->vendor, dev->device, pirq, irq); | |
18708 | + if (pirq <= 4) | |
18709 | + { | |
18710 | + write_config_nybble(router, 0x56, pirq - 1, irq); | |
18711 | + } | |
18712 | + return 1; | |
18713 | +} | |
18714 | + | |
18715 | +#ifdef CONFIG_PCI_BIOS | |
18716 | + | |
18717 | +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | |
18718 | +{ | |
18719 | + struct pci_dev *bridge; | |
18720 | + int pin = pci_get_interrupt_pin(dev, &bridge); | |
18721 | + return pcibios_set_irq_routing(bridge, pin, irq); | |
18722 | +} | |
18723 | + | |
18724 | +#endif | |
18725 | + | |
18726 | +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) | |
18727 | +{ | |
18728 | + static struct pci_device_id __initdata pirq_440gx[] = { | |
18729 | + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) }, | |
18730 | + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) }, | |
18731 | + { }, | |
18732 | + }; | |
18733 | + | |
18734 | + /* 440GX has a proprietary PIRQ router -- don't use it */ | |
18735 | + if (pci_dev_present(pirq_440gx)) | |
18736 | + return 0; | |
18737 | + | |
18738 | + switch(device) | |
18739 | + { | |
18740 | + case PCI_DEVICE_ID_INTEL_82371FB_0: | |
18741 | + case PCI_DEVICE_ID_INTEL_82371SB_0: | |
18742 | + case PCI_DEVICE_ID_INTEL_82371AB_0: | |
18743 | + case PCI_DEVICE_ID_INTEL_82371MX: | |
18744 | + case PCI_DEVICE_ID_INTEL_82443MX_0: | |
18745 | + case PCI_DEVICE_ID_INTEL_82801AA_0: | |
18746 | + case PCI_DEVICE_ID_INTEL_82801AB_0: | |
18747 | + case PCI_DEVICE_ID_INTEL_82801BA_0: | |
18748 | + case PCI_DEVICE_ID_INTEL_82801BA_10: | |
18749 | + case PCI_DEVICE_ID_INTEL_82801CA_0: | |
18750 | + case PCI_DEVICE_ID_INTEL_82801CA_12: | |
18751 | + case PCI_DEVICE_ID_INTEL_82801DB_0: | |
18752 | + case PCI_DEVICE_ID_INTEL_82801E_0: | |
18753 | + case PCI_DEVICE_ID_INTEL_82801EB_0: | |
18754 | + case PCI_DEVICE_ID_INTEL_ESB_1: | |
18755 | + case PCI_DEVICE_ID_INTEL_ICH6_0: | |
18756 | + case PCI_DEVICE_ID_INTEL_ICH6_1: | |
18757 | + case PCI_DEVICE_ID_INTEL_ICH7_0: | |
18758 | + case PCI_DEVICE_ID_INTEL_ICH7_1: | |
18759 | + case PCI_DEVICE_ID_INTEL_ICH7_30: | |
18760 | + case PCI_DEVICE_ID_INTEL_ICH7_31: | |
18761 | + case PCI_DEVICE_ID_INTEL_ESB2_0: | |
18762 | + case PCI_DEVICE_ID_INTEL_ICH8_0: | |
18763 | + case PCI_DEVICE_ID_INTEL_ICH8_1: | |
18764 | + case PCI_DEVICE_ID_INTEL_ICH8_2: | |
18765 | + case PCI_DEVICE_ID_INTEL_ICH8_3: | |
18766 | + case PCI_DEVICE_ID_INTEL_ICH8_4: | |
18767 | + case PCI_DEVICE_ID_INTEL_ICH9_0: | |
18768 | + case PCI_DEVICE_ID_INTEL_ICH9_1: | |
18769 | + case PCI_DEVICE_ID_INTEL_ICH9_2: | |
18770 | + case PCI_DEVICE_ID_INTEL_ICH9_3: | |
18771 | + case PCI_DEVICE_ID_INTEL_ICH9_4: | |
18772 | + case PCI_DEVICE_ID_INTEL_ICH9_5: | |
18773 | + r->name = "PIIX/ICH"; | |
18774 | + r->get = pirq_piix_get; | |
18775 | + r->set = pirq_piix_set; | |
18776 | + return 1; | |
18777 | + } | |
18778 | + return 0; | |
18779 | +} | |
18780 | + | |
18781 | +static __init int via_router_probe(struct irq_router *r, | |
18782 | + struct pci_dev *router, u16 device) | |
18783 | +{ | |
18784 | + /* FIXME: We should move some of the quirk fixup stuff here */ | |
18785 | + | |
18786 | + /* | |
18787 | + * work arounds for some buggy BIOSes | |
18788 | + */ | |
18789 | + if (device == PCI_DEVICE_ID_VIA_82C586_0) { | |
18790 | + switch(router->device) { | |
18791 | + case PCI_DEVICE_ID_VIA_82C686: | |
18792 | + /* | |
18793 | + * Asus k7m bios wrongly reports 82C686A | |
18794 | + * as 586-compatible | |
18795 | + */ | |
18796 | + device = PCI_DEVICE_ID_VIA_82C686; | |
18797 | + break; | |
18798 | + case PCI_DEVICE_ID_VIA_8235: | |
18799 | + /** | |
18800 | + * Asus a7v-x bios wrongly reports 8235 | |
18801 | + * as 586-compatible | |
18802 | + */ | |
18803 | + device = PCI_DEVICE_ID_VIA_8235; | |
18804 | + break; | |
18805 | + } | |
18806 | + } | |
18807 | + | |
18808 | + switch(device) { | |
18809 | + case PCI_DEVICE_ID_VIA_82C586_0: | |
18810 | + r->name = "VIA"; | |
18811 | + r->get = pirq_via586_get; | |
18812 | + r->set = pirq_via586_set; | |
18813 | + return 1; | |
18814 | + case PCI_DEVICE_ID_VIA_82C596: | |
18815 | + case PCI_DEVICE_ID_VIA_82C686: | |
18816 | + case PCI_DEVICE_ID_VIA_8231: | |
18817 | + case PCI_DEVICE_ID_VIA_8233A: | |
18818 | + case PCI_DEVICE_ID_VIA_8235: | |
18819 | + case PCI_DEVICE_ID_VIA_8237: | |
18820 | + /* FIXME: add new ones for 8233/5 */ | |
18821 | + r->name = "VIA"; | |
18822 | + r->get = pirq_via_get; | |
18823 | + r->set = pirq_via_set; | |
18824 | + return 1; | |
18825 | + } | |
18826 | + return 0; | |
18827 | +} | |
18828 | + | |
18829 | +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) | |
18830 | +{ | |
18831 | + switch(device) | |
18832 | + { | |
18833 | + case PCI_DEVICE_ID_VLSI_82C534: | |
18834 | + r->name = "VLSI 82C534"; | |
18835 | + r->get = pirq_vlsi_get; | |
18836 | + r->set = pirq_vlsi_set; | |
18837 | + return 1; | |
18838 | + } | |
18839 | + return 0; | |
18840 | +} | |
18841 | + | |
18842 | + | |
18843 | +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) | |
18844 | +{ | |
18845 | + switch(device) | |
18846 | + { | |
18847 | + case PCI_DEVICE_ID_SERVERWORKS_OSB4: | |
18848 | + case PCI_DEVICE_ID_SERVERWORKS_CSB5: | |
18849 | + r->name = "ServerWorks"; | |
18850 | + r->get = pirq_serverworks_get; | |
18851 | + r->set = pirq_serverworks_set; | |
18852 | + return 1; | |
18853 | + } | |
18854 | + return 0; | |
18855 | +} | |
18856 | + | |
18857 | +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) | |
18858 | +{ | |
18859 | + if (device != PCI_DEVICE_ID_SI_503) | |
18860 | + return 0; | |
18861 | + | |
18862 | + r->name = "SIS"; | |
18863 | + r->get = pirq_sis_get; | |
18864 | + r->set = pirq_sis_set; | |
18865 | + return 1; | |
18866 | +} | |
18867 | + | |
18868 | +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) | |
18869 | +{ | |
18870 | + switch(device) | |
18871 | + { | |
18872 | + case PCI_DEVICE_ID_CYRIX_5520: | |
18873 | + r->name = "NatSemi"; | |
18874 | + r->get = pirq_cyrix_get; | |
18875 | + r->set = pirq_cyrix_set; | |
18876 | + return 1; | |
18877 | + } | |
18878 | + return 0; | |
18879 | +} | |
18880 | + | |
18881 | +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) | |
18882 | +{ | |
18883 | + switch(device) | |
18884 | + { | |
18885 | + case PCI_DEVICE_ID_OPTI_82C700: | |
18886 | + r->name = "OPTI"; | |
18887 | + r->get = pirq_opti_get; | |
18888 | + r->set = pirq_opti_set; | |
18889 | + return 1; | |
18890 | + } | |
18891 | + return 0; | |
18892 | +} | |
18893 | + | |
18894 | +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) | |
18895 | +{ | |
18896 | + switch(device) | |
18897 | + { | |
18898 | + case PCI_DEVICE_ID_ITE_IT8330G_0: | |
18899 | + r->name = "ITE"; | |
18900 | + r->get = pirq_ite_get; | |
18901 | + r->set = pirq_ite_set; | |
18902 | + return 1; | |
18903 | + } | |
18904 | + return 0; | |
18905 | +} | |
18906 | + | |
18907 | +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) | |
18908 | +{ | |
18909 | + switch(device) | |
18910 | + { | |
18911 | + case PCI_DEVICE_ID_AL_M1533: | |
18912 | + case PCI_DEVICE_ID_AL_M1563: | |
18913 | + printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n"); | |
18914 | + r->name = "ALI"; | |
18915 | + r->get = pirq_ali_get; | |
18916 | + r->set = pirq_ali_set; | |
18917 | + return 1; | |
18918 | + } | |
18919 | + return 0; | |
18920 | +} | |
18921 | + | |
18922 | +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) | |
18923 | +{ | |
18924 | + switch(device) | |
18925 | + { | |
18926 | + case PCI_DEVICE_ID_AMD_VIPER_740B: | |
18927 | + r->name = "AMD756"; | |
18928 | + break; | |
18929 | + case PCI_DEVICE_ID_AMD_VIPER_7413: | |
18930 | + r->name = "AMD766"; | |
18931 | + break; | |
18932 | + case PCI_DEVICE_ID_AMD_VIPER_7443: | |
18933 | + r->name = "AMD768"; | |
18934 | + break; | |
18935 | + default: | |
18936 | + return 0; | |
18937 | + } | |
18938 | + r->get = pirq_amd756_get; | |
18939 | + r->set = pirq_amd756_set; | |
18940 | + return 1; | |
18941 | +} | |
18942 | + | |
18943 | +static __initdata struct irq_router_handler pirq_routers[] = { | |
18944 | + { PCI_VENDOR_ID_INTEL, intel_router_probe }, | |
18945 | + { PCI_VENDOR_ID_AL, ali_router_probe }, | |
18946 | + { PCI_VENDOR_ID_ITE, ite_router_probe }, | |
18947 | + { PCI_VENDOR_ID_VIA, via_router_probe }, | |
18948 | + { PCI_VENDOR_ID_OPTI, opti_router_probe }, | |
18949 | + { PCI_VENDOR_ID_SI, sis_router_probe }, | |
18950 | + { PCI_VENDOR_ID_CYRIX, cyrix_router_probe }, | |
18951 | + { PCI_VENDOR_ID_VLSI, vlsi_router_probe }, | |
18952 | + { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe }, | |
18953 | + { PCI_VENDOR_ID_AMD, amd_router_probe }, | |
18954 | + /* Someone with docs needs to add the ATI Radeon IGP */ | |
18955 | + { 0, NULL } | |
18956 | +}; | |
18957 | +static struct irq_router pirq_router; | |
18958 | +static struct pci_dev *pirq_router_dev; | |
18959 | + | |
18960 | + | |
18961 | +/* | |
18962 | + * FIXME: should we have an option to say "generic for | |
18963 | + * chipset" ? | |
18964 | + */ | |
18965 | + | |
18966 | +static void __init pirq_find_router(struct irq_router *r) | |
18967 | +{ | |
18968 | + struct irq_routing_table *rt = pirq_table; | |
18969 | + struct irq_router_handler *h; | |
18970 | + | |
18971 | +#ifdef CONFIG_PCI_BIOS | |
18972 | + if (!rt->signature) { | |
18973 | + printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n"); | |
18974 | + r->set = pirq_bios_set; | |
18975 | + r->name = "BIOS"; | |
18976 | + return; | |
18977 | + } | |
18978 | +#endif | |
18979 | + | |
18980 | + /* Default unless a driver reloads it */ | |
18981 | + r->name = "default"; | |
18982 | + r->get = NULL; | |
18983 | + r->set = NULL; | |
18984 | + | |
18985 | + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", | |
18986 | + rt->rtr_vendor, rt->rtr_device); | |
18987 | + | |
18988 | + pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn); | |
18989 | + if (!pirq_router_dev) { | |
18990 | + DBG(KERN_DEBUG "PCI: Interrupt router not found at " | |
18991 | + "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); | |
18992 | + return; | |
18993 | + } | |
18994 | + | |
18995 | + for( h = pirq_routers; h->vendor; h++) { | |
18996 | + /* First look for a router match */ | |
18997 | + if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device)) | |
18998 | + break; | |
18999 | + /* Fall back to a device match */ | |
19000 | + if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device)) | |
19001 | + break; | |
19002 | + } | |
19003 | + printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", | |
19004 | + pirq_router.name, | |
19005 | + pirq_router_dev->vendor, | |
19006 | + pirq_router_dev->device, | |
19007 | + pci_name(pirq_router_dev)); | |
19008 | +} | |
19009 | + | |
19010 | +static struct irq_info *pirq_get_info(struct pci_dev *dev) | |
19011 | +{ | |
19012 | + struct irq_routing_table *rt = pirq_table; | |
19013 | + int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); | |
19014 | + struct irq_info *info; | |
19015 | + | |
19016 | + for (info = rt->slots; entries--; info++) | |
19017 | + if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) | |
19018 | + return info; | |
19019 | + return NULL; | |
19020 | +} | |
19021 | + | |
19022 | +static int pcibios_lookup_irq(struct pci_dev *dev, int assign) | |
19023 | +{ | |
19024 | + u8 pin; | |
19025 | + struct irq_info *info; | |
19026 | + int i, pirq, newirq; | |
19027 | + int irq = 0; | |
19028 | + u32 mask; | |
19029 | + struct irq_router *r = &pirq_router; | |
19030 | + struct pci_dev *dev2 = NULL; | |
19031 | + char *msg = NULL; | |
19032 | + | |
19033 | + /* Find IRQ pin */ | |
19034 | + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); | |
19035 | + if (!pin) { | |
19036 | + DBG(KERN_DEBUG " -> no interrupt pin\n"); | |
19037 | + return 0; | |
19038 | + } | |
19039 | + pin = pin - 1; | |
19040 | + | |
19041 | + /* Find IRQ routing entry */ | |
19042 | + | |
19043 | + if (!pirq_table) | |
19044 | + return 0; | |
19045 | + | |
19046 | + DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin); | |
19047 | + info = pirq_get_info(dev); | |
19048 | + if (!info) { | |
19049 | + DBG(" -> not found in routing table\n" KERN_DEBUG); | |
19050 | + return 0; | |
19051 | + } | |
19052 | + pirq = info->irq[pin].link; | |
19053 | + mask = info->irq[pin].bitmap; | |
19054 | + if (!pirq) { | |
19055 | + DBG(" -> not routed\n" KERN_DEBUG); | |
19056 | + return 0; | |
19057 | + } | |
19058 | + DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs); | |
19059 | + mask &= pcibios_irq_mask; | |
19060 | + | |
19061 | + /* Work around broken HP Pavilion Notebooks which assign USB to | |
19062 | + IRQ 9 even though it is actually wired to IRQ 11 */ | |
19063 | + | |
19064 | + if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) { | |
19065 | + dev->irq = 11; | |
19066 | + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11); | |
19067 | + r->set(pirq_router_dev, dev, pirq, 11); | |
19068 | + } | |
19069 | + | |
19070 | + /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */ | |
19071 | + if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) { | |
19072 | + pirq = 0x68; | |
19073 | + mask = 0x400; | |
19074 | + dev->irq = r->get(pirq_router_dev, dev, pirq); | |
19075 | + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq); | |
19076 | + } | |
19077 | + | |
19078 | + /* | |
19079 | + * Find the best IRQ to assign: use the one | |
19080 | + * reported by the device if possible. | |
19081 | + */ | |
19082 | + newirq = dev->irq; | |
19083 | + if (newirq && !((1 << newirq) & mask)) { | |
19084 | + if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; | |
19085 | + else printk("\n" KERN_WARNING | |
19086 | + "PCI: IRQ %i for device %s doesn't match PIRQ mask " | |
19087 | + "- try pci=usepirqmask\n" KERN_DEBUG, newirq, | |
19088 | + pci_name(dev)); | |
19089 | + } | |
19090 | + if (!newirq && assign) { | |
19091 | + for (i = 0; i < 16; i++) { | |
19092 | + if (!(mask & (1 << i))) | |
19093 | + continue; | |
19094 | + if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED)) | |
19095 | + newirq = i; | |
19096 | + } | |
19097 | + } | |
19098 | + DBG(" -> newirq=%d", newirq); | |
19099 | + | |
19100 | + /* Check if it is hardcoded */ | |
19101 | + if ((pirq & 0xf0) == 0xf0) { | |
19102 | + irq = pirq & 0xf; | |
19103 | + DBG(" -> hardcoded IRQ %d\n", irq); | |
19104 | + msg = "Hardcoded"; | |
19105 | + } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ | |
19106 | + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) { | |
19107 | + DBG(" -> got IRQ %d\n", irq); | |
19108 | + msg = "Found"; | |
19109 | + eisa_set_level_irq(irq); | |
19110 | + } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { | |
19111 | + DBG(" -> assigning IRQ %d", newirq); | |
19112 | + if (r->set(pirq_router_dev, dev, pirq, newirq)) { | |
19113 | + eisa_set_level_irq(newirq); | |
19114 | + DBG(" ... OK\n"); | |
19115 | + msg = "Assigned"; | |
19116 | + irq = newirq; | |
19117 | + } | |
19118 | + } | |
19119 | + | |
19120 | + if (!irq) { | |
19121 | + DBG(" ... failed\n"); | |
19122 | + if (newirq && mask == (1 << newirq)) { | |
19123 | + msg = "Guessed"; | |
19124 | + irq = newirq; | |
19125 | + } else | |
19126 | + return 0; | |
19127 | + } | |
19128 | + printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev)); | |
19129 | + | |
19130 | + /* Update IRQ for all devices with the same pirq value */ | |
19131 | + while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { | |
19132 | + pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); | |
19133 | + if (!pin) | |
19134 | + continue; | |
19135 | + pin--; | |
19136 | + info = pirq_get_info(dev2); | |
19137 | + if (!info) | |
19138 | + continue; | |
19139 | + if (info->irq[pin].link == pirq) { | |
19140 | + /* We refuse to override the dev->irq information. Give a warning! */ | |
19141 | + if ( dev2->irq && dev2->irq != irq && \ | |
19142 | + (!(pci_probe & PCI_USE_PIRQ_MASK) || \ | |
19143 | + ((1 << dev2->irq) & mask)) ) { | |
19144 | +#ifndef CONFIG_PCI_MSI | |
19145 | + printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", | |
19146 | + pci_name(dev2), dev2->irq, irq); | |
19147 | +#endif | |
19148 | + continue; | |
19149 | + } | |
19150 | + dev2->irq = irq; | |
19151 | + pirq_penalty[irq]++; | |
19152 | + if (dev != dev2) | |
19153 | + printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2)); | |
19154 | + } | |
19155 | + } | |
19156 | + return 1; | |
19157 | +} | |
19158 | + | |
19159 | +static void __init pcibios_fixup_irqs(void) | |
19160 | +{ | |
19161 | + struct pci_dev *dev = NULL; | |
19162 | + u8 pin; | |
19163 | + | |
19164 | + DBG(KERN_DEBUG "PCI: IRQ fixup\n"); | |
19165 | + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { | |
19166 | + /* | |
19167 | + * If the BIOS has set an out of range IRQ number, just ignore it. | |
19168 | + * Also keep track of which IRQ's are already in use. | |
19169 | + */ | |
19170 | + if (dev->irq >= 16) { | |
19171 | + DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq); | |
19172 | + dev->irq = 0; | |
19173 | + } | |
19174 | + /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */ | |
19175 | + if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000) | |
19176 | + pirq_penalty[dev->irq] = 0; | |
19177 | + pirq_penalty[dev->irq]++; | |
19178 | + } | |
19179 | + | |
19180 | + dev = NULL; | |
19181 | + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { | |
19182 | + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); | |
19183 | +#ifdef CONFIG_X86_IO_APIC | |
19184 | + /* | |
19185 | + * Recalculate IRQ numbers if we use the I/O APIC. | |
19186 | + */ | |
19187 | + if (io_apic_assign_pci_irqs) | |
19188 | + { | |
19189 | + int irq; | |
19190 | + | |
19191 | + if (pin) { | |
19192 | + pin--; /* interrupt pins are numbered starting from 1 */ | |
19193 | + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); | |
19194 | + /* | |
19195 | + * Busses behind bridges are typically not listed in the MP-table. | |
19196 | + * In this case we have to look up the IRQ based on the parent bus, | |
19197 | + * parent slot, and pin number. The SMP code detects such bridged | |
19198 | + * busses itself so we should get into this branch reliably. | |
19199 | + */ | |
19200 | + if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ | |
19201 | + struct pci_dev * bridge = dev->bus->self; | |
19202 | + | |
19203 | + pin = (pin + PCI_SLOT(dev->devfn)) % 4; | |
19204 | + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, | |
19205 | + PCI_SLOT(bridge->devfn), pin); | |
19206 | + if (irq >= 0) | |
19207 | + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", | |
19208 | + pci_name(bridge), 'A' + pin, irq); | |
19209 | + } | |
19210 | + if (irq >= 0) { | |
19211 | + if (use_pci_vector() && | |
19212 | + !platform_legacy_irq(irq)) | |
19213 | + irq = IO_APIC_VECTOR(irq); | |
19214 | + | |
19215 | + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", | |
19216 | + pci_name(dev), 'A' + pin, irq); | |
19217 | + dev->irq = irq; | |
19218 | + } | |
19219 | + } | |
19220 | + } | |
19221 | +#endif | |
19222 | + /* | |
19223 | + * Still no IRQ? Try to lookup one... | |
19224 | + */ | |
19225 | + if (pin && !dev->irq) | |
19226 | + pcibios_lookup_irq(dev, 0); | |
19227 | + } | |
19228 | +} | |
19229 | + | |
19230 | +/* | |
19231 | + * Work around broken HP Pavilion Notebooks which assign USB to | |
19232 | + * IRQ 9 even though it is actually wired to IRQ 11 | |
19233 | + */ | |
19234 | +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d) | |
19235 | +{ | |
19236 | + if (!broken_hp_bios_irq9) { | |
19237 | + broken_hp_bios_irq9 = 1; | |
19238 | + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); | |
19239 | + } | |
19240 | + return 0; | |
19241 | +} | |
19242 | + | |
19243 | +/* | |
19244 | + * Work around broken Acer TravelMate 360 Notebooks which assign | |
19245 | + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10 | |
19246 | + */ | |
19247 | +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d) | |
19248 | +{ | |
19249 | + if (!acer_tm360_irqrouting) { | |
19250 | + acer_tm360_irqrouting = 1; | |
19251 | + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); | |
19252 | + } | |
19253 | + return 0; | |
19254 | +} | |
19255 | + | |
19256 | +static struct dmi_system_id __initdata pciirq_dmi_table[] = { | |
19257 | + { | |
19258 | + .callback = fix_broken_hp_bios_irq9, | |
19259 | + .ident = "HP Pavilion N5400 Series Laptop", | |
19260 | + .matches = { | |
19261 | + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), | |
19262 | + DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"), | |
19263 | + DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"), | |
19264 | + DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"), | |
19265 | + }, | |
19266 | + }, | |
19267 | + { | |
19268 | + .callback = fix_acer_tm360_irqrouting, | |
19269 | + .ident = "Acer TravelMate 36x Laptop", | |
19270 | + .matches = { | |
19271 | + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), | |
19272 | + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), | |
19273 | + }, | |
19274 | + }, | |
19275 | + { } | |
19276 | +}; | |
19277 | + | |
19278 | +static int __init pcibios_irq_init(void) | |
19279 | +{ | |
19280 | + DBG(KERN_DEBUG "PCI: IRQ init\n"); | |
19281 | + | |
19282 | + if (pcibios_enable_irq || raw_pci_ops == NULL) | |
19283 | + return 0; | |
19284 | + | |
19285 | + dmi_check_system(pciirq_dmi_table); | |
19286 | + | |
19287 | + pirq_table = pirq_find_routing_table(); | |
19288 | + | |
19289 | +#ifdef CONFIG_PCI_BIOS | |
19290 | + if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) | |
19291 | + pirq_table = pcibios_get_irq_routing_table(); | |
19292 | +#endif | |
19293 | + if (pirq_table) { | |
19294 | + pirq_peer_trick(); | |
19295 | + pirq_find_router(&pirq_router); | |
19296 | + if (pirq_table->exclusive_irqs) { | |
19297 | + int i; | |
19298 | + for (i=0; i<16; i++) | |
19299 | + if (!(pirq_table->exclusive_irqs & (1 << i))) | |
19300 | + pirq_penalty[i] += 100; | |
19301 | + } | |
19302 | + /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */ | |
19303 | + if (io_apic_assign_pci_irqs) | |
19304 | + pirq_table = NULL; | |
19305 | + } | |
19306 | + | |
19307 | + pcibios_enable_irq = pirq_enable_irq; | |
19308 | + | |
19309 | + pcibios_fixup_irqs(); | |
19310 | + return 0; | |
19311 | +} | |
19312 | + | |
19313 | +subsys_initcall(pcibios_irq_init); | |
19314 | + | |
19315 | + | |
19316 | +static void pirq_penalize_isa_irq(int irq, int active) | |
19317 | +{ | |
19318 | + /* | |
19319 | + * If any ISAPnP device reports an IRQ in its list of possible | |
19320 | + * IRQ's, we try to avoid assigning it to PCI devices. | |
19321 | + */ | |
19322 | + if (irq < 16) { | |
19323 | + if (active) | |
19324 | + pirq_penalty[irq] += 1000; | |
19325 | + else | |
19326 | + pirq_penalty[irq] += 100; | |
19327 | + } | |
19328 | +} | |
19329 | + | |
19330 | +void pcibios_penalize_isa_irq(int irq, int active) | |
19331 | +{ | |
19332 | +#ifdef CONFIG_ACPI | |
19333 | + if (!acpi_noirq) | |
19334 | + acpi_penalize_isa_irq(irq, active); | |
19335 | + else | |
19336 | +#endif | |
19337 | + pirq_penalize_isa_irq(irq, active); | |
19338 | +} | |
19339 | + | |
19340 | +static int pirq_enable_irq(struct pci_dev *dev) | |
19341 | +{ | |
19342 | + u8 pin; | |
19343 | + struct pci_dev *temp_dev; | |
19344 | + | |
19345 | + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); | |
19346 | + if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { | |
19347 | + char *msg = ""; | |
19348 | + | |
19349 | + pin--; /* interrupt pins are numbered starting from 1 */ | |
19350 | + | |
19351 | + if (io_apic_assign_pci_irqs) { | |
19352 | + int irq; | |
19353 | + | |
19354 | + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); | |
19355 | + /* | |
19356 | + * Busses behind bridges are typically not listed in the MP-table. | |
19357 | + * In this case we have to look up the IRQ based on the parent bus, | |
19358 | + * parent slot, and pin number. The SMP code detects such bridged | |
19359 | + * busses itself so we should get into this branch reliably. | |
19360 | + */ | |
19361 | + temp_dev = dev; | |
19362 | + while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ | |
19363 | + struct pci_dev * bridge = dev->bus->self; | |
19364 | + | |
19365 | + pin = (pin + PCI_SLOT(dev->devfn)) % 4; | |
19366 | + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, | |
19367 | + PCI_SLOT(bridge->devfn), pin); | |
19368 | + if (irq >= 0) | |
19369 | + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", | |
19370 | + pci_name(bridge), 'A' + pin, irq); | |
19371 | + dev = bridge; | |
19372 | + } | |
19373 | + dev = temp_dev; | |
19374 | + if (irq >= 0) { | |
19375 | +#ifdef CONFIG_PCI_MSI | |
19376 | + if (!platform_legacy_irq(irq)) | |
19377 | + irq = IO_APIC_VECTOR(irq); | |
19378 | +#endif | |
19379 | + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", | |
19380 | + pci_name(dev), 'A' + pin, irq); | |
19381 | + dev->irq = irq; | |
19382 | + return 0; | |
19383 | + } else | |
19384 | + msg = " Probably buggy MP table."; | |
19385 | + } else if (pci_probe & PCI_BIOS_IRQ_SCAN) | |
19386 | + msg = ""; | |
19387 | + else | |
19388 | + msg = " Please try using pci=biosirq."; | |
19389 | + | |
19390 | + /* With IDE legacy devices the IRQ lookup failure is not a problem.. */ | |
19391 | + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5)) | |
19392 | + return 0; | |
19393 | + | |
19394 | + printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", | |
19395 | + 'A' + pin, pci_name(dev), msg); | |
19396 | + } | |
19397 | + return 0; | |
19398 | +} | |
19399 | + | |
19400 | +int pci_vector_resources(int last, int nr_released) | |
19401 | +{ | |
19402 | + int count = nr_released; | |
19403 | + | |
19404 | + int next = last; | |
19405 | + int offset = (last % 8); | |
19406 | + | |
19407 | + while (next < FIRST_SYSTEM_VECTOR) { | |
19408 | + next += 8; | |
19409 | +#ifdef CONFIG_X86_64 | |
19410 | + if (next == IA32_SYSCALL_VECTOR) | |
19411 | + continue; | |
19412 | +#else | |
19413 | + if (next == SYSCALL_VECTOR) | |
19414 | + continue; | |
19415 | +#endif | |
19416 | + count++; | |
19417 | + if (next >= FIRST_SYSTEM_VECTOR) { | |
19418 | + if (offset%8) { | |
19419 | + next = FIRST_DEVICE_VECTOR + offset; | |
19420 | + offset++; | |
19421 | + continue; | |
19422 | + } | |
19423 | + count--; | |
19424 | + } | |
19425 | + } | |
19426 | + | |
19427 | + return count; | |
19428 | +} | |
19429 | Index: head-2008-11-25/arch/x86/pci/pcifront.c | |
19430 | =================================================================== | |
19431 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
19432 | +++ head-2008-11-25/arch/x86/pci/pcifront.c 2007-06-12 13:12:49.000000000 +0200 | |
19433 | @@ -0,0 +1,55 @@ | |
19434 | +/* | |
19435 | + * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core | |
19436 | + * to support the Xen PCI Frontend's operation | |
19437 | + * | |
19438 | + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> | |
19439 | + */ | |
19440 | +#include <linux/module.h> | |
19441 | +#include <linux/init.h> | |
19442 | +#include <linux/pci.h> | |
19443 | +#include <asm/acpi.h> | |
19444 | +#include "pci.h" | |
19445 | + | |
19446 | +static int pcifront_enable_irq(struct pci_dev *dev) | |
19447 | +{ | |
19448 | + u8 irq; | |
19449 | + pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq); | |
19450 | + dev->irq = irq; | |
19451 | + | |
19452 | + return 0; | |
19453 | +} | |
19454 | + | |
19455 | +extern u8 pci_cache_line_size; | |
19456 | + | |
19457 | +static int __init pcifront_x86_stub_init(void) | |
19458 | +{ | |
19459 | + struct cpuinfo_x86 *c = &boot_cpu_data; | |
19460 | + | |
19461 | + /* Only install our method if we haven't found real hardware already */ | |
19462 | + if (raw_pci_ops) | |
19463 | + return 0; | |
19464 | + | |
19465 | + printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n"); | |
19466 | + | |
19467 | + /* Copied from arch/i386/pci/common.c */ | |
19468 | + pci_cache_line_size = 32 >> 2; | |
19469 | + if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) | |
19470 | + pci_cache_line_size = 64 >> 2; /* K7 & K8 */ | |
19471 | + else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) | |
19472 | + pci_cache_line_size = 128 >> 2; /* P4 */ | |
19473 | + | |
19474 | + /* On x86, we need to disable the normal IRQ routing table and | |
19475 | + * just ask the backend | |
19476 | + */ | |
19477 | + pcibios_enable_irq = pcifront_enable_irq; | |
19478 | + pcibios_disable_irq = NULL; | |
19479 | + | |
19480 | +#ifdef CONFIG_ACPI | |
19481 | + /* Keep ACPI out of the picture */ | |
19482 | + acpi_noirq = 1; | |
19483 | +#endif | |
19484 | + | |
19485 | + return 0; | |
19486 | +} | |
19487 | + | |
19488 | +arch_initcall(pcifront_x86_stub_init); | |
19489 | Index: head-2008-11-25/arch/x86/ia32/ia32entry-xen.S | |
19490 | =================================================================== | |
19491 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
19492 | +++ head-2008-11-25/arch/x86/ia32/ia32entry-xen.S 2008-04-02 12:34:02.000000000 +0200 | |
19493 | @@ -0,0 +1,666 @@ | |
19494 | +/* | |
19495 | + * Compatibility mode system call entry point for x86-64. | |
19496 | + * | |
19497 | + * Copyright 2000-2002 Andi Kleen, SuSE Labs. | |
19498 | + */ | |
19499 | + | |
19500 | +#include <asm/dwarf2.h> | |
19501 | +#include <asm/calling.h> | |
19502 | +#include <asm/asm-offsets.h> | |
19503 | +#include <asm/current.h> | |
19504 | +#include <asm/errno.h> | |
19505 | +#include <asm/ia32_unistd.h> | |
19506 | +#include <asm/thread_info.h> | |
19507 | +#include <asm/segment.h> | |
19508 | +#include <asm/vsyscall32.h> | |
19509 | +#include <asm/irqflags.h> | |
19510 | +#include <linux/linkage.h> | |
19511 | + | |
19512 | +#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) | |
19513 | + | |
19514 | + .macro IA32_ARG_FIXUP noebp=0 | |
19515 | + movl %edi,%r8d | |
19516 | + .if \noebp | |
19517 | + .else | |
19518 | + movl %ebp,%r9d | |
19519 | + .endif | |
19520 | + xchg %ecx,%esi | |
19521 | + movl %ebx,%edi | |
19522 | + movl %edx,%edx /* zero extension */ | |
19523 | + .endm | |
19524 | + | |
19525 | + /* clobbers %eax */ | |
19526 | + .macro CLEAR_RREGS | |
19527 | + xorl %eax,%eax | |
19528 | + movq %rax,R11(%rsp) | |
19529 | + movq %rax,R10(%rsp) | |
19530 | + movq %rax,R9(%rsp) | |
19531 | + movq %rax,R8(%rsp) | |
19532 | + .endm | |
19533 | + | |
19534 | + .macro LOAD_ARGS32 offset | |
19535 | + movl \offset(%rsp),%r11d | |
19536 | + movl \offset+8(%rsp),%r10d | |
19537 | + movl \offset+16(%rsp),%r9d | |
19538 | + movl \offset+24(%rsp),%r8d | |
19539 | + movl \offset+40(%rsp),%ecx | |
19540 | + movl \offset+48(%rsp),%edx | |
19541 | + movl \offset+56(%rsp),%esi | |
19542 | + movl \offset+64(%rsp),%edi | |
19543 | + movl \offset+72(%rsp),%eax | |
19544 | + .endm | |
19545 | + | |
19546 | + .macro CFI_STARTPROC32 simple | |
19547 | + CFI_STARTPROC \simple | |
19548 | + CFI_UNDEFINED r8 | |
19549 | + CFI_UNDEFINED r9 | |
19550 | + CFI_UNDEFINED r10 | |
19551 | + CFI_UNDEFINED r11 | |
19552 | + CFI_UNDEFINED r12 | |
19553 | + CFI_UNDEFINED r13 | |
19554 | + CFI_UNDEFINED r14 | |
19555 | + CFI_UNDEFINED r15 | |
19556 | + .endm | |
19557 | + | |
19558 | +/* | |
19559 | + * 32bit SYSENTER instruction entry. | |
19560 | + * | |
19561 | + * Arguments: | |
19562 | + * %eax System call number. | |
19563 | + * %ebx Arg1 | |
19564 | + * %ecx Arg2 | |
19565 | + * %edx Arg3 | |
19566 | + * %esi Arg4 | |
19567 | + * %edi Arg5 | |
19568 | + * %ebp user stack | |
19569 | + * 0(%ebp) Arg6 | |
19570 | + * | |
19571 | + * Interrupts on. | |
19572 | + * | |
19573 | + * This is purely a fast path. For anything complicated we use the int 0x80 | |
19574 | + * path below. Set up a complete hardware stack frame to share code | |
19575 | + * with the int 0x80 path. | |
19576 | + */ | |
19577 | +ENTRY(ia32_sysenter_target) | |
19578 | + CFI_STARTPROC32 simple | |
19579 | + CFI_DEF_CFA rsp,SS+8-RIP+16 | |
19580 | + /*CFI_REL_OFFSET ss,SS-RIP+16*/ | |
19581 | + CFI_REL_OFFSET rsp,RSP-RIP+16 | |
19582 | + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ | |
19583 | + /*CFI_REL_OFFSET cs,CS-RIP+16*/ | |
19584 | + CFI_REL_OFFSET rip,RIP-RIP+16 | |
19585 | + CFI_REL_OFFSET r11,8 | |
19586 | + CFI_REL_OFFSET rcx,0 | |
19587 | + movq 8(%rsp),%r11 | |
19588 | + CFI_RESTORE r11 | |
19589 | + popq %rcx | |
19590 | + CFI_ADJUST_CFA_OFFSET -8 | |
19591 | + CFI_RESTORE rcx | |
19592 | + movl %ebp,%ebp /* zero extension */ | |
19593 | + movl %eax,%eax | |
19594 | + movl $__USER32_DS,40(%rsp) | |
19595 | + movq %rbp,32(%rsp) | |
19596 | + movl $__USER32_CS,16(%rsp) | |
19597 | + movl $VSYSCALL32_SYSEXIT,8(%rsp) | |
19598 | + movq %rax,(%rsp) | |
19599 | + cld | |
19600 | + SAVE_ARGS 0,0,0 | |
19601 | + /* no need to do an access_ok check here because rbp has been | |
19602 | + 32bit zero extended */ | |
19603 | +1: movl (%rbp),%r9d | |
19604 | + .section __ex_table,"a" | |
19605 | + .quad 1b,ia32_badarg | |
19606 | + .previous | |
19607 | + GET_THREAD_INFO(%r10) | |
19608 | + orl $TS_COMPAT,threadinfo_status(%r10) | |
19609 | + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) | |
19610 | + jnz sysenter_tracesys | |
19611 | +sysenter_do_call: | |
19612 | + cmpl $(IA32_NR_syscalls-1),%eax | |
19613 | + ja ia32_badsys | |
19614 | + IA32_ARG_FIXUP 1 | |
19615 | + call *ia32_sys_call_table(,%rax,8) | |
19616 | + movq %rax,RAX-ARGOFFSET(%rsp) | |
19617 | + jmp int_ret_from_sys_call | |
19618 | + | |
19619 | +sysenter_tracesys: | |
19620 | + SAVE_REST | |
19621 | + CLEAR_RREGS | |
19622 | + movq $-ENOSYS,RAX(%rsp) /* really needed? */ | |
19623 | + movq %rsp,%rdi /* &pt_regs -> arg1 */ | |
19624 | + call syscall_trace_enter | |
19625 | + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | |
19626 | + RESTORE_REST | |
19627 | + movl %ebp, %ebp | |
19628 | + /* no need to do an access_ok check here because rbp has been | |
19629 | + 32bit zero extended */ | |
19630 | +1: movl (%rbp),%r9d | |
19631 | + .section __ex_table,"a" | |
19632 | + .quad 1b,ia32_badarg | |
19633 | + .previous | |
19634 | + jmp sysenter_do_call | |
19635 | + CFI_ENDPROC | |
19636 | +ENDPROC(ia32_sysenter_target) | |
19637 | + | |
19638 | +/* | |
19639 | + * 32bit SYSCALL instruction entry. | |
19640 | + * | |
19641 | + * Arguments: | |
19642 | + * %eax System call number. | |
19643 | + * %ebx Arg1 | |
19644 | + * %ecx return EIP | |
19645 | + * %edx Arg3 | |
19646 | + * %esi Arg4 | |
19647 | + * %edi Arg5 | |
19648 | + * %ebp Arg2 [note: not saved in the stack frame, should not be touched] | |
19649 | + * %esp user stack | |
19650 | + * 0(%esp) Arg6 | |
19651 | + * | |
19652 | + * Interrupts on. | |
19653 | + * | |
19654 | + * This is purely a fast path. For anything complicated we use the int 0x80 | |
19655 | + * path below. Set up a complete hardware stack frame to share code | |
19656 | + * with the int 0x80 path. | |
19657 | + */ | |
19658 | +ENTRY(ia32_cstar_target) | |
19659 | + CFI_STARTPROC32 simple | |
19660 | + CFI_DEF_CFA rsp,SS+8-RIP+16 | |
19661 | + /*CFI_REL_OFFSET ss,SS-RIP+16*/ | |
19662 | + CFI_REL_OFFSET rsp,RSP-RIP+16 | |
19663 | + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ | |
19664 | + /*CFI_REL_OFFSET cs,CS-RIP+16*/ | |
19665 | + CFI_REL_OFFSET rip,RIP-RIP+16 | |
19666 | + movl %eax,%eax /* zero extension */ | |
19667 | + movl RSP-RIP+16(%rsp),%r8d | |
19668 | + SAVE_ARGS -8,1,1 | |
19669 | + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | |
19670 | + movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ | |
19671 | + movl %ebp,%ecx | |
19672 | + movl $__USER32_CS,CS-ARGOFFSET(%rsp) | |
19673 | + movl $__USER32_DS,SS-ARGOFFSET(%rsp) | |
19674 | + /* no need to do an access_ok check here because r8 has been | |
19675 | + 32bit zero extended */ | |
19676 | + /* hardware stack frame is complete now */ | |
19677 | +1: movl (%r8),%r9d | |
19678 | + .section __ex_table,"a" | |
19679 | + .quad 1b,ia32_badarg | |
19680 | + .previous | |
19681 | + GET_THREAD_INFO(%r10) | |
19682 | + orl $TS_COMPAT,threadinfo_status(%r10) | |
19683 | + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) | |
19684 | + jnz cstar_tracesys | |
19685 | +cstar_do_call: | |
19686 | + cmpl $IA32_NR_syscalls-1,%eax | |
19687 | + ja ia32_badsys | |
19688 | + IA32_ARG_FIXUP 1 | |
19689 | + call *ia32_sys_call_table(,%rax,8) | |
19690 | + movq %rax,RAX-ARGOFFSET(%rsp) | |
19691 | + jmp int_ret_from_sys_call | |
19692 | + | |
19693 | +cstar_tracesys: | |
19694 | + SAVE_REST | |
19695 | + CLEAR_RREGS | |
19696 | + movq $-ENOSYS,RAX(%rsp) /* really needed? */ | |
19697 | + movq %rsp,%rdi /* &pt_regs -> arg1 */ | |
19698 | + call syscall_trace_enter | |
19699 | + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | |
19700 | + RESTORE_REST | |
19701 | + movl RSP-ARGOFFSET(%rsp), %r8d | |
19702 | + /* no need to do an access_ok check here because r8 has been | |
19703 | + 32bit zero extended */ | |
19704 | +1: movl (%r8),%r9d | |
19705 | + .section __ex_table,"a" | |
19706 | + .quad 1b,ia32_badarg | |
19707 | + .previous | |
19708 | + jmp cstar_do_call | |
19709 | +END(ia32_cstar_target) | |
19710 | + | |
19711 | +ia32_badarg: | |
19712 | + movq $-EFAULT,%rax | |
19713 | + jmp ia32_sysret | |
19714 | + CFI_ENDPROC | |
19715 | + | |
19716 | +/* | |
19717 | + * Emulated IA32 system calls via int 0x80. | |
19718 | + * | |
19719 | + * Arguments: | |
19720 | + * %eax System call number. | |
19721 | + * %ebx Arg1 | |
19722 | + * %ecx Arg2 | |
19723 | + * %edx Arg3 | |
19724 | + * %esi Arg4 | |
19725 | + * %edi Arg5 | |
19726 | + * %ebp Arg6 [note: not saved in the stack frame, should not be touched] | |
19727 | + * | |
19728 | + * Notes: | |
19729 | + * Uses the same stack frame as the x86-64 version. | |
19730 | + * All registers except %eax must be saved (but ptrace may violate that) | |
19731 | + * Arguments are zero extended. For system calls that want sign extension and | |
19732 | + * take long arguments a wrapper is needed. Most calls can just be called | |
19733 | + * directly. | |
19734 | + * Assumes it is only called from user space and entered with interrupts on. | |
19735 | + */ | |
19736 | + | |
19737 | +ENTRY(ia32_syscall) | |
19738 | + CFI_STARTPROC simple | |
19739 | + CFI_DEF_CFA rsp,SS+8-RIP+16 | |
19740 | + /*CFI_REL_OFFSET ss,SS-RIP+16*/ | |
19741 | + CFI_REL_OFFSET rsp,RSP-RIP+16 | |
19742 | + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ | |
19743 | + /*CFI_REL_OFFSET cs,CS-RIP+16*/ | |
19744 | + CFI_REL_OFFSET rip,RIP-RIP+16 | |
19745 | + CFI_REL_OFFSET r11,8 | |
19746 | + CFI_REL_OFFSET rcx,0 | |
19747 | + movq 8(%rsp),%r11 | |
19748 | + CFI_RESTORE r11 | |
19749 | + popq %rcx | |
19750 | + CFI_ADJUST_CFA_OFFSET -8 | |
19751 | + CFI_RESTORE rcx | |
19752 | + movl %eax,%eax | |
19753 | + movq %rax,(%rsp) | |
19754 | + cld | |
19755 | + /* note the registers are not zero extended to the sf. | |
19756 | + this could be a problem. */ | |
19757 | + SAVE_ARGS 0,0,1 | |
19758 | + GET_THREAD_INFO(%r10) | |
19759 | + orl $TS_COMPAT,threadinfo_status(%r10) | |
19760 | + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) | |
19761 | + jnz ia32_tracesys | |
19762 | +ia32_do_syscall: | |
19763 | + cmpl $(IA32_NR_syscalls-1),%eax | |
19764 | + ja ia32_badsys | |
19765 | + IA32_ARG_FIXUP | |
19766 | + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative | |
19767 | +ia32_sysret: | |
19768 | + movq %rax,RAX-ARGOFFSET(%rsp) | |
19769 | + jmp int_ret_from_sys_call | |
19770 | + | |
19771 | +ia32_tracesys: | |
19772 | + SAVE_REST | |
19773 | + movq $-ENOSYS,RAX(%rsp) /* really needed? */ | |
19774 | + movq %rsp,%rdi /* &pt_regs -> arg1 */ | |
19775 | + call syscall_trace_enter | |
19776 | + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | |
19777 | + RESTORE_REST | |
19778 | + jmp ia32_do_syscall | |
19779 | +END(ia32_syscall) | |
19780 | + | |
19781 | +ia32_badsys: | |
19782 | + movq $0,ORIG_RAX-ARGOFFSET(%rsp) | |
19783 | + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | |
19784 | + jmp int_ret_from_sys_call | |
19785 | + | |
19786 | +quiet_ni_syscall: | |
19787 | + movq $-ENOSYS,%rax | |
19788 | + ret | |
19789 | + CFI_ENDPROC | |
19790 | + | |
19791 | + .macro PTREGSCALL label, func, arg | |
19792 | + .globl \label | |
19793 | +\label: | |
19794 | + leaq \func(%rip),%rax | |
19795 | + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ | |
19796 | + jmp ia32_ptregs_common | |
19797 | + .endm | |
19798 | + | |
19799 | + CFI_STARTPROC32 | |
19800 | + | |
19801 | + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi | |
19802 | + PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi | |
19803 | + PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx | |
19804 | + PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx | |
19805 | + PTREGSCALL stub32_execve, sys32_execve, %rcx | |
19806 | + PTREGSCALL stub32_fork, sys_fork, %rdi | |
19807 | + PTREGSCALL stub32_clone, sys32_clone, %rdx | |
19808 | + PTREGSCALL stub32_vfork, sys_vfork, %rdi | |
19809 | + PTREGSCALL stub32_iopl, sys_iopl, %rsi | |
19810 | + PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx | |
19811 | + | |
19812 | +ENTRY(ia32_ptregs_common) | |
19813 | + popq %r11 | |
19814 | + CFI_ENDPROC | |
19815 | + CFI_STARTPROC32 simple | |
19816 | + CFI_DEF_CFA rsp,SS+8-ARGOFFSET | |
19817 | + CFI_REL_OFFSET rax,RAX-ARGOFFSET | |
19818 | + CFI_REL_OFFSET rcx,RCX-ARGOFFSET | |
19819 | + CFI_REL_OFFSET rdx,RDX-ARGOFFSET | |
19820 | + CFI_REL_OFFSET rsi,RSI-ARGOFFSET | |
19821 | + CFI_REL_OFFSET rdi,RDI-ARGOFFSET | |
19822 | + CFI_REL_OFFSET rip,RIP-ARGOFFSET | |
19823 | +/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ | |
19824 | +/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ | |
19825 | + CFI_REL_OFFSET rsp,RSP-ARGOFFSET | |
19826 | +/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ | |
19827 | + SAVE_REST | |
19828 | + call *%rax | |
19829 | + RESTORE_REST | |
19830 | + jmp ia32_sysret /* misbalances the return cache */ | |
19831 | + CFI_ENDPROC | |
19832 | +END(ia32_ptregs_common) | |
19833 | + | |
19834 | + .section .rodata,"a" | |
19835 | + .align 8 | |
19836 | +ia32_sys_call_table: | |
19837 | + .quad sys_restart_syscall | |
19838 | + .quad sys_exit | |
19839 | + .quad stub32_fork | |
19840 | + .quad sys_read | |
19841 | + .quad sys_write | |
19842 | + .quad compat_sys_open /* 5 */ | |
19843 | + .quad sys_close | |
19844 | + .quad sys32_waitpid | |
19845 | + .quad sys_creat | |
19846 | + .quad sys_link | |
19847 | + .quad sys_unlink /* 10 */ | |
19848 | + .quad stub32_execve | |
19849 | + .quad sys_chdir | |
19850 | + .quad compat_sys_time | |
19851 | + .quad sys_mknod | |
19852 | + .quad sys_chmod /* 15 */ | |
19853 | + .quad sys_lchown16 | |
19854 | + .quad quiet_ni_syscall /* old break syscall holder */ | |
19855 | + .quad sys_stat | |
19856 | + .quad sys32_lseek | |
19857 | + .quad sys_getpid /* 20 */ | |
19858 | + .quad compat_sys_mount /* mount */ | |
19859 | + .quad sys_oldumount /* old_umount */ | |
19860 | + .quad sys_setuid16 | |
19861 | + .quad sys_getuid16 | |
19862 | + .quad compat_sys_stime /* stime */ /* 25 */ | |
19863 | + .quad sys32_ptrace /* ptrace */ | |
19864 | + .quad sys_alarm | |
19865 | + .quad sys_fstat /* (old)fstat */ | |
19866 | + .quad sys_pause | |
19867 | + .quad compat_sys_utime /* 30 */ | |
19868 | + .quad quiet_ni_syscall /* old stty syscall holder */ | |
19869 | + .quad quiet_ni_syscall /* old gtty syscall holder */ | |
19870 | + .quad sys_access | |
19871 | + .quad sys_nice | |
19872 | + .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ | |
19873 | + .quad sys_sync | |
19874 | + .quad sys32_kill | |
19875 | + .quad sys_rename | |
19876 | + .quad sys_mkdir | |
19877 | + .quad sys_rmdir /* 40 */ | |
19878 | + .quad sys_dup | |
19879 | + .quad sys32_pipe | |
19880 | + .quad compat_sys_times | |
19881 | + .quad quiet_ni_syscall /* old prof syscall holder */ | |
19882 | + .quad sys_brk /* 45 */ | |
19883 | + .quad sys_setgid16 | |
19884 | + .quad sys_getgid16 | |
19885 | + .quad sys_signal | |
19886 | + .quad sys_geteuid16 | |
19887 | + .quad sys_getegid16 /* 50 */ | |
19888 | + .quad sys_acct | |
19889 | + .quad sys_umount /* new_umount */ | |
19890 | + .quad quiet_ni_syscall /* old lock syscall holder */ | |
19891 | + .quad compat_sys_ioctl | |
19892 | + .quad compat_sys_fcntl64 /* 55 */ | |
19893 | + .quad quiet_ni_syscall /* old mpx syscall holder */ | |
19894 | + .quad sys_setpgid | |
19895 | + .quad quiet_ni_syscall /* old ulimit syscall holder */ | |
19896 | + .quad sys32_olduname | |
19897 | + .quad sys_umask /* 60 */ | |
19898 | + .quad sys_chroot | |
19899 | + .quad sys32_ustat | |
19900 | + .quad sys_dup2 | |
19901 | + .quad sys_getppid | |
19902 | + .quad sys_getpgrp /* 65 */ | |
19903 | + .quad sys_setsid | |
19904 | + .quad sys32_sigaction | |
19905 | + .quad sys_sgetmask | |
19906 | + .quad sys_ssetmask | |
19907 | + .quad sys_setreuid16 /* 70 */ | |
19908 | + .quad sys_setregid16 | |
19909 | + .quad stub32_sigsuspend | |
19910 | + .quad compat_sys_sigpending | |
19911 | + .quad sys_sethostname | |
19912 | + .quad compat_sys_setrlimit /* 75 */ | |
19913 | + .quad compat_sys_old_getrlimit /* old_getrlimit */ | |
19914 | + .quad compat_sys_getrusage | |
19915 | + .quad sys32_gettimeofday | |
19916 | + .quad sys32_settimeofday | |
19917 | + .quad sys_getgroups16 /* 80 */ | |
19918 | + .quad sys_setgroups16 | |
19919 | + .quad sys32_old_select | |
19920 | + .quad sys_symlink | |
19921 | + .quad sys_lstat | |
19922 | + .quad sys_readlink /* 85 */ | |
19923 | +#ifdef CONFIG_IA32_AOUT | |
19924 | + .quad sys_uselib | |
19925 | +#else | |
19926 | + .quad quiet_ni_syscall | |
19927 | +#endif | |
19928 | + .quad sys_swapon | |
19929 | + .quad sys_reboot | |
19930 | + .quad compat_sys_old_readdir | |
19931 | + .quad sys32_mmap /* 90 */ | |
19932 | + .quad sys_munmap | |
19933 | + .quad sys_truncate | |
19934 | + .quad sys_ftruncate | |
19935 | + .quad sys_fchmod | |
19936 | + .quad sys_fchown16 /* 95 */ | |
19937 | + .quad sys_getpriority | |
19938 | + .quad sys_setpriority | |
19939 | + .quad quiet_ni_syscall /* old profil syscall holder */ | |
19940 | + .quad compat_sys_statfs | |
19941 | + .quad compat_sys_fstatfs /* 100 */ | |
19942 | + .quad sys_ioperm | |
19943 | + .quad compat_sys_socketcall | |
19944 | + .quad sys_syslog | |
19945 | + .quad compat_sys_setitimer | |
19946 | + .quad compat_sys_getitimer /* 105 */ | |
19947 | + .quad compat_sys_newstat | |
19948 | + .quad compat_sys_newlstat | |
19949 | + .quad compat_sys_newfstat | |
19950 | + .quad sys32_uname | |
19951 | + .quad stub32_iopl /* 110 */ | |
19952 | + .quad sys_vhangup | |
19953 | + .quad quiet_ni_syscall /* old "idle" system call */ | |
19954 | + .quad sys32_vm86_warning /* vm86old */ | |
19955 | + .quad compat_sys_wait4 | |
19956 | + .quad sys_swapoff /* 115 */ | |
19957 | + .quad sys32_sysinfo | |
19958 | + .quad sys32_ipc | |
19959 | + .quad sys_fsync | |
19960 | + .quad stub32_sigreturn | |
19961 | + .quad stub32_clone /* 120 */ | |
19962 | + .quad sys_setdomainname | |
19963 | + .quad sys_uname | |
19964 | + .quad sys_modify_ldt | |
19965 | + .quad compat_sys_adjtimex | |
19966 | + .quad sys32_mprotect /* 125 */ | |
19967 | + .quad compat_sys_sigprocmask | |
19968 | + .quad quiet_ni_syscall /* create_module */ | |
19969 | + .quad sys_init_module | |
19970 | + .quad sys_delete_module | |
19971 | + .quad quiet_ni_syscall /* 130 get_kernel_syms */ | |
19972 | + .quad sys_quotactl | |
19973 | + .quad sys_getpgid | |
19974 | + .quad sys_fchdir | |
19975 | + .quad quiet_ni_syscall /* bdflush */ | |
19976 | + .quad sys_sysfs /* 135 */ | |
19977 | + .quad sys_personality | |
19978 | + .quad quiet_ni_syscall /* for afs_syscall */ | |
19979 | + .quad sys_setfsuid16 | |
19980 | + .quad sys_setfsgid16 | |
19981 | + .quad sys_llseek /* 140 */ | |
19982 | + .quad compat_sys_getdents | |
19983 | + .quad compat_sys_select | |
19984 | + .quad sys_flock | |
19985 | + .quad sys_msync | |
19986 | + .quad compat_sys_readv /* 145 */ | |
19987 | + .quad compat_sys_writev | |
19988 | + .quad sys_getsid | |
19989 | + .quad sys_fdatasync | |
19990 | + .quad sys32_sysctl /* sysctl */ | |
19991 | + .quad sys_mlock /* 150 */ | |
19992 | + .quad sys_munlock | |
19993 | + .quad sys_mlockall | |
19994 | + .quad sys_munlockall | |
19995 | + .quad sys_sched_setparam | |
19996 | + .quad sys_sched_getparam /* 155 */ | |
19997 | + .quad sys_sched_setscheduler | |
19998 | + .quad sys_sched_getscheduler | |
19999 | + .quad sys_sched_yield | |
20000 | + .quad sys_sched_get_priority_max | |
20001 | + .quad sys_sched_get_priority_min /* 160 */ | |
20002 | + .quad sys_sched_rr_get_interval | |
20003 | + .quad compat_sys_nanosleep | |
20004 | + .quad sys_mremap | |
20005 | + .quad sys_setresuid16 | |
20006 | + .quad sys_getresuid16 /* 165 */ | |
20007 | + .quad sys32_vm86_warning /* vm86 */ | |
20008 | + .quad quiet_ni_syscall /* query_module */ | |
20009 | + .quad sys_poll | |
20010 | + .quad compat_sys_nfsservctl | |
20011 | + .quad sys_setresgid16 /* 170 */ | |
20012 | + .quad sys_getresgid16 | |
20013 | + .quad sys_prctl | |
20014 | + .quad stub32_rt_sigreturn | |
20015 | + .quad sys32_rt_sigaction | |
20016 | + .quad sys32_rt_sigprocmask /* 175 */ | |
20017 | + .quad sys32_rt_sigpending | |
20018 | + .quad compat_sys_rt_sigtimedwait | |
20019 | + .quad sys32_rt_sigqueueinfo | |
20020 | + .quad stub32_rt_sigsuspend | |
20021 | + .quad sys32_pread /* 180 */ | |
20022 | + .quad sys32_pwrite | |
20023 | + .quad sys_chown16 | |
20024 | + .quad sys_getcwd | |
20025 | + .quad sys_capget | |
20026 | + .quad sys_capset | |
20027 | + .quad stub32_sigaltstack | |
20028 | + .quad sys32_sendfile | |
20029 | + .quad quiet_ni_syscall /* streams1 */ | |
20030 | + .quad quiet_ni_syscall /* streams2 */ | |
20031 | + .quad stub32_vfork /* 190 */ | |
20032 | + .quad compat_sys_getrlimit | |
20033 | + .quad sys32_mmap2 | |
20034 | + .quad sys32_truncate64 | |
20035 | + .quad sys32_ftruncate64 | |
20036 | + .quad sys32_stat64 /* 195 */ | |
20037 | + .quad sys32_lstat64 | |
20038 | + .quad sys32_fstat64 | |
20039 | + .quad sys_lchown | |
20040 | + .quad sys_getuid | |
20041 | + .quad sys_getgid /* 200 */ | |
20042 | + .quad sys_geteuid | |
20043 | + .quad sys_getegid | |
20044 | + .quad sys_setreuid | |
20045 | + .quad sys_setregid | |
20046 | + .quad sys_getgroups /* 205 */ | |
20047 | + .quad sys_setgroups | |
20048 | + .quad sys_fchown | |
20049 | + .quad sys_setresuid | |
20050 | + .quad sys_getresuid | |
20051 | + .quad sys_setresgid /* 210 */ | |
20052 | + .quad sys_getresgid | |
20053 | + .quad sys_chown | |
20054 | + .quad sys_setuid | |
20055 | + .quad sys_setgid | |
20056 | + .quad sys_setfsuid /* 215 */ | |
20057 | + .quad sys_setfsgid | |
20058 | + .quad sys_pivot_root | |
20059 | + .quad sys_mincore | |
20060 | + .quad sys_madvise | |
20061 | + .quad compat_sys_getdents64 /* 220 getdents64 */ | |
20062 | + .quad compat_sys_fcntl64 | |
20063 | + .quad quiet_ni_syscall /* tux */ | |
20064 | + .quad quiet_ni_syscall /* security */ | |
20065 | + .quad sys_gettid | |
20066 | + .quad sys_readahead /* 225 */ | |
20067 | + .quad sys_setxattr | |
20068 | + .quad sys_lsetxattr | |
20069 | + .quad sys_fsetxattr | |
20070 | + .quad sys_getxattr | |
20071 | + .quad sys_lgetxattr /* 230 */ | |
20072 | + .quad sys_fgetxattr | |
20073 | + .quad sys_listxattr | |
20074 | + .quad sys_llistxattr | |
20075 | + .quad sys_flistxattr | |
20076 | + .quad sys_removexattr /* 235 */ | |
20077 | + .quad sys_lremovexattr | |
20078 | + .quad sys_fremovexattr | |
20079 | + .quad sys_tkill | |
20080 | + .quad sys_sendfile64 | |
20081 | + .quad compat_sys_futex /* 240 */ | |
20082 | + .quad compat_sys_sched_setaffinity | |
20083 | + .quad compat_sys_sched_getaffinity | |
20084 | + .quad sys32_set_thread_area | |
20085 | + .quad sys32_get_thread_area | |
20086 | + .quad compat_sys_io_setup /* 245 */ | |
20087 | + .quad sys_io_destroy | |
20088 | + .quad compat_sys_io_getevents | |
20089 | + .quad compat_sys_io_submit | |
20090 | + .quad sys_io_cancel | |
20091 | + .quad sys_fadvise64 /* 250 */ | |
20092 | + .quad quiet_ni_syscall /* free_huge_pages */ | |
20093 | + .quad sys_exit_group | |
20094 | + .quad sys32_lookup_dcookie | |
20095 | + .quad sys_epoll_create | |
20096 | + .quad sys_epoll_ctl /* 255 */ | |
20097 | + .quad sys_epoll_wait | |
20098 | + .quad sys_remap_file_pages | |
20099 | + .quad sys_set_tid_address | |
20100 | + .quad compat_sys_timer_create | |
20101 | + .quad compat_sys_timer_settime /* 260 */ | |
20102 | + .quad compat_sys_timer_gettime | |
20103 | + .quad sys_timer_getoverrun | |
20104 | + .quad sys_timer_delete | |
20105 | + .quad compat_sys_clock_settime | |
20106 | + .quad compat_sys_clock_gettime /* 265 */ | |
20107 | + .quad compat_sys_clock_getres | |
20108 | + .quad compat_sys_clock_nanosleep | |
20109 | + .quad compat_sys_statfs64 | |
20110 | + .quad compat_sys_fstatfs64 | |
20111 | + .quad sys_tgkill /* 270 */ | |
20112 | + .quad compat_sys_utimes | |
20113 | + .quad sys32_fadvise64_64 | |
20114 | + .quad quiet_ni_syscall /* sys_vserver */ | |
20115 | + .quad sys_mbind | |
20116 | + .quad compat_sys_get_mempolicy /* 275 */ | |
20117 | + .quad sys_set_mempolicy | |
20118 | + .quad compat_sys_mq_open | |
20119 | + .quad sys_mq_unlink | |
20120 | + .quad compat_sys_mq_timedsend | |
20121 | + .quad compat_sys_mq_timedreceive /* 280 */ | |
20122 | + .quad compat_sys_mq_notify | |
20123 | + .quad compat_sys_mq_getsetattr | |
20124 | + .quad compat_sys_kexec_load /* reserved for kexec */ | |
20125 | + .quad compat_sys_waitid | |
20126 | + .quad quiet_ni_syscall /* 285: sys_altroot */ | |
20127 | + .quad sys_add_key | |
20128 | + .quad sys_request_key | |
20129 | + .quad sys_keyctl | |
20130 | + .quad sys_ioprio_set | |
20131 | + .quad sys_ioprio_get /* 290 */ | |
20132 | + .quad sys_inotify_init | |
20133 | + .quad sys_inotify_add_watch | |
20134 | + .quad sys_inotify_rm_watch | |
20135 | + .quad sys_migrate_pages | |
20136 | + .quad compat_sys_openat /* 295 */ | |
20137 | + .quad sys_mkdirat | |
20138 | + .quad sys_mknodat | |
20139 | + .quad sys_fchownat | |
20140 | + .quad compat_sys_futimesat | |
20141 | + .quad sys32_fstatat /* 300 */ | |
20142 | + .quad sys_unlinkat | |
20143 | + .quad sys_renameat | |
20144 | + .quad sys_linkat | |
20145 | + .quad sys_symlinkat | |
20146 | + .quad sys_readlinkat /* 305 */ | |
20147 | + .quad sys_fchmodat | |
20148 | + .quad sys_faccessat | |
20149 | + .quad quiet_ni_syscall /* pselect6 for now */ | |
20150 | + .quad quiet_ni_syscall /* ppoll for now */ | |
20151 | + .quad sys_unshare /* 310 */ | |
20152 | + .quad compat_sys_set_robust_list | |
20153 | + .quad compat_sys_get_robust_list | |
20154 | + .quad sys_splice | |
20155 | + .quad sys_sync_file_range | |
20156 | + .quad sys_tee | |
20157 | + .quad compat_sys_vmsplice | |
20158 | + .quad compat_sys_move_pages | |
20159 | +ia32_syscall_end: | |
20160 | Index: head-2008-11-25/arch/x86/kernel/acpi/sleep_64-xen.c | |
20161 | =================================================================== | |
20162 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
20163 | +++ head-2008-11-25/arch/x86/kernel/acpi/sleep_64-xen.c 2008-04-15 09:29:41.000000000 +0200 | |
20164 | @@ -0,0 +1,146 @@ | |
20165 | +/* | |
20166 | + * acpi.c - Architecture-Specific Low-Level ACPI Support | |
20167 | + * | |
20168 | + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> | |
20169 | + * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> | |
20170 | + * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> | |
20171 | + * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) | |
20172 | + * Copyright (C) 2003 Pavel Machek, SuSE Labs | |
20173 | + * | |
20174 | + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
20175 | + * | |
20176 | + * This program is free software; you can redistribute it and/or modify | |
20177 | + * it under the terms of the GNU General Public License as published by | |
20178 | + * the Free Software Foundation; either version 2 of the License, or | |
20179 | + * (at your option) any later version. | |
20180 | + * | |
20181 | + * This program is distributed in the hope that it will be useful, | |
20182 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20183 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20184 | + * GNU General Public License for more details. | |
20185 | + * | |
20186 | + * You should have received a copy of the GNU General Public License | |
20187 | + * along with this program; if not, write to the Free Software | |
20188 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
20189 | + * | |
20190 | + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
20191 | + */ | |
20192 | + | |
20193 | +#include <linux/kernel.h> | |
20194 | +#include <linux/init.h> | |
20195 | +#include <linux/types.h> | |
20196 | +#include <linux/stddef.h> | |
20197 | +#include <linux/slab.h> | |
20198 | +#include <linux/pci.h> | |
20199 | +#include <linux/bootmem.h> | |
20200 | +#include <linux/acpi.h> | |
20201 | +#include <linux/cpumask.h> | |
20202 | + | |
20203 | +#include <asm/mpspec.h> | |
20204 | +#include <asm/io.h> | |
20205 | +#include <asm/apic.h> | |
20206 | +#include <asm/apicdef.h> | |
20207 | +#include <asm/page.h> | |
20208 | +#include <asm/pgtable.h> | |
20209 | +#include <asm/pgalloc.h> | |
20210 | +#include <asm/io_apic.h> | |
20211 | +#include <asm/proto.h> | |
20212 | +#include <asm/tlbflush.h> | |
20213 | + | |
20214 | +/* -------------------------------------------------------------------------- | |
20215 | + Low-Level Sleep Support | |
20216 | + -------------------------------------------------------------------------- */ | |
20217 | + | |
20218 | +#ifdef CONFIG_ACPI_SLEEP | |
20219 | + | |
20220 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
20221 | +/* address in low memory of the wakeup routine. */ | |
20222 | +unsigned long acpi_wakeup_address = 0; | |
20223 | +unsigned long acpi_video_flags; | |
20224 | +extern char wakeup_start, wakeup_end; | |
20225 | + | |
20226 | +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); | |
20227 | + | |
20228 | +static pgd_t low_ptr; | |
20229 | + | |
20230 | +static void init_low_mapping(void) | |
20231 | +{ | |
20232 | + pgd_t *slot0 = pgd_offset(current->mm, 0UL); | |
20233 | + low_ptr = *slot0; | |
20234 | + set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); | |
20235 | + WARN_ON(num_online_cpus() != 1); | |
20236 | + local_flush_tlb(); | |
20237 | +} | |
20238 | +#endif | |
20239 | + | |
20240 | +/** | |
20241 | + * acpi_save_state_mem - save kernel state | |
20242 | + * | |
20243 | + * Create an identity mapped page table and copy the wakeup routine to | |
20244 | + * low memory. | |
20245 | + */ | |
20246 | +int acpi_save_state_mem(void) | |
20247 | +{ | |
20248 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
20249 | + init_low_mapping(); | |
20250 | + | |
20251 | + memcpy((void *)acpi_wakeup_address, &wakeup_start, | |
20252 | + &wakeup_end - &wakeup_start); | |
20253 | + acpi_copy_wakeup_routine(acpi_wakeup_address); | |
20254 | +#endif | |
20255 | + return 0; | |
20256 | +} | |
20257 | + | |
20258 | +/* | |
20259 | + * acpi_restore_state | |
20260 | + */ | |
20261 | +void acpi_restore_state_mem(void) | |
20262 | +{ | |
20263 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
20264 | + set_pgd(pgd_offset(current->mm, 0UL), low_ptr); | |
20265 | + local_flush_tlb(); | |
20266 | +#endif | |
20267 | +} | |
20268 | + | |
20269 | +/** | |
20270 | + * acpi_reserve_bootmem - do _very_ early ACPI initialisation | |
20271 | + * | |
20272 | + * We allocate a page in low memory for the wakeup | |
20273 | + * routine for when we come back from a sleep state. The | |
20274 | + * runtime allocator allows specification of <16M pages, but not | |
20275 | + * <1M pages. | |
20276 | + */ | |
20277 | +void __init acpi_reserve_bootmem(void) | |
20278 | +{ | |
20279 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
20280 | + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); | |
20281 | + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) | |
20282 | + printk(KERN_CRIT | |
20283 | + "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); | |
20284 | +#endif | |
20285 | +} | |
20286 | + | |
20287 | +#ifndef CONFIG_ACPI_PV_SLEEP | |
20288 | +static int __init acpi_sleep_setup(char *str) | |
20289 | +{ | |
20290 | + while ((str != NULL) && (*str != '\0')) { | |
20291 | + if (strncmp(str, "s3_bios", 7) == 0) | |
20292 | + acpi_video_flags = 1; | |
20293 | + if (strncmp(str, "s3_mode", 7) == 0) | |
20294 | + acpi_video_flags |= 2; | |
20295 | + str = strchr(str, ','); | |
20296 | + if (str != NULL) | |
20297 | + str += strspn(str, ", \t"); | |
20298 | + } | |
20299 | + | |
20300 | + return 1; | |
20301 | +} | |
20302 | + | |
20303 | +__setup("acpi_sleep=", acpi_sleep_setup); | |
20304 | +#endif /* CONFIG_ACPI_PV_SLEEP */ | |
20305 | + | |
20306 | +#endif /*CONFIG_ACPI_SLEEP */ | |
20307 | + | |
20308 | +void acpi_pci_link_exit(void) | |
20309 | +{ | |
20310 | +} | |
20311 | Index: head-2008-11-25/arch/x86/kernel/apic_64-xen.c | |
20312 | =================================================================== | |
20313 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
20314 | +++ head-2008-11-25/arch/x86/kernel/apic_64-xen.c 2007-06-12 13:13:01.000000000 +0200 | |
20315 | @@ -0,0 +1,197 @@ | |
20316 | +/* | |
20317 | + * Local APIC handling, local APIC timers | |
20318 | + * | |
20319 | + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> | |
20320 | + * | |
20321 | + * Fixes | |
20322 | + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | |
20323 | + * thanks to Eric Gilmore | |
20324 | + * and Rolf G. Tews | |
20325 | + * for testing these extensively. | |
20326 | + * Maciej W. Rozycki : Various updates and fixes. | |
20327 | + * Mikael Pettersson : Power Management for UP-APIC. | |
20328 | + * Pavel Machek and | |
20329 | + * Mikael Pettersson : PM converted to driver model. | |
20330 | + */ | |
20331 | + | |
20332 | +#include <linux/init.h> | |
20333 | + | |
20334 | +#include <linux/mm.h> | |
20335 | +#include <linux/delay.h> | |
20336 | +#include <linux/bootmem.h> | |
20337 | +#include <linux/smp_lock.h> | |
20338 | +#include <linux/interrupt.h> | |
20339 | +#include <linux/mc146818rtc.h> | |
20340 | +#include <linux/kernel_stat.h> | |
20341 | +#include <linux/sysdev.h> | |
20342 | +#include <linux/module.h> | |
20343 | + | |
20344 | +#include <asm/atomic.h> | |
20345 | +#include <asm/smp.h> | |
20346 | +#include <asm/mtrr.h> | |
20347 | +#include <asm/mpspec.h> | |
20348 | +#include <asm/desc.h> | |
20349 | +#include <asm/arch_hooks.h> | |
20350 | +#include <asm/hpet.h> | |
20351 | +#include <asm/idle.h> | |
20352 | + | |
20353 | +int apic_verbosity; | |
20354 | + | |
20355 | +/* | |
20356 | + * 'what should we do if we get a hw irq event on an illegal vector'. | |
20357 | + * each architecture has to answer this themselves. | |
20358 | + */ | |
20359 | +void ack_bad_irq(unsigned int irq) | |
20360 | +{ | |
20361 | + printk("unexpected IRQ trap at vector %02x\n", irq); | |
20362 | + /* | |
20363 | + * Currently unexpected vectors happen only on SMP and APIC. | |
20364 | + * We _must_ ack these because every local APIC has only N | |
20365 | + * irq slots per priority level, and a 'hanging, unacked' IRQ | |
20366 | + * holds up an irq slot - in excessive cases (when multiple | |
20367 | + * unexpected vectors occur) that might lock up the APIC | |
20368 | + * completely. | |
20369 | + * But don't ack when the APIC is disabled. -AK | |
20370 | + */ | |
20371 | + if (!disable_apic) | |
20372 | + ack_APIC_irq(); | |
20373 | +} | |
20374 | + | |
20375 | +int setup_profiling_timer(unsigned int multiplier) | |
20376 | +{ | |
20377 | + return -EINVAL; | |
20378 | +} | |
20379 | + | |
20380 | +void smp_local_timer_interrupt(struct pt_regs *regs) | |
20381 | +{ | |
20382 | + profile_tick(CPU_PROFILING, regs); | |
20383 | +#ifndef CONFIG_XEN | |
20384 | +#ifdef CONFIG_SMP | |
20385 | + update_process_times(user_mode(regs)); | |
20386 | +#endif | |
20387 | +#endif | |
20388 | + /* | |
20389 | + * We take the 'long' return path, and there every subsystem | |
20390 | + * grabs the appropriate locks (kernel lock/ irq lock). | |
20391 | + * | |
20392 | + * we might want to decouple profiling from the 'long path', | |
20393 | + * and do the profiling totally in assembly. | |
20394 | + * | |
20395 | + * Currently this isn't too much of an issue (performance wise), | |
20396 | + * we can take more than 100K local irqs per second on a 100 MHz P5. | |
20397 | + */ | |
20398 | +} | |
20399 | + | |
20400 | +/* | |
20401 | + * Local APIC timer interrupt. This is the most natural way for doing | |
20402 | + * local interrupts, but local timer interrupts can be emulated by | |
20403 | + * broadcast interrupts too. [in case the hw doesn't support APIC timers] | |
20404 | + * | |
20405 | + * [ if a single-CPU system runs an SMP kernel then we call the local | |
20406 | + * interrupt as well. Thus we cannot inline the local irq ... ] | |
20407 | + */ | |
20408 | +void smp_apic_timer_interrupt(struct pt_regs *regs) | |
20409 | +{ | |
20410 | + /* | |
20411 | + * the NMI deadlock-detector uses this. | |
20412 | + */ | |
20413 | + add_pda(apic_timer_irqs, 1); | |
20414 | + | |
20415 | + /* | |
20416 | + * NOTE! We'd better ACK the irq immediately, | |
20417 | + * because timer handling can be slow. | |
20418 | + */ | |
20419 | + ack_APIC_irq(); | |
20420 | + /* | |
20421 | + * update_process_times() expects us to have done irq_enter(). | |
20422 | + * Besides, if we don't timer interrupts ignore the global | |
20423 | + * interrupt lock, which is the WrongThing (tm) to do. | |
20424 | + */ | |
20425 | + exit_idle(); | |
20426 | + irq_enter(); | |
20427 | + smp_local_timer_interrupt(regs); | |
20428 | + irq_exit(); | |
20429 | +} | |
20430 | + | |
20431 | +/* | |
20432 | + * This interrupt should _never_ happen with our APIC/SMP architecture | |
20433 | + */ | |
20434 | +asmlinkage void smp_spurious_interrupt(void) | |
20435 | +{ | |
20436 | + unsigned int v; | |
20437 | + exit_idle(); | |
20438 | + irq_enter(); | |
20439 | + /* | |
20440 | + * Check if this really is a spurious interrupt and ACK it | |
20441 | + * if it is a vectored one. Just in case... | |
20442 | + * Spurious interrupts should not be ACKed. | |
20443 | + */ | |
20444 | + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); | |
20445 | + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) | |
20446 | + ack_APIC_irq(); | |
20447 | + | |
20448 | +#if 0 | |
20449 | + static unsigned long last_warning; | |
20450 | + static unsigned long skipped; | |
20451 | + | |
20452 | + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ | |
20453 | + if (time_before(last_warning+30*HZ,jiffies)) { | |
20454 | + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", | |
20455 | + smp_processor_id(), skipped); | |
20456 | + last_warning = jiffies; | |
20457 | + skipped = 0; | |
20458 | + } else { | |
20459 | + skipped++; | |
20460 | + } | |
20461 | +#endif | |
20462 | + irq_exit(); | |
20463 | +} | |
20464 | + | |
20465 | +/* | |
20466 | + * This interrupt should never happen with our APIC/SMP architecture | |
20467 | + */ | |
20468 | + | |
20469 | +asmlinkage void smp_error_interrupt(void) | |
20470 | +{ | |
20471 | + unsigned int v, v1; | |
20472 | + | |
20473 | + exit_idle(); | |
20474 | + irq_enter(); | |
20475 | + /* First tickle the hardware, only then report what went on. -- REW */ | |
20476 | + v = apic_read(APIC_ESR); | |
20477 | + apic_write(APIC_ESR, 0); | |
20478 | + v1 = apic_read(APIC_ESR); | |
20479 | + ack_APIC_irq(); | |
20480 | + atomic_inc(&irq_err_count); | |
20481 | + | |
20482 | + /* Here is what the APIC error bits mean: | |
20483 | + 0: Send CS error | |
20484 | + 1: Receive CS error | |
20485 | + 2: Send accept error | |
20486 | + 3: Receive accept error | |
20487 | + 4: Reserved | |
20488 | + 5: Send illegal vector | |
20489 | + 6: Received illegal vector | |
20490 | + 7: Illegal register address | |
20491 | + */ | |
20492 | + printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", | |
20493 | + smp_processor_id(), v , v1); | |
20494 | + irq_exit(); | |
20495 | +} | |
20496 | + | |
20497 | +int disable_apic; | |
20498 | + | |
20499 | +/* | |
20500 | + * This initializes the IO-APIC and APIC hardware if this is | |
20501 | + * a UP kernel. | |
20502 | + */ | |
20503 | +int __init APIC_init_uniprocessor (void) | |
20504 | +{ | |
20505 | +#ifdef CONFIG_X86_IO_APIC | |
20506 | + if (smp_found_config) | |
20507 | + if (!skip_ioapic_setup && nr_ioapics) | |
20508 | + setup_IO_APIC(); | |
20509 | +#endif | |
20510 | + | |
20511 | + return 1; | |
20512 | +} | |
20513 | Index: head-2008-11-25/arch/x86/kernel/e820_64-xen.c | |
20514 | =================================================================== | |
20515 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
20516 | +++ head-2008-11-25/arch/x86/kernel/e820_64-xen.c 2008-04-22 19:56:27.000000000 +0200 | |
20517 | @@ -0,0 +1,798 @@ | |
20518 | +/* | |
20519 | + * Handle the memory map. | |
20520 | + * The functions here do the job until bootmem takes over. | |
20521 | + * | |
20522 | + * Getting sanitize_e820_map() in sync with i386 version by applying change: | |
20523 | + * - Provisions for empty E820 memory regions (reported by certain BIOSes). | |
20524 | + * Alex Achenbach <xela@slit.de>, December 2002. | |
20525 | + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> | |
20526 | + * | |
20527 | + */ | |
20528 | +#include <linux/kernel.h> | |
20529 | +#include <linux/types.h> | |
20530 | +#include <linux/init.h> | |
20531 | +#include <linux/bootmem.h> | |
20532 | +#include <linux/ioport.h> | |
20533 | +#include <linux/string.h> | |
20534 | +#include <linux/kexec.h> | |
20535 | +#include <linux/module.h> | |
20536 | + | |
20537 | +#include <asm/pgtable.h> | |
20538 | +#include <asm/page.h> | |
20539 | +#include <asm/e820.h> | |
20540 | +#include <asm/proto.h> | |
20541 | +#include <asm/bootsetup.h> | |
20542 | +#include <asm/sections.h> | |
20543 | +#include <xen/interface/memory.h> | |
20544 | + | |
20545 | +/* | |
20546 | + * PFN of last memory page. | |
20547 | + */ | |
20548 | +unsigned long end_pfn; | |
20549 | +EXPORT_SYMBOL(end_pfn); | |
20550 | + | |
20551 | +/* | |
20552 | + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. | |
20553 | + * The direct mapping extends to end_pfn_map, so that we can directly access | |
20554 | + * apertures, ACPI and other tables without having to play with fixmaps. | |
20555 | + */ | |
20556 | +unsigned long end_pfn_map; | |
20557 | + | |
20558 | +/* | |
20559 | + * Last pfn which the user wants to use. | |
20560 | + */ | |
20561 | +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; | |
20562 | + | |
20563 | +extern struct resource code_resource, data_resource; | |
20564 | + | |
20565 | +#ifdef CONFIG_XEN | |
20566 | +extern struct e820map machine_e820; | |
20567 | +#endif | |
20568 | + | |
20569 | +/* Check for some hardcoded bad areas that early boot is not allowed to touch */ | |
20570 | +static inline int bad_addr(unsigned long *addrp, unsigned long size) | |
20571 | +{ | |
20572 | + unsigned long addr = *addrp, last = addr + size; | |
20573 | + | |
20574 | +#ifndef CONFIG_XEN | |
20575 | + /* various gunk below that needed for SMP startup */ | |
20576 | + if (addr < 0x8000) { | |
20577 | + *addrp = 0x8000; | |
20578 | + return 1; | |
20579 | + } | |
20580 | + | |
20581 | + /* direct mapping tables of the kernel */ | |
20582 | + if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { | |
20583 | + *addrp = table_end << PAGE_SHIFT; | |
20584 | + return 1; | |
20585 | + } | |
20586 | + | |
20587 | + /* initrd */ | |
20588 | +#ifdef CONFIG_BLK_DEV_INITRD | |
20589 | + if (LOADER_TYPE && INITRD_START && last >= INITRD_START && | |
20590 | + addr < INITRD_START+INITRD_SIZE) { | |
20591 | + *addrp = INITRD_START + INITRD_SIZE; | |
20592 | + return 1; | |
20593 | + } | |
20594 | +#endif | |
20595 | + /* kernel code + 640k memory hole (later should not be needed, but | |
20596 | + be paranoid for now) */ | |
20597 | + if (last >= 640*1024 && addr < 1024*1024) { | |
20598 | + *addrp = 1024*1024; | |
20599 | + return 1; | |
20600 | + } | |
20601 | + if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { | |
20602 | + *addrp = __pa_symbol(&_end); | |
20603 | + return 1; | |
20604 | + } | |
20605 | + | |
20606 | + if (last >= ebda_addr && addr < ebda_addr + ebda_size) { | |
20607 | + *addrp = ebda_addr + ebda_size; | |
20608 | + return 1; | |
20609 | + } | |
20610 | + | |
20611 | + /* XXX ramdisk image here? */ | |
20612 | +#else | |
20613 | + if (last < (table_end<<PAGE_SHIFT)) { | |
20614 | + *addrp = table_end << PAGE_SHIFT; | |
20615 | + return 1; | |
20616 | + } | |
20617 | +#endif | |
20618 | + return 0; | |
20619 | +} | |
20620 | + | |
20621 | +/* | |
20622 | + * This function checks if any part of the range <start,end> is mapped | |
20623 | + * with type. | |
20624 | + */ | |
20625 | +int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) | |
20626 | +{ | |
20627 | + int i; | |
20628 | + | |
20629 | +#ifndef CONFIG_XEN | |
20630 | + for (i = 0; i < e820.nr_map; i++) { | |
20631 | + struct e820entry *ei = &e820.map[i]; | |
20632 | +#else | |
20633 | + if (!is_initial_xendomain()) | |
20634 | + return 0; | |
20635 | + for (i = 0; i < machine_e820.nr_map; i++) { | |
20636 | + const struct e820entry *ei = &machine_e820.map[i]; | |
20637 | +#endif | |
20638 | + | |
20639 | + if (type && ei->type != type) | |
20640 | + continue; | |
20641 | + if (ei->addr >= end || ei->addr + ei->size <= start) | |
20642 | + continue; | |
20643 | + return 1; | |
20644 | + } | |
20645 | + return 0; | |
20646 | +} | |
20647 | +EXPORT_SYMBOL_GPL(e820_any_mapped); | |
20648 | + | |
20649 | +/* | |
20650 | + * This function checks if the entire range <start,end> is mapped with type. | |
20651 | + * | |
20652 | + * Note: this function only works correct if the e820 table is sorted and | |
20653 | + * not-overlapping, which is the case | |
20654 | + */ | |
20655 | +int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) | |
20656 | +{ | |
20657 | + int i; | |
20658 | + | |
20659 | +#ifndef CONFIG_XEN | |
20660 | + for (i = 0; i < e820.nr_map; i++) { | |
20661 | + struct e820entry *ei = &e820.map[i]; | |
20662 | +#else | |
20663 | + if (!is_initial_xendomain()) | |
20664 | + return 0; | |
20665 | + for (i = 0; i < machine_e820.nr_map; i++) { | |
20666 | + const struct e820entry *ei = &machine_e820.map[i]; | |
20667 | +#endif | |
20668 | + | |
20669 | + if (type && ei->type != type) | |
20670 | + continue; | |
20671 | + /* is the region (part) in overlap with the current region ?*/ | |
20672 | + if (ei->addr >= end || ei->addr + ei->size <= start) | |
20673 | + continue; | |
20674 | + | |
20675 | + /* if the region is at the beginning of <start,end> we move | |
20676 | + * start to the end of the region since it's ok until there | |
20677 | + */ | |
20678 | + if (ei->addr <= start) | |
20679 | + start = ei->addr + ei->size; | |
20680 | + /* if start is now at or beyond end, we're done, full coverage */ | |
20681 | + if (start >= end) | |
20682 | + return 1; /* we're done */ | |
20683 | + } | |
20684 | + return 0; | |
20685 | +} | |
20686 | + | |
20687 | +/* | |
20688 | + * Find a free area in a specific range. | |
20689 | + */ | |
20690 | +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) | |
20691 | +{ | |
20692 | + int i; | |
20693 | + for (i = 0; i < e820.nr_map; i++) { | |
20694 | + struct e820entry *ei = &e820.map[i]; | |
20695 | + unsigned long addr = ei->addr, last; | |
20696 | + if (ei->type != E820_RAM) | |
20697 | + continue; | |
20698 | + if (addr < start) | |
20699 | + addr = start; | |
20700 | + if (addr > ei->addr + ei->size) | |
20701 | + continue; | |
20702 | + while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) | |
20703 | + ; | |
20704 | + last = addr + size; | |
20705 | + if (last > ei->addr + ei->size) | |
20706 | + continue; | |
20707 | + if (last > end) | |
20708 | + continue; | |
20709 | + return addr; | |
20710 | + } | |
20711 | + return -1UL; | |
20712 | +} | |
20713 | + | |
20714 | +/* | |
20715 | + * Free bootmem based on the e820 table for a node. | |
20716 | + */ | |
20717 | +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) | |
20718 | +{ | |
20719 | + int i; | |
20720 | + for (i = 0; i < e820.nr_map; i++) { | |
20721 | + struct e820entry *ei = &e820.map[i]; | |
20722 | + unsigned long last, addr; | |
20723 | + | |
20724 | + if (ei->type != E820_RAM || | |
20725 | + ei->addr+ei->size <= start || | |
20726 | + ei->addr >= end) | |
20727 | + continue; | |
20728 | + | |
20729 | + addr = round_up(ei->addr, PAGE_SIZE); | |
20730 | + if (addr < start) | |
20731 | + addr = start; | |
20732 | + | |
20733 | + last = round_down(ei->addr + ei->size, PAGE_SIZE); | |
20734 | + if (last >= end) | |
20735 | + last = end; | |
20736 | + | |
20737 | + if (last > addr && last-addr >= PAGE_SIZE) | |
20738 | + free_bootmem_node(pgdat, addr, last-addr); | |
20739 | + } | |
20740 | +} | |
20741 | + | |
20742 | +/* | |
20743 | + * Find the highest page frame number we have available | |
20744 | + */ | |
20745 | +unsigned long __init e820_end_of_ram(void) | |
20746 | +{ | |
20747 | + int i; | |
20748 | + unsigned long end_pfn = 0; | |
20749 | + | |
20750 | + for (i = 0; i < e820.nr_map; i++) { | |
20751 | + struct e820entry *ei = &e820.map[i]; | |
20752 | + unsigned long start, end; | |
20753 | + | |
20754 | + start = round_up(ei->addr, PAGE_SIZE); | |
20755 | + end = round_down(ei->addr + ei->size, PAGE_SIZE); | |
20756 | + if (start >= end) | |
20757 | + continue; | |
20758 | + if (ei->type == E820_RAM) { | |
20759 | + if (end > end_pfn<<PAGE_SHIFT) | |
20760 | + end_pfn = end>>PAGE_SHIFT; | |
20761 | + } else { | |
20762 | + if (end > end_pfn_map<<PAGE_SHIFT) | |
20763 | + end_pfn_map = end>>PAGE_SHIFT; | |
20764 | + } | |
20765 | + } | |
20766 | + | |
20767 | + if (end_pfn > end_pfn_map) | |
20768 | + end_pfn_map = end_pfn; | |
20769 | + if (end_pfn_map > MAXMEM>>PAGE_SHIFT) | |
20770 | + end_pfn_map = MAXMEM>>PAGE_SHIFT; | |
20771 | + if (end_pfn > end_user_pfn) | |
20772 | + end_pfn = end_user_pfn; | |
20773 | + if (end_pfn > end_pfn_map) | |
20774 | + end_pfn = end_pfn_map; | |
20775 | + | |
20776 | + return end_pfn; | |
20777 | +} | |
20778 | + | |
20779 | +/* | |
20780 | + * Compute how much memory is missing in a range. | |
20781 | + * Unlike the other functions in this file the arguments are in page numbers. | |
20782 | + */ | |
20783 | +unsigned long __init | |
20784 | +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn) | |
20785 | +{ | |
20786 | + unsigned long ram = 0; | |
20787 | + unsigned long start = start_pfn << PAGE_SHIFT; | |
20788 | + unsigned long end = end_pfn << PAGE_SHIFT; | |
20789 | + int i; | |
20790 | + for (i = 0; i < e820.nr_map; i++) { | |
20791 | + struct e820entry *ei = &e820.map[i]; | |
20792 | + unsigned long last, addr; | |
20793 | + | |
20794 | + if (ei->type != E820_RAM || | |
20795 | + ei->addr+ei->size <= start || | |
20796 | + ei->addr >= end) | |
20797 | + continue; | |
20798 | + | |
20799 | + addr = round_up(ei->addr, PAGE_SIZE); | |
20800 | + if (addr < start) | |
20801 | + addr = start; | |
20802 | + | |
20803 | + last = round_down(ei->addr + ei->size, PAGE_SIZE); | |
20804 | + if (last >= end) | |
20805 | + last = end; | |
20806 | + | |
20807 | + if (last > addr) | |
20808 | + ram += last - addr; | |
20809 | + } | |
20810 | + return ((end - start) - ram) >> PAGE_SHIFT; | |
20811 | +} | |
20812 | + | |
20813 | +/* | |
20814 | + * Mark e820 reserved areas as busy for the resource manager. | |
20815 | + */ | |
20816 | +void __init e820_reserve_resources(struct e820entry *e820, int nr_map) | |
20817 | +{ | |
20818 | + int i; | |
20819 | + for (i = 0; i < nr_map; i++) { | |
20820 | + struct resource *res; | |
20821 | + res = alloc_bootmem_low(sizeof(struct resource)); | |
20822 | + switch (e820[i].type) { | |
20823 | + case E820_RAM: res->name = "System RAM"; break; | |
20824 | + case E820_ACPI: res->name = "ACPI Tables"; break; | |
20825 | + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; | |
20826 | + default: res->name = "reserved"; | |
20827 | + } | |
20828 | + res->start = e820[i].addr; | |
20829 | + res->end = res->start + e820[i].size - 1; | |
20830 | + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | |
20831 | + request_resource(&iomem_resource, res); | |
20832 | + if (e820[i].type == E820_RAM) { | |
20833 | + /* | |
20834 | + * We don't know which RAM region contains kernel data, | |
20835 | + * so we try it repeatedly and let the resource manager | |
20836 | + * test it. | |
20837 | + */ | |
20838 | +#ifndef CONFIG_XEN | |
20839 | + request_resource(res, &code_resource); | |
20840 | + request_resource(res, &data_resource); | |
20841 | +#endif | |
20842 | +#ifdef CONFIG_KEXEC | |
20843 | + if (crashk_res.start != crashk_res.end) | |
20844 | + request_resource(res, &crashk_res); | |
20845 | +#ifdef CONFIG_XEN | |
20846 | + xen_machine_kexec_register_resources(res); | |
20847 | +#endif | |
20848 | +#endif | |
20849 | + } | |
20850 | + } | |
20851 | +} | |
20852 | + | |
20853 | +/* | |
20854 | + * Add a memory region to the kernel e820 map. | |
20855 | + */ | |
20856 | +void __init add_memory_region(unsigned long start, unsigned long size, int type) | |
20857 | +{ | |
20858 | + int x = e820.nr_map; | |
20859 | + | |
20860 | + if (x == E820MAX) { | |
20861 | + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | |
20862 | + return; | |
20863 | + } | |
20864 | + | |
20865 | + e820.map[x].addr = start; | |
20866 | + e820.map[x].size = size; | |
20867 | + e820.map[x].type = type; | |
20868 | + e820.nr_map++; | |
20869 | +} | |
20870 | + | |
20871 | +void __init e820_print_map(char *who) | |
20872 | +{ | |
20873 | + int i; | |
20874 | + | |
20875 | + for (i = 0; i < e820.nr_map; i++) { | |
20876 | + printk(" %s: %016Lx - %016Lx ", who, | |
20877 | + (unsigned long long) e820.map[i].addr, | |
20878 | + (unsigned long long) (e820.map[i].addr + e820.map[i].size)); | |
20879 | + switch (e820.map[i].type) { | |
20880 | + case E820_RAM: printk("(usable)\n"); | |
20881 | + break; | |
20882 | + case E820_RESERVED: | |
20883 | + printk("(reserved)\n"); | |
20884 | + break; | |
20885 | + case E820_ACPI: | |
20886 | + printk("(ACPI data)\n"); | |
20887 | + break; | |
20888 | + case E820_NVS: | |
20889 | + printk("(ACPI NVS)\n"); | |
20890 | + break; | |
20891 | + default: printk("type %u\n", e820.map[i].type); | |
20892 | + break; | |
20893 | + } | |
20894 | + } | |
20895 | +} | |
20896 | + | |
20897 | +/* | |
20898 | + * Sanitize the BIOS e820 map. | |
20899 | + * | |
20900 | + * Some e820 responses include overlapping entries. The following | |
20901 | + * replaces the original e820 map with a new one, removing overlaps. | |
20902 | + * | |
20903 | + */ | |
20904 | +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) | |
20905 | +{ | |
20906 | + struct change_member { | |
20907 | + struct e820entry *pbios; /* pointer to original bios entry */ | |
20908 | + unsigned long long addr; /* address for this change point */ | |
20909 | + }; | |
20910 | + static struct change_member change_point_list[2*E820MAX] __initdata; | |
20911 | + static struct change_member *change_point[2*E820MAX] __initdata; | |
20912 | + static struct e820entry *overlap_list[E820MAX] __initdata; | |
20913 | + static struct e820entry new_bios[E820MAX] __initdata; | |
20914 | + struct change_member *change_tmp; | |
20915 | + unsigned long current_type, last_type; | |
20916 | + unsigned long long last_addr; | |
20917 | + int chgidx, still_changing; | |
20918 | + int overlap_entries; | |
20919 | + int new_bios_entry; | |
20920 | + int old_nr, new_nr, chg_nr; | |
20921 | + int i; | |
20922 | + | |
20923 | + /* | |
20924 | + Visually we're performing the following (1,2,3,4 = memory types)... | |
20925 | + | |
20926 | + Sample memory map (w/overlaps): | |
20927 | + ____22__________________ | |
20928 | + ______________________4_ | |
20929 | + ____1111________________ | |
20930 | + _44_____________________ | |
20931 | + 11111111________________ | |
20932 | + ____________________33__ | |
20933 | + ___________44___________ | |
20934 | + __________33333_________ | |
20935 | + ______________22________ | |
20936 | + ___________________2222_ | |
20937 | + _________111111111______ | |
20938 | + _____________________11_ | |
20939 | + _________________4______ | |
20940 | + | |
20941 | + Sanitized equivalent (no overlap): | |
20942 | + 1_______________________ | |
20943 | + _44_____________________ | |
20944 | + ___1____________________ | |
20945 | + ____22__________________ | |
20946 | + ______11________________ | |
20947 | + _________1______________ | |
20948 | + __________3_____________ | |
20949 | + ___________44___________ | |
20950 | + _____________33_________ | |
20951 | + _______________2________ | |
20952 | + ________________1_______ | |
20953 | + _________________4______ | |
20954 | + ___________________2____ | |
20955 | + ____________________33__ | |
20956 | + ______________________4_ | |
20957 | + */ | |
20958 | + | |
20959 | + /* if there's only one memory region, don't bother */ | |
20960 | + if (*pnr_map < 2) | |
20961 | + return -1; | |
20962 | + | |
20963 | + old_nr = *pnr_map; | |
20964 | + | |
20965 | + /* bail out if we find any unreasonable addresses in bios map */ | |
20966 | + for (i=0; i<old_nr; i++) | |
20967 | + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) | |
20968 | + return -1; | |
20969 | + | |
20970 | + /* create pointers for initial change-point information (for sorting) */ | |
20971 | + for (i=0; i < 2*old_nr; i++) | |
20972 | + change_point[i] = &change_point_list[i]; | |
20973 | + | |
20974 | + /* record all known change-points (starting and ending addresses), | |
20975 | + omitting those that are for empty memory regions */ | |
20976 | + chgidx = 0; | |
20977 | + for (i=0; i < old_nr; i++) { | |
20978 | + if (biosmap[i].size != 0) { | |
20979 | + change_point[chgidx]->addr = biosmap[i].addr; | |
20980 | + change_point[chgidx++]->pbios = &biosmap[i]; | |
20981 | + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; | |
20982 | + change_point[chgidx++]->pbios = &biosmap[i]; | |
20983 | + } | |
20984 | + } | |
20985 | + chg_nr = chgidx; | |
20986 | + | |
20987 | + /* sort change-point list by memory addresses (low -> high) */ | |
20988 | + still_changing = 1; | |
20989 | + while (still_changing) { | |
20990 | + still_changing = 0; | |
20991 | + for (i=1; i < chg_nr; i++) { | |
20992 | + /* if <current_addr> > <last_addr>, swap */ | |
20993 | + /* or, if current=<start_addr> & last=<end_addr>, swap */ | |
20994 | + if ((change_point[i]->addr < change_point[i-1]->addr) || | |
20995 | + ((change_point[i]->addr == change_point[i-1]->addr) && | |
20996 | + (change_point[i]->addr == change_point[i]->pbios->addr) && | |
20997 | + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) | |
20998 | + ) | |
20999 | + { | |
21000 | + change_tmp = change_point[i]; | |
21001 | + change_point[i] = change_point[i-1]; | |
21002 | + change_point[i-1] = change_tmp; | |
21003 | + still_changing=1; | |
21004 | + } | |
21005 | + } | |
21006 | + } | |
21007 | + | |
21008 | + /* create a new bios memory map, removing overlaps */ | |
21009 | + overlap_entries=0; /* number of entries in the overlap table */ | |
21010 | + new_bios_entry=0; /* index for creating new bios map entries */ | |
21011 | + last_type = 0; /* start with undefined memory type */ | |
21012 | + last_addr = 0; /* start with 0 as last starting address */ | |
21013 | + /* loop through change-points, determining affect on the new bios map */ | |
21014 | + for (chgidx=0; chgidx < chg_nr; chgidx++) | |
21015 | + { | |
21016 | + /* keep track of all overlapping bios entries */ | |
21017 | + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) | |
21018 | + { | |
21019 | + /* add map entry to overlap list (> 1 entry implies an overlap) */ | |
21020 | + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; | |
21021 | + } | |
21022 | + else | |
21023 | + { | |
21024 | + /* remove entry from list (order independent, so swap with last) */ | |
21025 | + for (i=0; i<overlap_entries; i++) | |
21026 | + { | |
21027 | + if (overlap_list[i] == change_point[chgidx]->pbios) | |
21028 | + overlap_list[i] = overlap_list[overlap_entries-1]; | |
21029 | + } | |
21030 | + overlap_entries--; | |
21031 | + } | |
21032 | + /* if there are overlapping entries, decide which "type" to use */ | |
21033 | + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ | |
21034 | + current_type = 0; | |
21035 | + for (i=0; i<overlap_entries; i++) | |
21036 | + if (overlap_list[i]->type > current_type) | |
21037 | + current_type = overlap_list[i]->type; | |
21038 | + /* continue building up new bios map based on this information */ | |
21039 | + if (current_type != last_type) { | |
21040 | + if (last_type != 0) { | |
21041 | + new_bios[new_bios_entry].size = | |
21042 | + change_point[chgidx]->addr - last_addr; | |
21043 | + /* move forward only if the new size was non-zero */ | |
21044 | + if (new_bios[new_bios_entry].size != 0) | |
21045 | + if (++new_bios_entry >= E820MAX) | |
21046 | + break; /* no more space left for new bios entries */ | |
21047 | + } | |
21048 | + if (current_type != 0) { | |
21049 | + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; | |
21050 | + new_bios[new_bios_entry].type = current_type; | |
21051 | + last_addr=change_point[chgidx]->addr; | |
21052 | + } | |
21053 | + last_type = current_type; | |
21054 | + } | |
21055 | + } | |
21056 | + new_nr = new_bios_entry; /* retain count for new bios entries */ | |
21057 | + | |
21058 | + /* copy new bios mapping into original location */ | |
21059 | + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); | |
21060 | + *pnr_map = new_nr; | |
21061 | + | |
21062 | + return 0; | |
21063 | +} | |
21064 | + | |
21065 | +/* | |
21066 | + * Copy the BIOS e820 map into a safe place. | |
21067 | + * | |
21068 | + * Sanity-check it while we're at it.. | |
21069 | + * | |
21070 | + * If we're lucky and live on a modern system, the setup code | |
21071 | + * will have given us a memory map that we can use to properly | |
21072 | + * set up memory. If we aren't, we'll fake a memory map. | |
21073 | + * | |
21074 | + * We check to see that the memory map contains at least 2 elements | |
21075 | + * before we'll use it, because the detection code in setup.S may | |
21076 | + * not be perfect and most every PC known to man has two memory | |
21077 | + * regions: one from 0 to 640k, and one from 1mb up. (The IBM | |
21078 | + * thinkpad 560x, for example, does not cooperate with the memory | |
21079 | + * detection code.) | |
21080 | + */ | |
21081 | +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) | |
21082 | +{ | |
21083 | +#ifndef CONFIG_XEN | |
21084 | + /* Only one memory region (or negative)? Ignore it */ | |
21085 | + if (nr_map < 2) | |
21086 | + return -1; | |
21087 | +#else | |
21088 | + BUG_ON(nr_map < 1); | |
21089 | +#endif | |
21090 | + | |
21091 | + do { | |
21092 | + unsigned long start = biosmap->addr; | |
21093 | + unsigned long size = biosmap->size; | |
21094 | + unsigned long end = start + size; | |
21095 | + unsigned long type = biosmap->type; | |
21096 | + | |
21097 | + /* Overflow in 64 bits? Ignore the memory map. */ | |
21098 | + if (start > end) | |
21099 | + return -1; | |
21100 | + | |
21101 | +#ifndef CONFIG_XEN | |
21102 | + /* | |
21103 | + * Some BIOSes claim RAM in the 640k - 1M region. | |
21104 | + * Not right. Fix it up. | |
21105 | + * | |
21106 | + * This should be removed on Hammer which is supposed to not | |
21107 | + * have non e820 covered ISA mappings there, but I had some strange | |
21108 | + * problems so it stays for now. -AK | |
21109 | + */ | |
21110 | + if (type == E820_RAM) { | |
21111 | + if (start < 0x100000ULL && end > 0xA0000ULL) { | |
21112 | + if (start < 0xA0000ULL) | |
21113 | + add_memory_region(start, 0xA0000ULL-start, type); | |
21114 | + if (end <= 0x100000ULL) | |
21115 | + continue; | |
21116 | + start = 0x100000ULL; | |
21117 | + size = end - start; | |
21118 | + } | |
21119 | + } | |
21120 | +#endif | |
21121 | + | |
21122 | + add_memory_region(start, size, type); | |
21123 | + } while (biosmap++,--nr_map); | |
21124 | + | |
21125 | +#ifdef CONFIG_XEN | |
21126 | + if (is_initial_xendomain()) { | |
21127 | + struct xen_memory_map memmap; | |
21128 | + | |
21129 | + memmap.nr_entries = E820MAX; | |
21130 | + set_xen_guest_handle(memmap.buffer, machine_e820.map); | |
21131 | + | |
21132 | + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) | |
21133 | + BUG(); | |
21134 | + machine_e820.nr_map = memmap.nr_entries; | |
21135 | + } else | |
21136 | + machine_e820 = e820; | |
21137 | +#endif | |
21138 | + | |
21139 | + return 0; | |
21140 | +} | |
21141 | + | |
21142 | +#ifndef CONFIG_XEN | |
21143 | +void __init setup_memory_region(void) | |
21144 | +{ | |
21145 | + char *who = "BIOS-e820"; | |
21146 | + | |
21147 | + /* | |
21148 | + * Try to copy the BIOS-supplied E820-map. | |
21149 | + * | |
21150 | + * Otherwise fake a memory map; one section from 0k->640k, | |
21151 | + * the next section from 1mb->appropriate_mem_k | |
21152 | + */ | |
21153 | + sanitize_e820_map(E820_MAP, &E820_MAP_NR); | |
21154 | + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { | |
21155 | + unsigned long mem_size; | |
21156 | + | |
21157 | + /* compare results from other methods and take the greater */ | |
21158 | + if (ALT_MEM_K < EXT_MEM_K) { | |
21159 | + mem_size = EXT_MEM_K; | |
21160 | + who = "BIOS-88"; | |
21161 | + } else { | |
21162 | + mem_size = ALT_MEM_K; | |
21163 | + who = "BIOS-e801"; | |
21164 | + } | |
21165 | + | |
21166 | + e820.nr_map = 0; | |
21167 | + add_memory_region(0, LOWMEMSIZE(), E820_RAM); | |
21168 | + add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); | |
21169 | + } | |
21170 | + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | |
21171 | + e820_print_map(who); | |
21172 | +} | |
21173 | + | |
21174 | +#else /* CONFIG_XEN */ | |
21175 | + | |
21176 | +void __init setup_memory_region(void) | |
21177 | +{ | |
21178 | + int rc; | |
21179 | + struct xen_memory_map memmap; | |
21180 | + /* | |
21181 | + * This is rather large for a stack variable but this early in | |
21182 | + * the boot process we know we have plenty slack space. | |
21183 | + */ | |
21184 | + struct e820entry map[E820MAX]; | |
21185 | + | |
21186 | + memmap.nr_entries = E820MAX; | |
21187 | + set_xen_guest_handle(memmap.buffer, map); | |
21188 | + | |
21189 | + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); | |
21190 | + if ( rc == -ENOSYS ) { | |
21191 | + memmap.nr_entries = 1; | |
21192 | + map[0].addr = 0ULL; | |
21193 | + map[0].size = xen_start_info->nr_pages << PAGE_SHIFT; | |
21194 | + /* 8MB slack (to balance backend allocations). */ | |
21195 | + map[0].size += 8 << 20; | |
21196 | + map[0].type = E820_RAM; | |
21197 | + rc = 0; | |
21198 | + } | |
21199 | + BUG_ON(rc); | |
21200 | + | |
21201 | + sanitize_e820_map(map, (char *)&memmap.nr_entries); | |
21202 | + | |
21203 | + BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0); | |
21204 | + | |
21205 | + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | |
21206 | + e820_print_map("Xen"); | |
21207 | +} | |
21208 | +#endif | |
21209 | + | |
21210 | +void __init parse_memopt(char *p, char **from) | |
21211 | +{ | |
21212 | + int i; | |
21213 | + unsigned long current_end; | |
21214 | + unsigned long end; | |
21215 | + | |
21216 | + end_user_pfn = memparse(p, from); | |
21217 | + end_user_pfn >>= PAGE_SHIFT; | |
21218 | + | |
21219 | + end = end_user_pfn<<PAGE_SHIFT; | |
21220 | + i = e820.nr_map-1; | |
21221 | + current_end = e820.map[i].addr + e820.map[i].size; | |
21222 | + | |
21223 | + if (current_end < end) { | |
21224 | + /* | |
21225 | + * The e820 map ends before our requested size so | |
21226 | + * extend the final entry to the requested address. | |
21227 | + */ | |
21228 | + if (e820.map[i].type == E820_RAM) | |
21229 | + e820.map[i].size = end - e820.map[i].addr; | |
21230 | + else | |
21231 | + add_memory_region(current_end, end - current_end, E820_RAM); | |
21232 | + } | |
21233 | +} | |
21234 | + | |
21235 | +void __init parse_memmapopt(char *p, char **from) | |
21236 | +{ | |
21237 | + unsigned long long start_at, mem_size; | |
21238 | + | |
21239 | + mem_size = memparse(p, from); | |
21240 | + p = *from; | |
21241 | + if (*p == '@') { | |
21242 | + start_at = memparse(p+1, from); | |
21243 | + add_memory_region(start_at, mem_size, E820_RAM); | |
21244 | + } else if (*p == '#') { | |
21245 | + start_at = memparse(p+1, from); | |
21246 | + add_memory_region(start_at, mem_size, E820_ACPI); | |
21247 | + } else if (*p == '$') { | |
21248 | + start_at = memparse(p+1, from); | |
21249 | + add_memory_region(start_at, mem_size, E820_RESERVED); | |
21250 | + } else { | |
21251 | + end_user_pfn = (mem_size >> PAGE_SHIFT); | |
21252 | + } | |
21253 | + p = *from; | |
21254 | +} | |
21255 | + | |
21256 | +unsigned long pci_mem_start = 0xaeedbabe; | |
21257 | +EXPORT_SYMBOL(pci_mem_start); | |
21258 | + | |
21259 | +/* | |
21260 | + * Search for the biggest gap in the low 32 bits of the e820 | |
21261 | + * memory space. We pass this space to PCI to assign MMIO resources | |
21262 | + * for hotplug or unconfigured devices in. | |
21263 | + * Hopefully the BIOS let enough space left. | |
21264 | + */ | |
21265 | +__init void e820_setup_gap(struct e820entry *e820, int nr_map) | |
21266 | +{ | |
21267 | + unsigned long gapstart, gapsize, round; | |
21268 | + unsigned long last; | |
21269 | + int i; | |
21270 | + int found = 0; | |
21271 | + | |
21272 | + last = 0x100000000ull; | |
21273 | + gapstart = 0x10000000; | |
21274 | + gapsize = 0x400000; | |
21275 | + i = nr_map; | |
21276 | + while (--i >= 0) { | |
21277 | + unsigned long long start = e820[i].addr; | |
21278 | + unsigned long long end = start + e820[i].size; | |
21279 | + | |
21280 | + /* | |
21281 | + * Since "last" is at most 4GB, we know we'll | |
21282 | + * fit in 32 bits if this condition is true | |
21283 | + */ | |
21284 | + if (last > end) { | |
21285 | + unsigned long gap = last - end; | |
21286 | + | |
21287 | + if (gap > gapsize) { | |
21288 | + gapsize = gap; | |
21289 | + gapstart = end; | |
21290 | + found = 1; | |
21291 | + } | |
21292 | + } | |
21293 | + if (start < last) | |
21294 | + last = start; | |
21295 | + } | |
21296 | + | |
21297 | + if (!found) { | |
21298 | + gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; | |
21299 | + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" | |
21300 | + KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); | |
21301 | + } | |
21302 | + | |
21303 | + /* | |
21304 | + * See how much we want to round up: start off with | |
21305 | + * rounding to the next 1MB area. | |
21306 | + */ | |
21307 | + round = 0x100000; | |
21308 | + while ((gapsize >> 4) > round) | |
21309 | + round += round; | |
21310 | + /* Fun with two's complement */ | |
21311 | + pci_mem_start = (gapstart + round) & -round; | |
21312 | + | |
21313 | + printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", | |
21314 | + pci_mem_start, gapstart, gapsize); | |
21315 | +} | |
21316 | Index: head-2008-11-25/arch/x86/kernel/early_printk-xen.c | |
21317 | =================================================================== | |
21318 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
21319 | +++ head-2008-11-25/arch/x86/kernel/early_printk-xen.c 2007-06-12 13:13:01.000000000 +0200 | |
21320 | @@ -0,0 +1,302 @@ | |
21321 | +#include <linux/console.h> | |
21322 | +#include <linux/kernel.h> | |
21323 | +#include <linux/init.h> | |
21324 | +#include <linux/string.h> | |
21325 | +#include <linux/screen_info.h> | |
21326 | +#include <asm/io.h> | |
21327 | +#include <asm/processor.h> | |
21328 | +#include <asm/fcntl.h> | |
21329 | + | |
21330 | +/* Simple VGA output */ | |
21331 | + | |
21332 | +#ifdef __i386__ | |
21333 | +#include <asm/setup.h> | |
21334 | +#define VGABASE (__ISA_IO_base + 0xb8000) | |
21335 | +#else | |
21336 | +#include <asm/bootsetup.h> | |
21337 | +#define VGABASE ((void __iomem *)0xffffffff800b8000UL) | |
21338 | +#endif | |
21339 | + | |
21340 | +#ifndef CONFIG_XEN | |
21341 | +static int max_ypos = 25, max_xpos = 80; | |
21342 | +static int current_ypos = 25, current_xpos = 0; | |
21343 | + | |
21344 | +static void early_vga_write(struct console *con, const char *str, unsigned n) | |
21345 | +{ | |
21346 | + char c; | |
21347 | + int i, k, j; | |
21348 | + | |
21349 | + while ((c = *str++) != '\0' && n-- > 0) { | |
21350 | + if (current_ypos >= max_ypos) { | |
21351 | + /* scroll 1 line up */ | |
21352 | + for (k = 1, j = 0; k < max_ypos; k++, j++) { | |
21353 | + for (i = 0; i < max_xpos; i++) { | |
21354 | + writew(readw(VGABASE+2*(max_xpos*k+i)), | |
21355 | + VGABASE + 2*(max_xpos*j + i)); | |
21356 | + } | |
21357 | + } | |
21358 | + for (i = 0; i < max_xpos; i++) | |
21359 | + writew(0x720, VGABASE + 2*(max_xpos*j + i)); | |
21360 | + current_ypos = max_ypos-1; | |
21361 | + } | |
21362 | + if (c == '\n') { | |
21363 | + current_xpos = 0; | |
21364 | + current_ypos++; | |
21365 | + } else if (c != '\r') { | |
21366 | + writew(((0x7 << 8) | (unsigned short) c), | |
21367 | + VGABASE + 2*(max_xpos*current_ypos + | |
21368 | + current_xpos++)); | |
21369 | + if (current_xpos >= max_xpos) { | |
21370 | + current_xpos = 0; | |
21371 | + current_ypos++; | |
21372 | + } | |
21373 | + } | |
21374 | + } | |
21375 | +} | |
21376 | + | |
21377 | +static struct console early_vga_console = { | |
21378 | + .name = "earlyvga", | |
21379 | + .write = early_vga_write, | |
21380 | + .flags = CON_PRINTBUFFER, | |
21381 | + .index = -1, | |
21382 | +}; | |
21383 | + | |
21384 | +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ | |
21385 | + | |
21386 | +static int early_serial_base = 0x3f8; /* ttyS0 */ | |
21387 | + | |
21388 | +#define XMTRDY 0x20 | |
21389 | + | |
21390 | +#define DLAB 0x80 | |
21391 | + | |
21392 | +#define TXR 0 /* Transmit register (WRITE) */ | |
21393 | +#define RXR 0 /* Receive register (READ) */ | |
21394 | +#define IER 1 /* Interrupt Enable */ | |
21395 | +#define IIR 2 /* Interrupt ID */ | |
21396 | +#define FCR 2 /* FIFO control */ | |
21397 | +#define LCR 3 /* Line control */ | |
21398 | +#define MCR 4 /* Modem control */ | |
21399 | +#define LSR 5 /* Line Status */ | |
21400 | +#define MSR 6 /* Modem Status */ | |
21401 | +#define DLL 0 /* Divisor Latch Low */ | |
21402 | +#define DLH 1 /* Divisor latch High */ | |
21403 | + | |
21404 | +static int early_serial_putc(unsigned char ch) | |
21405 | +{ | |
21406 | + unsigned timeout = 0xffff; | |
21407 | + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) | |
21408 | + cpu_relax(); | |
21409 | + outb(ch, early_serial_base + TXR); | |
21410 | + return timeout ? 0 : -1; | |
21411 | +} | |
21412 | + | |
21413 | +static void early_serial_write(struct console *con, const char *s, unsigned n) | |
21414 | +{ | |
21415 | + while (*s && n-- > 0) { | |
21416 | + early_serial_putc(*s); | |
21417 | + if (*s == '\n') | |
21418 | + early_serial_putc('\r'); | |
21419 | + s++; | |
21420 | + } | |
21421 | +} | |
21422 | + | |
21423 | +#define DEFAULT_BAUD 9600 | |
21424 | + | |
21425 | +static __init void early_serial_init(char *s) | |
21426 | +{ | |
21427 | + unsigned char c; | |
21428 | + unsigned divisor; | |
21429 | + unsigned baud = DEFAULT_BAUD; | |
21430 | + char *e; | |
21431 | + | |
21432 | + if (*s == ',') | |
21433 | + ++s; | |
21434 | + | |
21435 | + if (*s) { | |
21436 | + unsigned port; | |
21437 | + if (!strncmp(s,"0x",2)) { | |
21438 | + early_serial_base = simple_strtoul(s, &e, 16); | |
21439 | + } else { | |
21440 | + static int bases[] = { 0x3f8, 0x2f8 }; | |
21441 | + | |
21442 | + if (!strncmp(s,"ttyS",4)) | |
21443 | + s += 4; | |
21444 | + port = simple_strtoul(s, &e, 10); | |
21445 | + if (port > 1 || s == e) | |
21446 | + port = 0; | |
21447 | + early_serial_base = bases[port]; | |
21448 | + } | |
21449 | + s += strcspn(s, ","); | |
21450 | + if (*s == ',') | |
21451 | + s++; | |
21452 | + } | |
21453 | + | |
21454 | + outb(0x3, early_serial_base + LCR); /* 8n1 */ | |
21455 | + outb(0, early_serial_base + IER); /* no interrupt */ | |
21456 | + outb(0, early_serial_base + FCR); /* no fifo */ | |
21457 | + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ | |
21458 | + | |
21459 | + if (*s) { | |
21460 | + baud = simple_strtoul(s, &e, 0); | |
21461 | + if (baud == 0 || s == e) | |
21462 | + baud = DEFAULT_BAUD; | |
21463 | + } | |
21464 | + | |
21465 | + divisor = 115200 / baud; | |
21466 | + c = inb(early_serial_base + LCR); | |
21467 | + outb(c | DLAB, early_serial_base + LCR); | |
21468 | + outb(divisor & 0xff, early_serial_base + DLL); | |
21469 | + outb((divisor >> 8) & 0xff, early_serial_base + DLH); | |
21470 | + outb(c & ~DLAB, early_serial_base + LCR); | |
21471 | +} | |
21472 | + | |
21473 | +#else /* CONFIG_XEN */ | |
21474 | + | |
21475 | +static void | |
21476 | +early_serial_write(struct console *con, const char *s, unsigned count) | |
21477 | +{ | |
21478 | + int n; | |
21479 | + | |
21480 | + while (count > 0) { | |
21481 | + n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s); | |
21482 | + if (n <= 0) | |
21483 | + break; | |
21484 | + count -= n; | |
21485 | + s += n; | |
21486 | + } | |
21487 | +} | |
21488 | + | |
21489 | +static __init void early_serial_init(char *s) | |
21490 | +{ | |
21491 | +} | |
21492 | + | |
21493 | +/* | |
21494 | + * No early VGA console on Xen, as we do not have convenient ISA-space | |
21495 | + * mappings. Someone should fix this for domain 0. For now, use fake serial. | |
21496 | + */ | |
21497 | +#define early_vga_console early_serial_console | |
21498 | + | |
21499 | +#endif | |
21500 | + | |
21501 | +static struct console early_serial_console = { | |
21502 | + .name = "earlyser", | |
21503 | + .write = early_serial_write, | |
21504 | + .flags = CON_PRINTBUFFER, | |
21505 | + .index = -1, | |
21506 | +}; | |
21507 | + | |
21508 | +/* Console interface to a host file on AMD's SimNow! */ | |
21509 | + | |
21510 | +static int simnow_fd; | |
21511 | + | |
21512 | +enum { | |
21513 | + MAGIC1 = 0xBACCD00A, | |
21514 | + MAGIC2 = 0xCA110000, | |
21515 | + XOPEN = 5, | |
21516 | + XWRITE = 4, | |
21517 | +}; | |
21518 | + | |
21519 | +static noinline long simnow(long cmd, long a, long b, long c) | |
21520 | +{ | |
21521 | + long ret; | |
21522 | + asm volatile("cpuid" : | |
21523 | + "=a" (ret) : | |
21524 | + "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); | |
21525 | + return ret; | |
21526 | +} | |
21527 | + | |
21528 | +void __init simnow_init(char *str) | |
21529 | +{ | |
21530 | + char *fn = "klog"; | |
21531 | + if (*str == '=') | |
21532 | + fn = ++str; | |
21533 | + /* error ignored */ | |
21534 | + simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); | |
21535 | +} | |
21536 | + | |
21537 | +static void simnow_write(struct console *con, const char *s, unsigned n) | |
21538 | +{ | |
21539 | + simnow(XWRITE, simnow_fd, (unsigned long)s, n); | |
21540 | +} | |
21541 | + | |
21542 | +static struct console simnow_console = { | |
21543 | + .name = "simnow", | |
21544 | + .write = simnow_write, | |
21545 | + .flags = CON_PRINTBUFFER, | |
21546 | + .index = -1, | |
21547 | +}; | |
21548 | + | |
21549 | +/* Direct interface for emergencies */ | |
21550 | +struct console *early_console = &early_vga_console; | |
21551 | +static int early_console_initialized = 0; | |
21552 | + | |
21553 | +void early_printk(const char *fmt, ...) | |
21554 | +{ | |
21555 | + char buf[512]; | |
21556 | + int n; | |
21557 | + va_list ap; | |
21558 | + | |
21559 | + va_start(ap,fmt); | |
21560 | + n = vscnprintf(buf,512,fmt,ap); | |
21561 | + early_console->write(early_console,buf,n); | |
21562 | + va_end(ap); | |
21563 | +} | |
21564 | + | |
21565 | +static int __initdata keep_early; | |
21566 | + | |
21567 | +int __init setup_early_printk(char *opt) | |
21568 | +{ | |
21569 | + char *space; | |
21570 | + char buf[256]; | |
21571 | + | |
21572 | + if (early_console_initialized) | |
21573 | + return 1; | |
21574 | + | |
21575 | + strlcpy(buf,opt,sizeof(buf)); | |
21576 | + space = strchr(buf, ' '); | |
21577 | + if (space) | |
21578 | + *space = 0; | |
21579 | + | |
21580 | + if (strstr(buf,"keep")) | |
21581 | + keep_early = 1; | |
21582 | + | |
21583 | + if (!strncmp(buf, "serial", 6)) { | |
21584 | + early_serial_init(buf + 6); | |
21585 | + early_console = &early_serial_console; | |
21586 | + } else if (!strncmp(buf, "ttyS", 4)) { | |
21587 | + early_serial_init(buf); | |
21588 | + early_console = &early_serial_console; | |
21589 | + } else if (!strncmp(buf, "vga", 3) | |
21590 | +#ifndef CONFIG_XEN | |
21591 | + && SCREEN_INFO.orig_video_isVGA == 1) { | |
21592 | + max_xpos = SCREEN_INFO.orig_video_cols; | |
21593 | + max_ypos = SCREEN_INFO.orig_video_lines; | |
21594 | + current_ypos = SCREEN_INFO.orig_y; | |
21595 | +#else | |
21596 | + || !strncmp(buf, "xen", 3)) { | |
21597 | +#endif | |
21598 | + early_console = &early_vga_console; | |
21599 | + } else if (!strncmp(buf, "simnow", 6)) { | |
21600 | + simnow_init(buf + 6); | |
21601 | + early_console = &simnow_console; | |
21602 | + keep_early = 1; | |
21603 | + } | |
21604 | + early_console_initialized = 1; | |
21605 | + register_console(early_console); | |
21606 | + return 0; | |
21607 | +} | |
21608 | + | |
21609 | +void __init disable_early_printk(void) | |
21610 | +{ | |
21611 | + if (!early_console_initialized || !early_console) | |
21612 | + return; | |
21613 | + if (!keep_early) { | |
21614 | + printk("disabling early console\n"); | |
21615 | + unregister_console(early_console); | |
21616 | + early_console_initialized = 0; | |
21617 | + } else { | |
21618 | + printk("keeping early console\n"); | |
21619 | + } | |
21620 | +} | |
21621 | + | |
21622 | +__setup("earlyprintk=", setup_early_printk); | |
21623 | Index: head-2008-11-25/arch/x86/kernel/entry_64-xen.S | |
21624 | =================================================================== | |
21625 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
21626 | +++ head-2008-11-25/arch/x86/kernel/entry_64-xen.S 2008-10-29 09:55:56.000000000 +0100 | |
21627 | @@ -0,0 +1,1322 @@ | |
21628 | +/* | |
21629 | + * linux/arch/x86_64/entry.S | |
21630 | + * | |
21631 | + * Copyright (C) 1991, 1992 Linus Torvalds | |
21632 | + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs | |
21633 | + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | |
21634 | + * | |
21635 | + * $Id$ | |
21636 | + * | |
21637 | + * Jun Nakajima <jun.nakajima@intel.com> | |
21638 | + * Asit Mallick <asit.k.mallick@intel.com> | |
21639 | + * Modified for Xen | |
21640 | + */ | |
21641 | + | |
21642 | +/* | |
21643 | + * entry.S contains the system-call and fault low-level handling routines. | |
21644 | + * | |
21645 | + * NOTE: This code handles signal-recognition, which happens every time | |
21646 | + * after an interrupt and after each system call. | |
21647 | + * | |
21648 | + * Normal syscalls and interrupts don't save a full stack frame, this is | |
21649 | + * only done for syscall tracing, signals or fork/exec et.al. | |
21650 | + * | |
21651 | + * A note on terminology: | |
21652 | + * - top of stack: Architecture defined interrupt frame from SS to RIP | |
21653 | + * at the top of the kernel process stack. | |
21654 | + * - partial stack frame: partially saved registers upto R11. | |
21655 | + * - full stack frame: Like partial stack frame, but all register saved. | |
21656 | + * | |
21657 | + * TODO: | |
21658 | + * - schedule it carefully for the final hardware. | |
21659 | + */ | |
21660 | + | |
21661 | +#define ASSEMBLY 1 | |
21662 | +#include <linux/linkage.h> | |
21663 | +#include <asm/segment.h> | |
21664 | +#include <asm/smp.h> | |
21665 | +#include <asm/cache.h> | |
21666 | +#include <asm/errno.h> | |
21667 | +#include <asm/dwarf2.h> | |
21668 | +#include <asm/calling.h> | |
21669 | +#include <asm/asm-offsets.h> | |
21670 | +#include <asm/msr.h> | |
21671 | +#include <asm/unistd.h> | |
21672 | +#include <asm/thread_info.h> | |
21673 | +#include <asm/hw_irq.h> | |
21674 | +#include <asm/page.h> | |
21675 | +#include <asm/irqflags.h> | |
21676 | +#include <asm/errno.h> | |
21677 | +#include <xen/interface/arch-x86_64.h> | |
21678 | +#include <xen/interface/features.h> | |
21679 | + | |
21680 | +#include "xen_entry.S" | |
21681 | + | |
21682 | + .code64 | |
21683 | + | |
21684 | +#ifndef CONFIG_PREEMPT | |
21685 | +#define retint_kernel retint_restore_args | |
21686 | +#endif | |
21687 | + | |
21688 | + | |
21689 | +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET | |
21690 | +#ifdef CONFIG_TRACE_IRQFLAGS | |
21691 | + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ | |
21692 | + jnc 1f | |
21693 | + TRACE_IRQS_ON | |
21694 | +1: | |
21695 | +#endif | |
21696 | +.endm | |
21697 | + | |
21698 | +NMI_MASK = 0x80000000 | |
21699 | + | |
21700 | +/* | |
21701 | + * C code is not supposed to know about undefined top of stack. Every time | |
21702 | + * a C function with an pt_regs argument is called from the SYSCALL based | |
21703 | + * fast path FIXUP_TOP_OF_STACK is needed. | |
21704 | + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs | |
21705 | + * manipulation. | |
21706 | + */ | |
21707 | + | |
21708 | + /* %rsp:at FRAMEEND */ | |
21709 | + .macro FIXUP_TOP_OF_STACK tmp | |
21710 | + movq $__USER_CS,CS(%rsp) | |
21711 | + movq $-1,RCX(%rsp) | |
21712 | + .endm | |
21713 | + | |
21714 | + .macro RESTORE_TOP_OF_STACK tmp,offset=0 | |
21715 | + .endm | |
21716 | + | |
21717 | + .macro FAKE_STACK_FRAME child_rip | |
21718 | + /* push in order ss, rsp, eflags, cs, rip */ | |
21719 | + xorl %eax, %eax | |
21720 | + pushq %rax /* ss */ | |
21721 | + CFI_ADJUST_CFA_OFFSET 8 | |
21722 | + /*CFI_REL_OFFSET ss,0*/ | |
21723 | + pushq %rax /* rsp */ | |
21724 | + CFI_ADJUST_CFA_OFFSET 8 | |
21725 | + CFI_REL_OFFSET rsp,0 | |
21726 | + pushq $(1<<9) /* eflags - interrupts on */ | |
21727 | + CFI_ADJUST_CFA_OFFSET 8 | |
21728 | + /*CFI_REL_OFFSET rflags,0*/ | |
21729 | + pushq $__KERNEL_CS /* cs */ | |
21730 | + CFI_ADJUST_CFA_OFFSET 8 | |
21731 | + /*CFI_REL_OFFSET cs,0*/ | |
21732 | + pushq \child_rip /* rip */ | |
21733 | + CFI_ADJUST_CFA_OFFSET 8 | |
21734 | + CFI_REL_OFFSET rip,0 | |
21735 | + pushq %rax /* orig rax */ | |
21736 | + CFI_ADJUST_CFA_OFFSET 8 | |
21737 | + .endm | |
21738 | + | |
21739 | + .macro UNFAKE_STACK_FRAME | |
21740 | + addq $8*6, %rsp | |
21741 | + CFI_ADJUST_CFA_OFFSET -(6*8) | |
21742 | + .endm | |
21743 | + | |
21744 | + .macro CFI_DEFAULT_STACK start=1,adj=0 | |
21745 | + .if \start | |
21746 | + CFI_STARTPROC simple | |
21747 | + CFI_DEF_CFA rsp,SS+8 - \adj*ARGOFFSET | |
21748 | + .else | |
21749 | + CFI_DEF_CFA_OFFSET SS+8 - \adj*ARGOFFSET | |
21750 | + .endif | |
21751 | + .if \adj == 0 | |
21752 | + CFI_REL_OFFSET r15,R15 | |
21753 | + CFI_REL_OFFSET r14,R14 | |
21754 | + CFI_REL_OFFSET r13,R13 | |
21755 | + CFI_REL_OFFSET r12,R12 | |
21756 | + CFI_REL_OFFSET rbp,RBP | |
21757 | + CFI_REL_OFFSET rbx,RBX | |
21758 | + .endif | |
21759 | + CFI_REL_OFFSET r11,R11 - \adj*ARGOFFSET | |
21760 | + CFI_REL_OFFSET r10,R10 - \adj*ARGOFFSET | |
21761 | + CFI_REL_OFFSET r9,R9 - \adj*ARGOFFSET | |
21762 | + CFI_REL_OFFSET r8,R8 - \adj*ARGOFFSET | |
21763 | + CFI_REL_OFFSET rax,RAX - \adj*ARGOFFSET | |
21764 | + CFI_REL_OFFSET rcx,RCX - \adj*ARGOFFSET | |
21765 | + CFI_REL_OFFSET rdx,RDX - \adj*ARGOFFSET | |
21766 | + CFI_REL_OFFSET rsi,RSI - \adj*ARGOFFSET | |
21767 | + CFI_REL_OFFSET rdi,RDI - \adj*ARGOFFSET | |
21768 | + CFI_REL_OFFSET rip,RIP - \adj*ARGOFFSET | |
21769 | + /*CFI_REL_OFFSET cs,CS - \adj*ARGOFFSET*/ | |
21770 | + /*CFI_REL_OFFSET rflags,EFLAGS - \adj*ARGOFFSET*/ | |
21771 | + CFI_REL_OFFSET rsp,RSP - \adj*ARGOFFSET | |
21772 | + /*CFI_REL_OFFSET ss,SS - \adj*ARGOFFSET*/ | |
21773 | + .endm | |
21774 | + | |
21775 | + /* | |
21776 | + * Must be consistent with the definition in arch-x86/xen-x86_64.h: | |
21777 | + * struct iret_context { | |
21778 | + * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; | |
21779 | + * }; | |
21780 | + * with rax, r11, and rcx being taken care of in the hypercall stub. | |
21781 | + */ | |
21782 | + .macro HYPERVISOR_IRET flag | |
21783 | + testb $3,1*8(%rsp) | |
21784 | + jnz 2f | |
21785 | + testl $NMI_MASK,2*8(%rsp) | |
21786 | + jnz 2f | |
21787 | + | |
21788 | + cmpb $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip) | |
21789 | + jne 1f | |
21790 | + | |
21791 | + /* Direct iret to kernel space. Correct CS and SS. */ | |
21792 | + orl $3,1*8(%rsp) | |
21793 | + orl $3,4*8(%rsp) | |
21794 | +1: iretq | |
21795 | + | |
21796 | +2: /* Slow iret via hypervisor. */ | |
21797 | + andl $~NMI_MASK, 2*8(%rsp) | |
21798 | + pushq $\flag | |
21799 | + jmp hypercall_page + (__HYPERVISOR_iret * 32) | |
21800 | + .endm | |
21801 | + | |
21802 | +/* | |
21803 | + * A newly forked process directly context switches into this. | |
21804 | + */ | |
21805 | +/* rdi: prev */ | |
21806 | +ENTRY(ret_from_fork) | |
21807 | + CFI_DEFAULT_STACK | |
21808 | + push kernel_eflags(%rip) | |
21809 | + CFI_ADJUST_CFA_OFFSET 4 | |
21810 | + popf # reset kernel eflags | |
21811 | + CFI_ADJUST_CFA_OFFSET -4 | |
21812 | + call schedule_tail | |
21813 | + GET_THREAD_INFO(%rcx) | |
21814 | + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) | |
21815 | + jnz rff_trace | |
21816 | +rff_action: | |
21817 | + RESTORE_REST | |
21818 | + testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? | |
21819 | + je int_ret_from_sys_call | |
21820 | + testl $_TIF_IA32,threadinfo_flags(%rcx) | |
21821 | + jnz int_ret_from_sys_call | |
21822 | + RESTORE_TOP_OF_STACK %rdi,ARGOFFSET | |
21823 | + jmp ret_from_sys_call | |
21824 | +rff_trace: | |
21825 | + movq %rsp,%rdi | |
21826 | + call syscall_trace_leave | |
21827 | + GET_THREAD_INFO(%rcx) | |
21828 | + jmp rff_action | |
21829 | + CFI_ENDPROC | |
21830 | +END(ret_from_fork) | |
21831 | + | |
21832 | +/* | |
21833 | + * initial frame state for interrupts and exceptions | |
21834 | + */ | |
21835 | + .macro _frame ref | |
21836 | + CFI_STARTPROC simple | |
21837 | + CFI_DEF_CFA rsp,SS+8-\ref | |
21838 | + /*CFI_REL_OFFSET ss,SS-\ref*/ | |
21839 | + CFI_REL_OFFSET rsp,RSP-\ref | |
21840 | + /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ | |
21841 | + /*CFI_REL_OFFSET cs,CS-\ref*/ | |
21842 | + CFI_REL_OFFSET rip,RIP-\ref | |
21843 | + .endm | |
21844 | + | |
21845 | +/* | |
21846 | + * System call entry. Upto 6 arguments in registers are supported. | |
21847 | + * | |
21848 | + * SYSCALL does not save anything on the stack and does not change the | |
21849 | + * stack pointer. | |
21850 | + */ | |
21851 | + | |
21852 | +/* | |
21853 | + * Register setup: | |
21854 | + * rax system call number | |
21855 | + * rdi arg0 | |
21856 | + * rcx return address for syscall/sysret, C arg3 | |
21857 | + * rsi arg1 | |
21858 | + * rdx arg2 | |
21859 | + * r10 arg3 (--> moved to rcx for C) | |
21860 | + * r8 arg4 | |
21861 | + * r9 arg5 | |
21862 | + * r11 eflags for syscall/sysret, temporary for C | |
21863 | + * r12-r15,rbp,rbx saved by C code, not touched. | |
21864 | + * | |
21865 | + * Interrupts are enabled on entry. | |
21866 | + * Only called from user space. | |
21867 | + * | |
21868 | + * XXX if we had a free scratch register we could save the RSP into the stack frame | |
21869 | + * and report it properly in ps. Unfortunately we haven't. | |
21870 | + * | |
21871 | + * When user can change the frames always force IRET. That is because | |
21872 | + * it deals with uncanonical addresses better. SYSRET has trouble | |
21873 | + * with them due to bugs in both AMD and Intel CPUs. | |
21874 | + */ | |
21875 | + | |
21876 | +ENTRY(system_call) | |
21877 | + _frame (RIP-0x10) | |
21878 | + SAVE_ARGS -8,0 | |
21879 | + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | |
21880 | + GET_THREAD_INFO(%rcx) | |
21881 | + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) | |
21882 | + CFI_REMEMBER_STATE | |
21883 | + jnz tracesys | |
21884 | + cmpq $__NR_syscall_max,%rax | |
21885 | + ja badsys | |
21886 | + movq %r10,%rcx | |
21887 | + call *sys_call_table(,%rax,8) # XXX: rip relative | |
21888 | + movq %rax,RAX-ARGOFFSET(%rsp) | |
21889 | +/* | |
21890 | + * Syscall return path ending with SYSRET (fast path) | |
21891 | + * Has incomplete stack frame and undefined top of stack. | |
21892 | + */ | |
21893 | + .globl ret_from_sys_call | |
21894 | +ret_from_sys_call: | |
21895 | + movl $_TIF_ALLWORK_MASK,%edi | |
21896 | + /* edi: flagmask */ | |
21897 | +sysret_check: | |
21898 | + GET_THREAD_INFO(%rcx) | |
21899 | + XEN_BLOCK_EVENTS(%rsi) | |
21900 | + TRACE_IRQS_OFF | |
21901 | + movl threadinfo_flags(%rcx),%edx | |
21902 | + andl %edi,%edx | |
21903 | + CFI_REMEMBER_STATE | |
21904 | + jnz sysret_careful | |
21905 | + /* | |
21906 | + * sysretq will re-enable interrupts: | |
21907 | + */ | |
21908 | + TRACE_IRQS_ON | |
21909 | + XEN_UNBLOCK_EVENTS(%rsi) | |
21910 | + RESTORE_ARGS 0,8,0 | |
21911 | + HYPERVISOR_IRET VGCF_IN_SYSCALL | |
21912 | + | |
21913 | + /* Handle reschedules */ | |
21914 | + /* edx: work, edi: workmask */ | |
21915 | +sysret_careful: | |
21916 | + CFI_RESTORE_STATE | |
21917 | + bt $TIF_NEED_RESCHED,%edx | |
21918 | + jnc sysret_signal | |
21919 | + TRACE_IRQS_ON | |
21920 | + XEN_UNBLOCK_EVENTS(%rsi) | |
21921 | + pushq %rdi | |
21922 | + CFI_ADJUST_CFA_OFFSET 8 | |
21923 | + call schedule | |
21924 | + popq %rdi | |
21925 | + CFI_ADJUST_CFA_OFFSET -8 | |
21926 | + jmp sysret_check | |
21927 | + | |
21928 | + /* Handle a signal */ | |
21929 | +sysret_signal: | |
21930 | + TRACE_IRQS_ON | |
21931 | +/* sti */ | |
21932 | + XEN_UNBLOCK_EVENTS(%rsi) | |
21933 | + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx | |
21934 | + jz 1f | |
21935 | + | |
21936 | + /* Really a signal */ | |
21937 | + /* edx: work flags (arg3) */ | |
21938 | + leaq do_notify_resume(%rip),%rax | |
21939 | + leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 | |
21940 | + xorl %esi,%esi # oldset -> arg2 | |
21941 | + call ptregscall_common | |
21942 | +1: movl $_TIF_NEED_RESCHED,%edi | |
21943 | + /* Use IRET because user could have changed frame. This | |
21944 | + works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ | |
21945 | + XEN_BLOCK_EVENTS(%rsi) | |
21946 | + TRACE_IRQS_OFF | |
21947 | + jmp int_with_check | |
21948 | + | |
21949 | +badsys: | |
21950 | + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | |
21951 | + jmp ret_from_sys_call | |
21952 | + | |
21953 | + /* Do syscall tracing */ | |
21954 | +tracesys: | |
21955 | + CFI_RESTORE_STATE | |
21956 | + SAVE_REST | |
21957 | + movq $-ENOSYS,RAX(%rsp) | |
21958 | + FIXUP_TOP_OF_STACK %rdi | |
21959 | + movq %rsp,%rdi | |
21960 | + call syscall_trace_enter | |
21961 | + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ | |
21962 | + RESTORE_REST | |
21963 | + cmpq $__NR_syscall_max,%rax | |
21964 | + ja 1f | |
21965 | + movq %r10,%rcx /* fixup for C */ | |
21966 | + call *sys_call_table(,%rax,8) | |
21967 | +1: movq %rax,RAX-ARGOFFSET(%rsp) | |
21968 | + /* Use IRET because user could have changed frame */ | |
21969 | + jmp int_ret_from_sys_call | |
21970 | + CFI_ENDPROC | |
21971 | +END(system_call) | |
21972 | + | |
21973 | +/* | |
21974 | + * Syscall return path ending with IRET. | |
21975 | + * Has correct top of stack, but partial stack frame. | |
21976 | + */ | |
21977 | +ENTRY(int_ret_from_sys_call) | |
21978 | + CFI_STARTPROC simple | |
21979 | + CFI_DEF_CFA rsp,SS+8-ARGOFFSET | |
21980 | + /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ | |
21981 | + CFI_REL_OFFSET rsp,RSP-ARGOFFSET | |
21982 | + /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ | |
21983 | + /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/ | |
21984 | + CFI_REL_OFFSET rip,RIP-ARGOFFSET | |
21985 | + CFI_REL_OFFSET rdx,RDX-ARGOFFSET | |
21986 | + CFI_REL_OFFSET rcx,RCX-ARGOFFSET | |
21987 | + CFI_REL_OFFSET rax,RAX-ARGOFFSET | |
21988 | + CFI_REL_OFFSET rdi,RDI-ARGOFFSET | |
21989 | + CFI_REL_OFFSET rsi,RSI-ARGOFFSET | |
21990 | + CFI_REL_OFFSET r8,R8-ARGOFFSET | |
21991 | + CFI_REL_OFFSET r9,R9-ARGOFFSET | |
21992 | + CFI_REL_OFFSET r10,R10-ARGOFFSET | |
21993 | + CFI_REL_OFFSET r11,R11-ARGOFFSET | |
21994 | + XEN_BLOCK_EVENTS(%rsi) | |
21995 | + TRACE_IRQS_OFF | |
21996 | + testb $3,CS-ARGOFFSET(%rsp) | |
21997 | + jnz 1f | |
21998 | + /* Need to set the proper %ss (not NULL) for ring 3 iretq */ | |
21999 | + movl $__KERNEL_DS,SS-ARGOFFSET(%rsp) | |
22000 | + jmp retint_restore_args # retrun from ring3 kernel | |
22001 | +1: | |
22002 | + movl $_TIF_ALLWORK_MASK,%edi | |
22003 | + /* edi: mask to check */ | |
22004 | +int_with_check: | |
22005 | + GET_THREAD_INFO(%rcx) | |
22006 | + movl threadinfo_flags(%rcx),%edx | |
22007 | + andl %edi,%edx | |
22008 | + jnz int_careful | |
22009 | + andl $~TS_COMPAT,threadinfo_status(%rcx) | |
22010 | + jmp retint_restore_args | |
22011 | + | |
22012 | + /* Either reschedule or signal or syscall exit tracking needed. */ | |
22013 | + /* First do a reschedule test. */ | |
22014 | + /* edx: work, edi: workmask */ | |
22015 | +int_careful: | |
22016 | + bt $TIF_NEED_RESCHED,%edx | |
22017 | + jnc int_very_careful | |
22018 | + TRACE_IRQS_ON | |
22019 | +/* sti */ | |
22020 | + XEN_UNBLOCK_EVENTS(%rsi) | |
22021 | + pushq %rdi | |
22022 | + CFI_ADJUST_CFA_OFFSET 8 | |
22023 | + call schedule | |
22024 | + popq %rdi | |
22025 | + CFI_ADJUST_CFA_OFFSET -8 | |
22026 | + XEN_BLOCK_EVENTS(%rsi) | |
22027 | + TRACE_IRQS_OFF | |
22028 | + jmp int_with_check | |
22029 | + | |
22030 | + /* handle signals and tracing -- both require a full stack frame */ | |
22031 | +int_very_careful: | |
22032 | + TRACE_IRQS_ON | |
22033 | +/* sti */ | |
22034 | + XEN_UNBLOCK_EVENTS(%rsi) | |
22035 | + SAVE_REST | |
22036 | + /* Check for syscall exit trace */ | |
22037 | + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx | |
22038 | + jz int_signal | |
22039 | + pushq %rdi | |
22040 | + CFI_ADJUST_CFA_OFFSET 8 | |
22041 | + leaq 8(%rsp),%rdi # &ptregs -> arg1 | |
22042 | + call syscall_trace_leave | |
22043 | + popq %rdi | |
22044 | + CFI_ADJUST_CFA_OFFSET -8 | |
22045 | + andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi | |
22046 | + XEN_BLOCK_EVENTS(%rsi) | |
22047 | + TRACE_IRQS_OFF | |
22048 | + jmp int_restore_rest | |
22049 | + | |
22050 | +int_signal: | |
22051 | + testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx | |
22052 | + jz 1f | |
22053 | + movq %rsp,%rdi # &ptregs -> arg1 | |
22054 | + xorl %esi,%esi # oldset -> arg2 | |
22055 | + call do_notify_resume | |
22056 | +1: movl $_TIF_NEED_RESCHED,%edi | |
22057 | +int_restore_rest: | |
22058 | + RESTORE_REST | |
22059 | + XEN_BLOCK_EVENTS(%rsi) | |
22060 | + TRACE_IRQS_OFF | |
22061 | + jmp int_with_check | |
22062 | + CFI_ENDPROC | |
22063 | +END(int_ret_from_sys_call) | |
22064 | + | |
22065 | +/* | |
22066 | + * Certain special system calls that need to save a complete full stack frame. | |
22067 | + */ | |
22068 | + | |
22069 | + .macro PTREGSCALL label,func,arg | |
22070 | + .globl \label | |
22071 | +\label: | |
22072 | + leaq \func(%rip),%rax | |
22073 | + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ | |
22074 | + jmp ptregscall_common | |
22075 | +END(\label) | |
22076 | + .endm | |
22077 | + | |
22078 | + CFI_STARTPROC | |
22079 | + | |
22080 | + PTREGSCALL stub_clone, sys_clone, %r8 | |
22081 | + PTREGSCALL stub_fork, sys_fork, %rdi | |
22082 | + PTREGSCALL stub_vfork, sys_vfork, %rdi | |
22083 | + PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx | |
22084 | + PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx | |
22085 | + PTREGSCALL stub_iopl, sys_iopl, %rsi | |
22086 | + | |
22087 | +ENTRY(ptregscall_common) | |
22088 | + popq %r11 | |
22089 | + CFI_ADJUST_CFA_OFFSET -8 | |
22090 | + CFI_REGISTER rip, r11 | |
22091 | + SAVE_REST | |
22092 | + movq %r11, %r15 | |
22093 | + CFI_REGISTER rip, r15 | |
22094 | + FIXUP_TOP_OF_STACK %r11 | |
22095 | + call *%rax | |
22096 | + RESTORE_TOP_OF_STACK %r11 | |
22097 | + movq %r15, %r11 | |
22098 | + CFI_REGISTER rip, r11 | |
22099 | + RESTORE_REST | |
22100 | + pushq %r11 | |
22101 | + CFI_ADJUST_CFA_OFFSET 8 | |
22102 | + CFI_REL_OFFSET rip, 0 | |
22103 | + ret | |
22104 | + CFI_ENDPROC | |
22105 | +END(ptregscall_common) | |
22106 | + | |
22107 | +ENTRY(stub_execve) | |
22108 | + CFI_STARTPROC | |
22109 | + popq %r11 | |
22110 | + CFI_ADJUST_CFA_OFFSET -8 | |
22111 | + CFI_REGISTER rip, r11 | |
22112 | + SAVE_REST | |
22113 | + FIXUP_TOP_OF_STACK %r11 | |
22114 | + call sys_execve | |
22115 | + RESTORE_TOP_OF_STACK %r11 | |
22116 | + movq %rax,RAX(%rsp) | |
22117 | + RESTORE_REST | |
22118 | + jmp int_ret_from_sys_call | |
22119 | + CFI_ENDPROC | |
22120 | +END(stub_execve) | |
22121 | + | |
22122 | +/* | |
22123 | + * sigreturn is special because it needs to restore all registers on return. | |
22124 | + * This cannot be done with SYSRET, so use the IRET return path instead. | |
22125 | + */ | |
22126 | +ENTRY(stub_rt_sigreturn) | |
22127 | + CFI_STARTPROC | |
22128 | + addq $8, %rsp | |
22129 | + CFI_ADJUST_CFA_OFFSET -8 | |
22130 | + SAVE_REST | |
22131 | + movq %rsp,%rdi | |
22132 | + FIXUP_TOP_OF_STACK %r11 | |
22133 | + call sys_rt_sigreturn | |
22134 | + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | |
22135 | + RESTORE_REST | |
22136 | + jmp int_ret_from_sys_call | |
22137 | + CFI_ENDPROC | |
22138 | +END(stub_rt_sigreturn) | |
22139 | + | |
22140 | +/* initial frame state for interrupts (and exceptions without error code) */ | |
22141 | +#define INTR_FRAME _frame (RIP-0x10); \ | |
22142 | + CFI_REL_OFFSET rcx,0; \ | |
22143 | + CFI_REL_OFFSET r11,8 | |
22144 | + | |
22145 | +/* initial frame state for exceptions with error code (and interrupts with | |
22146 | + vector already pushed) */ | |
22147 | +#define XCPT_FRAME _frame (RIP-0x18); \ | |
22148 | + CFI_REL_OFFSET rcx,0; \ | |
22149 | + CFI_REL_OFFSET r11,8 | |
22150 | + | |
22151 | +/* | |
22152 | + * Interrupt exit. | |
22153 | + * | |
22154 | + */ | |
22155 | + | |
22156 | +retint_check: | |
22157 | + CFI_DEFAULT_STACK adj=1 | |
22158 | + movl threadinfo_flags(%rcx),%edx | |
22159 | + andl %edi,%edx | |
22160 | + CFI_REMEMBER_STATE | |
22161 | + jnz retint_careful | |
22162 | +retint_restore_args: | |
22163 | + movl EFLAGS-REST_SKIP(%rsp), %eax | |
22164 | + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF | |
22165 | + XEN_GET_VCPU_INFO(%rsi) | |
22166 | + andb evtchn_upcall_mask(%rsi),%al | |
22167 | + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask | |
22168 | + jnz restore_all_enable_events # != 0 => enable event delivery | |
22169 | + XEN_PUT_VCPU_INFO(%rsi) | |
22170 | + | |
22171 | + RESTORE_ARGS 0,8,0 | |
22172 | + HYPERVISOR_IRET 0 | |
22173 | + | |
22174 | + /* edi: workmask, edx: work */ | |
22175 | +retint_careful: | |
22176 | + CFI_RESTORE_STATE | |
22177 | + bt $TIF_NEED_RESCHED,%edx | |
22178 | + jnc retint_signal | |
22179 | + TRACE_IRQS_ON | |
22180 | + XEN_UNBLOCK_EVENTS(%rsi) | |
22181 | +/* sti */ | |
22182 | + pushq %rdi | |
22183 | + CFI_ADJUST_CFA_OFFSET 8 | |
22184 | + call schedule | |
22185 | + popq %rdi | |
22186 | + CFI_ADJUST_CFA_OFFSET -8 | |
22187 | + GET_THREAD_INFO(%rcx) | |
22188 | + XEN_BLOCK_EVENTS(%rsi) | |
22189 | +/* cli */ | |
22190 | + TRACE_IRQS_OFF | |
22191 | + jmp retint_check | |
22192 | + | |
22193 | +retint_signal: | |
22194 | + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx | |
22195 | + jz retint_restore_args | |
22196 | + TRACE_IRQS_ON | |
22197 | + XEN_UNBLOCK_EVENTS(%rsi) | |
22198 | + SAVE_REST | |
22199 | + movq $-1,ORIG_RAX(%rsp) | |
22200 | + xorl %esi,%esi # oldset | |
22201 | + movq %rsp,%rdi # &pt_regs | |
22202 | + call do_notify_resume | |
22203 | + RESTORE_REST | |
22204 | + XEN_BLOCK_EVENTS(%rsi) | |
22205 | + TRACE_IRQS_OFF | |
22206 | + movl $_TIF_NEED_RESCHED,%edi | |
22207 | + GET_THREAD_INFO(%rcx) | |
22208 | + jmp retint_check | |
22209 | + | |
22210 | +#ifdef CONFIG_PREEMPT | |
22211 | + /* Returning to kernel space. Check if we need preemption */ | |
22212 | + /* rcx: threadinfo. interrupts off. */ | |
22213 | + .p2align | |
22214 | +retint_kernel: | |
22215 | + cmpl $0,threadinfo_preempt_count(%rcx) | |
22216 | + jnz retint_restore_args | |
22217 | + bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) | |
22218 | + jnc retint_restore_args | |
22219 | + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ | |
22220 | + jnc retint_restore_args | |
22221 | + call preempt_schedule_irq | |
22222 | + jmp retint_kernel /* check again */ | |
22223 | +#endif | |
22224 | + | |
22225 | + CFI_ENDPROC | |
22226 | +END(retint_check) | |
22227 | + | |
22228 | +#ifndef CONFIG_XEN | |
22229 | +/* | |
22230 | + * APIC interrupts. | |
22231 | + */ | |
22232 | + .macro apicinterrupt num,func | |
22233 | + INTR_FRAME | |
22234 | + pushq $~(\num) | |
22235 | + CFI_ADJUST_CFA_OFFSET 8 | |
22236 | + interrupt \func | |
22237 | + jmp error_entry | |
22238 | + CFI_ENDPROC | |
22239 | + .endm | |
22240 | + | |
22241 | +ENTRY(thermal_interrupt) | |
22242 | + apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt | |
22243 | +END(thermal_interrupt) | |
22244 | + | |
22245 | +ENTRY(threshold_interrupt) | |
22246 | + apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt | |
22247 | +END(threshold_interrupt) | |
22248 | + | |
22249 | +#ifdef CONFIG_SMP | |
22250 | +ENTRY(reschedule_interrupt) | |
22251 | + apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt | |
22252 | +END(reschedule_interrupt) | |
22253 | + | |
22254 | + .macro INVALIDATE_ENTRY num | |
22255 | +ENTRY(invalidate_interrupt\num) | |
22256 | + apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt | |
22257 | +END(invalidate_interrupt\num) | |
22258 | + .endm | |
22259 | + | |
22260 | + INVALIDATE_ENTRY 0 | |
22261 | + INVALIDATE_ENTRY 1 | |
22262 | + INVALIDATE_ENTRY 2 | |
22263 | + INVALIDATE_ENTRY 3 | |
22264 | + INVALIDATE_ENTRY 4 | |
22265 | + INVALIDATE_ENTRY 5 | |
22266 | + INVALIDATE_ENTRY 6 | |
22267 | + INVALIDATE_ENTRY 7 | |
22268 | + | |
22269 | +ENTRY(call_function_interrupt) | |
22270 | + apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt | |
22271 | +END(call_function_interrupt) | |
22272 | +#endif | |
22273 | + | |
22274 | +#ifdef CONFIG_X86_LOCAL_APIC | |
22275 | +ENTRY(apic_timer_interrupt) | |
22276 | + apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt | |
22277 | +END(apic_timer_interrupt) | |
22278 | + | |
22279 | +ENTRY(error_interrupt) | |
22280 | + apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt | |
22281 | +END(error_interrupt) | |
22282 | + | |
22283 | +ENTRY(spurious_interrupt) | |
22284 | + apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt | |
22285 | +END(spurious_interrupt) | |
22286 | +#endif | |
22287 | +#endif /* !CONFIG_XEN */ | |
22288 | + | |
22289 | +/* | |
22290 | + * Exception entry points. | |
22291 | + */ | |
22292 | + .macro zeroentry sym | |
22293 | + INTR_FRAME | |
22294 | + movq (%rsp),%rcx | |
22295 | + CFI_RESTORE rcx | |
22296 | + movq 8(%rsp),%r11 | |
22297 | + CFI_RESTORE r11 | |
22298 | + addq $0x10,%rsp /* skip rcx and r11 */ | |
22299 | + CFI_ADJUST_CFA_OFFSET -0x10 | |
22300 | + pushq $0 /* push error code/oldrax */ | |
22301 | + CFI_ADJUST_CFA_OFFSET 8 | |
22302 | + pushq %rax /* push real oldrax to the rdi slot */ | |
22303 | + CFI_ADJUST_CFA_OFFSET 8 | |
22304 | + CFI_REL_OFFSET rax,0 | |
22305 | + leaq \sym(%rip),%rax | |
22306 | + jmp error_entry | |
22307 | + CFI_ENDPROC | |
22308 | + .endm | |
22309 | + | |
22310 | + .macro errorentry sym | |
22311 | + XCPT_FRAME | |
22312 | + movq (%rsp),%rcx | |
22313 | + CFI_RESTORE rcx | |
22314 | + movq 8(%rsp),%r11 | |
22315 | + CFI_RESTORE r11 | |
22316 | + addq $0x10,%rsp /* rsp points to the error code */ | |
22317 | + CFI_ADJUST_CFA_OFFSET -0x10 | |
22318 | + pushq %rax | |
22319 | + CFI_ADJUST_CFA_OFFSET 8 | |
22320 | + CFI_REL_OFFSET rax,0 | |
22321 | + leaq \sym(%rip),%rax | |
22322 | + jmp error_entry | |
22323 | + CFI_ENDPROC | |
22324 | + .endm | |
22325 | + | |
22326 | +#if 0 /* not XEN */ | |
22327 | + /* error code is on the stack already */ | |
22328 | + /* handle NMI like exceptions that can happen everywhere */ | |
22329 | + .macro paranoidentry sym, ist=0, irqtrace=1 | |
22330 | + movq (%rsp),%rcx | |
22331 | + movq 8(%rsp),%r11 | |
22332 | + addq $0x10,%rsp /* skip rcx and r11 */ | |
22333 | + SAVE_ALL | |
22334 | + cld | |
22335 | +#if 0 /* not XEN */ | |
22336 | + movl $1,%ebx | |
22337 | + movl $MSR_GS_BASE,%ecx | |
22338 | + rdmsr | |
22339 | + testl %edx,%edx | |
22340 | + js 1f | |
22341 | + swapgs | |
22342 | + xorl %ebx,%ebx | |
22343 | +1: | |
22344 | +#endif | |
22345 | + .if \ist | |
22346 | + movq %gs:pda_data_offset, %rbp | |
22347 | + .endif | |
22348 | + movq %rsp,%rdi | |
22349 | + movq ORIG_RAX(%rsp),%rsi | |
22350 | + movq $-1,ORIG_RAX(%rsp) | |
22351 | + .if \ist | |
22352 | + subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) | |
22353 | + .endif | |
22354 | + call \sym | |
22355 | + .if \ist | |
22356 | + addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) | |
22357 | + .endif | |
22358 | +/* cli */ | |
22359 | + XEN_BLOCK_EVENTS(%rsi) | |
22360 | + .if \irqtrace | |
22361 | + TRACE_IRQS_OFF | |
22362 | + .endif | |
22363 | + .endm | |
22364 | + | |
22365 | + /* | |
22366 | + * "Paranoid" exit path from exception stack. | |
22367 | + * Paranoid because this is used by NMIs and cannot take | |
22368 | + * any kernel state for granted. | |
22369 | + * We don't do kernel preemption checks here, because only | |
22370 | + * NMI should be common and it does not enable IRQs and | |
22371 | + * cannot get reschedule ticks. | |
22372 | + * | |
22373 | + * "trace" is 0 for the NMI handler only, because irq-tracing | |
22374 | + * is fundamentally NMI-unsafe. (we cannot change the soft and | |
22375 | + * hard flags at once, atomically) | |
22376 | + */ | |
22377 | + .macro paranoidexit trace=1 | |
22378 | + /* ebx: no swapgs flag */ | |
22379 | +paranoid_exit\trace: | |
22380 | + testl %ebx,%ebx /* swapgs needed? */ | |
22381 | + jnz paranoid_restore\trace | |
22382 | + testl $3,CS(%rsp) | |
22383 | + jnz paranoid_userspace\trace | |
22384 | +paranoid_swapgs\trace: | |
22385 | + TRACE_IRQS_IRETQ 0 | |
22386 | + swapgs | |
22387 | +paranoid_restore\trace: | |
22388 | + RESTORE_ALL 8 | |
22389 | + iretq | |
22390 | +paranoid_userspace\trace: | |
22391 | + GET_THREAD_INFO(%rcx) | |
22392 | + movl threadinfo_flags(%rcx),%ebx | |
22393 | + andl $_TIF_WORK_MASK,%ebx | |
22394 | + jz paranoid_swapgs\trace | |
22395 | + movq %rsp,%rdi /* &pt_regs */ | |
22396 | + call sync_regs | |
22397 | + movq %rax,%rsp /* switch stack for scheduling */ | |
22398 | + testl $_TIF_NEED_RESCHED,%ebx | |
22399 | + jnz paranoid_schedule\trace | |
22400 | + movl %ebx,%edx /* arg3: thread flags */ | |
22401 | + .if \trace | |
22402 | + TRACE_IRQS_ON | |
22403 | + .endif | |
22404 | + sti | |
22405 | + xorl %esi,%esi /* arg2: oldset */ | |
22406 | + movq %rsp,%rdi /* arg1: &pt_regs */ | |
22407 | + call do_notify_resume | |
22408 | + cli | |
22409 | + .if \trace | |
22410 | + TRACE_IRQS_OFF | |
22411 | + .endif | |
22412 | + jmp paranoid_userspace\trace | |
22413 | +paranoid_schedule\trace: | |
22414 | + .if \trace | |
22415 | + TRACE_IRQS_ON | |
22416 | + .endif | |
22417 | + sti | |
22418 | + call schedule | |
22419 | + cli | |
22420 | + .if \trace | |
22421 | + TRACE_IRQS_OFF | |
22422 | + .endif | |
22423 | + jmp paranoid_userspace\trace | |
22424 | + CFI_ENDPROC | |
22425 | + .endm | |
22426 | +#endif | |
22427 | + | |
22428 | +/* | |
22429 | + * Exception entry point. This expects an error code/orig_rax on the stack | |
22430 | + * and the exception handler in %rax. | |
22431 | + */ | |
22432 | +ENTRY(error_entry) | |
22433 | + _frame RDI | |
22434 | + CFI_REL_OFFSET rax,0 | |
22435 | + /* rdi slot contains rax, oldrax contains error code */ | |
22436 | + cld | |
22437 | + subq $14*8,%rsp | |
22438 | + CFI_ADJUST_CFA_OFFSET (14*8) | |
22439 | + movq %rsi,13*8(%rsp) | |
22440 | + CFI_REL_OFFSET rsi,RSI | |
22441 | + movq 14*8(%rsp),%rsi /* load rax from rdi slot */ | |
22442 | + CFI_REGISTER rax,rsi | |
22443 | + movq %rdx,12*8(%rsp) | |
22444 | + CFI_REL_OFFSET rdx,RDX | |
22445 | + movq %rcx,11*8(%rsp) | |
22446 | + CFI_REL_OFFSET rcx,RCX | |
22447 | + movq %rsi,10*8(%rsp) /* store rax */ | |
22448 | + CFI_REL_OFFSET rax,RAX | |
22449 | + movq %r8, 9*8(%rsp) | |
22450 | + CFI_REL_OFFSET r8,R8 | |
22451 | + movq %r9, 8*8(%rsp) | |
22452 | + CFI_REL_OFFSET r9,R9 | |
22453 | + movq %r10,7*8(%rsp) | |
22454 | + CFI_REL_OFFSET r10,R10 | |
22455 | + movq %r11,6*8(%rsp) | |
22456 | + CFI_REL_OFFSET r11,R11 | |
22457 | + movq %rbx,5*8(%rsp) | |
22458 | + CFI_REL_OFFSET rbx,RBX | |
22459 | + movq %rbp,4*8(%rsp) | |
22460 | + CFI_REL_OFFSET rbp,RBP | |
22461 | + movq %r12,3*8(%rsp) | |
22462 | + CFI_REL_OFFSET r12,R12 | |
22463 | + movq %r13,2*8(%rsp) | |
22464 | + CFI_REL_OFFSET r13,R13 | |
22465 | + movq %r14,1*8(%rsp) | |
22466 | + CFI_REL_OFFSET r14,R14 | |
22467 | + movq %r15,(%rsp) | |
22468 | + CFI_REL_OFFSET r15,R15 | |
22469 | +#if 0 | |
22470 | + cmpl $__KERNEL_CS,CS(%rsp) | |
22471 | + CFI_REMEMBER_STATE | |
22472 | + je error_kernelspace | |
22473 | +#endif | |
22474 | +error_call_handler: | |
22475 | + movq %rdi, RDI(%rsp) | |
22476 | + CFI_REL_OFFSET rdi,RDI | |
22477 | + movq %rsp,%rdi | |
22478 | + movq ORIG_RAX(%rsp),%rsi # get error code | |
22479 | + movq $-1,ORIG_RAX(%rsp) | |
22480 | + call *%rax | |
22481 | +error_exit: | |
22482 | + RESTORE_REST | |
22483 | +/* cli */ | |
22484 | + XEN_BLOCK_EVENTS(%rsi) | |
22485 | + TRACE_IRQS_OFF | |
22486 | + GET_THREAD_INFO(%rcx) | |
22487 | + testb $3,CS-ARGOFFSET(%rsp) | |
22488 | + jz retint_kernel | |
22489 | + movl threadinfo_flags(%rcx),%edx | |
22490 | + movl $_TIF_WORK_MASK,%edi | |
22491 | + andl %edi,%edx | |
22492 | + jnz retint_careful | |
22493 | + /* | |
22494 | + * The iret might restore flags: | |
22495 | + */ | |
22496 | + TRACE_IRQS_IRETQ | |
22497 | + jmp retint_restore_args | |
22498 | + | |
22499 | +#if 0 | |
22500 | + /* | |
22501 | + * We need to re-write the logic here because we don't do iretq to | |
22502 | + * to return to user mode. It's still possible that we get trap/fault | |
22503 | + * in the kernel (when accessing buffers pointed to by system calls, | |
22504 | + * for example). | |
22505 | + * | |
22506 | + */ | |
22507 | + CFI_RESTORE_STATE | |
22508 | +error_kernelspace: | |
22509 | + incl %ebx | |
22510 | + /* There are two places in the kernel that can potentially fault with | |
22511 | + usergs. Handle them here. The exception handlers after | |
22512 | + iret run with kernel gs again, so don't set the user space flag. | |
22513 | + B stepping K8s sometimes report an truncated RIP for IRET | |
22514 | + exceptions returning to compat mode. Check for these here too. */ | |
22515 | + leaq iret_label(%rip),%rbp | |
22516 | + cmpq %rbp,RIP(%rsp) | |
22517 | + je error_swapgs | |
22518 | + movl %ebp,%ebp /* zero extend */ | |
22519 | + cmpq %rbp,RIP(%rsp) | |
22520 | + je error_swapgs | |
22521 | + cmpq $gs_change,RIP(%rsp) | |
22522 | + je error_swapgs | |
22523 | + jmp error_sti | |
22524 | +#endif | |
22525 | + CFI_ENDPROC | |
22526 | +END(error_entry) | |
22527 | + | |
22528 | +ENTRY(hypervisor_callback) | |
22529 | + zeroentry do_hypervisor_callback | |
22530 | +END(hypervisor_callback) | |
22531 | + | |
22532 | +/* | |
22533 | + * Copied from arch/xen/i386/kernel/entry.S | |
22534 | + */ | |
22535 | +# A note on the "critical region" in our callback handler. | |
22536 | +# We want to avoid stacking callback handlers due to events occurring | |
22537 | +# during handling of the last event. To do this, we keep events disabled | |
22538 | +# until we've done all processing. HOWEVER, we must enable events before | |
22539 | +# popping the stack frame (can't be done atomically) and so it would still | |
22540 | +# be possible to get enough handler activations to overflow the stack. | |
22541 | +# Although unlikely, bugs of that kind are hard to track down, so we'd | |
22542 | +# like to avoid the possibility. | |
22543 | +# So, on entry to the handler we detect whether we interrupted an | |
22544 | +# existing activation in its critical region -- if so, we pop the current | |
22545 | +# activation and restart the handler using the previous one. | |
22546 | +ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) | |
22547 | + CFI_STARTPROC | |
22548 | +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will | |
22549 | +# see the correct pointer to the pt_regs | |
22550 | + movq %rdi, %rsp # we don't return, adjust the stack frame | |
22551 | + CFI_ENDPROC | |
22552 | + CFI_DEFAULT_STACK | |
22553 | +11: incl %gs:pda_irqcount | |
22554 | + movq %rsp,%rbp | |
22555 | + CFI_DEF_CFA_REGISTER rbp | |
22556 | + cmovzq %gs:pda_irqstackptr,%rsp | |
22557 | + pushq %rbp # backlink for old unwinder | |
22558 | + call evtchn_do_upcall | |
22559 | + popq %rsp | |
22560 | + CFI_DEF_CFA_REGISTER rsp | |
22561 | + decl %gs:pda_irqcount | |
22562 | + jmp error_exit | |
22563 | + CFI_ENDPROC | |
22564 | +END(do_hypervisor_callback) | |
22565 | + | |
22566 | +#ifdef CONFIG_X86_LOCAL_APIC | |
22567 | +KPROBE_ENTRY(nmi) | |
22568 | + zeroentry do_nmi_callback | |
22569 | +ENTRY(do_nmi_callback) | |
22570 | + CFI_STARTPROC | |
22571 | + addq $8, %rsp | |
22572 | + CFI_ENDPROC | |
22573 | + CFI_DEFAULT_STACK | |
22574 | + call do_nmi | |
22575 | + orl $NMI_MASK,EFLAGS(%rsp) | |
22576 | + RESTORE_REST | |
22577 | + XEN_BLOCK_EVENTS(%rsi) | |
22578 | + TRACE_IRQS_OFF | |
22579 | + GET_THREAD_INFO(%rcx) | |
22580 | + jmp retint_restore_args | |
22581 | + CFI_ENDPROC | |
22582 | + .previous .text | |
22583 | +END(nmi) | |
22584 | +#endif | |
22585 | + | |
22586 | + ALIGN | |
22587 | +restore_all_enable_events: | |
22588 | + CFI_DEFAULT_STACK adj=1 | |
22589 | + TRACE_IRQS_ON | |
22590 | + XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up... | |
22591 | + | |
22592 | +scrit: /**** START OF CRITICAL REGION ****/ | |
22593 | + XEN_TEST_PENDING(%rsi) | |
22594 | + CFI_REMEMBER_STATE | |
22595 | + jnz 14f # process more events if necessary... | |
22596 | + XEN_PUT_VCPU_INFO(%rsi) | |
22597 | + RESTORE_ARGS 0,8,0 | |
22598 | + HYPERVISOR_IRET 0 | |
22599 | + | |
22600 | + CFI_RESTORE_STATE | |
22601 | +14: XEN_LOCKED_BLOCK_EVENTS(%rsi) | |
22602 | + XEN_PUT_VCPU_INFO(%rsi) | |
22603 | + SAVE_REST | |
22604 | + movq %rsp,%rdi # set the argument again | |
22605 | + jmp 11b | |
22606 | + CFI_ENDPROC | |
22607 | +ecrit: /**** END OF CRITICAL REGION ****/ | |
22608 | +# At this point, unlike on x86-32, we don't do the fixup to simplify the | |
22609 | +# code and the stack frame is more complex on x86-64. | |
22610 | +# When the kernel is interrupted in the critical section, the kernel | |
22611 | +# will do IRET in that case, and everything will be restored at that point, | |
22612 | +# i.e. it just resumes from the next instruction interrupted with the same context. | |
22613 | + | |
22614 | +# Hypervisor uses this for application faults while it executes. | |
22615 | +# We get here for two reasons: | |
22616 | +# 1. Fault while reloading DS, ES, FS or GS | |
22617 | +# 2. Fault while executing IRET | |
22618 | +# Category 1 we do not need to fix up as Xen has already reloaded all segment | |
22619 | +# registers that could be reloaded and zeroed the others. | |
22620 | +# Category 2 we fix up by killing the current process. We cannot use the | |
22621 | +# normal Linux return path in this case because if we use the IRET hypercall | |
22622 | +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. | |
22623 | +# We distinguish between categories by comparing each saved segment register | |
22624 | +# with its current contents: any discrepancy means we in category 1. | |
22625 | +ENTRY(failsafe_callback) | |
22626 | + _frame (RIP-0x30) | |
22627 | + CFI_REL_OFFSET rcx, 0 | |
22628 | + CFI_REL_OFFSET r11, 8 | |
22629 | + movw %ds,%cx | |
22630 | + cmpw %cx,0x10(%rsp) | |
22631 | + CFI_REMEMBER_STATE | |
22632 | + jne 1f | |
22633 | + movw %es,%cx | |
22634 | + cmpw %cx,0x18(%rsp) | |
22635 | + jne 1f | |
22636 | + movw %fs,%cx | |
22637 | + cmpw %cx,0x20(%rsp) | |
22638 | + jne 1f | |
22639 | + movw %gs,%cx | |
22640 | + cmpw %cx,0x28(%rsp) | |
22641 | + jne 1f | |
22642 | + /* All segments match their saved values => Category 2 (Bad IRET). */ | |
22643 | + movq (%rsp),%rcx | |
22644 | + CFI_RESTORE rcx | |
22645 | + movq 8(%rsp),%r11 | |
22646 | + CFI_RESTORE r11 | |
22647 | + addq $0x30,%rsp | |
22648 | + CFI_ADJUST_CFA_OFFSET -0x30 | |
22649 | + movq $11,%rdi /* SIGSEGV */ | |
22650 | + jmp do_exit | |
22651 | + CFI_RESTORE_STATE | |
22652 | +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ | |
22653 | + movq (%rsp),%rcx | |
22654 | + CFI_RESTORE rcx | |
22655 | + movq 8(%rsp),%r11 | |
22656 | + CFI_RESTORE r11 | |
22657 | + addq $0x30,%rsp | |
22658 | + CFI_ADJUST_CFA_OFFSET -0x30 | |
22659 | + pushq $0 | |
22660 | + CFI_ADJUST_CFA_OFFSET 8 | |
22661 | + SAVE_ALL | |
22662 | + jmp error_exit | |
22663 | + CFI_ENDPROC | |
22664 | +#if 0 | |
22665 | + .section __ex_table,"a" | |
22666 | + .align 8 | |
22667 | + .quad gs_change,bad_gs | |
22668 | + .previous | |
22669 | + .section .fixup,"ax" | |
22670 | + /* running with kernelgs */ | |
22671 | +bad_gs: | |
22672 | +/* swapgs */ /* switch back to user gs */ | |
22673 | + xorl %eax,%eax | |
22674 | + movl %eax,%gs | |
22675 | + jmp 2b | |
22676 | + .previous | |
22677 | +#endif | |
22678 | + | |
22679 | +/* | |
22680 | + * Create a kernel thread. | |
22681 | + * | |
22682 | + * C extern interface: | |
22683 | + * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) | |
22684 | + * | |
22685 | + * asm input arguments: | |
22686 | + * rdi: fn, rsi: arg, rdx: flags | |
22687 | + */ | |
22688 | +ENTRY(kernel_thread) | |
22689 | + CFI_STARTPROC | |
22690 | + FAKE_STACK_FRAME $child_rip | |
22691 | + SAVE_ALL | |
22692 | + | |
22693 | + # rdi: flags, rsi: usp, rdx: will be &pt_regs | |
22694 | + movq %rdx,%rdi | |
22695 | + orq kernel_thread_flags(%rip),%rdi | |
22696 | + movq $-1, %rsi | |
22697 | + movq %rsp, %rdx | |
22698 | + | |
22699 | + xorl %r8d,%r8d | |
22700 | + xorl %r9d,%r9d | |
22701 | + | |
22702 | + # clone now | |
22703 | + call do_fork | |
22704 | + movq %rax,RAX(%rsp) | |
22705 | + xorl %edi,%edi | |
22706 | + | |
22707 | + /* | |
22708 | + * It isn't worth to check for reschedule here, | |
22709 | + * so internally to the x86_64 port you can rely on kernel_thread() | |
22710 | + * not to reschedule the child before returning, this avoids the need | |
22711 | + * of hacks for example to fork off the per-CPU idle tasks. | |
22712 | + * [Hopefully no generic code relies on the reschedule -AK] | |
22713 | + */ | |
22714 | + RESTORE_ALL | |
22715 | + UNFAKE_STACK_FRAME | |
22716 | + ret | |
22717 | + CFI_ENDPROC | |
22718 | +ENDPROC(kernel_thread) | |
22719 | + | |
22720 | +child_rip: | |
22721 | + pushq $0 # fake return address | |
22722 | + CFI_STARTPROC | |
22723 | + /* | |
22724 | + * Here we are in the child and the registers are set as they were | |
22725 | + * at kernel_thread() invocation in the parent. | |
22726 | + */ | |
22727 | + movq %rdi, %rax | |
22728 | + movq %rsi, %rdi | |
22729 | + call *%rax | |
22730 | + # exit | |
22731 | + xorl %edi, %edi | |
22732 | + call do_exit | |
22733 | + CFI_ENDPROC | |
22734 | +ENDPROC(child_rip) | |
22735 | + | |
22736 | +/* | |
22737 | + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. | |
22738 | + * | |
22739 | + * C extern interface: | |
22740 | + * extern long execve(char *name, char **argv, char **envp) | |
22741 | + * | |
22742 | + * asm input arguments: | |
22743 | + * rdi: name, rsi: argv, rdx: envp | |
22744 | + * | |
22745 | + * We want to fallback into: | |
22746 | + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) | |
22747 | + * | |
22748 | + * do_sys_execve asm fallback arguments: | |
22749 | + * rdi: name, rsi: argv, rdx: envp, fake frame on the stack | |
22750 | + */ | |
22751 | +ENTRY(execve) | |
22752 | + CFI_STARTPROC | |
22753 | + FAKE_STACK_FRAME $0 | |
22754 | + SAVE_ALL | |
22755 | + call sys_execve | |
22756 | + movq %rax, RAX(%rsp) | |
22757 | + RESTORE_REST | |
22758 | + testq %rax,%rax | |
22759 | + jne 1f | |
22760 | + jmp int_ret_from_sys_call | |
22761 | +1: RESTORE_ARGS | |
22762 | + UNFAKE_STACK_FRAME | |
22763 | + ret | |
22764 | + CFI_ENDPROC | |
22765 | +ENDPROC(execve) | |
22766 | + | |
22767 | +KPROBE_ENTRY(page_fault) | |
22768 | + errorentry do_page_fault | |
22769 | +END(page_fault) | |
22770 | + .previous .text | |
22771 | + | |
22772 | +ENTRY(coprocessor_error) | |
22773 | + zeroentry do_coprocessor_error | |
22774 | +END(coprocessor_error) | |
22775 | + | |
22776 | +ENTRY(simd_coprocessor_error) | |
22777 | + zeroentry do_simd_coprocessor_error | |
22778 | +END(simd_coprocessor_error) | |
22779 | + | |
22780 | +ENTRY(device_not_available) | |
22781 | + zeroentry math_state_restore | |
22782 | +END(device_not_available) | |
22783 | + | |
22784 | + /* runs on exception stack */ | |
22785 | +KPROBE_ENTRY(debug) | |
22786 | +/* INTR_FRAME | |
22787 | + pushq $0 | |
22788 | + CFI_ADJUST_CFA_OFFSET 8 */ | |
22789 | + zeroentry do_debug | |
22790 | +/* paranoidexit | |
22791 | + CFI_ENDPROC */ | |
22792 | +END(debug) | |
22793 | + .previous .text | |
22794 | + | |
22795 | +#if 0 | |
22796 | + /* runs on exception stack */ | |
22797 | +KPROBE_ENTRY(nmi) | |
22798 | + INTR_FRAME | |
22799 | + pushq $-1 | |
22800 | + CFI_ADJUST_CFA_OFFSET 8 | |
22801 | + paranoidentry do_nmi, 0, 0 | |
22802 | +#ifdef CONFIG_TRACE_IRQFLAGS | |
22803 | + paranoidexit 0 | |
22804 | +#else | |
22805 | + jmp paranoid_exit1 | |
22806 | + CFI_ENDPROC | |
22807 | +#endif | |
22808 | +END(nmi) | |
22809 | + .previous .text | |
22810 | +#endif | |
22811 | + | |
22812 | +KPROBE_ENTRY(int3) | |
22813 | +/* INTR_FRAME | |
22814 | + pushq $0 | |
22815 | + CFI_ADJUST_CFA_OFFSET 8 */ | |
22816 | + zeroentry do_int3 | |
22817 | +/* jmp paranoid_exit1 | |
22818 | + CFI_ENDPROC */ | |
22819 | +END(int3) | |
22820 | + .previous .text | |
22821 | + | |
22822 | +ENTRY(overflow) | |
22823 | + zeroentry do_overflow | |
22824 | +END(overflow) | |
22825 | + | |
22826 | +ENTRY(bounds) | |
22827 | + zeroentry do_bounds | |
22828 | +END(bounds) | |
22829 | + | |
22830 | +ENTRY(invalid_op) | |
22831 | + zeroentry do_invalid_op | |
22832 | +END(invalid_op) | |
22833 | + | |
22834 | +ENTRY(coprocessor_segment_overrun) | |
22835 | + zeroentry do_coprocessor_segment_overrun | |
22836 | +END(coprocessor_segment_overrun) | |
22837 | + | |
22838 | +ENTRY(reserved) | |
22839 | + zeroentry do_reserved | |
22840 | +END(reserved) | |
22841 | + | |
22842 | +#if 0 | |
22843 | + /* runs on exception stack */ | |
22844 | +ENTRY(double_fault) | |
22845 | + XCPT_FRAME | |
22846 | + paranoidentry do_double_fault | |
22847 | + jmp paranoid_exit1 | |
22848 | + CFI_ENDPROC | |
22849 | +END(double_fault) | |
22850 | +#endif | |
22851 | + | |
22852 | +ENTRY(invalid_TSS) | |
22853 | + errorentry do_invalid_TSS | |
22854 | +END(invalid_TSS) | |
22855 | + | |
22856 | +ENTRY(segment_not_present) | |
22857 | + errorentry do_segment_not_present | |
22858 | +END(segment_not_present) | |
22859 | + | |
22860 | + /* runs on exception stack */ | |
22861 | +ENTRY(stack_segment) | |
22862 | +/* XCPT_FRAME | |
22863 | + paranoidentry do_stack_segment */ | |
22864 | + errorentry do_stack_segment | |
22865 | +/* jmp paranoid_exit1 | |
22866 | + CFI_ENDPROC */ | |
22867 | +END(stack_segment) | |
22868 | + | |
22869 | +KPROBE_ENTRY(general_protection) | |
22870 | + errorentry do_general_protection | |
22871 | +END(general_protection) | |
22872 | + .previous .text | |
22873 | + | |
22874 | +ENTRY(alignment_check) | |
22875 | + errorentry do_alignment_check | |
22876 | +END(alignment_check) | |
22877 | + | |
22878 | +ENTRY(divide_error) | |
22879 | + zeroentry do_divide_error | |
22880 | +END(divide_error) | |
22881 | + | |
22882 | +ENTRY(spurious_interrupt_bug) | |
22883 | + zeroentry do_spurious_interrupt_bug | |
22884 | +END(spurious_interrupt_bug) | |
22885 | + | |
22886 | +#ifdef CONFIG_X86_MCE | |
22887 | + /* runs on exception stack */ | |
22888 | +ENTRY(machine_check) | |
22889 | + INTR_FRAME | |
22890 | + pushq $0 | |
22891 | + CFI_ADJUST_CFA_OFFSET 8 | |
22892 | + paranoidentry do_machine_check | |
22893 | + jmp paranoid_exit1 | |
22894 | + CFI_ENDPROC | |
22895 | +END(machine_check) | |
22896 | +#endif | |
22897 | + | |
22898 | +/* Call softirq on interrupt stack. Interrupts are off. */ | |
22899 | +ENTRY(call_softirq) | |
22900 | + CFI_STARTPROC | |
22901 | + push %rbp | |
22902 | + CFI_ADJUST_CFA_OFFSET 8 | |
22903 | + CFI_REL_OFFSET rbp,0 | |
22904 | + mov %rsp,%rbp | |
22905 | + CFI_DEF_CFA_REGISTER rbp | |
22906 | + incl %gs:pda_irqcount | |
22907 | + cmove %gs:pda_irqstackptr,%rsp | |
22908 | + push %rbp # backlink for old unwinder | |
22909 | + call __do_softirq | |
22910 | + leaveq | |
22911 | + CFI_DEF_CFA_REGISTER rsp | |
22912 | + CFI_ADJUST_CFA_OFFSET -8 | |
22913 | + decl %gs:pda_irqcount | |
22914 | + ret | |
22915 | + CFI_ENDPROC | |
22916 | +ENDPROC(call_softirq) | |
22917 | + | |
22918 | +#ifdef CONFIG_STACK_UNWIND | |
22919 | +ENTRY(arch_unwind_init_running) | |
22920 | + CFI_STARTPROC | |
22921 | + movq %r15, R15(%rdi) | |
22922 | + movq %r14, R14(%rdi) | |
22923 | + xchgq %rsi, %rdx | |
22924 | + movq %r13, R13(%rdi) | |
22925 | + movq %r12, R12(%rdi) | |
22926 | + xorl %eax, %eax | |
22927 | + movq %rbp, RBP(%rdi) | |
22928 | + movq %rbx, RBX(%rdi) | |
22929 | + movq (%rsp), %rcx | |
22930 | + movq %rax, R11(%rdi) | |
22931 | + movq %rax, R10(%rdi) | |
22932 | + movq %rax, R9(%rdi) | |
22933 | + movq %rax, R8(%rdi) | |
22934 | + movq %rax, RAX(%rdi) | |
22935 | + movq %rax, RCX(%rdi) | |
22936 | + movq %rax, RDX(%rdi) | |
22937 | + movq %rax, RSI(%rdi) | |
22938 | + movq %rax, RDI(%rdi) | |
22939 | + movq %rax, ORIG_RAX(%rdi) | |
22940 | + movq %rcx, RIP(%rdi) | |
22941 | + leaq 8(%rsp), %rcx | |
22942 | + movq $__KERNEL_CS, CS(%rdi) | |
22943 | + movq %rax, EFLAGS(%rdi) | |
22944 | + movq %rcx, RSP(%rdi) | |
22945 | + movq $__KERNEL_DS, SS(%rdi) | |
22946 | + jmpq *%rdx | |
22947 | + CFI_ENDPROC | |
22948 | +ENDPROC(arch_unwind_init_running) | |
22949 | +#endif | |
22950 | Index: head-2008-11-25/arch/x86/kernel/genapic_64-xen.c | |
22951 | =================================================================== | |
22952 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
22953 | +++ head-2008-11-25/arch/x86/kernel/genapic_64-xen.c 2007-06-12 13:13:01.000000000 +0200 | |
22954 | @@ -0,0 +1,143 @@ | |
22955 | +/* | |
22956 | + * Copyright 2004 James Cleverdon, IBM. | |
22957 | + * Subject to the GNU Public License, v.2 | |
22958 | + * | |
22959 | + * Generic APIC sub-arch probe layer. | |
22960 | + * | |
22961 | + * Hacked for x86-64 by James Cleverdon from i386 architecture code by | |
22962 | + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | |
22963 | + * James Cleverdon. | |
22964 | + */ | |
22965 | +#include <linux/threads.h> | |
22966 | +#include <linux/cpumask.h> | |
22967 | +#include <linux/string.h> | |
22968 | +#include <linux/kernel.h> | |
22969 | +#include <linux/ctype.h> | |
22970 | +#include <linux/init.h> | |
22971 | +#include <linux/module.h> | |
22972 | + | |
22973 | +#include <asm/smp.h> | |
22974 | +#include <asm/ipi.h> | |
22975 | + | |
22976 | +#if defined(CONFIG_ACPI) | |
22977 | +#include <acpi/acpi_bus.h> | |
22978 | +#endif | |
22979 | + | |
22980 | +/* which logical CPU number maps to which CPU (physical APIC ID) */ | |
22981 | +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; | |
22982 | +EXPORT_SYMBOL(x86_cpu_to_apicid); | |
22983 | +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | |
22984 | + | |
22985 | +extern struct genapic apic_cluster; | |
22986 | +extern struct genapic apic_flat; | |
22987 | +extern struct genapic apic_physflat; | |
22988 | + | |
22989 | +#ifndef CONFIG_XEN | |
22990 | +struct genapic *genapic = &apic_flat; | |
22991 | +#else | |
22992 | +extern struct genapic apic_xen; | |
22993 | +struct genapic *genapic = &apic_xen; | |
22994 | +#endif | |
22995 | + | |
22996 | + | |
22997 | +/* | |
22998 | + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. | |
22999 | + */ | |
23000 | +void __init clustered_apic_check(void) | |
23001 | +{ | |
23002 | +#ifndef CONFIG_XEN | |
23003 | + long i; | |
23004 | + u8 clusters, max_cluster; | |
23005 | + u8 id; | |
23006 | + u8 cluster_cnt[NUM_APIC_CLUSTERS]; | |
23007 | + int max_apic = 0; | |
23008 | + | |
23009 | +#if defined(CONFIG_ACPI) | |
23010 | + /* | |
23011 | + * Some x86_64 machines use physical APIC mode regardless of how many | |
23012 | + * procs/clusters are present (x86_64 ES7000 is an example). | |
23013 | + */ | |
23014 | + if (acpi_fadt.revision > FADT2_REVISION_ID) | |
23015 | + if (acpi_fadt.force_apic_physical_destination_mode) { | |
23016 | + genapic = &apic_cluster; | |
23017 | + goto print; | |
23018 | + } | |
23019 | +#endif | |
23020 | + | |
23021 | + memset(cluster_cnt, 0, sizeof(cluster_cnt)); | |
23022 | + for (i = 0; i < NR_CPUS; i++) { | |
23023 | + id = bios_cpu_apicid[i]; | |
23024 | + if (id == BAD_APICID) | |
23025 | + continue; | |
23026 | + if (id > max_apic) | |
23027 | + max_apic = id; | |
23028 | + cluster_cnt[APIC_CLUSTERID(id)]++; | |
23029 | + } | |
23030 | + | |
23031 | + /* Don't use clustered mode on AMD platforms. */ | |
23032 | + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | |
23033 | + genapic = &apic_physflat; | |
23034 | +#ifndef CONFIG_HOTPLUG_CPU | |
23035 | + /* In the CPU hotplug case we cannot use broadcast mode | |
23036 | + because that opens a race when a CPU is removed. | |
23037 | + Stay at physflat mode in this case. | |
23038 | + It is bad to do this unconditionally though. Once | |
23039 | + we have ACPI platform support for CPU hotplug | |
23040 | + we should detect hotplug capablity from ACPI tables and | |
23041 | + only do this when really needed. -AK */ | |
23042 | + if (max_apic <= 8) | |
23043 | + genapic = &apic_flat; | |
23044 | +#endif | |
23045 | + goto print; | |
23046 | + } | |
23047 | + | |
23048 | + clusters = 0; | |
23049 | + max_cluster = 0; | |
23050 | + | |
23051 | + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { | |
23052 | + if (cluster_cnt[i] > 0) { | |
23053 | + ++clusters; | |
23054 | + if (cluster_cnt[i] > max_cluster) | |
23055 | + max_cluster = cluster_cnt[i]; | |
23056 | + } | |
23057 | + } | |
23058 | + | |
23059 | + /* | |
23060 | + * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, | |
23061 | + * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical | |
23062 | + * else physical mode. | |
23063 | + * (We don't use lowest priority delivery + HW APIC IRQ steering, so | |
23064 | + * can ignore the clustered logical case and go straight to physical.) | |
23065 | + */ | |
23066 | + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) { | |
23067 | +#ifdef CONFIG_HOTPLUG_CPU | |
23068 | + /* Don't use APIC shortcuts in CPU hotplug to avoid races */ | |
23069 | + genapic = &apic_physflat; | |
23070 | +#else | |
23071 | + genapic = &apic_flat; | |
23072 | +#endif | |
23073 | + } else | |
23074 | + genapic = &apic_cluster; | |
23075 | + | |
23076 | +print: | |
23077 | +#else | |
23078 | + /* hardcode to xen apic functions */ | |
23079 | + genapic = &apic_xen; | |
23080 | +#endif | |
23081 | + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); | |
23082 | +} | |
23083 | + | |
23084 | +/* Same for both flat and clustered. */ | |
23085 | + | |
23086 | +#ifdef CONFIG_XEN | |
23087 | +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest); | |
23088 | +#endif | |
23089 | + | |
23090 | +void send_IPI_self(int vector) | |
23091 | +{ | |
23092 | +#ifndef CONFIG_XEN | |
23093 | + __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); | |
23094 | +#else | |
23095 | + xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); | |
23096 | +#endif | |
23097 | +} | |
23098 | Index: head-2008-11-25/arch/x86/kernel/genapic_xen_64.c | |
23099 | =================================================================== | |
23100 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
23101 | +++ head-2008-11-25/arch/x86/kernel/genapic_xen_64.c 2007-06-12 13:13:01.000000000 +0200 | |
23102 | @@ -0,0 +1,161 @@ | |
23103 | +/* | |
23104 | + * Copyright 2004 James Cleverdon, IBM. | |
23105 | + * Subject to the GNU Public License, v.2 | |
23106 | + * | |
23107 | + * Xen APIC subarch code. Maximum 8 CPUs, logical delivery. | |
23108 | + * | |
23109 | + * Hacked for x86-64 by James Cleverdon from i386 architecture code by | |
23110 | + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | |
23111 | + * James Cleverdon. | |
23112 | + * | |
23113 | + * Hacked to pieces for Xen by Chris Wright. | |
23114 | + */ | |
23115 | +#include <linux/threads.h> | |
23116 | +#include <linux/cpumask.h> | |
23117 | +#include <linux/string.h> | |
23118 | +#include <linux/kernel.h> | |
23119 | +#include <linux/ctype.h> | |
23120 | +#include <linux/init.h> | |
23121 | +#ifdef CONFIG_XEN_PRIVILEGED_GUEST | |
23122 | +#include <asm/smp.h> | |
23123 | +#include <asm/ipi.h> | |
23124 | +#else | |
23125 | +#include <asm/apic.h> | |
23126 | +#include <asm/apicdef.h> | |
23127 | +#include <asm/genapic.h> | |
23128 | +#endif | |
23129 | +#include <xen/evtchn.h> | |
23130 | + | |
23131 | +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]); | |
23132 | + | |
23133 | +static inline void __send_IPI_one(unsigned int cpu, int vector) | |
23134 | +{ | |
23135 | + int irq = per_cpu(ipi_to_irq, cpu)[vector]; | |
23136 | + BUG_ON(irq < 0); | |
23137 | + notify_remote_via_irq(irq); | |
23138 | +} | |
23139 | + | |
23140 | +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) | |
23141 | +{ | |
23142 | + int cpu; | |
23143 | + | |
23144 | + switch (shortcut) { | |
23145 | + case APIC_DEST_SELF: | |
23146 | + __send_IPI_one(smp_processor_id(), vector); | |
23147 | + break; | |
23148 | + case APIC_DEST_ALLBUT: | |
23149 | + for (cpu = 0; cpu < NR_CPUS; ++cpu) { | |
23150 | + if (cpu == smp_processor_id()) | |
23151 | + continue; | |
23152 | + if (cpu_isset(cpu, cpu_online_map)) { | |
23153 | + __send_IPI_one(cpu, vector); | |
23154 | + } | |
23155 | + } | |
23156 | + break; | |
23157 | + case APIC_DEST_ALLINC: | |
23158 | + for (cpu = 0; cpu < NR_CPUS; ++cpu) { | |
23159 | + if (cpu_isset(cpu, cpu_online_map)) { | |
23160 | + __send_IPI_one(cpu, vector); | |
23161 | + } | |
23162 | + } | |
23163 | + break; | |
23164 | + default: | |
23165 | + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, | |
23166 | + vector); | |
23167 | + break; | |
23168 | + } | |
23169 | +} | |
23170 | + | |
23171 | +static cpumask_t xen_target_cpus(void) | |
23172 | +{ | |
23173 | + return cpu_online_map; | |
23174 | +} | |
23175 | + | |
23176 | +/* | |
23177 | + * Set up the logical destination ID. | |
23178 | + * Do nothing, not called now. | |
23179 | + */ | |
23180 | +static void xen_init_apic_ldr(void) | |
23181 | +{ | |
23182 | + Dprintk("%s\n", __FUNCTION__); | |
23183 | + return; | |
23184 | +} | |
23185 | + | |
23186 | +static void xen_send_IPI_allbutself(int vector) | |
23187 | +{ | |
23188 | + /* | |
23189 | + * if there are no other CPUs in the system then | |
23190 | + * we get an APIC send error if we try to broadcast. | |
23191 | + * thus we have to avoid sending IPIs in this case. | |
23192 | + */ | |
23193 | + Dprintk("%s\n", __FUNCTION__); | |
23194 | + if (num_online_cpus() > 1) | |
23195 | + xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); | |
23196 | +} | |
23197 | + | |
23198 | +static void xen_send_IPI_all(int vector) | |
23199 | +{ | |
23200 | + Dprintk("%s\n", __FUNCTION__); | |
23201 | + xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); | |
23202 | +} | |
23203 | + | |
23204 | +static void xen_send_IPI_mask(cpumask_t cpumask, int vector) | |
23205 | +{ | |
23206 | + unsigned long mask = cpus_addr(cpumask)[0]; | |
23207 | + unsigned int cpu; | |
23208 | + unsigned long flags; | |
23209 | + | |
23210 | + Dprintk("%s\n", __FUNCTION__); | |
23211 | + local_irq_save(flags); | |
23212 | + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); | |
23213 | + | |
23214 | + for (cpu = 0; cpu < NR_CPUS; ++cpu) { | |
23215 | + if (cpu_isset(cpu, cpumask)) { | |
23216 | + __send_IPI_one(cpu, vector); | |
23217 | + } | |
23218 | + } | |
23219 | + local_irq_restore(flags); | |
23220 | +} | |
23221 | + | |
23222 | +#ifdef CONFIG_XEN_PRIVILEGED_GUEST | |
23223 | +static int xen_apic_id_registered(void) | |
23224 | +{ | |
23225 | + /* better be set */ | |
23226 | + Dprintk("%s\n", __FUNCTION__); | |
23227 | + return physid_isset(smp_processor_id(), phys_cpu_present_map); | |
23228 | +} | |
23229 | +#endif | |
23230 | + | |
23231 | +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask) | |
23232 | +{ | |
23233 | + Dprintk("%s\n", __FUNCTION__); | |
23234 | + return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; | |
23235 | +} | |
23236 | + | |
23237 | +static unsigned int phys_pkg_id(int index_msb) | |
23238 | +{ | |
23239 | + u32 ebx; | |
23240 | + | |
23241 | + Dprintk("%s\n", __FUNCTION__); | |
23242 | + ebx = cpuid_ebx(1); | |
23243 | + return ((ebx >> 24) & 0xFF) >> index_msb; | |
23244 | +} | |
23245 | + | |
23246 | +struct genapic apic_xen = { | |
23247 | + .name = "xen", | |
23248 | +#ifdef CONFIG_XEN_PRIVILEGED_GUEST | |
23249 | + .int_delivery_mode = dest_LowestPrio, | |
23250 | +#endif | |
23251 | + .int_dest_mode = (APIC_DEST_LOGICAL != 0), | |
23252 | + .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, | |
23253 | + .target_cpus = xen_target_cpus, | |
23254 | +#ifdef CONFIG_XEN_PRIVILEGED_GUEST | |
23255 | + .apic_id_registered = xen_apic_id_registered, | |
23256 | +#endif | |
23257 | + .init_apic_ldr = xen_init_apic_ldr, | |
23258 | + .send_IPI_all = xen_send_IPI_all, | |
23259 | + .send_IPI_allbutself = xen_send_IPI_allbutself, | |
23260 | + .send_IPI_mask = xen_send_IPI_mask, | |
23261 | + .cpu_mask_to_apicid = xen_cpu_mask_to_apicid, | |
23262 | + .phys_pkg_id = phys_pkg_id, | |
23263 | +}; | |
23264 | Index: head-2008-11-25/arch/x86/kernel/head_64-xen.S | |
23265 | =================================================================== | |
23266 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
23267 | +++ head-2008-11-25/arch/x86/kernel/head_64-xen.S 2007-08-06 15:10:49.000000000 +0200 | |
23268 | @@ -0,0 +1,214 @@ | |
23269 | +/* | |
23270 | + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit | |
23271 | + * | |
23272 | + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | |
23273 | + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | |
23274 | + * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> | |
23275 | + * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> | |
23276 | + * | |
23277 | + * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ | |
23278 | + * | |
23279 | + * Jun Nakajima <jun.nakajima@intel.com> | |
23280 | + * Modified for Xen | |
23281 | + */ | |
23282 | + | |
23283 | + | |
23284 | +#include <linux/linkage.h> | |
23285 | +#include <linux/threads.h> | |
23286 | +#include <linux/init.h> | |
23287 | +#include <linux/elfnote.h> | |
23288 | +#include <asm/desc.h> | |
23289 | +#include <asm/segment.h> | |
23290 | +#include <asm/page.h> | |
23291 | +#include <asm/msr.h> | |
23292 | +#include <asm/cache.h> | |
23293 | +#include <asm/dwarf2.h> | |
23294 | +#include <xen/interface/elfnote.h> | |
23295 | + | |
23296 | + .section .bootstrap.text, "ax", @progbits | |
23297 | + .code64 | |
23298 | + .globl startup_64 | |
23299 | +startup_64: | |
23300 | + movq $(init_thread_union+THREAD_SIZE-8),%rsp | |
23301 | + | |
23302 | + /* rsi is pointer to startup info structure. | |
23303 | + pass it to C */ | |
23304 | + movq %rsi,%rdi | |
23305 | + pushq $0 # fake return address | |
23306 | + jmp x86_64_start_kernel | |
23307 | + | |
23308 | +#ifdef CONFIG_ACPI_SLEEP | |
23309 | +.org 0xf00 | |
23310 | + .globl pGDT32 | |
23311 | +pGDT32: | |
23312 | + .word gdt_end-cpu_gdt_table-1 | |
23313 | + .long cpu_gdt_table-__START_KERNEL_map | |
23314 | +#endif | |
23315 | +ENTRY(stext) | |
23316 | +ENTRY(_stext) | |
23317 | + | |
23318 | + $page = 0 | |
23319 | +#define NEXT_PAGE(name) \ | |
23320 | + $page = $page + 1; \ | |
23321 | + .org $page * 0x1000; \ | |
23322 | + phys_##name = $page * 0x1000 + __PHYSICAL_START; \ | |
23323 | +ENTRY(name) | |
23324 | + | |
23325 | +NEXT_PAGE(init_level4_pgt) | |
23326 | + /* This gets initialized in x86_64_start_kernel */ | |
23327 | + .fill 512,8,0 | |
23328 | +NEXT_PAGE(init_level4_user_pgt) | |
23329 | + /* | |
23330 | + * We update two pgd entries to make kernel and user pgd consistent | |
23331 | + * at pgd_populate(). It can be used for kernel modules. So we place | |
23332 | + * this page here for those cases to avoid memory corruption. | |
23333 | + * We also use this page to establish the initial mapping for the | |
23334 | + * vsyscall area. | |
23335 | + */ | |
23336 | + .fill 512,8,0 | |
23337 | + | |
23338 | +NEXT_PAGE(level3_kernel_pgt) | |
23339 | + .fill 512,8,0 | |
23340 | + | |
23341 | + /* | |
23342 | + * This is used for vsyscall area mapping as we have a different | |
23343 | + * level4 page table for user. | |
23344 | + */ | |
23345 | +NEXT_PAGE(level3_user_pgt) | |
23346 | + .fill 512,8,0 | |
23347 | + | |
23348 | +NEXT_PAGE(level2_kernel_pgt) | |
23349 | + .fill 512,8,0 | |
23350 | + | |
23351 | +NEXT_PAGE(hypercall_page) | |
23352 | + CFI_STARTPROC | |
23353 | + .rept 0x1000 / 0x20 | |
23354 | + .skip 1 /* push %rcx */ | |
23355 | + CFI_ADJUST_CFA_OFFSET 8 | |
23356 | + CFI_REL_OFFSET rcx,0 | |
23357 | + .skip 2 /* push %r11 */ | |
23358 | + CFI_ADJUST_CFA_OFFSET 8 | |
23359 | + CFI_REL_OFFSET rcx,0 | |
23360 | + .skip 5 /* mov $#,%eax */ | |
23361 | + .skip 2 /* syscall */ | |
23362 | + .skip 2 /* pop %r11 */ | |
23363 | + CFI_ADJUST_CFA_OFFSET -8 | |
23364 | + CFI_RESTORE r11 | |
23365 | + .skip 1 /* pop %rcx */ | |
23366 | + CFI_ADJUST_CFA_OFFSET -8 | |
23367 | + CFI_RESTORE rcx | |
23368 | + .align 0x20,0 /* ret */ | |
23369 | + .endr | |
23370 | + CFI_ENDPROC | |
23371 | + | |
23372 | +#undef NEXT_PAGE | |
23373 | + | |
23374 | + .data | |
23375 | +/* Just dummy symbol to allow compilation. Not used in sleep path */ | |
23376 | +#ifdef CONFIG_ACPI_SLEEP | |
23377 | + .align PAGE_SIZE | |
23378 | +ENTRY(wakeup_level4_pgt) | |
23379 | + .fill 512,8,0 | |
23380 | +#endif | |
23381 | + | |
23382 | + .data | |
23383 | + | |
23384 | + .align 16 | |
23385 | + .globl cpu_gdt_descr | |
23386 | +cpu_gdt_descr: | |
23387 | + .word gdt_end-cpu_gdt_table-1 | |
23388 | +gdt: | |
23389 | + .quad cpu_gdt_table | |
23390 | +#ifdef CONFIG_SMP | |
23391 | + .rept NR_CPUS-1 | |
23392 | + .word 0 | |
23393 | + .quad 0 | |
23394 | + .endr | |
23395 | +#endif | |
23396 | + | |
23397 | +/* We need valid kernel segments for data and code in long mode too | |
23398 | + * IRET will check the segment types kkeil 2000/10/28 | |
23399 | + * Also sysret mandates a special GDT layout | |
23400 | + */ | |
23401 | + | |
23402 | + .section .data.page_aligned, "aw" | |
23403 | + .align PAGE_SIZE | |
23404 | + | |
23405 | +/* The TLS descriptors are currently at a different place compared to i386. | |
23406 | + Hopefully nobody expects them at a fixed place (Wine?) */ | |
23407 | + | |
23408 | +ENTRY(cpu_gdt_table) | |
23409 | + .quad 0x0000000000000000 /* NULL descriptor */ | |
23410 | + .quad 0x0 /* unused */ | |
23411 | + .quad 0x00af9a000000ffff /* __KERNEL_CS */ | |
23412 | + .quad 0x00cf92000000ffff /* __KERNEL_DS */ | |
23413 | + .quad 0x00cffa000000ffff /* __USER32_CS */ | |
23414 | + .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ | |
23415 | + .quad 0x00affa000000ffff /* __USER_CS */ | |
23416 | + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ | |
23417 | + .quad 0,0 /* TSS */ | |
23418 | + .quad 0,0 /* LDT */ | |
23419 | + .quad 0,0,0 /* three TLS descriptors */ | |
23420 | + .quad 0 /* unused */ | |
23421 | +gdt_end: | |
23422 | + /* asm/segment.h:GDT_ENTRIES must match this */ | |
23423 | + /* This should be a multiple of the cache line size */ | |
23424 | + /* GDTs of other CPUs are now dynamically allocated */ | |
23425 | + | |
23426 | + /* zero the remaining page */ | |
23427 | + .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 | |
23428 | + | |
23429 | + .section .bss.page_aligned, "aw", @nobits | |
23430 | + .align PAGE_SIZE | |
23431 | +ENTRY(empty_zero_page) | |
23432 | + .skip PAGE_SIZE | |
23433 | + | |
23434 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
23435 | +/* | |
23436 | + * __xen_guest information | |
23437 | + */ | |
23438 | +.macro utoh value | |
23439 | + .if (\value) < 0 || (\value) >= 0x10 | |
23440 | + utoh (((\value)>>4)&0x0fffffffffffffff) | |
23441 | + .endif | |
23442 | + .if ((\value) & 0xf) < 10 | |
23443 | + .byte '0' + ((\value) & 0xf) | |
23444 | + .else | |
23445 | + .byte 'A' + ((\value) & 0xf) - 10 | |
23446 | + .endif | |
23447 | +.endm | |
23448 | + | |
23449 | +.section __xen_guest | |
23450 | + .ascii "GUEST_OS=linux,GUEST_VER=2.6" | |
23451 | + .ascii ",XEN_VER=xen-3.0" | |
23452 | + .ascii ",VIRT_BASE=0x" | |
23453 | + utoh __START_KERNEL_map | |
23454 | + .ascii ",ELF_PADDR_OFFSET=0x" | |
23455 | + utoh __START_KERNEL_map | |
23456 | + .ascii ",VIRT_ENTRY=0x" | |
23457 | + utoh (__START_KERNEL_map + __PHYSICAL_START) | |
23458 | + .ascii ",HYPERCALL_PAGE=0x" | |
23459 | + utoh (phys_hypercall_page >> PAGE_SHIFT) | |
23460 | + .ascii ",FEATURES=writable_page_tables" | |
23461 | + .ascii "|writable_descriptor_tables" | |
23462 | + .ascii "|auto_translated_physmap" | |
23463 | + .ascii "|supervisor_mode_kernel" | |
23464 | + .ascii ",LOADER=generic" | |
23465 | + .byte 0 | |
23466 | +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ | |
23467 | + | |
23468 | + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") | |
23469 | + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") | |
23470 | + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") | |
23471 | + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad, __START_KERNEL_map) | |
23472 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
23473 | + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, __START_KERNEL_map) | |
23474 | +#else | |
23475 | + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, 0) | |
23476 | +#endif | |
23477 | + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad, startup_64) | |
23478 | + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad, hypercall_page) | |
23479 | + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT) | |
23480 | + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") | |
23481 | + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") | |
23482 | + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) | |
23483 | Index: head-2008-11-25/arch/x86/kernel/head64-xen.c | |
23484 | =================================================================== | |
23485 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
23486 | +++ head-2008-11-25/arch/x86/kernel/head64-xen.c 2007-06-12 13:13:01.000000000 +0200 | |
23487 | @@ -0,0 +1,162 @@ | |
23488 | +/* | |
23489 | + * linux/arch/x86_64/kernel/head64.c -- prepare to run common code | |
23490 | + * | |
23491 | + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | |
23492 | + * | |
23493 | + * Jun Nakajima <jun.nakajima@intel.com> | |
23494 | + * Modified for Xen. | |
23495 | + */ | |
23496 | + | |
23497 | +#include <linux/init.h> | |
23498 | +#include <linux/linkage.h> | |
23499 | +#include <linux/types.h> | |
23500 | +#include <linux/kernel.h> | |
23501 | +#include <linux/string.h> | |
23502 | +#include <linux/percpu.h> | |
23503 | +#include <linux/module.h> | |
23504 | + | |
23505 | +#include <asm/processor.h> | |
23506 | +#include <asm/proto.h> | |
23507 | +#include <asm/smp.h> | |
23508 | +#include <asm/bootsetup.h> | |
23509 | +#include <asm/setup.h> | |
23510 | +#include <asm/desc.h> | |
23511 | +#include <asm/pgtable.h> | |
23512 | +#include <asm/sections.h> | |
23513 | + | |
23514 | +unsigned long start_pfn; | |
23515 | + | |
23516 | +/* Don't add a printk in there. printk relies on the PDA which is not initialized | |
23517 | + yet. */ | |
23518 | +#if 0 | |
23519 | +static void __init clear_bss(void) | |
23520 | +{ | |
23521 | + memset(__bss_start, 0, | |
23522 | + (unsigned long) __bss_stop - (unsigned long) __bss_start); | |
23523 | +} | |
23524 | +#endif | |
23525 | + | |
23526 | +#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ | |
23527 | +#define OLD_CL_MAGIC_ADDR 0x90020 | |
23528 | +#define OLD_CL_MAGIC 0xA33F | |
23529 | +#define OLD_CL_BASE_ADDR 0x90000 | |
23530 | +#define OLD_CL_OFFSET 0x90022 | |
23531 | + | |
23532 | +extern char saved_command_line[]; | |
23533 | + | |
23534 | +static void __init copy_bootdata(char *real_mode_data) | |
23535 | +{ | |
23536 | +#ifndef CONFIG_XEN | |
23537 | + int new_data; | |
23538 | + char * command_line; | |
23539 | + | |
23540 | + memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); | |
23541 | + new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); | |
23542 | + if (!new_data) { | |
23543 | + if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { | |
23544 | + printk("so old bootloader that it does not support commandline?!\n"); | |
23545 | + return; | |
23546 | + } | |
23547 | + new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; | |
23548 | + printk("old bootloader convention, maybe loadlin?\n"); | |
23549 | + } | |
23550 | + command_line = (char *) ((u64)(new_data)); | |
23551 | + memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); | |
23552 | +#else | |
23553 | + int max_cmdline; | |
23554 | + | |
23555 | + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) | |
23556 | + max_cmdline = COMMAND_LINE_SIZE; | |
23557 | + memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline); | |
23558 | + saved_command_line[max_cmdline-1] = '\0'; | |
23559 | +#endif | |
23560 | + printk("Bootdata ok (command line is %s)\n", saved_command_line); | |
23561 | +} | |
23562 | + | |
23563 | +static void __init setup_boot_cpu_data(void) | |
23564 | +{ | |
23565 | + unsigned int dummy, eax; | |
23566 | + | |
23567 | + /* get vendor info */ | |
23568 | + cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, | |
23569 | + (unsigned int *)&boot_cpu_data.x86_vendor_id[0], | |
23570 | + (unsigned int *)&boot_cpu_data.x86_vendor_id[8], | |
23571 | + (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); | |
23572 | + | |
23573 | + /* get cpu type */ | |
23574 | + cpuid(1, &eax, &dummy, &dummy, | |
23575 | + (unsigned int *) &boot_cpu_data.x86_capability); | |
23576 | + boot_cpu_data.x86 = (eax >> 8) & 0xf; | |
23577 | + boot_cpu_data.x86_model = (eax >> 4) & 0xf; | |
23578 | + boot_cpu_data.x86_mask = eax & 0xf; | |
23579 | +} | |
23580 | + | |
23581 | +#include <xen/interface/memory.h> | |
23582 | +unsigned long *machine_to_phys_mapping; | |
23583 | +EXPORT_SYMBOL(machine_to_phys_mapping); | |
23584 | +unsigned int machine_to_phys_order; | |
23585 | +EXPORT_SYMBOL(machine_to_phys_order); | |
23586 | + | |
23587 | +void __init x86_64_start_kernel(char * real_mode_data) | |
23588 | +{ | |
23589 | + struct xen_machphys_mapping mapping; | |
23590 | + unsigned long machine_to_phys_nr_ents; | |
23591 | + char *s; | |
23592 | + int i; | |
23593 | + | |
23594 | + setup_xen_features(); | |
23595 | + | |
23596 | + xen_start_info = (struct start_info *)real_mode_data; | |
23597 | + if (!xen_feature(XENFEAT_auto_translated_physmap)) | |
23598 | + phys_to_machine_mapping = | |
23599 | + (unsigned long *)xen_start_info->mfn_list; | |
23600 | + start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + | |
23601 | + xen_start_info->nr_pt_frames; | |
23602 | + | |
23603 | + machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START; | |
23604 | + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; | |
23605 | + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { | |
23606 | + machine_to_phys_mapping = (unsigned long *)mapping.v_start; | |
23607 | + machine_to_phys_nr_ents = mapping.max_mfn + 1; | |
23608 | + } | |
23609 | + while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents ) | |
23610 | + machine_to_phys_order++; | |
23611 | + | |
23612 | +#if 0 | |
23613 | + for (i = 0; i < 256; i++) | |
23614 | + set_intr_gate(i, early_idt_handler); | |
23615 | + asm volatile("lidt %0" :: "m" (idt_descr)); | |
23616 | +#endif | |
23617 | + | |
23618 | + /* | |
23619 | + * This must be called really, really early: | |
23620 | + */ | |
23621 | + lockdep_init(); | |
23622 | + | |
23623 | + for (i = 0; i < NR_CPUS; i++) | |
23624 | + cpu_pda(i) = &boot_cpu_pda[i]; | |
23625 | + | |
23626 | + pda_init(0); | |
23627 | + copy_bootdata(real_mode_data); | |
23628 | +#ifdef CONFIG_SMP | |
23629 | + cpu_set(0, cpu_online_map); | |
23630 | +#endif | |
23631 | + s = strstr(saved_command_line, "earlyprintk="); | |
23632 | + if (s != NULL) | |
23633 | + setup_early_printk(strchr(s, '=') + 1); | |
23634 | +#ifdef CONFIG_NUMA | |
23635 | + s = strstr(saved_command_line, "numa="); | |
23636 | + if (s != NULL) | |
23637 | + numa_setup(s+5); | |
23638 | +#endif | |
23639 | +#ifdef CONFIG_X86_IO_APIC | |
23640 | + if (strstr(saved_command_line, "disableapic")) | |
23641 | + disable_apic = 1; | |
23642 | +#endif | |
23643 | + /* You need early console to see that */ | |
23644 | + if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) | |
23645 | + panic("Kernel too big for kernel mapping\n"); | |
23646 | + | |
23647 | + setup_boot_cpu_data(); | |
23648 | + start_kernel(); | |
23649 | +} | |
23650 | Index: head-2008-11-25/arch/x86/kernel/io_apic_64-xen.c | |
23651 | =================================================================== | |
23652 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
23653 | +++ head-2008-11-25/arch/x86/kernel/io_apic_64-xen.c 2008-11-25 12:22:34.000000000 +0100 | |
23654 | @@ -0,0 +1,2268 @@ | |
23655 | +/* | |
23656 | + * Intel IO-APIC support for multi-Pentium hosts. | |
23657 | + * | |
23658 | + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo | |
23659 | + * | |
23660 | + * Many thanks to Stig Venaas for trying out countless experimental | |
23661 | + * patches and reporting/debugging problems patiently! | |
23662 | + * | |
23663 | + * (c) 1999, Multiple IO-APIC support, developed by | |
23664 | + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and | |
23665 | + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, | |
23666 | + * further tested and cleaned up by Zach Brown <zab@redhat.com> | |
23667 | + * and Ingo Molnar <mingo@redhat.com> | |
23668 | + * | |
23669 | + * Fixes | |
23670 | + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | |
23671 | + * thanks to Eric Gilmore | |
23672 | + * and Rolf G. Tews | |
23673 | + * for testing these extensively | |
23674 | + * Paul Diefenbaugh : Added full ACPI support | |
23675 | + */ | |
23676 | + | |
23677 | +#include <linux/mm.h> | |
23678 | +#include <linux/interrupt.h> | |
23679 | +#include <linux/init.h> | |
23680 | +#include <linux/delay.h> | |
23681 | +#include <linux/sched.h> | |
23682 | +#include <linux/smp_lock.h> | |
23683 | +#include <linux/mc146818rtc.h> | |
23684 | +#include <linux/acpi.h> | |
23685 | +#include <linux/sysdev.h> | |
23686 | +#ifdef CONFIG_ACPI | |
23687 | +#include <acpi/acpi_bus.h> | |
23688 | +#endif | |
23689 | + | |
23690 | +#include <asm/io.h> | |
23691 | +#include <asm/smp.h> | |
23692 | +#include <asm/desc.h> | |
23693 | +#include <asm/proto.h> | |
23694 | +#include <asm/mach_apic.h> | |
23695 | +#include <asm/acpi.h> | |
23696 | +#include <asm/dma.h> | |
23697 | +#include <asm/nmi.h> | |
23698 | + | |
23699 | +#define __apicdebuginit __init | |
23700 | + | |
23701 | +int sis_apic_bug; /* not actually supported, dummy for compile */ | |
23702 | + | |
23703 | +static int no_timer_check; | |
23704 | + | |
23705 | +int disable_timer_pin_1 __initdata; | |
23706 | + | |
23707 | +#ifndef CONFIG_XEN | |
23708 | +int timer_over_8254 __initdata = 0; | |
23709 | + | |
23710 | +/* Where if anywhere is the i8259 connect in external int mode */ | |
23711 | +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | |
23712 | +#endif | |
23713 | + | |
23714 | +static DEFINE_SPINLOCK(ioapic_lock); | |
23715 | +static DEFINE_SPINLOCK(vector_lock); | |
23716 | + | |
23717 | +/* | |
23718 | + * # of IRQ routing registers | |
23719 | + */ | |
23720 | +int nr_ioapic_registers[MAX_IO_APICS]; | |
23721 | + | |
23722 | +/* | |
23723 | + * Rough estimation of how many shared IRQs there are, can | |
23724 | + * be changed anytime. | |
23725 | + */ | |
23726 | +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS | |
23727 | +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) | |
23728 | + | |
23729 | +/* | |
23730 | + * This is performance-critical, we want to do it O(1) | |
23731 | + * | |
23732 | + * the indexing order of this array favors 1:1 mappings | |
23733 | + * between pins and IRQs. | |
23734 | + */ | |
23735 | + | |
23736 | +static struct irq_pin_list { | |
23737 | + short apic, pin, next; | |
23738 | +} irq_2_pin[PIN_MAP_SIZE]; | |
23739 | + | |
23740 | +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; | |
23741 | +#ifdef CONFIG_PCI_MSI | |
23742 | +#define vector_to_irq(vector) \ | |
23743 | + (platform_legacy_irq(vector) ? vector : vector_irq[vector]) | |
23744 | +#else | |
23745 | +#define vector_to_irq(vector) (vector) | |
23746 | +#endif | |
23747 | + | |
23748 | +#ifdef CONFIG_XEN | |
23749 | + | |
23750 | +#include <xen/interface/xen.h> | |
23751 | +#include <xen/interface/physdev.h> | |
23752 | +#include <xen/evtchn.h> | |
23753 | + | |
23754 | +/* Fake i8259 */ | |
23755 | +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) | |
23756 | +#define disable_8259A_irq(_irq) ((void)0) | |
23757 | +#define i8259A_irq_pending(_irq) (0) | |
23758 | + | |
23759 | +unsigned long io_apic_irqs; | |
23760 | + | |
23761 | +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) | |
23762 | +{ | |
23763 | + struct physdev_apic apic_op; | |
23764 | + int ret; | |
23765 | + | |
23766 | + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; | |
23767 | + apic_op.reg = reg; | |
23768 | + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); | |
23769 | + if (ret) | |
23770 | + return ret; | |
23771 | + return apic_op.value; | |
23772 | +} | |
23773 | + | |
23774 | +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) | |
23775 | +{ | |
23776 | + struct physdev_apic apic_op; | |
23777 | + | |
23778 | + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; | |
23779 | + apic_op.reg = reg; | |
23780 | + apic_op.value = value; | |
23781 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); | |
23782 | +} | |
23783 | + | |
23784 | +#define io_apic_read(a,r) xen_io_apic_read(a,r) | |
23785 | +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) | |
23786 | + | |
23787 | +#define clear_IO_APIC() ((void)0) | |
23788 | + | |
23789 | +#else | |
23790 | + | |
23791 | +#ifdef CONFIG_SMP | |
23792 | +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) | |
23793 | +{ | |
23794 | + unsigned long flags; | |
23795 | + unsigned int dest; | |
23796 | + cpumask_t tmp; | |
23797 | + | |
23798 | + cpus_and(tmp, mask, cpu_online_map); | |
23799 | + if (cpus_empty(tmp)) | |
23800 | + tmp = TARGET_CPUS; | |
23801 | + | |
23802 | + cpus_and(mask, tmp, CPU_MASK_ALL); | |
23803 | + | |
23804 | + dest = cpu_mask_to_apicid(mask); | |
23805 | + | |
23806 | + /* | |
23807 | + * Only the high 8 bits are valid. | |
23808 | + */ | |
23809 | + dest = SET_APIC_LOGICAL_ID(dest); | |
23810 | + | |
23811 | + spin_lock_irqsave(&ioapic_lock, flags); | |
23812 | + __DO_ACTION(1, = dest, ) | |
23813 | + set_irq_info(irq, mask); | |
23814 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
23815 | +} | |
23816 | +#endif | |
23817 | + | |
23818 | +#endif /* !CONFIG_XEN */ | |
23819 | + | |
23820 | +/* | |
23821 | + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are | |
23822 | + * shared ISA-space IRQs, so we have to support them. We are super | |
23823 | + * fast in the common case, and fast for shared ISA-space IRQs. | |
23824 | + */ | |
23825 | +static void add_pin_to_irq(unsigned int irq, int apic, int pin) | |
23826 | +{ | |
23827 | + static int first_free_entry = NR_IRQS; | |
23828 | + struct irq_pin_list *entry = irq_2_pin + irq; | |
23829 | + | |
23830 | + BUG_ON(irq >= NR_IRQS); | |
23831 | + while (entry->next) | |
23832 | + entry = irq_2_pin + entry->next; | |
23833 | + | |
23834 | + if (entry->pin != -1) { | |
23835 | + entry->next = first_free_entry; | |
23836 | + entry = irq_2_pin + entry->next; | |
23837 | + if (++first_free_entry >= PIN_MAP_SIZE) | |
23838 | + panic("io_apic.c: ran out of irq_2_pin entries!"); | |
23839 | + } | |
23840 | + entry->apic = apic; | |
23841 | + entry->pin = pin; | |
23842 | +} | |
23843 | + | |
23844 | +#ifndef CONFIG_XEN | |
23845 | +#define __DO_ACTION(R, ACTION, FINAL) \ | |
23846 | + \ | |
23847 | +{ \ | |
23848 | + int pin; \ | |
23849 | + struct irq_pin_list *entry = irq_2_pin + irq; \ | |
23850 | + \ | |
23851 | + BUG_ON(irq >= NR_IRQS); \ | |
23852 | + for (;;) { \ | |
23853 | + unsigned int reg; \ | |
23854 | + pin = entry->pin; \ | |
23855 | + if (pin == -1) \ | |
23856 | + break; \ | |
23857 | + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ | |
23858 | + reg ACTION; \ | |
23859 | + io_apic_modify(entry->apic, reg); \ | |
23860 | + if (!entry->next) \ | |
23861 | + break; \ | |
23862 | + entry = irq_2_pin + entry->next; \ | |
23863 | + } \ | |
23864 | + FINAL; \ | |
23865 | +} | |
23866 | + | |
23867 | +#define DO_ACTION(name,R,ACTION, FINAL) \ | |
23868 | + \ | |
23869 | + static void name##_IO_APIC_irq (unsigned int irq) \ | |
23870 | + __DO_ACTION(R, ACTION, FINAL) | |
23871 | + | |
23872 | +DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) | |
23873 | + /* mask = 1 */ | |
23874 | +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) | |
23875 | + /* mask = 0 */ | |
23876 | + | |
23877 | +static void mask_IO_APIC_irq (unsigned int irq) | |
23878 | +{ | |
23879 | + unsigned long flags; | |
23880 | + | |
23881 | + spin_lock_irqsave(&ioapic_lock, flags); | |
23882 | + __mask_IO_APIC_irq(irq); | |
23883 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
23884 | +} | |
23885 | + | |
23886 | +static void unmask_IO_APIC_irq (unsigned int irq) | |
23887 | +{ | |
23888 | + unsigned long flags; | |
23889 | + | |
23890 | + spin_lock_irqsave(&ioapic_lock, flags); | |
23891 | + __unmask_IO_APIC_irq(irq); | |
23892 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
23893 | +} | |
23894 | + | |
23895 | +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) | |
23896 | +{ | |
23897 | + struct IO_APIC_route_entry entry; | |
23898 | + unsigned long flags; | |
23899 | + | |
23900 | + /* Check delivery_mode to be sure we're not clearing an SMI pin */ | |
23901 | + spin_lock_irqsave(&ioapic_lock, flags); | |
23902 | + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); | |
23903 | + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); | |
23904 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
23905 | + if (entry.delivery_mode == dest_SMI) | |
23906 | + return; | |
23907 | + /* | |
23908 | + * Disable it in the IO-APIC irq-routing table: | |
23909 | + */ | |
23910 | + memset(&entry, 0, sizeof(entry)); | |
23911 | + entry.mask = 1; | |
23912 | + spin_lock_irqsave(&ioapic_lock, flags); | |
23913 | + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); | |
23914 | + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); | |
23915 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
23916 | +} | |
23917 | + | |
23918 | +static void clear_IO_APIC (void) | |
23919 | +{ | |
23920 | + int apic, pin; | |
23921 | + | |
23922 | + for (apic = 0; apic < nr_ioapics; apic++) | |
23923 | + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) | |
23924 | + clear_IO_APIC_pin(apic, pin); | |
23925 | +} | |
23926 | + | |
23927 | +#endif /* !CONFIG_XEN */ | |
23928 | + | |
23929 | +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF }; | |
23930 | + | |
23931 | +/* | |
23932 | + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to | |
23933 | + * specific CPU-side IRQs. | |
23934 | + */ | |
23935 | + | |
23936 | +#define MAX_PIRQS 8 | |
23937 | +static int pirq_entries [MAX_PIRQS]; | |
23938 | +static int pirqs_enabled; | |
23939 | +int skip_ioapic_setup; | |
23940 | +int ioapic_force; | |
23941 | + | |
23942 | +/* dummy parsing: see setup.c */ | |
23943 | + | |
23944 | +static int __init disable_ioapic_setup(char *str) | |
23945 | +{ | |
23946 | + skip_ioapic_setup = 1; | |
23947 | + return 1; | |
23948 | +} | |
23949 | + | |
23950 | +static int __init enable_ioapic_setup(char *str) | |
23951 | +{ | |
23952 | + ioapic_force = 1; | |
23953 | + skip_ioapic_setup = 0; | |
23954 | + return 1; | |
23955 | +} | |
23956 | + | |
23957 | +__setup("noapic", disable_ioapic_setup); | |
23958 | +__setup("apic", enable_ioapic_setup); | |
23959 | + | |
23960 | +#ifndef CONFIG_XEN | |
23961 | +static int __init setup_disable_8254_timer(char *s) | |
23962 | +{ | |
23963 | + timer_over_8254 = -1; | |
23964 | + return 1; | |
23965 | +} | |
23966 | +static int __init setup_enable_8254_timer(char *s) | |
23967 | +{ | |
23968 | + timer_over_8254 = 2; | |
23969 | + return 1; | |
23970 | +} | |
23971 | + | |
23972 | +__setup("disable_8254_timer", setup_disable_8254_timer); | |
23973 | +__setup("enable_8254_timer", setup_enable_8254_timer); | |
23974 | +#endif /* !CONFIG_XEN */ | |
23975 | + | |
23976 | +#include <asm/pci-direct.h> | |
23977 | +#include <linux/pci_ids.h> | |
23978 | +#include <linux/pci.h> | |
23979 | + | |
23980 | + | |
23981 | +#ifdef CONFIG_ACPI | |
23982 | + | |
23983 | +static int nvidia_hpet_detected __initdata; | |
23984 | + | |
23985 | +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) | |
23986 | +{ | |
23987 | + nvidia_hpet_detected = 1; | |
23988 | + return 0; | |
23989 | +} | |
23990 | +#endif | |
23991 | + | |
23992 | +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC | |
23993 | + off. Check for an Nvidia or VIA PCI bridge and turn it off. | |
23994 | + Use pci direct infrastructure because this runs before the PCI subsystem. | |
23995 | + | |
23996 | + Can be overwritten with "apic" | |
23997 | + | |
23998 | + And another hack to disable the IOMMU on VIA chipsets. | |
23999 | + | |
24000 | + ... and others. Really should move this somewhere else. | |
24001 | + | |
24002 | + Kludge-O-Rama. */ | |
24003 | +void __init check_ioapic(void) | |
24004 | +{ | |
24005 | + int num,slot,func; | |
24006 | + /* Poor man's PCI discovery */ | |
24007 | + for (num = 0; num < 32; num++) { | |
24008 | + for (slot = 0; slot < 32; slot++) { | |
24009 | + for (func = 0; func < 8; func++) { | |
24010 | + u32 class; | |
24011 | + u32 vendor; | |
24012 | + u8 type; | |
24013 | + class = read_pci_config(num,slot,func, | |
24014 | + PCI_CLASS_REVISION); | |
24015 | + if (class == 0xffffffff) | |
24016 | + break; | |
24017 | + | |
24018 | + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) | |
24019 | + continue; | |
24020 | + | |
24021 | + vendor = read_pci_config(num, slot, func, | |
24022 | + PCI_VENDOR_ID); | |
24023 | + vendor &= 0xffff; | |
24024 | + switch (vendor) { | |
24025 | + case PCI_VENDOR_ID_VIA: | |
24026 | +#ifdef CONFIG_IOMMU | |
24027 | + if ((end_pfn > MAX_DMA32_PFN || | |
24028 | + force_iommu) && | |
24029 | + !iommu_aperture_allowed) { | |
24030 | + printk(KERN_INFO | |
24031 | + "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n"); | |
24032 | + iommu_aperture_disabled = 1; | |
24033 | + } | |
24034 | +#endif | |
24035 | + return; | |
24036 | + case PCI_VENDOR_ID_NVIDIA: | |
24037 | +#ifdef CONFIG_ACPI | |
24038 | + /* | |
24039 | + * All timer overrides on Nvidia are | |
24040 | + * wrong unless HPET is enabled. | |
24041 | + */ | |
24042 | + nvidia_hpet_detected = 0; | |
24043 | + acpi_table_parse(ACPI_HPET, | |
24044 | + nvidia_hpet_check); | |
24045 | + if (nvidia_hpet_detected == 0) { | |
24046 | + acpi_skip_timer_override = 1; | |
24047 | + printk(KERN_INFO "Nvidia board " | |
24048 | + "detected. Ignoring ACPI " | |
24049 | + "timer override.\n"); | |
24050 | + } | |
24051 | +#endif | |
24052 | + /* RED-PEN skip them on mptables too? */ | |
24053 | + return; | |
24054 | + case PCI_VENDOR_ID_ATI: | |
24055 | + | |
24056 | + /* This should be actually default, but | |
24057 | + for 2.6.16 let's do it for ATI only where | |
24058 | + it's really needed. */ | |
24059 | +#ifndef CONFIG_XEN | |
24060 | + if (timer_over_8254 == 1) { | |
24061 | + timer_over_8254 = 0; | |
24062 | + printk(KERN_INFO | |
24063 | + "ATI board detected. Disabling timer routing over 8254.\n"); | |
24064 | + } | |
24065 | +#endif | |
24066 | + return; | |
24067 | + } | |
24068 | + | |
24069 | + | |
24070 | + /* No multi-function device? */ | |
24071 | + type = read_pci_config_byte(num,slot,func, | |
24072 | + PCI_HEADER_TYPE); | |
24073 | + if (!(type & 0x80)) | |
24074 | + break; | |
24075 | + } | |
24076 | + } | |
24077 | + } | |
24078 | +} | |
24079 | + | |
24080 | +static int __init ioapic_pirq_setup(char *str) | |
24081 | +{ | |
24082 | + int i, max; | |
24083 | + int ints[MAX_PIRQS+1]; | |
24084 | + | |
24085 | + get_options(str, ARRAY_SIZE(ints), ints); | |
24086 | + | |
24087 | + for (i = 0; i < MAX_PIRQS; i++) | |
24088 | + pirq_entries[i] = -1; | |
24089 | + | |
24090 | + pirqs_enabled = 1; | |
24091 | + apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); | |
24092 | + max = MAX_PIRQS; | |
24093 | + if (ints[0] < MAX_PIRQS) | |
24094 | + max = ints[0]; | |
24095 | + | |
24096 | + for (i = 0; i < max; i++) { | |
24097 | + apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); | |
24098 | + /* | |
24099 | + * PIRQs are mapped upside down, usually. | |
24100 | + */ | |
24101 | + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; | |
24102 | + } | |
24103 | + return 1; | |
24104 | +} | |
24105 | + | |
24106 | +__setup("pirq=", ioapic_pirq_setup); | |
24107 | + | |
24108 | +/* | |
24109 | + * Find the IRQ entry number of a certain pin. | |
24110 | + */ | |
24111 | +static int find_irq_entry(int apic, int pin, int type) | |
24112 | +{ | |
24113 | + int i; | |
24114 | + | |
24115 | + for (i = 0; i < mp_irq_entries; i++) | |
24116 | + if (mp_irqs[i].mpc_irqtype == type && | |
24117 | + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || | |
24118 | + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && | |
24119 | + mp_irqs[i].mpc_dstirq == pin) | |
24120 | + return i; | |
24121 | + | |
24122 | + return -1; | |
24123 | +} | |
24124 | + | |
24125 | +#ifndef CONFIG_XEN | |
24126 | +/* | |
24127 | + * Find the pin to which IRQ[irq] (ISA) is connected | |
24128 | + */ | |
24129 | +static int __init find_isa_irq_pin(int irq, int type) | |
24130 | +{ | |
24131 | + int i; | |
24132 | + | |
24133 | + for (i = 0; i < mp_irq_entries; i++) { | |
24134 | + int lbus = mp_irqs[i].mpc_srcbus; | |
24135 | + | |
24136 | + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || | |
24137 | + mp_bus_id_to_type[lbus] == MP_BUS_EISA || | |
24138 | + mp_bus_id_to_type[lbus] == MP_BUS_MCA) && | |
24139 | + (mp_irqs[i].mpc_irqtype == type) && | |
24140 | + (mp_irqs[i].mpc_srcbusirq == irq)) | |
24141 | + | |
24142 | + return mp_irqs[i].mpc_dstirq; | |
24143 | + } | |
24144 | + return -1; | |
24145 | +} | |
24146 | + | |
24147 | +static int __init find_isa_irq_apic(int irq, int type) | |
24148 | +{ | |
24149 | + int i; | |
24150 | + | |
24151 | + for (i = 0; i < mp_irq_entries; i++) { | |
24152 | + int lbus = mp_irqs[i].mpc_srcbus; | |
24153 | + | |
24154 | + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || | |
24155 | + mp_bus_id_to_type[lbus] == MP_BUS_EISA || | |
24156 | + mp_bus_id_to_type[lbus] == MP_BUS_MCA) && | |
24157 | + (mp_irqs[i].mpc_irqtype == type) && | |
24158 | + (mp_irqs[i].mpc_srcbusirq == irq)) | |
24159 | + break; | |
24160 | + } | |
24161 | + if (i < mp_irq_entries) { | |
24162 | + int apic; | |
24163 | + for(apic = 0; apic < nr_ioapics; apic++) { | |
24164 | + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) | |
24165 | + return apic; | |
24166 | + } | |
24167 | + } | |
24168 | + | |
24169 | + return -1; | |
24170 | +} | |
24171 | +#endif | |
24172 | + | |
24173 | +/* | |
24174 | + * Find a specific PCI IRQ entry. | |
24175 | + * Not an __init, possibly needed by modules | |
24176 | + */ | |
24177 | +static int pin_2_irq(int idx, int apic, int pin); | |
24178 | + | |
24179 | +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | |
24180 | +{ | |
24181 | + int apic, i, best_guess = -1; | |
24182 | + | |
24183 | + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", | |
24184 | + bus, slot, pin); | |
24185 | + if (mp_bus_id_to_pci_bus[bus] == -1) { | |
24186 | + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | |
24187 | + return -1; | |
24188 | + } | |
24189 | + for (i = 0; i < mp_irq_entries; i++) { | |
24190 | + int lbus = mp_irqs[i].mpc_srcbus; | |
24191 | + | |
24192 | + for (apic = 0; apic < nr_ioapics; apic++) | |
24193 | + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || | |
24194 | + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) | |
24195 | + break; | |
24196 | + | |
24197 | + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && | |
24198 | + !mp_irqs[i].mpc_irqtype && | |
24199 | + (bus == lbus) && | |
24200 | + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { | |
24201 | + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); | |
24202 | + | |
24203 | + if (!(apic || IO_APIC_IRQ(irq))) | |
24204 | + continue; | |
24205 | + | |
24206 | + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) | |
24207 | + return irq; | |
24208 | + /* | |
24209 | + * Use the first all-but-pin matching entry as a | |
24210 | + * best-guess fuzzy result for broken mptables. | |
24211 | + */ | |
24212 | + if (best_guess < 0) | |
24213 | + best_guess = irq; | |
24214 | + } | |
24215 | + } | |
24216 | + BUG_ON(best_guess >= NR_IRQS); | |
24217 | + return best_guess; | |
24218 | +} | |
24219 | + | |
24220 | +/* | |
24221 | + * EISA Edge/Level control register, ELCR | |
24222 | + */ | |
24223 | +static int EISA_ELCR(unsigned int irq) | |
24224 | +{ | |
24225 | + if (irq < 16) { | |
24226 | + unsigned int port = 0x4d0 + (irq >> 3); | |
24227 | + return (inb(port) >> (irq & 7)) & 1; | |
24228 | + } | |
24229 | + apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); | |
24230 | + return 0; | |
24231 | +} | |
24232 | + | |
24233 | +/* EISA interrupts are always polarity zero and can be edge or level | |
24234 | + * trigger depending on the ELCR value. If an interrupt is listed as | |
24235 | + * EISA conforming in the MP table, that means its trigger type must | |
24236 | + * be read in from the ELCR */ | |
24237 | + | |
24238 | +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) | |
24239 | +#define default_EISA_polarity(idx) (0) | |
24240 | + | |
24241 | +/* ISA interrupts are always polarity zero edge triggered, | |
24242 | + * when listed as conforming in the MP table. */ | |
24243 | + | |
24244 | +#define default_ISA_trigger(idx) (0) | |
24245 | +#define default_ISA_polarity(idx) (0) | |
24246 | + | |
24247 | +/* PCI interrupts are always polarity one level triggered, | |
24248 | + * when listed as conforming in the MP table. */ | |
24249 | + | |
24250 | +#define default_PCI_trigger(idx) (1) | |
24251 | +#define default_PCI_polarity(idx) (1) | |
24252 | + | |
24253 | +/* MCA interrupts are always polarity zero level triggered, | |
24254 | + * when listed as conforming in the MP table. */ | |
24255 | + | |
24256 | +#define default_MCA_trigger(idx) (1) | |
24257 | +#define default_MCA_polarity(idx) (0) | |
24258 | + | |
24259 | +static int __init MPBIOS_polarity(int idx) | |
24260 | +{ | |
24261 | + int bus = mp_irqs[idx].mpc_srcbus; | |
24262 | + int polarity; | |
24263 | + | |
24264 | + /* | |
24265 | + * Determine IRQ line polarity (high active or low active): | |
24266 | + */ | |
24267 | + switch (mp_irqs[idx].mpc_irqflag & 3) | |
24268 | + { | |
24269 | + case 0: /* conforms, ie. bus-type dependent polarity */ | |
24270 | + { | |
24271 | + switch (mp_bus_id_to_type[bus]) | |
24272 | + { | |
24273 | + case MP_BUS_ISA: /* ISA pin */ | |
24274 | + { | |
24275 | + polarity = default_ISA_polarity(idx); | |
24276 | + break; | |
24277 | + } | |
24278 | + case MP_BUS_EISA: /* EISA pin */ | |
24279 | + { | |
24280 | + polarity = default_EISA_polarity(idx); | |
24281 | + break; | |
24282 | + } | |
24283 | + case MP_BUS_PCI: /* PCI pin */ | |
24284 | + { | |
24285 | + polarity = default_PCI_polarity(idx); | |
24286 | + break; | |
24287 | + } | |
24288 | + case MP_BUS_MCA: /* MCA pin */ | |
24289 | + { | |
24290 | + polarity = default_MCA_polarity(idx); | |
24291 | + break; | |
24292 | + } | |
24293 | + default: | |
24294 | + { | |
24295 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
24296 | + polarity = 1; | |
24297 | + break; | |
24298 | + } | |
24299 | + } | |
24300 | + break; | |
24301 | + } | |
24302 | + case 1: /* high active */ | |
24303 | + { | |
24304 | + polarity = 0; | |
24305 | + break; | |
24306 | + } | |
24307 | + case 2: /* reserved */ | |
24308 | + { | |
24309 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
24310 | + polarity = 1; | |
24311 | + break; | |
24312 | + } | |
24313 | + case 3: /* low active */ | |
24314 | + { | |
24315 | + polarity = 1; | |
24316 | + break; | |
24317 | + } | |
24318 | + default: /* invalid */ | |
24319 | + { | |
24320 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
24321 | + polarity = 1; | |
24322 | + break; | |
24323 | + } | |
24324 | + } | |
24325 | + return polarity; | |
24326 | +} | |
24327 | + | |
24328 | +static int MPBIOS_trigger(int idx) | |
24329 | +{ | |
24330 | + int bus = mp_irqs[idx].mpc_srcbus; | |
24331 | + int trigger; | |
24332 | + | |
24333 | + /* | |
24334 | + * Determine IRQ trigger mode (edge or level sensitive): | |
24335 | + */ | |
24336 | + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) | |
24337 | + { | |
24338 | + case 0: /* conforms, ie. bus-type dependent */ | |
24339 | + { | |
24340 | + switch (mp_bus_id_to_type[bus]) | |
24341 | + { | |
24342 | + case MP_BUS_ISA: /* ISA pin */ | |
24343 | + { | |
24344 | + trigger = default_ISA_trigger(idx); | |
24345 | + break; | |
24346 | + } | |
24347 | + case MP_BUS_EISA: /* EISA pin */ | |
24348 | + { | |
24349 | + trigger = default_EISA_trigger(idx); | |
24350 | + break; | |
24351 | + } | |
24352 | + case MP_BUS_PCI: /* PCI pin */ | |
24353 | + { | |
24354 | + trigger = default_PCI_trigger(idx); | |
24355 | + break; | |
24356 | + } | |
24357 | + case MP_BUS_MCA: /* MCA pin */ | |
24358 | + { | |
24359 | + trigger = default_MCA_trigger(idx); | |
24360 | + break; | |
24361 | + } | |
24362 | + default: | |
24363 | + { | |
24364 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
24365 | + trigger = 1; | |
24366 | + break; | |
24367 | + } | |
24368 | + } | |
24369 | + break; | |
24370 | + } | |
24371 | + case 1: /* edge */ | |
24372 | + { | |
24373 | + trigger = 0; | |
24374 | + break; | |
24375 | + } | |
24376 | + case 2: /* reserved */ | |
24377 | + { | |
24378 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
24379 | + trigger = 1; | |
24380 | + break; | |
24381 | + } | |
24382 | + case 3: /* level */ | |
24383 | + { | |
24384 | + trigger = 1; | |
24385 | + break; | |
24386 | + } | |
24387 | + default: /* invalid */ | |
24388 | + { | |
24389 | + printk(KERN_WARNING "broken BIOS!!\n"); | |
24390 | + trigger = 0; | |
24391 | + break; | |
24392 | + } | |
24393 | + } | |
24394 | + return trigger; | |
24395 | +} | |
24396 | + | |
24397 | +static inline int irq_polarity(int idx) | |
24398 | +{ | |
24399 | + return MPBIOS_polarity(idx); | |
24400 | +} | |
24401 | + | |
24402 | +static inline int irq_trigger(int idx) | |
24403 | +{ | |
24404 | + return MPBIOS_trigger(idx); | |
24405 | +} | |
24406 | + | |
24407 | +static int next_irq = 16; | |
24408 | + | |
24409 | +/* | |
24410 | + * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ | |
24411 | + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number | |
24412 | + * from ACPI, which can reach 800 in large boxen. | |
24413 | + * | |
24414 | + * Compact the sparse GSI space into a sequential IRQ series and reuse | |
24415 | + * vectors if possible. | |
24416 | + */ | |
24417 | +int gsi_irq_sharing(int gsi) | |
24418 | +{ | |
24419 | + int i, tries, vector; | |
24420 | + | |
24421 | + BUG_ON(gsi >= NR_IRQ_VECTORS); | |
24422 | + | |
24423 | + if (platform_legacy_irq(gsi)) | |
24424 | + return gsi; | |
24425 | + | |
24426 | + if (gsi_2_irq[gsi] != 0xFF) | |
24427 | + return (int)gsi_2_irq[gsi]; | |
24428 | + | |
24429 | + tries = NR_IRQS; | |
24430 | + try_again: | |
24431 | + vector = assign_irq_vector(gsi); | |
24432 | + | |
24433 | + /* | |
24434 | + * Sharing vectors means sharing IRQs, so scan irq_vectors for previous | |
24435 | + * use of vector and if found, return that IRQ. However, we never want | |
24436 | + * to share legacy IRQs, which usually have a different trigger mode | |
24437 | + * than PCI. | |
24438 | + */ | |
24439 | + for (i = 0; i < NR_IRQS; i++) | |
24440 | + if (IO_APIC_VECTOR(i) == vector) | |
24441 | + break; | |
24442 | + if (platform_legacy_irq(i)) { | |
24443 | + if (--tries >= 0) { | |
24444 | + IO_APIC_VECTOR(i) = 0; | |
24445 | + goto try_again; | |
24446 | + } | |
24447 | + panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi); | |
24448 | + } | |
24449 | + if (i < NR_IRQS) { | |
24450 | + gsi_2_irq[gsi] = i; | |
24451 | + printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n", | |
24452 | + gsi, vector, i); | |
24453 | + return i; | |
24454 | + } | |
24455 | + | |
24456 | + i = next_irq++; | |
24457 | + BUG_ON(i >= NR_IRQS); | |
24458 | + gsi_2_irq[gsi] = i; | |
24459 | + IO_APIC_VECTOR(i) = vector; | |
24460 | + printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n", | |
24461 | + gsi, vector, i); | |
24462 | + return i; | |
24463 | +} | |
24464 | + | |
24465 | +static int pin_2_irq(int idx, int apic, int pin) | |
24466 | +{ | |
24467 | + int irq, i; | |
24468 | + int bus = mp_irqs[idx].mpc_srcbus; | |
24469 | + | |
24470 | + /* | |
24471 | + * Debugging check, we are in big trouble if this message pops up! | |
24472 | + */ | |
24473 | + if (mp_irqs[idx].mpc_dstirq != pin) | |
24474 | + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); | |
24475 | + | |
24476 | + switch (mp_bus_id_to_type[bus]) | |
24477 | + { | |
24478 | + case MP_BUS_ISA: /* ISA pin */ | |
24479 | + case MP_BUS_EISA: | |
24480 | + case MP_BUS_MCA: | |
24481 | + { | |
24482 | + irq = mp_irqs[idx].mpc_srcbusirq; | |
24483 | + break; | |
24484 | + } | |
24485 | + case MP_BUS_PCI: /* PCI pin */ | |
24486 | + { | |
24487 | + /* | |
24488 | + * PCI IRQs are mapped in order | |
24489 | + */ | |
24490 | + i = irq = 0; | |
24491 | + while (i < apic) | |
24492 | + irq += nr_ioapic_registers[i++]; | |
24493 | + irq += pin; | |
24494 | + irq = gsi_irq_sharing(irq); | |
24495 | + break; | |
24496 | + } | |
24497 | + default: | |
24498 | + { | |
24499 | + printk(KERN_ERR "unknown bus type %d.\n",bus); | |
24500 | + irq = 0; | |
24501 | + break; | |
24502 | + } | |
24503 | + } | |
24504 | + BUG_ON(irq >= NR_IRQS); | |
24505 | + | |
24506 | + /* | |
24507 | + * PCI IRQ command line redirection. Yes, limits are hardcoded. | |
24508 | + */ | |
24509 | + if ((pin >= 16) && (pin <= 23)) { | |
24510 | + if (pirq_entries[pin-16] != -1) { | |
24511 | + if (!pirq_entries[pin-16]) { | |
24512 | + apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); | |
24513 | + } else { | |
24514 | + irq = pirq_entries[pin-16]; | |
24515 | + apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", | |
24516 | + pin-16, irq); | |
24517 | + } | |
24518 | + } | |
24519 | + } | |
24520 | + BUG_ON(irq >= NR_IRQS); | |
24521 | + return irq; | |
24522 | +} | |
24523 | + | |
24524 | +static inline int IO_APIC_irq_trigger(int irq) | |
24525 | +{ | |
24526 | + int apic, idx, pin; | |
24527 | + | |
24528 | + for (apic = 0; apic < nr_ioapics; apic++) { | |
24529 | + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | |
24530 | + idx = find_irq_entry(apic,pin,mp_INT); | |
24531 | + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) | |
24532 | + return irq_trigger(idx); | |
24533 | + } | |
24534 | + } | |
24535 | + /* | |
24536 | + * nonexistent IRQs are edge default | |
24537 | + */ | |
24538 | + return 0; | |
24539 | +} | |
24540 | + | |
24541 | +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ | |
24542 | +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; | |
24543 | + | |
24544 | +int assign_irq_vector(int irq) | |
24545 | +{ | |
24546 | + unsigned long flags; | |
24547 | + int vector; | |
24548 | + struct physdev_irq irq_op; | |
24549 | + | |
24550 | + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); | |
24551 | + | |
24552 | + if (irq < PIRQ_BASE || irq - PIRQ_BASE > NR_PIRQS) | |
24553 | + return -EINVAL; | |
24554 | + | |
24555 | + spin_lock_irqsave(&vector_lock, flags); | |
24556 | + | |
24557 | + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { | |
24558 | + spin_unlock_irqrestore(&vector_lock, flags); | |
24559 | + return IO_APIC_VECTOR(irq); | |
24560 | + } | |
24561 | + | |
24562 | + irq_op.irq = irq; | |
24563 | + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { | |
24564 | + spin_unlock_irqrestore(&vector_lock, flags); | |
24565 | + return -ENOSPC; | |
24566 | + } | |
24567 | + | |
24568 | + vector = irq_op.vector; | |
24569 | + vector_irq[vector] = irq; | |
24570 | + if (irq != AUTO_ASSIGN) | |
24571 | + IO_APIC_VECTOR(irq) = vector; | |
24572 | + | |
24573 | + spin_unlock_irqrestore(&vector_lock, flags); | |
24574 | + | |
24575 | + return vector; | |
24576 | +} | |
24577 | + | |
24578 | +extern void (*interrupt[NR_IRQS])(void); | |
24579 | +#ifndef CONFIG_XEN | |
24580 | +static struct hw_interrupt_type ioapic_level_type; | |
24581 | +static struct hw_interrupt_type ioapic_edge_type; | |
24582 | + | |
24583 | +#define IOAPIC_AUTO -1 | |
24584 | +#define IOAPIC_EDGE 0 | |
24585 | +#define IOAPIC_LEVEL 1 | |
24586 | + | |
24587 | +static void ioapic_register_intr(int irq, int vector, unsigned long trigger) | |
24588 | +{ | |
24589 | + unsigned idx; | |
24590 | + | |
24591 | + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; | |
24592 | + | |
24593 | + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || | |
24594 | + trigger == IOAPIC_LEVEL) | |
24595 | + irq_desc[idx].chip = &ioapic_level_type; | |
24596 | + else | |
24597 | + irq_desc[idx].chip = &ioapic_edge_type; | |
24598 | + set_intr_gate(vector, interrupt[idx]); | |
24599 | +} | |
24600 | +#else | |
24601 | +#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq) | |
24602 | +#endif /* !CONFIG_XEN */ | |
24603 | + | |
24604 | +static void __init setup_IO_APIC_irqs(void) | |
24605 | +{ | |
24606 | + struct IO_APIC_route_entry entry; | |
24607 | + int apic, pin, idx, irq, first_notcon = 1, vector; | |
24608 | + unsigned long flags; | |
24609 | + | |
24610 | + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); | |
24611 | + | |
24612 | + for (apic = 0; apic < nr_ioapics; apic++) { | |
24613 | + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | |
24614 | + | |
24615 | + /* | |
24616 | + * add it to the IO-APIC irq-routing table: | |
24617 | + */ | |
24618 | + memset(&entry,0,sizeof(entry)); | |
24619 | + | |
24620 | + entry.delivery_mode = INT_DELIVERY_MODE; | |
24621 | + entry.dest_mode = INT_DEST_MODE; | |
24622 | + entry.mask = 0; /* enable IRQ */ | |
24623 | + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | |
24624 | + | |
24625 | + idx = find_irq_entry(apic,pin,mp_INT); | |
24626 | + if (idx == -1) { | |
24627 | + if (first_notcon) { | |
24628 | + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); | |
24629 | + first_notcon = 0; | |
24630 | + } else | |
24631 | + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); | |
24632 | + continue; | |
24633 | + } | |
24634 | + | |
24635 | + entry.trigger = irq_trigger(idx); | |
24636 | + entry.polarity = irq_polarity(idx); | |
24637 | + | |
24638 | + if (irq_trigger(idx)) { | |
24639 | + entry.trigger = 1; | |
24640 | + entry.mask = 1; | |
24641 | + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | |
24642 | + } | |
24643 | + | |
24644 | + irq = pin_2_irq(idx, apic, pin); | |
24645 | + add_pin_to_irq(irq, apic, pin); | |
24646 | + | |
24647 | + if (/* !apic && */ !IO_APIC_IRQ(irq)) | |
24648 | + continue; | |
24649 | + | |
24650 | + if (IO_APIC_IRQ(irq)) { | |
24651 | + vector = assign_irq_vector(irq); | |
24652 | + entry.vector = vector; | |
24653 | + | |
24654 | + ioapic_register_intr(irq, vector, IOAPIC_AUTO); | |
24655 | + if (!apic && (irq < 16)) | |
24656 | + disable_8259A_irq(irq); | |
24657 | + } | |
24658 | + spin_lock_irqsave(&ioapic_lock, flags); | |
24659 | + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); | |
24660 | + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); | |
24661 | + set_native_irq_info(irq, TARGET_CPUS); | |
24662 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
24663 | + } | |
24664 | + } | |
24665 | + | |
24666 | + if (!first_notcon) | |
24667 | + apic_printk(APIC_VERBOSE," not connected.\n"); | |
24668 | +} | |
24669 | + | |
24670 | +#ifndef CONFIG_XEN | |
24671 | +/* | |
24672 | + * Set up the 8259A-master output pin as broadcast to all | |
24673 | + * CPUs. | |
24674 | + */ | |
24675 | +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) | |
24676 | +{ | |
24677 | + struct IO_APIC_route_entry entry; | |
24678 | + unsigned long flags; | |
24679 | + | |
24680 | + memset(&entry,0,sizeof(entry)); | |
24681 | + | |
24682 | + disable_8259A_irq(0); | |
24683 | + | |
24684 | + /* mask LVT0 */ | |
24685 | + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | |
24686 | + | |
24687 | + /* | |
24688 | + * We use logical delivery to get the timer IRQ | |
24689 | + * to the first CPU. | |
24690 | + */ | |
24691 | + entry.dest_mode = INT_DEST_MODE; | |
24692 | + entry.mask = 0; /* unmask IRQ now */ | |
24693 | + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | |
24694 | + entry.delivery_mode = INT_DELIVERY_MODE; | |
24695 | + entry.polarity = 0; | |
24696 | + entry.trigger = 0; | |
24697 | + entry.vector = vector; | |
24698 | + | |
24699 | + /* | |
24700 | + * The timer IRQ doesn't have to know that behind the | |
24701 | + * scene we have a 8259A-master in AEOI mode ... | |
24702 | + */ | |
24703 | + irq_desc[0].chip = &ioapic_edge_type; | |
24704 | + | |
24705 | + /* | |
24706 | + * Add it to the IO-APIC irq-routing table: | |
24707 | + */ | |
24708 | + spin_lock_irqsave(&ioapic_lock, flags); | |
24709 | + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); | |
24710 | + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); | |
24711 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
24712 | + | |
24713 | + enable_8259A_irq(0); | |
24714 | +} | |
24715 | + | |
24716 | +void __init UNEXPECTED_IO_APIC(void) | |
24717 | +{ | |
24718 | +} | |
24719 | + | |
24720 | +void __apicdebuginit print_IO_APIC(void) | |
24721 | +{ | |
24722 | + int apic, i; | |
24723 | + union IO_APIC_reg_00 reg_00; | |
24724 | + union IO_APIC_reg_01 reg_01; | |
24725 | + union IO_APIC_reg_02 reg_02; | |
24726 | + unsigned long flags; | |
24727 | + | |
24728 | + if (apic_verbosity == APIC_QUIET) | |
24729 | + return; | |
24730 | + | |
24731 | + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); | |
24732 | + for (i = 0; i < nr_ioapics; i++) | |
24733 | + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", | |
24734 | + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); | |
24735 | + | |
24736 | + /* | |
24737 | + * We are a bit conservative about what we expect. We have to | |
24738 | + * know about every hardware change ASAP. | |
24739 | + */ | |
24740 | + printk(KERN_INFO "testing the IO APIC.......................\n"); | |
24741 | + | |
24742 | + for (apic = 0; apic < nr_ioapics; apic++) { | |
24743 | + | |
24744 | + spin_lock_irqsave(&ioapic_lock, flags); | |
24745 | + reg_00.raw = io_apic_read(apic, 0); | |
24746 | + reg_01.raw = io_apic_read(apic, 1); | |
24747 | + if (reg_01.bits.version >= 0x10) | |
24748 | + reg_02.raw = io_apic_read(apic, 2); | |
24749 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
24750 | + | |
24751 | + printk("\n"); | |
24752 | + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); | |
24753 | + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | |
24754 | + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | |
24755 | + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) | |
24756 | + UNEXPECTED_IO_APIC(); | |
24757 | + | |
24758 | + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); | |
24759 | + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); | |
24760 | + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ | |
24761 | + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ | |
24762 | + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ | |
24763 | + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ | |
24764 | + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ | |
24765 | + (reg_01.bits.entries != 0x2E) && | |
24766 | + (reg_01.bits.entries != 0x3F) && | |
24767 | + (reg_01.bits.entries != 0x03) | |
24768 | + ) | |
24769 | + UNEXPECTED_IO_APIC(); | |
24770 | + | |
24771 | + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); | |
24772 | + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); | |
24773 | + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ | |
24774 | + (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ | |
24775 | + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ | |
24776 | + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ | |
24777 | + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ | |
24778 | + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ | |
24779 | + ) | |
24780 | + UNEXPECTED_IO_APIC(); | |
24781 | + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) | |
24782 | + UNEXPECTED_IO_APIC(); | |
24783 | + | |
24784 | + if (reg_01.bits.version >= 0x10) { | |
24785 | + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); | |
24786 | + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); | |
24787 | + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) | |
24788 | + UNEXPECTED_IO_APIC(); | |
24789 | + } | |
24790 | + | |
24791 | + printk(KERN_DEBUG ".... IRQ redirection table:\n"); | |
24792 | + | |
24793 | + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" | |
24794 | + " Stat Dest Deli Vect: \n"); | |
24795 | + | |
24796 | + for (i = 0; i <= reg_01.bits.entries; i++) { | |
24797 | + struct IO_APIC_route_entry entry; | |
24798 | + | |
24799 | + spin_lock_irqsave(&ioapic_lock, flags); | |
24800 | + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); | |
24801 | + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); | |
24802 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
24803 | + | |
24804 | + printk(KERN_DEBUG " %02x %03X %02X ", | |
24805 | + i, | |
24806 | + entry.dest.logical.logical_dest, | |
24807 | + entry.dest.physical.physical_dest | |
24808 | + ); | |
24809 | + | |
24810 | + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", | |
24811 | + entry.mask, | |
24812 | + entry.trigger, | |
24813 | + entry.irr, | |
24814 | + entry.polarity, | |
24815 | + entry.delivery_status, | |
24816 | + entry.dest_mode, | |
24817 | + entry.delivery_mode, | |
24818 | + entry.vector | |
24819 | + ); | |
24820 | + } | |
24821 | + } | |
24822 | + if (use_pci_vector()) | |
24823 | + printk(KERN_INFO "Using vector-based indexing\n"); | |
24824 | + printk(KERN_DEBUG "IRQ to pin mappings:\n"); | |
24825 | + for (i = 0; i < NR_IRQS; i++) { | |
24826 | + struct irq_pin_list *entry = irq_2_pin + i; | |
24827 | + if (entry->pin < 0) | |
24828 | + continue; | |
24829 | + if (use_pci_vector() && !platform_legacy_irq(i)) | |
24830 | + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); | |
24831 | + else | |
24832 | + printk(KERN_DEBUG "IRQ%d ", i); | |
24833 | + for (;;) { | |
24834 | + printk("-> %d:%d", entry->apic, entry->pin); | |
24835 | + if (!entry->next) | |
24836 | + break; | |
24837 | + entry = irq_2_pin + entry->next; | |
24838 | + } | |
24839 | + printk("\n"); | |
24840 | + } | |
24841 | + | |
24842 | + printk(KERN_INFO ".................................... done.\n"); | |
24843 | + | |
24844 | + return; | |
24845 | +} | |
24846 | + | |
24847 | +static __apicdebuginit void print_APIC_bitfield (int base) | |
24848 | +{ | |
24849 | + unsigned int v; | |
24850 | + int i, j; | |
24851 | + | |
24852 | + if (apic_verbosity == APIC_QUIET) | |
24853 | + return; | |
24854 | + | |
24855 | + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); | |
24856 | + for (i = 0; i < 8; i++) { | |
24857 | + v = apic_read(base + i*0x10); | |
24858 | + for (j = 0; j < 32; j++) { | |
24859 | + if (v & (1<<j)) | |
24860 | + printk("1"); | |
24861 | + else | |
24862 | + printk("0"); | |
24863 | + } | |
24864 | + printk("\n"); | |
24865 | + } | |
24866 | +} | |
24867 | + | |
24868 | +void __apicdebuginit print_local_APIC(void * dummy) | |
24869 | +{ | |
24870 | + unsigned int v, ver, maxlvt; | |
24871 | + | |
24872 | + if (apic_verbosity == APIC_QUIET) | |
24873 | + return; | |
24874 | + | |
24875 | + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | |
24876 | + smp_processor_id(), hard_smp_processor_id()); | |
24877 | + v = apic_read(APIC_ID); | |
24878 | + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); | |
24879 | + v = apic_read(APIC_LVR); | |
24880 | + printk(KERN_INFO "... APIC VERSION: %08x\n", v); | |
24881 | + ver = GET_APIC_VERSION(v); | |
24882 | + maxlvt = get_maxlvt(); | |
24883 | + | |
24884 | + v = apic_read(APIC_TASKPRI); | |
24885 | + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); | |
24886 | + | |
24887 | + v = apic_read(APIC_ARBPRI); | |
24888 | + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, | |
24889 | + v & APIC_ARBPRI_MASK); | |
24890 | + v = apic_read(APIC_PROCPRI); | |
24891 | + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); | |
24892 | + | |
24893 | + v = apic_read(APIC_EOI); | |
24894 | + printk(KERN_DEBUG "... APIC EOI: %08x\n", v); | |
24895 | + v = apic_read(APIC_RRR); | |
24896 | + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); | |
24897 | + v = apic_read(APIC_LDR); | |
24898 | + printk(KERN_DEBUG "... APIC LDR: %08x\n", v); | |
24899 | + v = apic_read(APIC_DFR); | |
24900 | + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); | |
24901 | + v = apic_read(APIC_SPIV); | |
24902 | + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); | |
24903 | + | |
24904 | + printk(KERN_DEBUG "... APIC ISR field:\n"); | |
24905 | + print_APIC_bitfield(APIC_ISR); | |
24906 | + printk(KERN_DEBUG "... APIC TMR field:\n"); | |
24907 | + print_APIC_bitfield(APIC_TMR); | |
24908 | + printk(KERN_DEBUG "... APIC IRR field:\n"); | |
24909 | + print_APIC_bitfield(APIC_IRR); | |
24910 | + | |
24911 | + v = apic_read(APIC_ESR); | |
24912 | + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | |
24913 | + | |
24914 | + v = apic_read(APIC_ICR); | |
24915 | + printk(KERN_DEBUG "... APIC ICR: %08x\n", v); | |
24916 | + v = apic_read(APIC_ICR2); | |
24917 | + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); | |
24918 | + | |
24919 | + v = apic_read(APIC_LVTT); | |
24920 | + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); | |
24921 | + | |
24922 | + if (maxlvt > 3) { /* PC is LVT#4. */ | |
24923 | + v = apic_read(APIC_LVTPC); | |
24924 | + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); | |
24925 | + } | |
24926 | + v = apic_read(APIC_LVT0); | |
24927 | + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); | |
24928 | + v = apic_read(APIC_LVT1); | |
24929 | + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); | |
24930 | + | |
24931 | + if (maxlvt > 2) { /* ERR is LVT#3. */ | |
24932 | + v = apic_read(APIC_LVTERR); | |
24933 | + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); | |
24934 | + } | |
24935 | + | |
24936 | + v = apic_read(APIC_TMICT); | |
24937 | + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); | |
24938 | + v = apic_read(APIC_TMCCT); | |
24939 | + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); | |
24940 | + v = apic_read(APIC_TDCR); | |
24941 | + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); | |
24942 | + printk("\n"); | |
24943 | +} | |
24944 | + | |
24945 | +void print_all_local_APICs (void) | |
24946 | +{ | |
24947 | + on_each_cpu(print_local_APIC, NULL, 1, 1); | |
24948 | +} | |
24949 | + | |
24950 | +void __apicdebuginit print_PIC(void) | |
24951 | +{ | |
24952 | + unsigned int v; | |
24953 | + unsigned long flags; | |
24954 | + | |
24955 | + if (apic_verbosity == APIC_QUIET) | |
24956 | + return; | |
24957 | + | |
24958 | + printk(KERN_DEBUG "\nprinting PIC contents\n"); | |
24959 | + | |
24960 | + spin_lock_irqsave(&i8259A_lock, flags); | |
24961 | + | |
24962 | + v = inb(0xa1) << 8 | inb(0x21); | |
24963 | + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); | |
24964 | + | |
24965 | + v = inb(0xa0) << 8 | inb(0x20); | |
24966 | + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); | |
24967 | + | |
24968 | + outb(0x0b,0xa0); | |
24969 | + outb(0x0b,0x20); | |
24970 | + v = inb(0xa0) << 8 | inb(0x20); | |
24971 | + outb(0x0a,0xa0); | |
24972 | + outb(0x0a,0x20); | |
24973 | + | |
24974 | + spin_unlock_irqrestore(&i8259A_lock, flags); | |
24975 | + | |
24976 | + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); | |
24977 | + | |
24978 | + v = inb(0x4d1) << 8 | inb(0x4d0); | |
24979 | + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); | |
24980 | +} | |
24981 | +#endif /* !CONFIG_XEN */ | |
24982 | + | |
24983 | +static void __init enable_IO_APIC(void) | |
24984 | +{ | |
24985 | + union IO_APIC_reg_01 reg_01; | |
24986 | +#ifndef CONFIG_XEN | |
24987 | + int i8259_apic, i8259_pin; | |
24988 | +#endif | |
24989 | + int i, apic; | |
24990 | + unsigned long flags; | |
24991 | + | |
24992 | + for (i = 0; i < PIN_MAP_SIZE; i++) { | |
24993 | + irq_2_pin[i].pin = -1; | |
24994 | + irq_2_pin[i].next = 0; | |
24995 | + } | |
24996 | + if (!pirqs_enabled) | |
24997 | + for (i = 0; i < MAX_PIRQS; i++) | |
24998 | + pirq_entries[i] = -1; | |
24999 | + | |
25000 | + /* | |
25001 | + * The number of IO-APIC IRQ registers (== #pins): | |
25002 | + */ | |
25003 | + for (apic = 0; apic < nr_ioapics; apic++) { | |
25004 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25005 | + reg_01.raw = io_apic_read(apic, 1); | |
25006 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25007 | + nr_ioapic_registers[apic] = reg_01.bits.entries+1; | |
25008 | + } | |
25009 | +#ifndef CONFIG_XEN | |
25010 | + for(apic = 0; apic < nr_ioapics; apic++) { | |
25011 | + int pin; | |
25012 | + /* See if any of the pins is in ExtINT mode */ | |
25013 | + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | |
25014 | + struct IO_APIC_route_entry entry; | |
25015 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25016 | + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); | |
25017 | + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); | |
25018 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25019 | + | |
25020 | + | |
25021 | + /* If the interrupt line is enabled and in ExtInt mode | |
25022 | + * I have found the pin where the i8259 is connected. | |
25023 | + */ | |
25024 | + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { | |
25025 | + ioapic_i8259.apic = apic; | |
25026 | + ioapic_i8259.pin = pin; | |
25027 | + goto found_i8259; | |
25028 | + } | |
25029 | + } | |
25030 | + } | |
25031 | + found_i8259: | |
25032 | + /* Look to see what if the MP table has reported the ExtINT */ | |
25033 | + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); | |
25034 | + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); | |
25035 | + /* Trust the MP table if nothing is setup in the hardware */ | |
25036 | + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { | |
25037 | + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); | |
25038 | + ioapic_i8259.pin = i8259_pin; | |
25039 | + ioapic_i8259.apic = i8259_apic; | |
25040 | + } | |
25041 | + /* Complain if the MP table and the hardware disagree */ | |
25042 | + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && | |
25043 | + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) | |
25044 | + { | |
25045 | + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); | |
25046 | + } | |
25047 | +#endif | |
25048 | + | |
25049 | + /* | |
25050 | + * Do not trust the IO-APIC being empty at bootup | |
25051 | + */ | |
25052 | + clear_IO_APIC(); | |
25053 | +} | |
25054 | + | |
25055 | +/* | |
25056 | + * Not an __init, needed by the reboot code | |
25057 | + */ | |
25058 | +void disable_IO_APIC(void) | |
25059 | +{ | |
25060 | + /* | |
25061 | + * Clear the IO-APIC before rebooting: | |
25062 | + */ | |
25063 | + clear_IO_APIC(); | |
25064 | + | |
25065 | +#ifndef CONFIG_XEN | |
25066 | + /* | |
25067 | + * If the i8259 is routed through an IOAPIC | |
25068 | + * Put that IOAPIC in virtual wire mode | |
25069 | + * so legacy interrupts can be delivered. | |
25070 | + */ | |
25071 | + if (ioapic_i8259.pin != -1) { | |
25072 | + struct IO_APIC_route_entry entry; | |
25073 | + unsigned long flags; | |
25074 | + | |
25075 | + memset(&entry, 0, sizeof(entry)); | |
25076 | + entry.mask = 0; /* Enabled */ | |
25077 | + entry.trigger = 0; /* Edge */ | |
25078 | + entry.irr = 0; | |
25079 | + entry.polarity = 0; /* High */ | |
25080 | + entry.delivery_status = 0; | |
25081 | + entry.dest_mode = 0; /* Physical */ | |
25082 | + entry.delivery_mode = dest_ExtINT; /* ExtInt */ | |
25083 | + entry.vector = 0; | |
25084 | + entry.dest.physical.physical_dest = | |
25085 | + GET_APIC_ID(apic_read(APIC_ID)); | |
25086 | + | |
25087 | + /* | |
25088 | + * Add it to the IO-APIC irq-routing table: | |
25089 | + */ | |
25090 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25091 | + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, | |
25092 | + *(((int *)&entry)+1)); | |
25093 | + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, | |
25094 | + *(((int *)&entry)+0)); | |
25095 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25096 | + } | |
25097 | + | |
25098 | + disconnect_bsp_APIC(ioapic_i8259.pin != -1); | |
25099 | +#endif | |
25100 | +} | |
25101 | + | |
25102 | +/* | |
25103 | + * function to set the IO-APIC physical IDs based on the | |
25104 | + * values stored in the MPC table. | |
25105 | + * | |
25106 | + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 | |
25107 | + */ | |
25108 | + | |
25109 | +#ifndef CONFIG_XEN | |
25110 | +static void __init setup_ioapic_ids_from_mpc (void) | |
25111 | +{ | |
25112 | + union IO_APIC_reg_00 reg_00; | |
25113 | + int apic; | |
25114 | + int i; | |
25115 | + unsigned char old_id; | |
25116 | + unsigned long flags; | |
25117 | + | |
25118 | + /* | |
25119 | + * Set the IOAPIC ID to the value stored in the MPC table. | |
25120 | + */ | |
25121 | + for (apic = 0; apic < nr_ioapics; apic++) { | |
25122 | + | |
25123 | + /* Read the register 0 value */ | |
25124 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25125 | + reg_00.raw = io_apic_read(apic, 0); | |
25126 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25127 | + | |
25128 | + old_id = mp_ioapics[apic].mpc_apicid; | |
25129 | + | |
25130 | + | |
25131 | + printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); | |
25132 | + | |
25133 | + | |
25134 | + /* | |
25135 | + * We need to adjust the IRQ routing table | |
25136 | + * if the ID changed. | |
25137 | + */ | |
25138 | + if (old_id != mp_ioapics[apic].mpc_apicid) | |
25139 | + for (i = 0; i < mp_irq_entries; i++) | |
25140 | + if (mp_irqs[i].mpc_dstapic == old_id) | |
25141 | + mp_irqs[i].mpc_dstapic | |
25142 | + = mp_ioapics[apic].mpc_apicid; | |
25143 | + | |
25144 | + /* | |
25145 | + * Read the right value from the MPC table and | |
25146 | + * write it into the ID register. | |
25147 | + */ | |
25148 | + apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", | |
25149 | + mp_ioapics[apic].mpc_apicid); | |
25150 | + | |
25151 | + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; | |
25152 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25153 | + io_apic_write(apic, 0, reg_00.raw); | |
25154 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25155 | + | |
25156 | + /* | |
25157 | + * Sanity check | |
25158 | + */ | |
25159 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25160 | + reg_00.raw = io_apic_read(apic, 0); | |
25161 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25162 | + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) | |
25163 | + printk("could not set ID!\n"); | |
25164 | + else | |
25165 | + apic_printk(APIC_VERBOSE," ok.\n"); | |
25166 | + } | |
25167 | +} | |
25168 | +#else | |
25169 | +static void __init setup_ioapic_ids_from_mpc(void) { } | |
25170 | +#endif | |
25171 | + | |
25172 | +/* | |
25173 | + * There is a nasty bug in some older SMP boards, their mptable lies | |
25174 | + * about the timer IRQ. We do the following to work around the situation: | |
25175 | + * | |
25176 | + * - timer IRQ defaults to IO-APIC IRQ | |
25177 | + * - if this function detects that timer IRQs are defunct, then we fall | |
25178 | + * back to ISA timer IRQs | |
25179 | + */ | |
25180 | +#ifndef CONFIG_XEN | |
25181 | +static int __init timer_irq_works(void) | |
25182 | +{ | |
25183 | + unsigned long t1 = jiffies; | |
25184 | + | |
25185 | + local_irq_enable(); | |
25186 | + /* Let ten ticks pass... */ | |
25187 | + mdelay((10 * 1000) / HZ); | |
25188 | + | |
25189 | + /* | |
25190 | + * Expect a few ticks at least, to be sure some possible | |
25191 | + * glue logic does not lock up after one or two first | |
25192 | + * ticks in a non-ExtINT mode. Also the local APIC | |
25193 | + * might have cached one ExtINT interrupt. Finally, at | |
25194 | + * least one tick may be lost due to delays. | |
25195 | + */ | |
25196 | + | |
25197 | + /* jiffies wrap? */ | |
25198 | + if (jiffies - t1 > 4) | |
25199 | + return 1; | |
25200 | + return 0; | |
25201 | +} | |
25202 | + | |
25203 | +/* | |
25204 | + * In the SMP+IOAPIC case it might happen that there are an unspecified | |
25205 | + * number of pending IRQ events unhandled. These cases are very rare, | |
25206 | + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much | |
25207 | + * better to do it this way as thus we do not have to be aware of | |
25208 | + * 'pending' interrupts in the IRQ path, except at this point. | |
25209 | + */ | |
25210 | +/* | |
25211 | + * Edge triggered needs to resend any interrupt | |
25212 | + * that was delayed but this is now handled in the device | |
25213 | + * independent code. | |
25214 | + */ | |
25215 | + | |
25216 | +/* | |
25217 | + * Starting up a edge-triggered IO-APIC interrupt is | |
25218 | + * nasty - we need to make sure that we get the edge. | |
25219 | + * If it is already asserted for some reason, we need | |
25220 | + * return 1 to indicate that is was pending. | |
25221 | + * | |
25222 | + * This is not complete - we should be able to fake | |
25223 | + * an edge even if it isn't on the 8259A... | |
25224 | + */ | |
25225 | + | |
25226 | +static unsigned int startup_edge_ioapic_irq(unsigned int irq) | |
25227 | +{ | |
25228 | + int was_pending = 0; | |
25229 | + unsigned long flags; | |
25230 | + | |
25231 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25232 | + if (irq < 16) { | |
25233 | + disable_8259A_irq(irq); | |
25234 | + if (i8259A_irq_pending(irq)) | |
25235 | + was_pending = 1; | |
25236 | + } | |
25237 | + __unmask_IO_APIC_irq(irq); | |
25238 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25239 | + | |
25240 | + return was_pending; | |
25241 | +} | |
25242 | + | |
25243 | +/* | |
25244 | + * Once we have recorded IRQ_PENDING already, we can mask the | |
25245 | + * interrupt for real. This prevents IRQ storms from unhandled | |
25246 | + * devices. | |
25247 | + */ | |
25248 | +static void ack_edge_ioapic_irq(unsigned int irq) | |
25249 | +{ | |
25250 | + move_irq(irq); | |
25251 | + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) | |
25252 | + == (IRQ_PENDING | IRQ_DISABLED)) | |
25253 | + mask_IO_APIC_irq(irq); | |
25254 | + ack_APIC_irq(); | |
25255 | +} | |
25256 | + | |
25257 | +/* | |
25258 | + * Level triggered interrupts can just be masked, | |
25259 | + * and shutting down and starting up the interrupt | |
25260 | + * is the same as enabling and disabling them -- except | |
25261 | + * with a startup need to return a "was pending" value. | |
25262 | + * | |
25263 | + * Level triggered interrupts are special because we | |
25264 | + * do not touch any IO-APIC register while handling | |
25265 | + * them. We ack the APIC in the end-IRQ handler, not | |
25266 | + * in the start-IRQ-handler. Protection against reentrance | |
25267 | + * from the same interrupt is still provided, both by the | |
25268 | + * generic IRQ layer and by the fact that an unacked local | |
25269 | + * APIC does not accept IRQs. | |
25270 | + */ | |
25271 | +static unsigned int startup_level_ioapic_irq (unsigned int irq) | |
25272 | +{ | |
25273 | + unmask_IO_APIC_irq(irq); | |
25274 | + | |
25275 | + return 0; /* don't check for pending */ | |
25276 | +} | |
25277 | + | |
25278 | +static void end_level_ioapic_irq (unsigned int irq) | |
25279 | +{ | |
25280 | + move_irq(irq); | |
25281 | + ack_APIC_irq(); | |
25282 | +} | |
25283 | + | |
25284 | +#ifdef CONFIG_PCI_MSI | |
25285 | +static unsigned int startup_edge_ioapic_vector(unsigned int vector) | |
25286 | +{ | |
25287 | + int irq = vector_to_irq(vector); | |
25288 | + | |
25289 | + return startup_edge_ioapic_irq(irq); | |
25290 | +} | |
25291 | + | |
25292 | +static void ack_edge_ioapic_vector(unsigned int vector) | |
25293 | +{ | |
25294 | + int irq = vector_to_irq(vector); | |
25295 | + | |
25296 | + move_native_irq(vector); | |
25297 | + ack_edge_ioapic_irq(irq); | |
25298 | +} | |
25299 | + | |
25300 | +static unsigned int startup_level_ioapic_vector (unsigned int vector) | |
25301 | +{ | |
25302 | + int irq = vector_to_irq(vector); | |
25303 | + | |
25304 | + return startup_level_ioapic_irq (irq); | |
25305 | +} | |
25306 | + | |
25307 | +static void end_level_ioapic_vector (unsigned int vector) | |
25308 | +{ | |
25309 | + int irq = vector_to_irq(vector); | |
25310 | + | |
25311 | + move_native_irq(vector); | |
25312 | + end_level_ioapic_irq(irq); | |
25313 | +} | |
25314 | + | |
25315 | +static void mask_IO_APIC_vector (unsigned int vector) | |
25316 | +{ | |
25317 | + int irq = vector_to_irq(vector); | |
25318 | + | |
25319 | + mask_IO_APIC_irq(irq); | |
25320 | +} | |
25321 | + | |
25322 | +static void unmask_IO_APIC_vector (unsigned int vector) | |
25323 | +{ | |
25324 | + int irq = vector_to_irq(vector); | |
25325 | + | |
25326 | + unmask_IO_APIC_irq(irq); | |
25327 | +} | |
25328 | + | |
25329 | +#ifdef CONFIG_SMP | |
25330 | +static void set_ioapic_affinity_vector (unsigned int vector, | |
25331 | + cpumask_t cpu_mask) | |
25332 | +{ | |
25333 | + int irq = vector_to_irq(vector); | |
25334 | + | |
25335 | + set_native_irq_info(vector, cpu_mask); | |
25336 | + set_ioapic_affinity_irq(irq, cpu_mask); | |
25337 | +} | |
25338 | +#endif // CONFIG_SMP | |
25339 | +#endif // CONFIG_PCI_MSI | |
25340 | + | |
25341 | +static int ioapic_retrigger(unsigned int irq) | |
25342 | +{ | |
25343 | + send_IPI_self(IO_APIC_VECTOR(irq)); | |
25344 | + | |
25345 | + return 1; | |
25346 | +} | |
25347 | + | |
25348 | +/* | |
25349 | + * Level and edge triggered IO-APIC interrupts need different handling, | |
25350 | + * so we use two separate IRQ descriptors. Edge triggered IRQs can be | |
25351 | + * handled with the level-triggered descriptor, but that one has slightly | |
25352 | + * more overhead. Level-triggered interrupts cannot be handled with the | |
25353 | + * edge-triggered handler, without risking IRQ storms and other ugly | |
25354 | + * races. | |
25355 | + */ | |
25356 | + | |
25357 | +static struct hw_interrupt_type ioapic_edge_type __read_mostly = { | |
25358 | + .typename = "IO-APIC-edge", | |
25359 | + .startup = startup_edge_ioapic, | |
25360 | + .shutdown = shutdown_edge_ioapic, | |
25361 | + .enable = enable_edge_ioapic, | |
25362 | + .disable = disable_edge_ioapic, | |
25363 | + .ack = ack_edge_ioapic, | |
25364 | + .end = end_edge_ioapic, | |
25365 | +#ifdef CONFIG_SMP | |
25366 | + .set_affinity = set_ioapic_affinity, | |
25367 | +#endif | |
25368 | + .retrigger = ioapic_retrigger, | |
25369 | +}; | |
25370 | + | |
25371 | +static struct hw_interrupt_type ioapic_level_type __read_mostly = { | |
25372 | + .typename = "IO-APIC-level", | |
25373 | + .startup = startup_level_ioapic, | |
25374 | + .shutdown = shutdown_level_ioapic, | |
25375 | + .enable = enable_level_ioapic, | |
25376 | + .disable = disable_level_ioapic, | |
25377 | + .ack = mask_and_ack_level_ioapic, | |
25378 | + .end = end_level_ioapic, | |
25379 | +#ifdef CONFIG_SMP | |
25380 | + .set_affinity = set_ioapic_affinity, | |
25381 | +#endif | |
25382 | + .retrigger = ioapic_retrigger, | |
25383 | +}; | |
25384 | +#endif /* !CONFIG_XEN */ | |
25385 | + | |
25386 | +static inline void init_IO_APIC_traps(void) | |
25387 | +{ | |
25388 | + int irq; | |
25389 | + | |
25390 | + /* | |
25391 | + * NOTE! The local APIC isn't very good at handling | |
25392 | + * multiple interrupts at the same interrupt level. | |
25393 | + * As the interrupt level is determined by taking the | |
25394 | + * vector number and shifting that right by 4, we | |
25395 | + * want to spread these out a bit so that they don't | |
25396 | + * all fall in the same interrupt level. | |
25397 | + * | |
25398 | + * Also, we've got to be careful not to trash gate | |
25399 | + * 0x80, because int 0x80 is hm, kind of importantish. ;) | |
25400 | + */ | |
25401 | + for (irq = 0; irq < NR_IRQS ; irq++) { | |
25402 | + int tmp = irq; | |
25403 | + if (use_pci_vector()) { | |
25404 | + if (!platform_legacy_irq(tmp)) | |
25405 | + if ((tmp = vector_to_irq(tmp)) == -1) | |
25406 | + continue; | |
25407 | + } | |
25408 | + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { | |
25409 | + /* | |
25410 | + * Hmm.. We don't have an entry for this, | |
25411 | + * so default to an old-fashioned 8259 | |
25412 | + * interrupt if we can.. | |
25413 | + */ | |
25414 | + if (irq < 16) | |
25415 | + make_8259A_irq(irq); | |
25416 | +#ifndef CONFIG_XEN | |
25417 | + else | |
25418 | + /* Strange. Oh, well.. */ | |
25419 | + irq_desc[irq].chip = &no_irq_type; | |
25420 | +#endif | |
25421 | + } | |
25422 | + } | |
25423 | +} | |
25424 | + | |
25425 | +#ifndef CONFIG_XEN | |
25426 | +static void enable_lapic_irq (unsigned int irq) | |
25427 | +{ | |
25428 | + unsigned long v; | |
25429 | + | |
25430 | + v = apic_read(APIC_LVT0); | |
25431 | + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); | |
25432 | +} | |
25433 | + | |
25434 | +static void disable_lapic_irq (unsigned int irq) | |
25435 | +{ | |
25436 | + unsigned long v; | |
25437 | + | |
25438 | + v = apic_read(APIC_LVT0); | |
25439 | + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); | |
25440 | +} | |
25441 | + | |
25442 | +static void ack_lapic_irq (unsigned int irq) | |
25443 | +{ | |
25444 | + ack_APIC_irq(); | |
25445 | +} | |
25446 | + | |
25447 | +static void end_lapic_irq (unsigned int i) { /* nothing */ } | |
25448 | + | |
25449 | +static struct hw_interrupt_type lapic_irq_type __read_mostly = { | |
25450 | + .typename = "local-APIC-edge", | |
25451 | + .startup = NULL, /* startup_irq() not used for IRQ0 */ | |
25452 | + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ | |
25453 | + .enable = enable_lapic_irq, | |
25454 | + .disable = disable_lapic_irq, | |
25455 | + .ack = ack_lapic_irq, | |
25456 | + .end = end_lapic_irq, | |
25457 | +}; | |
25458 | + | |
25459 | +static void setup_nmi (void) | |
25460 | +{ | |
25461 | + /* | |
25462 | + * Dirty trick to enable the NMI watchdog ... | |
25463 | + * We put the 8259A master into AEOI mode and | |
25464 | + * unmask on all local APICs LVT0 as NMI. | |
25465 | + * | |
25466 | + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | |
25467 | + * is from Maciej W. Rozycki - so we do not have to EOI from | |
25468 | + * the NMI handler or the timer interrupt. | |
25469 | + */ | |
25470 | + printk(KERN_INFO "activating NMI Watchdog ..."); | |
25471 | + | |
25472 | + enable_NMI_through_LVT0(NULL); | |
25473 | + | |
25474 | + printk(" done.\n"); | |
25475 | +} | |
25476 | + | |
25477 | +/* | |
25478 | + * This looks a bit hackish but it's about the only one way of sending | |
25479 | + * a few INTA cycles to 8259As and any associated glue logic. ICR does | |
25480 | + * not support the ExtINT mode, unfortunately. We need to send these | |
25481 | + * cycles as some i82489DX-based boards have glue logic that keeps the | |
25482 | + * 8259A interrupt line asserted until INTA. --macro | |
25483 | + */ | |
25484 | +static inline void unlock_ExtINT_logic(void) | |
25485 | +{ | |
25486 | + int apic, pin, i; | |
25487 | + struct IO_APIC_route_entry entry0, entry1; | |
25488 | + unsigned char save_control, save_freq_select; | |
25489 | + unsigned long flags; | |
25490 | + | |
25491 | + pin = find_isa_irq_pin(8, mp_INT); | |
25492 | + apic = find_isa_irq_apic(8, mp_INT); | |
25493 | + if (pin == -1) | |
25494 | + return; | |
25495 | + | |
25496 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25497 | + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); | |
25498 | + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); | |
25499 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25500 | + clear_IO_APIC_pin(apic, pin); | |
25501 | + | |
25502 | + memset(&entry1, 0, sizeof(entry1)); | |
25503 | + | |
25504 | + entry1.dest_mode = 0; /* physical delivery */ | |
25505 | + entry1.mask = 0; /* unmask IRQ now */ | |
25506 | + entry1.dest.physical.physical_dest = hard_smp_processor_id(); | |
25507 | + entry1.delivery_mode = dest_ExtINT; | |
25508 | + entry1.polarity = entry0.polarity; | |
25509 | + entry1.trigger = 0; | |
25510 | + entry1.vector = 0; | |
25511 | + | |
25512 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25513 | + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); | |
25514 | + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); | |
25515 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25516 | + | |
25517 | + save_control = CMOS_READ(RTC_CONTROL); | |
25518 | + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); | |
25519 | + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, | |
25520 | + RTC_FREQ_SELECT); | |
25521 | + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); | |
25522 | + | |
25523 | + i = 100; | |
25524 | + while (i-- > 0) { | |
25525 | + mdelay(10); | |
25526 | + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) | |
25527 | + i -= 10; | |
25528 | + } | |
25529 | + | |
25530 | + CMOS_WRITE(save_control, RTC_CONTROL); | |
25531 | + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); | |
25532 | + clear_IO_APIC_pin(apic, pin); | |
25533 | + | |
25534 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25535 | + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); | |
25536 | + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); | |
25537 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25538 | +} | |
25539 | + | |
25540 | +int timer_uses_ioapic_pin_0; | |
25541 | + | |
25542 | +/* | |
25543 | + * This code may look a bit paranoid, but it's supposed to cooperate with | |
25544 | + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ | |
25545 | + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast | |
25546 | + * fanatically on his truly buggy board. | |
25547 | + * | |
25548 | + * FIXME: really need to revamp this for modern platforms only. | |
25549 | + */ | |
25550 | +static inline void check_timer(void) | |
25551 | +{ | |
25552 | + int apic1, pin1, apic2, pin2; | |
25553 | + int vector; | |
25554 | + | |
25555 | + /* | |
25556 | + * get/set the timer IRQ vector: | |
25557 | + */ | |
25558 | + disable_8259A_irq(0); | |
25559 | + vector = assign_irq_vector(0); | |
25560 | + set_intr_gate(vector, interrupt[0]); | |
25561 | + | |
25562 | + /* | |
25563 | + * Subtle, code in do_timer_interrupt() expects an AEOI | |
25564 | + * mode for the 8259A whenever interrupts are routed | |
25565 | + * through I/O APICs. Also IRQ0 has to be enabled in | |
25566 | + * the 8259A which implies the virtual wire has to be | |
25567 | + * disabled in the local APIC. | |
25568 | + */ | |
25569 | + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | |
25570 | + init_8259A(1); | |
25571 | + if (timer_over_8254 > 0) | |
25572 | + enable_8259A_irq(0); | |
25573 | + | |
25574 | + pin1 = find_isa_irq_pin(0, mp_INT); | |
25575 | + apic1 = find_isa_irq_apic(0, mp_INT); | |
25576 | + pin2 = ioapic_i8259.pin; | |
25577 | + apic2 = ioapic_i8259.apic; | |
25578 | + | |
25579 | + if (pin1 == 0) | |
25580 | + timer_uses_ioapic_pin_0 = 1; | |
25581 | + | |
25582 | + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", | |
25583 | + vector, apic1, pin1, apic2, pin2); | |
25584 | + | |
25585 | + if (pin1 != -1) { | |
25586 | + /* | |
25587 | + * Ok, does IRQ0 through the IOAPIC work? | |
25588 | + */ | |
25589 | + unmask_IO_APIC_irq(0); | |
25590 | + if (!no_timer_check && timer_irq_works()) { | |
25591 | + nmi_watchdog_default(); | |
25592 | + if (nmi_watchdog == NMI_IO_APIC) { | |
25593 | + disable_8259A_irq(0); | |
25594 | + setup_nmi(); | |
25595 | + enable_8259A_irq(0); | |
25596 | + } | |
25597 | + if (disable_timer_pin_1 > 0) | |
25598 | + clear_IO_APIC_pin(0, pin1); | |
25599 | + return; | |
25600 | + } | |
25601 | + clear_IO_APIC_pin(apic1, pin1); | |
25602 | + apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " | |
25603 | + "connected to IO-APIC\n"); | |
25604 | + } | |
25605 | + | |
25606 | + apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " | |
25607 | + "through the 8259A ... "); | |
25608 | + if (pin2 != -1) { | |
25609 | + apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", | |
25610 | + apic2, pin2); | |
25611 | + /* | |
25612 | + * legacy devices should be connected to IO APIC #0 | |
25613 | + */ | |
25614 | + setup_ExtINT_IRQ0_pin(apic2, pin2, vector); | |
25615 | + if (timer_irq_works()) { | |
25616 | + apic_printk(APIC_VERBOSE," works.\n"); | |
25617 | + nmi_watchdog_default(); | |
25618 | + if (nmi_watchdog == NMI_IO_APIC) { | |
25619 | + setup_nmi(); | |
25620 | + } | |
25621 | + return; | |
25622 | + } | |
25623 | + /* | |
25624 | + * Cleanup, just in case ... | |
25625 | + */ | |
25626 | + clear_IO_APIC_pin(apic2, pin2); | |
25627 | + } | |
25628 | + apic_printk(APIC_VERBOSE," failed.\n"); | |
25629 | + | |
25630 | + if (nmi_watchdog == NMI_IO_APIC) { | |
25631 | + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); | |
25632 | + nmi_watchdog = 0; | |
25633 | + } | |
25634 | + | |
25635 | + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); | |
25636 | + | |
25637 | + disable_8259A_irq(0); | |
25638 | + irq_desc[0].chip = &lapic_irq_type; | |
25639 | + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ | |
25640 | + enable_8259A_irq(0); | |
25641 | + | |
25642 | + if (timer_irq_works()) { | |
25643 | + apic_printk(APIC_VERBOSE," works.\n"); | |
25644 | + return; | |
25645 | + } | |
25646 | + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); | |
25647 | + apic_printk(APIC_VERBOSE," failed.\n"); | |
25648 | + | |
25649 | + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); | |
25650 | + | |
25651 | + init_8259A(0); | |
25652 | + make_8259A_irq(0); | |
25653 | + apic_write(APIC_LVT0, APIC_DM_EXTINT); | |
25654 | + | |
25655 | + unlock_ExtINT_logic(); | |
25656 | + | |
25657 | + if (timer_irq_works()) { | |
25658 | + apic_printk(APIC_VERBOSE," works.\n"); | |
25659 | + return; | |
25660 | + } | |
25661 | + apic_printk(APIC_VERBOSE," failed :(.\n"); | |
25662 | + panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); | |
25663 | +} | |
25664 | +#else | |
25665 | +#define check_timer() ((void)0) | |
25666 | +int timer_uses_ioapic_pin_0 = 0; | |
25667 | +#endif /* !CONFIG_XEN */ | |
25668 | + | |
25669 | +static int __init notimercheck(char *s) | |
25670 | +{ | |
25671 | + no_timer_check = 1; | |
25672 | + return 1; | |
25673 | +} | |
25674 | +__setup("no_timer_check", notimercheck); | |
25675 | + | |
25676 | +/* | |
25677 | + * | |
25678 | + * IRQ's that are handled by the PIC in the MPS IOAPIC case. | |
25679 | + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. | |
25680 | + * Linux doesn't really care, as it's not actually used | |
25681 | + * for any interrupt handling anyway. | |
25682 | + */ | |
25683 | +#define PIC_IRQS (1<<2) | |
25684 | + | |
25685 | +void __init setup_IO_APIC(void) | |
25686 | +{ | |
25687 | + enable_IO_APIC(); | |
25688 | + | |
25689 | + if (acpi_ioapic) | |
25690 | + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ | |
25691 | + else | |
25692 | + io_apic_irqs = ~PIC_IRQS; | |
25693 | + | |
25694 | + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); | |
25695 | + | |
25696 | + /* | |
25697 | + * Set up the IO-APIC IRQ routing table. | |
25698 | + */ | |
25699 | + if (!acpi_ioapic) | |
25700 | + setup_ioapic_ids_from_mpc(); | |
25701 | +#ifndef CONFIG_XEN | |
25702 | + sync_Arb_IDs(); | |
25703 | +#endif /* !CONFIG_XEN */ | |
25704 | + setup_IO_APIC_irqs(); | |
25705 | + init_IO_APIC_traps(); | |
25706 | + check_timer(); | |
25707 | + if (!acpi_ioapic) | |
25708 | + print_IO_APIC(); | |
25709 | +} | |
25710 | + | |
25711 | +struct sysfs_ioapic_data { | |
25712 | + struct sys_device dev; | |
25713 | + struct IO_APIC_route_entry entry[0]; | |
25714 | +}; | |
25715 | +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; | |
25716 | + | |
25717 | +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) | |
25718 | +{ | |
25719 | + struct IO_APIC_route_entry *entry; | |
25720 | + struct sysfs_ioapic_data *data; | |
25721 | + unsigned long flags; | |
25722 | + int i; | |
25723 | + | |
25724 | + data = container_of(dev, struct sysfs_ioapic_data, dev); | |
25725 | + entry = data->entry; | |
25726 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25727 | + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { | |
25728 | + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); | |
25729 | + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); | |
25730 | + } | |
25731 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25732 | + | |
25733 | + return 0; | |
25734 | +} | |
25735 | + | |
25736 | +static int ioapic_resume(struct sys_device *dev) | |
25737 | +{ | |
25738 | + struct IO_APIC_route_entry *entry; | |
25739 | + struct sysfs_ioapic_data *data; | |
25740 | + unsigned long flags; | |
25741 | + union IO_APIC_reg_00 reg_00; | |
25742 | + int i; | |
25743 | + | |
25744 | + data = container_of(dev, struct sysfs_ioapic_data, dev); | |
25745 | + entry = data->entry; | |
25746 | + | |
25747 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25748 | + reg_00.raw = io_apic_read(dev->id, 0); | |
25749 | + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { | |
25750 | + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; | |
25751 | + io_apic_write(dev->id, 0, reg_00.raw); | |
25752 | + } | |
25753 | + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { | |
25754 | + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); | |
25755 | + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); | |
25756 | + } | |
25757 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25758 | + | |
25759 | + return 0; | |
25760 | +} | |
25761 | + | |
25762 | +static struct sysdev_class ioapic_sysdev_class = { | |
25763 | + set_kset_name("ioapic"), | |
25764 | +#ifndef CONFIG_XEN | |
25765 | + .suspend = ioapic_suspend, | |
25766 | + .resume = ioapic_resume, | |
25767 | +#endif | |
25768 | +}; | |
25769 | + | |
25770 | +static int __init ioapic_init_sysfs(void) | |
25771 | +{ | |
25772 | + struct sys_device * dev; | |
25773 | + int i, size, error = 0; | |
25774 | + | |
25775 | + error = sysdev_class_register(&ioapic_sysdev_class); | |
25776 | + if (error) | |
25777 | + return error; | |
25778 | + | |
25779 | + for (i = 0; i < nr_ioapics; i++ ) { | |
25780 | + size = sizeof(struct sys_device) + nr_ioapic_registers[i] | |
25781 | + * sizeof(struct IO_APIC_route_entry); | |
25782 | + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); | |
25783 | + if (!mp_ioapic_data[i]) { | |
25784 | + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | |
25785 | + continue; | |
25786 | + } | |
25787 | + memset(mp_ioapic_data[i], 0, size); | |
25788 | + dev = &mp_ioapic_data[i]->dev; | |
25789 | + dev->id = i; | |
25790 | + dev->cls = &ioapic_sysdev_class; | |
25791 | + error = sysdev_register(dev); | |
25792 | + if (error) { | |
25793 | + kfree(mp_ioapic_data[i]); | |
25794 | + mp_ioapic_data[i] = NULL; | |
25795 | + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | |
25796 | + continue; | |
25797 | + } | |
25798 | + } | |
25799 | + | |
25800 | + return 0; | |
25801 | +} | |
25802 | + | |
25803 | +device_initcall(ioapic_init_sysfs); | |
25804 | + | |
25805 | +/* -------------------------------------------------------------------------- | |
25806 | + ACPI-based IOAPIC Configuration | |
25807 | + -------------------------------------------------------------------------- */ | |
25808 | + | |
25809 | +#ifdef CONFIG_ACPI | |
25810 | + | |
25811 | +#define IO_APIC_MAX_ID 0xFE | |
25812 | + | |
25813 | +int __init io_apic_get_version (int ioapic) | |
25814 | +{ | |
25815 | + union IO_APIC_reg_01 reg_01; | |
25816 | + unsigned long flags; | |
25817 | + | |
25818 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25819 | + reg_01.raw = io_apic_read(ioapic, 1); | |
25820 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25821 | + | |
25822 | + return reg_01.bits.version; | |
25823 | +} | |
25824 | + | |
25825 | + | |
25826 | +int __init io_apic_get_redir_entries (int ioapic) | |
25827 | +{ | |
25828 | + union IO_APIC_reg_01 reg_01; | |
25829 | + unsigned long flags; | |
25830 | + | |
25831 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25832 | + reg_01.raw = io_apic_read(ioapic, 1); | |
25833 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25834 | + | |
25835 | + return reg_01.bits.entries; | |
25836 | +} | |
25837 | + | |
25838 | + | |
25839 | +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) | |
25840 | +{ | |
25841 | + struct IO_APIC_route_entry entry; | |
25842 | + unsigned long flags; | |
25843 | + | |
25844 | + if (!IO_APIC_IRQ(irq)) { | |
25845 | + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", | |
25846 | + ioapic); | |
25847 | + return -EINVAL; | |
25848 | + } | |
25849 | + | |
25850 | + /* | |
25851 | + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. | |
25852 | + * Note that we mask (disable) IRQs now -- these get enabled when the | |
25853 | + * corresponding device driver registers for this IRQ. | |
25854 | + */ | |
25855 | + | |
25856 | + memset(&entry,0,sizeof(entry)); | |
25857 | + | |
25858 | + entry.delivery_mode = INT_DELIVERY_MODE; | |
25859 | + entry.dest_mode = INT_DEST_MODE; | |
25860 | + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | |
25861 | + entry.trigger = edge_level; | |
25862 | + entry.polarity = active_high_low; | |
25863 | + entry.mask = 1; /* Disabled (masked) */ | |
25864 | + | |
25865 | + irq = gsi_irq_sharing(irq); | |
25866 | + /* | |
25867 | + * IRQs < 16 are already in the irq_2_pin[] map | |
25868 | + */ | |
25869 | + if (irq >= 16) | |
25870 | + add_pin_to_irq(irq, ioapic, pin); | |
25871 | + | |
25872 | + entry.vector = assign_irq_vector(irq); | |
25873 | + | |
25874 | + apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " | |
25875 | + "IRQ %d Mode:%i Active:%i)\n", ioapic, | |
25876 | + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, | |
25877 | + edge_level, active_high_low); | |
25878 | + | |
25879 | + ioapic_register_intr(irq, entry.vector, edge_level); | |
25880 | + | |
25881 | + if (!ioapic && (irq < 16)) | |
25882 | + disable_8259A_irq(irq); | |
25883 | + | |
25884 | + spin_lock_irqsave(&ioapic_lock, flags); | |
25885 | + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); | |
25886 | + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); | |
25887 | + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); | |
25888 | + spin_unlock_irqrestore(&ioapic_lock, flags); | |
25889 | + | |
25890 | + return 0; | |
25891 | +} | |
25892 | + | |
25893 | +#endif /* CONFIG_ACPI */ | |
25894 | + | |
25895 | + | |
25896 | +#ifndef CONFIG_XEN | |
25897 | +/* | |
25898 | + * This function currently is only a helper for the i386 smp boot process where | |
25899 | + * we need to reprogram the ioredtbls to cater for the cpus which have come online | |
25900 | + * so mask in all cases should simply be TARGET_CPUS | |
25901 | + */ | |
25902 | +#ifdef CONFIG_SMP | |
25903 | +void __init setup_ioapic_dest(void) | |
25904 | +{ | |
25905 | + int pin, ioapic, irq, irq_entry; | |
25906 | + | |
25907 | + if (skip_ioapic_setup == 1) | |
25908 | + return; | |
25909 | + | |
25910 | + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { | |
25911 | + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { | |
25912 | + irq_entry = find_irq_entry(ioapic, pin, mp_INT); | |
25913 | + if (irq_entry == -1) | |
25914 | + continue; | |
25915 | + irq = pin_2_irq(irq_entry, ioapic, pin); | |
25916 | + set_ioapic_affinity_irq(irq, TARGET_CPUS); | |
25917 | + } | |
25918 | + | |
25919 | + } | |
25920 | +} | |
25921 | +#endif | |
25922 | +#endif /* !CONFIG_XEN */ | |
25923 | Index: head-2008-11-25/arch/x86/kernel/ioport_64-xen.c | |
25924 | =================================================================== | |
25925 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
25926 | +++ head-2008-11-25/arch/x86/kernel/ioport_64-xen.c 2008-01-28 12:24:19.000000000 +0100 | |
25927 | @@ -0,0 +1,100 @@ | |
25928 | +/* | |
25929 | + * linux/arch/x86_64/kernel/ioport.c | |
25930 | + * | |
25931 | + * This contains the io-permission bitmap code - written by obz, with changes | |
25932 | + * by Linus. | |
25933 | + */ | |
25934 | + | |
25935 | +#include <linux/sched.h> | |
25936 | +#include <linux/kernel.h> | |
25937 | +#include <linux/capability.h> | |
25938 | +#include <linux/errno.h> | |
25939 | +#include <linux/types.h> | |
25940 | +#include <linux/ioport.h> | |
25941 | +#include <linux/mm.h> | |
25942 | +#include <linux/smp.h> | |
25943 | +#include <linux/smp_lock.h> | |
25944 | +#include <linux/stddef.h> | |
25945 | +#include <linux/slab.h> | |
25946 | +#include <linux/thread_info.h> | |
25947 | +#include <xen/interface/physdev.h> | |
25948 | + | |
25949 | +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | |
25950 | +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) | |
25951 | +{ | |
25952 | + int i; | |
25953 | + | |
25954 | + if (new_value) | |
25955 | + for (i = base; i < base + extent; i++) | |
25956 | + __set_bit(i, bitmap); | |
25957 | + else | |
25958 | + for (i = base; i < base + extent; i++) | |
25959 | + clear_bit(i, bitmap); | |
25960 | +} | |
25961 | + | |
25962 | +/* | |
25963 | + * this changes the io permissions bitmap in the current task. | |
25964 | + */ | |
25965 | +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | |
25966 | +{ | |
25967 | + struct thread_struct * t = ¤t->thread; | |
25968 | + unsigned long *bitmap; | |
25969 | + struct physdev_set_iobitmap set_iobitmap; | |
25970 | + | |
25971 | + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) | |
25972 | + return -EINVAL; | |
25973 | + if (turn_on && !capable(CAP_SYS_RAWIO)) | |
25974 | + return -EPERM; | |
25975 | + | |
25976 | + /* | |
25977 | + * If it's the first ioperm() call in this thread's lifetime, set the | |
25978 | + * IO bitmap up. ioperm() is much less timing critical than clone(), | |
25979 | + * this is why we delay this operation until now: | |
25980 | + */ | |
25981 | + if (!t->io_bitmap_ptr) { | |
25982 | + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | |
25983 | + if (!bitmap) | |
25984 | + return -ENOMEM; | |
25985 | + | |
25986 | + memset(bitmap, 0xff, IO_BITMAP_BYTES); | |
25987 | + t->io_bitmap_ptr = bitmap; | |
25988 | + | |
25989 | + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); | |
25990 | + set_iobitmap.nr_ports = IO_BITMAP_BITS; | |
25991 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, | |
25992 | + &set_iobitmap)); | |
25993 | + } | |
25994 | + | |
25995 | + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); | |
25996 | + | |
25997 | + return 0; | |
25998 | +} | |
25999 | + | |
26000 | +/* | |
26001 | + * sys_iopl has to be used when you want to access the IO ports | |
26002 | + * beyond the 0x3ff range: to get the full 65536 ports bitmapped | |
26003 | + * you'd need 8kB of bitmaps/process, which is a bit excessive. | |
26004 | + * | |
26005 | + */ | |
26006 | + | |
26007 | +asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs) | |
26008 | +{ | |
26009 | + unsigned int old_iopl = current->thread.iopl; | |
26010 | + struct physdev_set_iopl set_iopl; | |
26011 | + | |
26012 | + if (new_iopl > 3) | |
26013 | + return -EINVAL; | |
26014 | + | |
26015 | + /* Need "raw I/O" privileges for direct port access. */ | |
26016 | + if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO)) | |
26017 | + return -EPERM; | |
26018 | + | |
26019 | + /* Change our version of the privilege levels. */ | |
26020 | + current->thread.iopl = new_iopl; | |
26021 | + | |
26022 | + /* Force the change at ring 0. */ | |
26023 | + set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl; | |
26024 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); | |
26025 | + | |
26026 | + return 0; | |
26027 | +} | |
26028 | Index: head-2008-11-25/arch/x86/kernel/irq_64-xen.c | |
26029 | =================================================================== | |
26030 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
26031 | +++ head-2008-11-25/arch/x86/kernel/irq_64-xen.c 2008-10-29 09:55:56.000000000 +0100 | |
26032 | @@ -0,0 +1,197 @@ | |
26033 | +/* | |
26034 | + * linux/arch/x86_64/kernel/irq.c | |
26035 | + * | |
26036 | + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar | |
26037 | + * | |
26038 | + * This file contains the lowest level x86_64-specific interrupt | |
26039 | + * entry and irq statistics code. All the remaining irq logic is | |
26040 | + * done by the generic kernel/irq/ code and in the | |
26041 | + * x86_64-specific irq controller code. (e.g. i8259.c and | |
26042 | + * io_apic.c.) | |
26043 | + */ | |
26044 | + | |
26045 | +#include <linux/kernel_stat.h> | |
26046 | +#include <linux/interrupt.h> | |
26047 | +#include <linux/seq_file.h> | |
26048 | +#include <linux/module.h> | |
26049 | +#include <linux/delay.h> | |
26050 | +#include <asm/uaccess.h> | |
26051 | +#include <asm/io_apic.h> | |
26052 | +#include <asm/idle.h> | |
26053 | + | |
26054 | +atomic_t irq_err_count; | |
26055 | +#ifdef CONFIG_X86_IO_APIC | |
26056 | +#ifdef APIC_MISMATCH_DEBUG | |
26057 | +atomic_t irq_mis_count; | |
26058 | +#endif | |
26059 | +#endif | |
26060 | + | |
26061 | +#ifdef CONFIG_DEBUG_STACKOVERFLOW | |
26062 | +/* | |
26063 | + * Probabilistic stack overflow check: | |
26064 | + * | |
26065 | + * Only check the stack in process context, because everything else | |
26066 | + * runs on the big interrupt stacks. Checking reliably is too expensive, | |
26067 | + * so we just check from interrupts. | |
26068 | + */ | |
26069 | +static inline void stack_overflow_check(struct pt_regs *regs) | |
26070 | +{ | |
26071 | + u64 curbase = (u64) current->thread_info; | |
26072 | + static unsigned long warned = -60*HZ; | |
26073 | + | |
26074 | + if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && | |
26075 | + regs->rsp < curbase + sizeof(struct thread_info) + 128 && | |
26076 | + time_after(jiffies, warned + 60*HZ)) { | |
26077 | + printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", | |
26078 | + current->comm, curbase, regs->rsp); | |
26079 | + show_stack(NULL,NULL); | |
26080 | + warned = jiffies; | |
26081 | + } | |
26082 | +} | |
26083 | +#endif | |
26084 | + | |
26085 | +/* | |
26086 | + * Generic, controller-independent functions: | |
26087 | + */ | |
26088 | + | |
26089 | +int show_interrupts(struct seq_file *p, void *v) | |
26090 | +{ | |
26091 | + int i = *(loff_t *) v, j; | |
26092 | + struct irqaction * action; | |
26093 | + unsigned long flags; | |
26094 | + | |
26095 | + if (i == 0) { | |
26096 | + seq_printf(p, " "); | |
26097 | + for_each_online_cpu(j) | |
26098 | + seq_printf(p, "CPU%-8d",j); | |
26099 | + seq_putc(p, '\n'); | |
26100 | + } | |
26101 | + | |
26102 | + if (i < NR_IRQS) { | |
26103 | + spin_lock_irqsave(&irq_desc[i].lock, flags); | |
26104 | + action = irq_desc[i].action; | |
26105 | + if (!action) | |
26106 | + goto skip; | |
26107 | + seq_printf(p, "%3d: ",i); | |
26108 | +#ifndef CONFIG_SMP | |
26109 | + seq_printf(p, "%10u ", kstat_irqs(i)); | |
26110 | +#else | |
26111 | + for_each_online_cpu(j) | |
26112 | + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); | |
26113 | +#endif | |
26114 | + seq_printf(p, " %14s", irq_desc[i].chip->typename); | |
26115 | + | |
26116 | + seq_printf(p, " %s", action->name); | |
26117 | + for (action=action->next; action; action = action->next) | |
26118 | + seq_printf(p, ", %s", action->name); | |
26119 | + seq_putc(p, '\n'); | |
26120 | +skip: | |
26121 | + spin_unlock_irqrestore(&irq_desc[i].lock, flags); | |
26122 | + } else if (i == NR_IRQS) { | |
26123 | + seq_printf(p, "NMI: "); | |
26124 | + for_each_online_cpu(j) | |
26125 | + seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); | |
26126 | + seq_putc(p, '\n'); | |
26127 | +#ifdef CONFIG_X86_LOCAL_APIC | |
26128 | + seq_printf(p, "LOC: "); | |
26129 | + for_each_online_cpu(j) | |
26130 | + seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); | |
26131 | + seq_putc(p, '\n'); | |
26132 | +#endif | |
26133 | + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | |
26134 | +#ifdef CONFIG_X86_IO_APIC | |
26135 | +#ifdef APIC_MISMATCH_DEBUG | |
26136 | + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); | |
26137 | +#endif | |
26138 | +#endif | |
26139 | + } | |
26140 | + return 0; | |
26141 | +} | |
26142 | + | |
26143 | +/* | |
26144 | + * do_IRQ handles all normal device IRQ's (the special | |
26145 | + * SMP cross-CPU interrupts have their own specific | |
26146 | + * handlers). | |
26147 | + */ | |
26148 | +asmlinkage unsigned int do_IRQ(struct pt_regs *regs) | |
26149 | +{ | |
26150 | + /* high bit used in ret_from_ code */ | |
26151 | + unsigned irq = ~regs->orig_rax; | |
26152 | + | |
26153 | + if (unlikely(irq >= NR_IRQS)) { | |
26154 | + printk(KERN_EMERG "%s: cannot handle IRQ %d\n", | |
26155 | + __FUNCTION__, irq); | |
26156 | + BUG(); | |
26157 | + } | |
26158 | + | |
26159 | + /*exit_idle();*/ | |
26160 | + /*irq_enter();*/ | |
26161 | +#ifdef CONFIG_DEBUG_STACKOVERFLOW | |
26162 | + stack_overflow_check(regs); | |
26163 | +#endif | |
26164 | + __do_IRQ(irq, regs); | |
26165 | + /*irq_exit();*/ | |
26166 | + | |
26167 | + return 1; | |
26168 | +} | |
26169 | + | |
26170 | +#ifdef CONFIG_HOTPLUG_CPU | |
26171 | +void fixup_irqs(cpumask_t map) | |
26172 | +{ | |
26173 | + unsigned int irq; | |
26174 | + static int warned; | |
26175 | + | |
26176 | + for (irq = 0; irq < NR_IRQS; irq++) { | |
26177 | + cpumask_t mask; | |
26178 | + if (irq == 2) | |
26179 | + continue; | |
26180 | + | |
26181 | + cpus_and(mask, irq_desc[irq].affinity, map); | |
26182 | + if (any_online_cpu(mask) == NR_CPUS) { | |
26183 | + /*printk("Breaking affinity for irq %i\n", irq);*/ | |
26184 | + mask = map; | |
26185 | + } | |
26186 | + if (irq_desc[irq].chip->set_affinity) | |
26187 | + irq_desc[irq].chip->set_affinity(irq, mask); | |
26188 | + else if (irq_desc[irq].action && !(warned++)) | |
26189 | + printk("Cannot set affinity for irq %i\n", irq); | |
26190 | + } | |
26191 | + | |
26192 | + /* That doesn't seem sufficient. Give it 1ms. */ | |
26193 | + local_irq_enable(); | |
26194 | + mdelay(1); | |
26195 | + local_irq_disable(); | |
26196 | +} | |
26197 | +#endif | |
26198 | + | |
26199 | +extern void call_softirq(void); | |
26200 | + | |
26201 | +asmlinkage void do_softirq(void) | |
26202 | +{ | |
26203 | + __u32 pending; | |
26204 | + unsigned long flags; | |
26205 | + | |
26206 | + if (in_interrupt()) | |
26207 | + return; | |
26208 | + | |
26209 | + local_irq_save(flags); | |
26210 | + pending = local_softirq_pending(); | |
26211 | + /* Switch to interrupt stack */ | |
26212 | + if (pending) { | |
26213 | + call_softirq(); | |
26214 | + WARN_ON_ONCE(softirq_count()); | |
26215 | + } | |
26216 | + local_irq_restore(flags); | |
26217 | +} | |
26218 | +EXPORT_SYMBOL(do_softirq); | |
26219 | + | |
26220 | +#ifndef CONFIG_X86_LOCAL_APIC | |
26221 | +/* | |
26222 | + * 'what should we do if we get a hw irq event on an illegal vector'. | |
26223 | + * each architecture has to answer this themselves. | |
26224 | + */ | |
26225 | +void ack_bad_irq(unsigned int irq) | |
26226 | +{ | |
26227 | + printk("unexpected IRQ trap at vector %02x\n", irq); | |
26228 | +} | |
26229 | +#endif | |
26230 | Index: head-2008-11-25/arch/x86/kernel/ldt_64-xen.c | |
26231 | =================================================================== | |
26232 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
26233 | +++ head-2008-11-25/arch/x86/kernel/ldt_64-xen.c 2007-06-12 13:13:01.000000000 +0200 | |
26234 | @@ -0,0 +1,282 @@ | |
26235 | +/* | |
26236 | + * linux/arch/x86_64/kernel/ldt.c | |
26237 | + * | |
26238 | + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds | |
26239 | + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | |
26240 | + * Copyright (C) 2002 Andi Kleen | |
26241 | + * | |
26242 | + * This handles calls from both 32bit and 64bit mode. | |
26243 | + */ | |
26244 | + | |
26245 | +#include <linux/errno.h> | |
26246 | +#include <linux/sched.h> | |
26247 | +#include <linux/string.h> | |
26248 | +#include <linux/mm.h> | |
26249 | +#include <linux/smp.h> | |
26250 | +#include <linux/smp_lock.h> | |
26251 | +#include <linux/vmalloc.h> | |
26252 | +#include <linux/slab.h> | |
26253 | + | |
26254 | +#include <asm/uaccess.h> | |
26255 | +#include <asm/system.h> | |
26256 | +#include <asm/ldt.h> | |
26257 | +#include <asm/desc.h> | |
26258 | +#include <asm/proto.h> | |
26259 | +#include <asm/pgalloc.h> | |
26260 | + | |
26261 | +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ | |
26262 | +static void flush_ldt(void *null) | |
26263 | +{ | |
26264 | + if (current->active_mm) | |
26265 | + load_LDT(¤t->active_mm->context); | |
26266 | +} | |
26267 | +#endif | |
26268 | + | |
26269 | +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) | |
26270 | +{ | |
26271 | + void *oldldt; | |
26272 | + void *newldt; | |
26273 | + unsigned oldsize; | |
26274 | + | |
26275 | + if (mincount <= (unsigned)pc->size) | |
26276 | + return 0; | |
26277 | + oldsize = pc->size; | |
26278 | + mincount = (mincount+511)&(~511); | |
26279 | + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) | |
26280 | + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); | |
26281 | + else | |
26282 | + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); | |
26283 | + | |
26284 | + if (!newldt) | |
26285 | + return -ENOMEM; | |
26286 | + | |
26287 | + if (oldsize) | |
26288 | + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); | |
26289 | + oldldt = pc->ldt; | |
26290 | + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); | |
26291 | + wmb(); | |
26292 | + pc->ldt = newldt; | |
26293 | + wmb(); | |
26294 | + pc->size = mincount; | |
26295 | + wmb(); | |
26296 | + if (reload) { | |
26297 | +#ifdef CONFIG_SMP | |
26298 | + cpumask_t mask; | |
26299 | + | |
26300 | + preempt_disable(); | |
26301 | +#endif | |
26302 | + make_pages_readonly( | |
26303 | + pc->ldt, | |
26304 | + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
26305 | + XENFEAT_writable_descriptor_tables); | |
26306 | + load_LDT(pc); | |
26307 | +#ifdef CONFIG_SMP | |
26308 | + mask = cpumask_of_cpu(smp_processor_id()); | |
26309 | + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | |
26310 | + smp_call_function(flush_ldt, NULL, 1, 1); | |
26311 | + preempt_enable(); | |
26312 | +#endif | |
26313 | + } | |
26314 | + if (oldsize) { | |
26315 | + make_pages_writable( | |
26316 | + oldldt, | |
26317 | + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
26318 | + XENFEAT_writable_descriptor_tables); | |
26319 | + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) | |
26320 | + vfree(oldldt); | |
26321 | + else | |
26322 | + kfree(oldldt); | |
26323 | + } | |
26324 | + return 0; | |
26325 | +} | |
26326 | + | |
26327 | +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | |
26328 | +{ | |
26329 | + int err = alloc_ldt(new, old->size, 0); | |
26330 | + if (err < 0) | |
26331 | + return err; | |
26332 | + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); | |
26333 | + make_pages_readonly( | |
26334 | + new->ldt, | |
26335 | + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
26336 | + XENFEAT_writable_descriptor_tables); | |
26337 | + return 0; | |
26338 | +} | |
26339 | + | |
26340 | +/* | |
26341 | + * we do not have to muck with descriptors here, that is | |
26342 | + * done in switch_mm() as needed. | |
26343 | + */ | |
26344 | +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | |
26345 | +{ | |
26346 | + struct mm_struct * old_mm; | |
26347 | + int retval = 0; | |
26348 | + | |
26349 | + memset(&mm->context, 0, sizeof(mm->context)); | |
26350 | + init_MUTEX(&mm->context.sem); | |
26351 | + old_mm = current->mm; | |
26352 | + if (old_mm && old_mm->context.size > 0) { | |
26353 | + down(&old_mm->context.sem); | |
26354 | + retval = copy_ldt(&mm->context, &old_mm->context); | |
26355 | + up(&old_mm->context.sem); | |
26356 | + } | |
26357 | + if (retval == 0) { | |
26358 | + spin_lock(&mm_unpinned_lock); | |
26359 | + list_add(&mm->context.unpinned, &mm_unpinned); | |
26360 | + spin_unlock(&mm_unpinned_lock); | |
26361 | + } | |
26362 | + return retval; | |
26363 | +} | |
26364 | + | |
26365 | +/* | |
26366 | + * | |
26367 | + * Don't touch the LDT register - we're already in the next thread. | |
26368 | + */ | |
26369 | +void destroy_context(struct mm_struct *mm) | |
26370 | +{ | |
26371 | + if (mm->context.size) { | |
26372 | + if (mm == current->active_mm) | |
26373 | + clear_LDT(); | |
26374 | + make_pages_writable( | |
26375 | + mm->context.ldt, | |
26376 | + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, | |
26377 | + XENFEAT_writable_descriptor_tables); | |
26378 | + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) | |
26379 | + vfree(mm->context.ldt); | |
26380 | + else | |
26381 | + kfree(mm->context.ldt); | |
26382 | + mm->context.size = 0; | |
26383 | + } | |
26384 | + if (!mm->context.pinned) { | |
26385 | + spin_lock(&mm_unpinned_lock); | |
26386 | + list_del(&mm->context.unpinned); | |
26387 | + spin_unlock(&mm_unpinned_lock); | |
26388 | + } | |
26389 | +} | |
26390 | + | |
26391 | +static int read_ldt(void __user * ptr, unsigned long bytecount) | |
26392 | +{ | |
26393 | + int err; | |
26394 | + unsigned long size; | |
26395 | + struct mm_struct * mm = current->mm; | |
26396 | + | |
26397 | + if (!mm->context.size) | |
26398 | + return 0; | |
26399 | + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) | |
26400 | + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; | |
26401 | + | |
26402 | + down(&mm->context.sem); | |
26403 | + size = mm->context.size*LDT_ENTRY_SIZE; | |
26404 | + if (size > bytecount) | |
26405 | + size = bytecount; | |
26406 | + | |
26407 | + err = 0; | |
26408 | + if (copy_to_user(ptr, mm->context.ldt, size)) | |
26409 | + err = -EFAULT; | |
26410 | + up(&mm->context.sem); | |
26411 | + if (err < 0) | |
26412 | + goto error_return; | |
26413 | + if (size != bytecount) { | |
26414 | + /* zero-fill the rest */ | |
26415 | + if (clear_user(ptr+size, bytecount-size) != 0) { | |
26416 | + err = -EFAULT; | |
26417 | + goto error_return; | |
26418 | + } | |
26419 | + } | |
26420 | + return bytecount; | |
26421 | +error_return: | |
26422 | + return err; | |
26423 | +} | |
26424 | + | |
26425 | +static int read_default_ldt(void __user * ptr, unsigned long bytecount) | |
26426 | +{ | |
26427 | + /* Arbitrary number */ | |
26428 | + /* x86-64 default LDT is all zeros */ | |
26429 | + if (bytecount > 128) | |
26430 | + bytecount = 128; | |
26431 | + if (clear_user(ptr, bytecount)) | |
26432 | + return -EFAULT; | |
26433 | + return bytecount; | |
26434 | +} | |
26435 | + | |
26436 | +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) | |
26437 | +{ | |
26438 | + struct task_struct *me = current; | |
26439 | + struct mm_struct * mm = me->mm; | |
26440 | + __u32 entry_1, entry_2, *lp; | |
26441 | + unsigned long mach_lp; | |
26442 | + int error; | |
26443 | + struct user_desc ldt_info; | |
26444 | + | |
26445 | + error = -EINVAL; | |
26446 | + | |
26447 | + if (bytecount != sizeof(ldt_info)) | |
26448 | + goto out; | |
26449 | + error = -EFAULT; | |
26450 | + if (copy_from_user(&ldt_info, ptr, bytecount)) | |
26451 | + goto out; | |
26452 | + | |
26453 | + error = -EINVAL; | |
26454 | + if (ldt_info.entry_number >= LDT_ENTRIES) | |
26455 | + goto out; | |
26456 | + if (ldt_info.contents == 3) { | |
26457 | + if (oldmode) | |
26458 | + goto out; | |
26459 | + if (ldt_info.seg_not_present == 0) | |
26460 | + goto out; | |
26461 | + } | |
26462 | + | |
26463 | + down(&mm->context.sem); | |
26464 | + if (ldt_info.entry_number >= (unsigned)mm->context.size) { | |
26465 | + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); | |
26466 | + if (error < 0) | |
26467 | + goto out_unlock; | |
26468 | + } | |
26469 | + | |
26470 | + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); | |
26471 | + mach_lp = arbitrary_virt_to_machine(lp); | |
26472 | + | |
26473 | + /* Allow LDTs to be cleared by the user. */ | |
26474 | + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { | |
26475 | + if (oldmode || LDT_empty(&ldt_info)) { | |
26476 | + entry_1 = 0; | |
26477 | + entry_2 = 0; | |
26478 | + goto install; | |
26479 | + } | |
26480 | + } | |
26481 | + | |
26482 | + entry_1 = LDT_entry_a(&ldt_info); | |
26483 | + entry_2 = LDT_entry_b(&ldt_info); | |
26484 | + if (oldmode) | |
26485 | + entry_2 &= ~(1 << 20); | |
26486 | + | |
26487 | + /* Install the new entry ... */ | |
26488 | +install: | |
26489 | + error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32))); | |
26490 | + | |
26491 | +out_unlock: | |
26492 | + up(&mm->context.sem); | |
26493 | +out: | |
26494 | + return error; | |
26495 | +} | |
26496 | + | |
26497 | +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) | |
26498 | +{ | |
26499 | + int ret = -ENOSYS; | |
26500 | + | |
26501 | + switch (func) { | |
26502 | + case 0: | |
26503 | + ret = read_ldt(ptr, bytecount); | |
26504 | + break; | |
26505 | + case 1: | |
26506 | + ret = write_ldt(ptr, bytecount, 1); | |
26507 | + break; | |
26508 | + case 2: | |
26509 | + ret = read_default_ldt(ptr, bytecount); | |
26510 | + break; | |
26511 | + case 0x11: | |
26512 | + ret = write_ldt(ptr, bytecount, 0); | |
26513 | + break; | |
26514 | + } | |
26515 | + return ret; | |
26516 | +} | |
26517 | Index: head-2008-11-25/arch/x86/kernel/mpparse_64-xen.c | |
26518 | =================================================================== | |
26519 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
26520 | +++ head-2008-11-25/arch/x86/kernel/mpparse_64-xen.c 2007-06-12 13:13:01.000000000 +0200 | |
26521 | @@ -0,0 +1,1011 @@ | |
26522 | +/* | |
26523 | + * Intel Multiprocessor Specification 1.1 and 1.4 | |
26524 | + * compliant MP-table parsing routines. | |
26525 | + * | |
26526 | + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | |
26527 | + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | |
26528 | + * | |
26529 | + * Fixes | |
26530 | + * Erich Boleyn : MP v1.4 and additional changes. | |
26531 | + * Alan Cox : Added EBDA scanning | |
26532 | + * Ingo Molnar : various cleanups and rewrites | |
26533 | + * Maciej W. Rozycki: Bits for default MP configurations | |
26534 | + * Paul Diefenbaugh: Added full ACPI support | |
26535 | + */ | |
26536 | + | |
26537 | +#include <linux/mm.h> | |
26538 | +#include <linux/init.h> | |
26539 | +#include <linux/delay.h> | |
26540 | +#include <linux/bootmem.h> | |
26541 | +#include <linux/smp_lock.h> | |
26542 | +#include <linux/kernel_stat.h> | |
26543 | +#include <linux/mc146818rtc.h> | |
26544 | +#include <linux/acpi.h> | |
26545 | +#include <linux/module.h> | |
26546 | + | |
26547 | +#include <asm/smp.h> | |
26548 | +#include <asm/mtrr.h> | |
26549 | +#include <asm/mpspec.h> | |
26550 | +#include <asm/pgalloc.h> | |
26551 | +#include <asm/io_apic.h> | |
26552 | +#include <asm/proto.h> | |
26553 | +#include <asm/acpi.h> | |
26554 | + | |
26555 | +/* Have we found an MP table */ | |
26556 | +int smp_found_config; | |
26557 | +unsigned int __initdata maxcpus = NR_CPUS; | |
26558 | + | |
26559 | +int acpi_found_madt; | |
26560 | + | |
26561 | +/* | |
26562 | + * Various Linux-internal data structures created from the | |
26563 | + * MP-table. | |
26564 | + */ | |
26565 | +unsigned char apic_version [MAX_APICS]; | |
26566 | +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; | |
26567 | +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; | |
26568 | + | |
26569 | +static int mp_current_pci_id = 0; | |
26570 | +/* I/O APIC entries */ | |
26571 | +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; | |
26572 | + | |
26573 | +/* # of MP IRQ source entries */ | |
26574 | +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | |
26575 | + | |
26576 | +/* MP IRQ source entries */ | |
26577 | +int mp_irq_entries; | |
26578 | + | |
26579 | +int nr_ioapics; | |
26580 | +int pic_mode; | |
26581 | +unsigned long mp_lapic_addr = 0; | |
26582 | + | |
26583 | + | |
26584 | + | |
26585 | +/* Processor that is doing the boot up */ | |
26586 | +unsigned int boot_cpu_id = -1U; | |
26587 | +/* Internal processor count */ | |
26588 | +unsigned int num_processors __initdata = 0; | |
26589 | + | |
26590 | +unsigned disabled_cpus __initdata; | |
26591 | + | |
26592 | +/* Bitmask of physically existing CPUs */ | |
26593 | +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; | |
26594 | + | |
26595 | +/* ACPI MADT entry parsing functions */ | |
26596 | +#ifdef CONFIG_ACPI | |
26597 | +extern struct acpi_boot_flags acpi_boot; | |
26598 | +#ifdef CONFIG_X86_LOCAL_APIC | |
26599 | +extern int acpi_parse_lapic (acpi_table_entry_header *header); | |
26600 | +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); | |
26601 | +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); | |
26602 | +#endif /*CONFIG_X86_LOCAL_APIC*/ | |
26603 | +#ifdef CONFIG_X86_IO_APIC | |
26604 | +extern int acpi_parse_ioapic (acpi_table_entry_header *header); | |
26605 | +#endif /*CONFIG_X86_IO_APIC*/ | |
26606 | +#endif /*CONFIG_ACPI*/ | |
26607 | + | |
26608 | +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | |
26609 | + | |
26610 | + | |
26611 | +/* | |
26612 | + * Intel MP BIOS table parsing routines: | |
26613 | + */ | |
26614 | + | |
26615 | +/* | |
26616 | + * Checksum an MP configuration block. | |
26617 | + */ | |
26618 | + | |
26619 | +static int __init mpf_checksum(unsigned char *mp, int len) | |
26620 | +{ | |
26621 | + int sum = 0; | |
26622 | + | |
26623 | + while (len--) | |
26624 | + sum += *mp++; | |
26625 | + | |
26626 | + return sum & 0xFF; | |
26627 | +} | |
26628 | + | |
26629 | +#ifndef CONFIG_XEN | |
26630 | +static void __cpuinit MP_processor_info (struct mpc_config_processor *m) | |
26631 | +{ | |
26632 | + int cpu; | |
26633 | + unsigned char ver; | |
26634 | + cpumask_t tmp_map; | |
26635 | + | |
26636 | + if (!(m->mpc_cpuflag & CPU_ENABLED)) { | |
26637 | + disabled_cpus++; | |
26638 | + return; | |
26639 | + } | |
26640 | + | |
26641 | + printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", | |
26642 | + m->mpc_apicid, | |
26643 | + (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, | |
26644 | + (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, | |
26645 | + m->mpc_apicver); | |
26646 | + | |
26647 | + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | |
26648 | + Dprintk(" Bootup CPU\n"); | |
26649 | + boot_cpu_id = m->mpc_apicid; | |
26650 | + } | |
26651 | + if (num_processors >= NR_CPUS) { | |
26652 | + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." | |
26653 | + " Processor ignored.\n", NR_CPUS); | |
26654 | + return; | |
26655 | + } | |
26656 | + | |
26657 | + num_processors++; | |
26658 | + cpus_complement(tmp_map, cpu_present_map); | |
26659 | + cpu = first_cpu(tmp_map); | |
26660 | + | |
26661 | +#if MAX_APICS < 255 | |
26662 | + if ((int)m->mpc_apicid > MAX_APICS) { | |
26663 | + printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", | |
26664 | + m->mpc_apicid, MAX_APICS); | |
26665 | + return; | |
26666 | + } | |
26667 | +#endif | |
26668 | + ver = m->mpc_apicver; | |
26669 | + | |
26670 | + physid_set(m->mpc_apicid, phys_cpu_present_map); | |
26671 | + /* | |
26672 | + * Validate version | |
26673 | + */ | |
26674 | + if (ver == 0x0) { | |
26675 | + printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); | |
26676 | + ver = 0x10; | |
26677 | + } | |
26678 | + apic_version[m->mpc_apicid] = ver; | |
26679 | + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | |
26680 | + /* | |
26681 | + * bios_cpu_apicid is required to have processors listed | |
26682 | + * in same order as logical cpu numbers. Hence the first | |
26683 | + * entry is BSP, and so on. | |
26684 | + */ | |
26685 | + cpu = 0; | |
26686 | + } | |
26687 | + bios_cpu_apicid[cpu] = m->mpc_apicid; | |
26688 | + x86_cpu_to_apicid[cpu] = m->mpc_apicid; | |
26689 | + | |
26690 | + cpu_set(cpu, cpu_possible_map); | |
26691 | + cpu_set(cpu, cpu_present_map); | |
26692 | +} | |
26693 | +#else | |
26694 | +static void __cpuinit MP_processor_info (struct mpc_config_processor *m) | |
26695 | +{ | |
26696 | + num_processors++; | |
26697 | +} | |
26698 | +#endif /* CONFIG_XEN */ | |
26699 | + | |
26700 | +static void __init MP_bus_info (struct mpc_config_bus *m) | |
26701 | +{ | |
26702 | + char str[7]; | |
26703 | + | |
26704 | + memcpy(str, m->mpc_bustype, 6); | |
26705 | + str[6] = 0; | |
26706 | + Dprintk("Bus #%d is %s\n", m->mpc_busid, str); | |
26707 | + | |
26708 | + if (strncmp(str, "ISA", 3) == 0) { | |
26709 | + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; | |
26710 | + } else if (strncmp(str, "EISA", 4) == 0) { | |
26711 | + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; | |
26712 | + } else if (strncmp(str, "PCI", 3) == 0) { | |
26713 | + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; | |
26714 | + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; | |
26715 | + mp_current_pci_id++; | |
26716 | + } else if (strncmp(str, "MCA", 3) == 0) { | |
26717 | + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; | |
26718 | + } else { | |
26719 | + printk(KERN_ERR "Unknown bustype %s\n", str); | |
26720 | + } | |
26721 | +} | |
26722 | + | |
26723 | +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) | |
26724 | +{ | |
26725 | + if (!(m->mpc_flags & MPC_APIC_USABLE)) | |
26726 | + return; | |
26727 | + | |
26728 | + printk("I/O APIC #%d Version %d at 0x%X.\n", | |
26729 | + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); | |
26730 | + if (nr_ioapics >= MAX_IO_APICS) { | |
26731 | + printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", | |
26732 | + MAX_IO_APICS, nr_ioapics); | |
26733 | + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); | |
26734 | + } | |
26735 | + if (!m->mpc_apicaddr) { | |
26736 | + printk(KERN_ERR "WARNING: bogus zero I/O APIC address" | |
26737 | + " found in MP table, skipping!\n"); | |
26738 | + return; | |
26739 | + } | |
26740 | + mp_ioapics[nr_ioapics] = *m; | |
26741 | + nr_ioapics++; | |
26742 | +} | |
26743 | + | |
26744 | +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) | |
26745 | +{ | |
26746 | + mp_irqs [mp_irq_entries] = *m; | |
26747 | + Dprintk("Int: type %d, pol %d, trig %d, bus %d," | |
26748 | + " IRQ %02x, APIC ID %x, APIC INT %02x\n", | |
26749 | + m->mpc_irqtype, m->mpc_irqflag & 3, | |
26750 | + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, | |
26751 | + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); | |
26752 | + if (++mp_irq_entries >= MAX_IRQ_SOURCES) | |
26753 | + panic("Max # of irq sources exceeded!!\n"); | |
26754 | +} | |
26755 | + | |
26756 | +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) | |
26757 | +{ | |
26758 | + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," | |
26759 | + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", | |
26760 | + m->mpc_irqtype, m->mpc_irqflag & 3, | |
26761 | + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, | |
26762 | + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); | |
26763 | + /* | |
26764 | + * Well it seems all SMP boards in existence | |
26765 | + * use ExtINT/LVT1 == LINT0 and | |
26766 | + * NMI/LVT2 == LINT1 - the following check | |
26767 | + * will show us if this assumptions is false. | |
26768 | + * Until then we do not have to add baggage. | |
26769 | + */ | |
26770 | + if ((m->mpc_irqtype == mp_ExtINT) && | |
26771 | + (m->mpc_destapiclint != 0)) | |
26772 | + BUG(); | |
26773 | + if ((m->mpc_irqtype == mp_NMI) && | |
26774 | + (m->mpc_destapiclint != 1)) | |
26775 | + BUG(); | |
26776 | +} | |
26777 | + | |
26778 | +/* | |
26779 | + * Read/parse the MPC | |
26780 | + */ | |
26781 | + | |
26782 | +static int __init smp_read_mpc(struct mp_config_table *mpc) | |
26783 | +{ | |
26784 | + char str[16]; | |
26785 | + int count=sizeof(*mpc); | |
26786 | + unsigned char *mpt=((unsigned char *)mpc)+count; | |
26787 | + | |
26788 | + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { | |
26789 | + printk("SMP mptable: bad signature [%c%c%c%c]!\n", | |
26790 | + mpc->mpc_signature[0], | |
26791 | + mpc->mpc_signature[1], | |
26792 | + mpc->mpc_signature[2], | |
26793 | + mpc->mpc_signature[3]); | |
26794 | + return 0; | |
26795 | + } | |
26796 | + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { | |
26797 | + printk("SMP mptable: checksum error!\n"); | |
26798 | + return 0; | |
26799 | + } | |
26800 | + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { | |
26801 | + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", | |
26802 | + mpc->mpc_spec); | |
26803 | + return 0; | |
26804 | + } | |
26805 | + if (!mpc->mpc_lapic) { | |
26806 | + printk(KERN_ERR "SMP mptable: null local APIC address!\n"); | |
26807 | + return 0; | |
26808 | + } | |
26809 | + memcpy(str,mpc->mpc_oem,8); | |
26810 | + str[8]=0; | |
26811 | + printk(KERN_INFO "OEM ID: %s ",str); | |
26812 | + | |
26813 | + memcpy(str,mpc->mpc_productid,12); | |
26814 | + str[12]=0; | |
26815 | + printk("Product ID: %s ",str); | |
26816 | + | |
26817 | + printk("APIC at: 0x%X\n",mpc->mpc_lapic); | |
26818 | + | |
26819 | + /* save the local APIC address, it might be non-default */ | |
26820 | + if (!acpi_lapic) | |
26821 | + mp_lapic_addr = mpc->mpc_lapic; | |
26822 | + | |
26823 | + /* | |
26824 | + * Now process the configuration blocks. | |
26825 | + */ | |
26826 | + while (count < mpc->mpc_length) { | |
26827 | + switch(*mpt) { | |
26828 | + case MP_PROCESSOR: | |
26829 | + { | |
26830 | + struct mpc_config_processor *m= | |
26831 | + (struct mpc_config_processor *)mpt; | |
26832 | + if (!acpi_lapic) | |
26833 | + MP_processor_info(m); | |
26834 | + mpt += sizeof(*m); | |
26835 | + count += sizeof(*m); | |
26836 | + break; | |
26837 | + } | |
26838 | + case MP_BUS: | |
26839 | + { | |
26840 | + struct mpc_config_bus *m= | |
26841 | + (struct mpc_config_bus *)mpt; | |
26842 | + MP_bus_info(m); | |
26843 | + mpt += sizeof(*m); | |
26844 | + count += sizeof(*m); | |
26845 | + break; | |
26846 | + } | |
26847 | + case MP_IOAPIC: | |
26848 | + { | |
26849 | + struct mpc_config_ioapic *m= | |
26850 | + (struct mpc_config_ioapic *)mpt; | |
26851 | + MP_ioapic_info(m); | |
26852 | + mpt+=sizeof(*m); | |
26853 | + count+=sizeof(*m); | |
26854 | + break; | |
26855 | + } | |
26856 | + case MP_INTSRC: | |
26857 | + { | |
26858 | + struct mpc_config_intsrc *m= | |
26859 | + (struct mpc_config_intsrc *)mpt; | |
26860 | + | |
26861 | + MP_intsrc_info(m); | |
26862 | + mpt+=sizeof(*m); | |
26863 | + count+=sizeof(*m); | |
26864 | + break; | |
26865 | + } | |
26866 | + case MP_LINTSRC: | |
26867 | + { | |
26868 | + struct mpc_config_lintsrc *m= | |
26869 | + (struct mpc_config_lintsrc *)mpt; | |
26870 | + MP_lintsrc_info(m); | |
26871 | + mpt+=sizeof(*m); | |
26872 | + count+=sizeof(*m); | |
26873 | + break; | |
26874 | + } | |
26875 | + } | |
26876 | + } | |
26877 | + clustered_apic_check(); | |
26878 | + if (!num_processors) | |
26879 | + printk(KERN_ERR "SMP mptable: no processors registered!\n"); | |
26880 | + return num_processors; | |
26881 | +} | |
26882 | + | |
26883 | +static int __init ELCR_trigger(unsigned int irq) | |
26884 | +{ | |
26885 | + unsigned int port; | |
26886 | + | |
26887 | + port = 0x4d0 + (irq >> 3); | |
26888 | + return (inb(port) >> (irq & 7)) & 1; | |
26889 | +} | |
26890 | + | |
26891 | +static void __init construct_default_ioirq_mptable(int mpc_default_type) | |
26892 | +{ | |
26893 | + struct mpc_config_intsrc intsrc; | |
26894 | + int i; | |
26895 | + int ELCR_fallback = 0; | |
26896 | + | |
26897 | + intsrc.mpc_type = MP_INTSRC; | |
26898 | + intsrc.mpc_irqflag = 0; /* conforming */ | |
26899 | + intsrc.mpc_srcbus = 0; | |
26900 | + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; | |
26901 | + | |
26902 | + intsrc.mpc_irqtype = mp_INT; | |
26903 | + | |
26904 | + /* | |
26905 | + * If true, we have an ISA/PCI system with no IRQ entries | |
26906 | + * in the MP table. To prevent the PCI interrupts from being set up | |
26907 | + * incorrectly, we try to use the ELCR. The sanity check to see if | |
26908 | + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can | |
26909 | + * never be level sensitive, so we simply see if the ELCR agrees. | |
26910 | + * If it does, we assume it's valid. | |
26911 | + */ | |
26912 | + if (mpc_default_type == 5) { | |
26913 | + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); | |
26914 | + | |
26915 | + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) | |
26916 | + printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); | |
26917 | + else { | |
26918 | + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); | |
26919 | + ELCR_fallback = 1; | |
26920 | + } | |
26921 | + } | |
26922 | + | |
26923 | + for (i = 0; i < 16; i++) { | |
26924 | + switch (mpc_default_type) { | |
26925 | + case 2: | |
26926 | + if (i == 0 || i == 13) | |
26927 | + continue; /* IRQ0 & IRQ13 not connected */ | |
26928 | + /* fall through */ | |
26929 | + default: | |
26930 | + if (i == 2) | |
26931 | + continue; /* IRQ2 is never connected */ | |
26932 | + } | |
26933 | + | |
26934 | + if (ELCR_fallback) { | |
26935 | + /* | |
26936 | + * If the ELCR indicates a level-sensitive interrupt, we | |
26937 | + * copy that information over to the MP table in the | |
26938 | + * irqflag field (level sensitive, active high polarity). | |
26939 | + */ | |
26940 | + if (ELCR_trigger(i)) | |
26941 | + intsrc.mpc_irqflag = 13; | |
26942 | + else | |
26943 | + intsrc.mpc_irqflag = 0; | |
26944 | + } | |
26945 | + | |
26946 | + intsrc.mpc_srcbusirq = i; | |
26947 | + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ | |
26948 | + MP_intsrc_info(&intsrc); | |
26949 | + } | |
26950 | + | |
26951 | + intsrc.mpc_irqtype = mp_ExtINT; | |
26952 | + intsrc.mpc_srcbusirq = 0; | |
26953 | + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ | |
26954 | + MP_intsrc_info(&intsrc); | |
26955 | +} | |
26956 | + | |
26957 | +static inline void __init construct_default_ISA_mptable(int mpc_default_type) | |
26958 | +{ | |
26959 | + struct mpc_config_processor processor; | |
26960 | + struct mpc_config_bus bus; | |
26961 | + struct mpc_config_ioapic ioapic; | |
26962 | + struct mpc_config_lintsrc lintsrc; | |
26963 | + int linttypes[2] = { mp_ExtINT, mp_NMI }; | |
26964 | + int i; | |
26965 | + | |
26966 | + /* | |
26967 | + * local APIC has default address | |
26968 | + */ | |
26969 | + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | |
26970 | + | |
26971 | + /* | |
26972 | + * 2 CPUs, numbered 0 & 1. | |
26973 | + */ | |
26974 | + processor.mpc_type = MP_PROCESSOR; | |
26975 | + /* Either an integrated APIC or a discrete 82489DX. */ | |
26976 | + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | |
26977 | + processor.mpc_cpuflag = CPU_ENABLED; | |
26978 | + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | |
26979 | + (boot_cpu_data.x86_model << 4) | | |
26980 | + boot_cpu_data.x86_mask; | |
26981 | + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | |
26982 | + processor.mpc_reserved[0] = 0; | |
26983 | + processor.mpc_reserved[1] = 0; | |
26984 | + for (i = 0; i < 2; i++) { | |
26985 | + processor.mpc_apicid = i; | |
26986 | + MP_processor_info(&processor); | |
26987 | + } | |
26988 | + | |
26989 | + bus.mpc_type = MP_BUS; | |
26990 | + bus.mpc_busid = 0; | |
26991 | + switch (mpc_default_type) { | |
26992 | + default: | |
26993 | + printk(KERN_ERR "???\nUnknown standard configuration %d\n", | |
26994 | + mpc_default_type); | |
26995 | + /* fall through */ | |
26996 | + case 1: | |
26997 | + case 5: | |
26998 | + memcpy(bus.mpc_bustype, "ISA ", 6); | |
26999 | + break; | |
27000 | + case 2: | |
27001 | + case 6: | |
27002 | + case 3: | |
27003 | + memcpy(bus.mpc_bustype, "EISA ", 6); | |
27004 | + break; | |
27005 | + case 4: | |
27006 | + case 7: | |
27007 | + memcpy(bus.mpc_bustype, "MCA ", 6); | |
27008 | + } | |
27009 | + MP_bus_info(&bus); | |
27010 | + if (mpc_default_type > 4) { | |
27011 | + bus.mpc_busid = 1; | |
27012 | + memcpy(bus.mpc_bustype, "PCI ", 6); | |
27013 | + MP_bus_info(&bus); | |
27014 | + } | |
27015 | + | |
27016 | + ioapic.mpc_type = MP_IOAPIC; | |
27017 | + ioapic.mpc_apicid = 2; | |
27018 | + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | |
27019 | + ioapic.mpc_flags = MPC_APIC_USABLE; | |
27020 | + ioapic.mpc_apicaddr = 0xFEC00000; | |
27021 | + MP_ioapic_info(&ioapic); | |
27022 | + | |
27023 | + /* | |
27024 | + * We set up most of the low 16 IO-APIC pins according to MPS rules. | |
27025 | + */ | |
27026 | + construct_default_ioirq_mptable(mpc_default_type); | |
27027 | + | |
27028 | + lintsrc.mpc_type = MP_LINTSRC; | |
27029 | + lintsrc.mpc_irqflag = 0; /* conforming */ | |
27030 | + lintsrc.mpc_srcbusid = 0; | |
27031 | + lintsrc.mpc_srcbusirq = 0; | |
27032 | + lintsrc.mpc_destapic = MP_APIC_ALL; | |
27033 | + for (i = 0; i < 2; i++) { | |
27034 | + lintsrc.mpc_irqtype = linttypes[i]; | |
27035 | + lintsrc.mpc_destapiclint = i; | |
27036 | + MP_lintsrc_info(&lintsrc); | |
27037 | + } | |
27038 | +} | |
27039 | + | |
27040 | +static struct intel_mp_floating *mpf_found; | |
27041 | + | |
27042 | +/* | |
27043 | + * Scan the memory blocks for an SMP configuration block. | |
27044 | + */ | |
27045 | +void __init get_smp_config (void) | |
27046 | +{ | |
27047 | + struct intel_mp_floating *mpf = mpf_found; | |
27048 | + | |
27049 | + /* | |
27050 | + * ACPI supports both logical (e.g. Hyper-Threading) and physical | |
27051 | + * processors, where MPS only supports physical. | |
27052 | + */ | |
27053 | + if (acpi_lapic && acpi_ioapic) { | |
27054 | + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); | |
27055 | + return; | |
27056 | + } | |
27057 | + else if (acpi_lapic) | |
27058 | + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); | |
27059 | + | |
27060 | + printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); | |
27061 | + if (mpf->mpf_feature2 & (1<<7)) { | |
27062 | + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); | |
27063 | + pic_mode = 1; | |
27064 | + } else { | |
27065 | + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); | |
27066 | + pic_mode = 0; | |
27067 | + } | |
27068 | + | |
27069 | + /* | |
27070 | + * Now see if we need to read further. | |
27071 | + */ | |
27072 | + if (mpf->mpf_feature1 != 0) { | |
27073 | + | |
27074 | + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); | |
27075 | + construct_default_ISA_mptable(mpf->mpf_feature1); | |
27076 | + | |
27077 | + } else if (mpf->mpf_physptr) { | |
27078 | + | |
27079 | + /* | |
27080 | + * Read the physical hardware table. Anything here will | |
27081 | + * override the defaults. | |
27082 | + */ | |
27083 | + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { | |
27084 | + smp_found_config = 0; | |
27085 | + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); | |
27086 | + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); | |
27087 | + return; | |
27088 | + } | |
27089 | + /* | |
27090 | + * If there are no explicit MP IRQ entries, then we are | |
27091 | + * broken. We set up most of the low 16 IO-APIC pins to | |
27092 | + * ISA defaults and hope it will work. | |
27093 | + */ | |
27094 | + if (!mp_irq_entries) { | |
27095 | + struct mpc_config_bus bus; | |
27096 | + | |
27097 | + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); | |
27098 | + | |
27099 | + bus.mpc_type = MP_BUS; | |
27100 | + bus.mpc_busid = 0; | |
27101 | + memcpy(bus.mpc_bustype, "ISA ", 6); | |
27102 | + MP_bus_info(&bus); | |
27103 | + | |
27104 | + construct_default_ioirq_mptable(0); | |
27105 | + } | |
27106 | + | |
27107 | + } else | |
27108 | + BUG(); | |
27109 | + | |
27110 | + printk(KERN_INFO "Processors: %d\n", num_processors); | |
27111 | + /* | |
27112 | + * Only use the first configuration found. | |
27113 | + */ | |
27114 | +} | |
27115 | + | |
27116 | +static int __init smp_scan_config (unsigned long base, unsigned long length) | |
27117 | +{ | |
27118 | + extern void __bad_mpf_size(void); | |
27119 | + unsigned int *bp = isa_bus_to_virt(base); | |
27120 | + struct intel_mp_floating *mpf; | |
27121 | + | |
27122 | + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); | |
27123 | + if (sizeof(*mpf) != 16) | |
27124 | + __bad_mpf_size(); | |
27125 | + | |
27126 | + while (length > 0) { | |
27127 | + mpf = (struct intel_mp_floating *)bp; | |
27128 | + if ((*bp == SMP_MAGIC_IDENT) && | |
27129 | + (mpf->mpf_length == 1) && | |
27130 | + !mpf_checksum((unsigned char *)bp, 16) && | |
27131 | + ((mpf->mpf_specification == 1) | |
27132 | + || (mpf->mpf_specification == 4)) ) { | |
27133 | + | |
27134 | + smp_found_config = 1; | |
27135 | + mpf_found = mpf; | |
27136 | + return 1; | |
27137 | + } | |
27138 | + bp += 4; | |
27139 | + length -= 16; | |
27140 | + } | |
27141 | + return 0; | |
27142 | +} | |
27143 | + | |
27144 | +void __init find_intel_smp (void) | |
27145 | +{ | |
27146 | + unsigned int address; | |
27147 | + | |
27148 | + /* | |
27149 | + * FIXME: Linux assumes you have 640K of base ram.. | |
27150 | + * this continues the error... | |
27151 | + * | |
27152 | + * 1) Scan the bottom 1K for a signature | |
27153 | + * 2) Scan the top 1K of base RAM | |
27154 | + * 3) Scan the 64K of bios | |
27155 | + */ | |
27156 | + if (smp_scan_config(0x0,0x400) || | |
27157 | + smp_scan_config(639*0x400,0x400) || | |
27158 | + smp_scan_config(0xF0000,0x10000)) | |
27159 | + return; | |
27160 | + /* | |
27161 | + * If it is an SMP machine we should know now, unless the | |
27162 | + * configuration is in an EISA/MCA bus machine with an | |
27163 | + * extended bios data area. | |
27164 | + * | |
27165 | + * there is a real-mode segmented pointer pointing to the | |
27166 | + * 4K EBDA area at 0x40E, calculate and scan it here. | |
27167 | + * | |
27168 | + * NOTE! There are Linux loaders that will corrupt the EBDA | |
27169 | + * area, and as such this kind of SMP config may be less | |
27170 | + * trustworthy, simply because the SMP table may have been | |
27171 | + * stomped on during early boot. These loaders are buggy and | |
27172 | + * should be fixed. | |
27173 | + */ | |
27174 | + | |
27175 | + address = *(unsigned short *)phys_to_virt(0x40E); | |
27176 | + address <<= 4; | |
27177 | + if (smp_scan_config(address, 0x1000)) | |
27178 | + return; | |
27179 | + | |
27180 | + /* If we have come this far, we did not find an MP table */ | |
27181 | + printk(KERN_INFO "No mptable found.\n"); | |
27182 | +} | |
27183 | + | |
27184 | +/* | |
27185 | + * - Intel MP Configuration Table | |
27186 | + */ | |
27187 | +void __init find_smp_config (void) | |
27188 | +{ | |
27189 | +#ifdef CONFIG_X86_LOCAL_APIC | |
27190 | + find_intel_smp(); | |
27191 | +#endif | |
27192 | +} | |
27193 | + | |
27194 | + | |
27195 | +/* -------------------------------------------------------------------------- | |
27196 | + ACPI-based MP Configuration | |
27197 | + -------------------------------------------------------------------------- */ | |
27198 | + | |
27199 | +#ifdef CONFIG_ACPI | |
27200 | + | |
27201 | +void __init mp_register_lapic_address ( | |
27202 | + u64 address) | |
27203 | +{ | |
27204 | +#ifndef CONFIG_XEN | |
27205 | + mp_lapic_addr = (unsigned long) address; | |
27206 | + | |
27207 | + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); | |
27208 | + | |
27209 | + if (boot_cpu_id == -1U) | |
27210 | + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); | |
27211 | + | |
27212 | + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); | |
27213 | +#endif | |
27214 | +} | |
27215 | + | |
27216 | + | |
27217 | +void __cpuinit mp_register_lapic ( | |
27218 | + u8 id, | |
27219 | + u8 enabled) | |
27220 | +{ | |
27221 | + struct mpc_config_processor processor; | |
27222 | + int boot_cpu = 0; | |
27223 | + | |
27224 | + if (id >= MAX_APICS) { | |
27225 | + printk(KERN_WARNING "Processor #%d invalid (max %d)\n", | |
27226 | + id, MAX_APICS); | |
27227 | + return; | |
27228 | + } | |
27229 | + | |
27230 | + if (id == boot_cpu_physical_apicid) | |
27231 | + boot_cpu = 1; | |
27232 | + | |
27233 | +#ifndef CONFIG_XEN | |
27234 | + processor.mpc_type = MP_PROCESSOR; | |
27235 | + processor.mpc_apicid = id; | |
27236 | + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); | |
27237 | + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); | |
27238 | + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); | |
27239 | + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | |
27240 | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; | |
27241 | + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | |
27242 | + processor.mpc_reserved[0] = 0; | |
27243 | + processor.mpc_reserved[1] = 0; | |
27244 | +#endif | |
27245 | + | |
27246 | + MP_processor_info(&processor); | |
27247 | +} | |
27248 | + | |
27249 | +#ifdef CONFIG_X86_IO_APIC | |
27250 | + | |
27251 | +#define MP_ISA_BUS 0 | |
27252 | +#define MP_MAX_IOAPIC_PIN 127 | |
27253 | + | |
27254 | +static struct mp_ioapic_routing { | |
27255 | + int apic_id; | |
27256 | + int gsi_start; | |
27257 | + int gsi_end; | |
27258 | + u32 pin_programmed[4]; | |
27259 | +} mp_ioapic_routing[MAX_IO_APICS]; | |
27260 | + | |
27261 | + | |
27262 | +static int mp_find_ioapic ( | |
27263 | + int gsi) | |
27264 | +{ | |
27265 | + int i = 0; | |
27266 | + | |
27267 | + /* Find the IOAPIC that manages this GSI. */ | |
27268 | + for (i = 0; i < nr_ioapics; i++) { | |
27269 | + if ((gsi >= mp_ioapic_routing[i].gsi_start) | |
27270 | + && (gsi <= mp_ioapic_routing[i].gsi_end)) | |
27271 | + return i; | |
27272 | + } | |
27273 | + | |
27274 | + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | |
27275 | + | |
27276 | + return -1; | |
27277 | +} | |
27278 | + | |
27279 | + | |
27280 | +void __init mp_register_ioapic ( | |
27281 | + u8 id, | |
27282 | + u32 address, | |
27283 | + u32 gsi_base) | |
27284 | +{ | |
27285 | + int idx = 0; | |
27286 | + | |
27287 | + if (nr_ioapics >= MAX_IO_APICS) { | |
27288 | + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " | |
27289 | + "(found %d)\n", MAX_IO_APICS, nr_ioapics); | |
27290 | + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); | |
27291 | + } | |
27292 | + if (!address) { | |
27293 | + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" | |
27294 | + " found in MADT table, skipping!\n"); | |
27295 | + return; | |
27296 | + } | |
27297 | + | |
27298 | + idx = nr_ioapics++; | |
27299 | + | |
27300 | + mp_ioapics[idx].mpc_type = MP_IOAPIC; | |
27301 | + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; | |
27302 | + mp_ioapics[idx].mpc_apicaddr = address; | |
27303 | + | |
27304 | +#ifndef CONFIG_XEN | |
27305 | + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | |
27306 | +#endif | |
27307 | + mp_ioapics[idx].mpc_apicid = id; | |
27308 | + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); | |
27309 | + | |
27310 | + /* | |
27311 | + * Build basic IRQ lookup table to facilitate gsi->io_apic lookups | |
27312 | + * and to prevent reprogramming of IOAPIC pins (PCI IRQs). | |
27313 | + */ | |
27314 | + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; | |
27315 | + mp_ioapic_routing[idx].gsi_start = gsi_base; | |
27316 | + mp_ioapic_routing[idx].gsi_end = gsi_base + | |
27317 | + io_apic_get_redir_entries(idx); | |
27318 | + | |
27319 | + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " | |
27320 | + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, | |
27321 | + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, | |
27322 | + mp_ioapic_routing[idx].gsi_start, | |
27323 | + mp_ioapic_routing[idx].gsi_end); | |
27324 | + | |
27325 | + return; | |
27326 | +} | |
27327 | + | |
27328 | + | |
27329 | +void __init mp_override_legacy_irq ( | |
27330 | + u8 bus_irq, | |
27331 | + u8 polarity, | |
27332 | + u8 trigger, | |
27333 | + u32 gsi) | |
27334 | +{ | |
27335 | + struct mpc_config_intsrc intsrc; | |
27336 | + int ioapic = -1; | |
27337 | + int pin = -1; | |
27338 | + | |
27339 | + /* | |
27340 | + * Convert 'gsi' to 'ioapic.pin'. | |
27341 | + */ | |
27342 | + ioapic = mp_find_ioapic(gsi); | |
27343 | + if (ioapic < 0) | |
27344 | + return; | |
27345 | + pin = gsi - mp_ioapic_routing[ioapic].gsi_start; | |
27346 | + | |
27347 | + /* | |
27348 | + * TBD: This check is for faulty timer entries, where the override | |
27349 | + * erroneously sets the trigger to level, resulting in a HUGE | |
27350 | + * increase of timer interrupts! | |
27351 | + */ | |
27352 | + if ((bus_irq == 0) && (trigger == 3)) | |
27353 | + trigger = 1; | |
27354 | + | |
27355 | + intsrc.mpc_type = MP_INTSRC; | |
27356 | + intsrc.mpc_irqtype = mp_INT; | |
27357 | + intsrc.mpc_irqflag = (trigger << 2) | polarity; | |
27358 | + intsrc.mpc_srcbus = MP_ISA_BUS; | |
27359 | + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ | |
27360 | + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ | |
27361 | + intsrc.mpc_dstirq = pin; /* INTIN# */ | |
27362 | + | |
27363 | + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", | |
27364 | + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | |
27365 | + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | |
27366 | + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); | |
27367 | + | |
27368 | + mp_irqs[mp_irq_entries] = intsrc; | |
27369 | + if (++mp_irq_entries == MAX_IRQ_SOURCES) | |
27370 | + panic("Max # of irq sources exceeded!\n"); | |
27371 | + | |
27372 | + return; | |
27373 | +} | |
27374 | + | |
27375 | + | |
27376 | +void __init mp_config_acpi_legacy_irqs (void) | |
27377 | +{ | |
27378 | + struct mpc_config_intsrc intsrc; | |
27379 | + int i = 0; | |
27380 | + int ioapic = -1; | |
27381 | + | |
27382 | + /* | |
27383 | + * Fabricate the legacy ISA bus (bus #31). | |
27384 | + */ | |
27385 | + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; | |
27386 | + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); | |
27387 | + | |
27388 | + /* | |
27389 | + * Locate the IOAPIC that manages the ISA IRQs (0-15). | |
27390 | + */ | |
27391 | + ioapic = mp_find_ioapic(0); | |
27392 | + if (ioapic < 0) | |
27393 | + return; | |
27394 | + | |
27395 | + intsrc.mpc_type = MP_INTSRC; | |
27396 | + intsrc.mpc_irqflag = 0; /* Conforming */ | |
27397 | + intsrc.mpc_srcbus = MP_ISA_BUS; | |
27398 | + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; | |
27399 | + | |
27400 | + /* | |
27401 | + * Use the default configuration for the IRQs 0-15. Unless | |
27402 | + * overridden by (MADT) interrupt source override entries. | |
27403 | + */ | |
27404 | + for (i = 0; i < 16; i++) { | |
27405 | + int idx; | |
27406 | + | |
27407 | + for (idx = 0; idx < mp_irq_entries; idx++) { | |
27408 | + struct mpc_config_intsrc *irq = mp_irqs + idx; | |
27409 | + | |
27410 | + /* Do we already have a mapping for this ISA IRQ? */ | |
27411 | + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) | |
27412 | + break; | |
27413 | + | |
27414 | + /* Do we already have a mapping for this IOAPIC pin */ | |
27415 | + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && | |
27416 | + (irq->mpc_dstirq == i)) | |
27417 | + break; | |
27418 | + } | |
27419 | + | |
27420 | + if (idx != mp_irq_entries) { | |
27421 | + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); | |
27422 | + continue; /* IRQ already used */ | |
27423 | + } | |
27424 | + | |
27425 | + intsrc.mpc_irqtype = mp_INT; | |
27426 | + intsrc.mpc_srcbusirq = i; /* Identity mapped */ | |
27427 | + intsrc.mpc_dstirq = i; | |
27428 | + | |
27429 | + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " | |
27430 | + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | |
27431 | + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | |
27432 | + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, | |
27433 | + intsrc.mpc_dstirq); | |
27434 | + | |
27435 | + mp_irqs[mp_irq_entries] = intsrc; | |
27436 | + if (++mp_irq_entries == MAX_IRQ_SOURCES) | |
27437 | + panic("Max # of irq sources exceeded!\n"); | |
27438 | + } | |
27439 | + | |
27440 | + return; | |
27441 | +} | |
27442 | + | |
27443 | +#define MAX_GSI_NUM 4096 | |
27444 | + | |
27445 | +int mp_register_gsi(u32 gsi, int triggering, int polarity) | |
27446 | +{ | |
27447 | + int ioapic = -1; | |
27448 | + int ioapic_pin = 0; | |
27449 | + int idx, bit = 0; | |
27450 | + static int pci_irq = 16; | |
27451 | + /* | |
27452 | + * Mapping between Global System Interrupts, which | |
27453 | + * represent all possible interrupts, to the IRQs | |
27454 | + * assigned to actual devices. | |
27455 | + */ | |
27456 | + static int gsi_to_irq[MAX_GSI_NUM]; | |
27457 | + | |
27458 | + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) | |
27459 | + return gsi; | |
27460 | + | |
27461 | + /* Don't set up the ACPI SCI because it's already set up */ | |
27462 | + if (acpi_fadt.sci_int == gsi) | |
27463 | + return gsi; | |
27464 | + | |
27465 | + ioapic = mp_find_ioapic(gsi); | |
27466 | + if (ioapic < 0) { | |
27467 | + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); | |
27468 | + return gsi; | |
27469 | + } | |
27470 | + | |
27471 | + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; | |
27472 | + | |
27473 | + /* | |
27474 | + * Avoid pin reprogramming. PRTs typically include entries | |
27475 | + * with redundant pin->gsi mappings (but unique PCI devices); | |
27476 | + * we only program the IOAPIC on the first. | |
27477 | + */ | |
27478 | + bit = ioapic_pin % 32; | |
27479 | + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); | |
27480 | + if (idx > 3) { | |
27481 | + printk(KERN_ERR "Invalid reference to IOAPIC pin " | |
27482 | + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, | |
27483 | + ioapic_pin); | |
27484 | + return gsi; | |
27485 | + } | |
27486 | + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { | |
27487 | + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", | |
27488 | + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | |
27489 | + return gsi_to_irq[gsi]; | |
27490 | + } | |
27491 | + | |
27492 | + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); | |
27493 | + | |
27494 | + if (triggering == ACPI_LEVEL_SENSITIVE) { | |
27495 | + /* | |
27496 | + * For PCI devices assign IRQs in order, avoiding gaps | |
27497 | + * due to unused I/O APIC pins. | |
27498 | + */ | |
27499 | + int irq = gsi; | |
27500 | + if (gsi < MAX_GSI_NUM) { | |
27501 | + /* | |
27502 | + * Retain the VIA chipset work-around (gsi > 15), but | |
27503 | + * avoid a problem where the 8254 timer (IRQ0) is setup | |
27504 | + * via an override (so it's not on pin 0 of the ioapic), | |
27505 | + * and at the same time, the pin 0 interrupt is a PCI | |
27506 | + * type. The gsi > 15 test could cause these two pins | |
27507 | + * to be shared as IRQ0, and they are not shareable. | |
27508 | + * So test for this condition, and if necessary, avoid | |
27509 | + * the pin collision. | |
27510 | + */ | |
27511 | + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) | |
27512 | + gsi = pci_irq++; | |
27513 | + /* | |
27514 | + * Don't assign IRQ used by ACPI SCI | |
27515 | + */ | |
27516 | + if (gsi == acpi_fadt.sci_int) | |
27517 | + gsi = pci_irq++; | |
27518 | + gsi_to_irq[irq] = gsi; | |
27519 | + } else { | |
27520 | + printk(KERN_ERR "GSI %u is too high\n", gsi); | |
27521 | + return gsi; | |
27522 | + } | |
27523 | + } | |
27524 | + | |
27525 | + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, | |
27526 | + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, | |
27527 | + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); | |
27528 | + return gsi; | |
27529 | +} | |
27530 | + | |
27531 | +#endif /*CONFIG_X86_IO_APIC*/ | |
27532 | +#endif /*CONFIG_ACPI*/ | |
27533 | Index: head-2008-11-25/arch/x86/kernel/process_64-xen.c | |
27534 | =================================================================== | |
27535 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
27536 | +++ head-2008-11-25/arch/x86/kernel/process_64-xen.c 2008-08-07 12:44:36.000000000 +0200 | |
27537 | @@ -0,0 +1,848 @@ | |
27538 | +/* | |
27539 | + * linux/arch/x86-64/kernel/process.c | |
27540 | + * | |
27541 | + * Copyright (C) 1995 Linus Torvalds | |
27542 | + * | |
27543 | + * Pentium III FXSR, SSE support | |
27544 | + * Gareth Hughes <gareth@valinux.com>, May 2000 | |
27545 | + * | |
27546 | + * X86-64 port | |
27547 | + * Andi Kleen. | |
27548 | + * | |
27549 | + * CPU hotplug support - ashok.raj@intel.com | |
27550 | + * | |
27551 | + * Jun Nakajima <jun.nakajima@intel.com> | |
27552 | + * Modified for Xen | |
27553 | + */ | |
27554 | + | |
27555 | +/* | |
27556 | + * This file handles the architecture-dependent parts of process handling.. | |
27557 | + */ | |
27558 | + | |
27559 | +#include <stdarg.h> | |
27560 | + | |
27561 | +#include <linux/cpu.h> | |
27562 | +#include <linux/errno.h> | |
27563 | +#include <linux/sched.h> | |
27564 | +#include <linux/kernel.h> | |
27565 | +#include <linux/mm.h> | |
27566 | +#include <linux/elfcore.h> | |
27567 | +#include <linux/smp.h> | |
27568 | +#include <linux/slab.h> | |
27569 | +#include <linux/user.h> | |
27570 | +#include <linux/module.h> | |
27571 | +#include <linux/a.out.h> | |
27572 | +#include <linux/interrupt.h> | |
27573 | +#include <linux/delay.h> | |
27574 | +#include <linux/ptrace.h> | |
27575 | +#include <linux/utsname.h> | |
27576 | +#include <linux/random.h> | |
27577 | +#include <linux/notifier.h> | |
27578 | +#include <linux/kprobes.h> | |
27579 | + | |
27580 | +#include <asm/uaccess.h> | |
27581 | +#include <asm/pgtable.h> | |
27582 | +#include <asm/system.h> | |
27583 | +#include <asm/io.h> | |
27584 | +#include <asm/processor.h> | |
27585 | +#include <asm/i387.h> | |
27586 | +#include <asm/mmu_context.h> | |
27587 | +#include <asm/pda.h> | |
27588 | +#include <asm/prctl.h> | |
27589 | +#include <asm/kdebug.h> | |
27590 | +#include <xen/interface/platform.h> | |
27591 | +#include <xen/interface/physdev.h> | |
27592 | +#include <xen/interface/vcpu.h> | |
27593 | +#include <asm/desc.h> | |
27594 | +#include <asm/proto.h> | |
27595 | +#include <asm/hardirq.h> | |
27596 | +#include <asm/ia32.h> | |
27597 | +#include <asm/idle.h> | |
27598 | + | |
27599 | +#include <xen/cpu_hotplug.h> | |
27600 | + | |
27601 | +asmlinkage extern void ret_from_fork(void); | |
27602 | + | |
27603 | +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; | |
27604 | + | |
27605 | +unsigned long boot_option_idle_override = 0; | |
27606 | +EXPORT_SYMBOL(boot_option_idle_override); | |
27607 | + | |
27608 | +/* | |
27609 | + * Powermanagement idle function, if any.. | |
27610 | + */ | |
27611 | +void (*pm_idle)(void); | |
27612 | +EXPORT_SYMBOL(pm_idle); | |
27613 | +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | |
27614 | + | |
27615 | +static ATOMIC_NOTIFIER_HEAD(idle_notifier); | |
27616 | + | |
27617 | +void idle_notifier_register(struct notifier_block *n) | |
27618 | +{ | |
27619 | + atomic_notifier_chain_register(&idle_notifier, n); | |
27620 | +} | |
27621 | +EXPORT_SYMBOL_GPL(idle_notifier_register); | |
27622 | + | |
27623 | +void idle_notifier_unregister(struct notifier_block *n) | |
27624 | +{ | |
27625 | + atomic_notifier_chain_unregister(&idle_notifier, n); | |
27626 | +} | |
27627 | +EXPORT_SYMBOL(idle_notifier_unregister); | |
27628 | + | |
27629 | +enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; | |
27630 | +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; | |
27631 | + | |
27632 | +void enter_idle(void) | |
27633 | +{ | |
27634 | + __get_cpu_var(idle_state) = CPU_IDLE; | |
27635 | + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); | |
27636 | +} | |
27637 | + | |
27638 | +static void __exit_idle(void) | |
27639 | +{ | |
27640 | + __get_cpu_var(idle_state) = CPU_NOT_IDLE; | |
27641 | + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); | |
27642 | +} | |
27643 | + | |
27644 | +/* Called from interrupts to signify idle end */ | |
27645 | +void exit_idle(void) | |
27646 | +{ | |
27647 | + if (current->pid | read_pda(irqcount)) | |
27648 | + return; | |
27649 | + __exit_idle(); | |
27650 | +} | |
27651 | + | |
27652 | +/* | |
27653 | + * On SMP it's slightly faster (but much more power-consuming!) | |
27654 | + * to poll the ->need_resched flag instead of waiting for the | |
27655 | + * cross-CPU IPI to arrive. Use this option with caution. | |
27656 | + */ | |
27657 | +static void poll_idle (void) | |
27658 | +{ | |
27659 | + local_irq_enable(); | |
27660 | + | |
27661 | + asm volatile( | |
27662 | + "2:" | |
27663 | + "testl %0,%1;" | |
27664 | + "rep; nop;" | |
27665 | + "je 2b;" | |
27666 | + : : | |
27667 | + "i" (_TIF_NEED_RESCHED), | |
27668 | + "m" (current_thread_info()->flags)); | |
27669 | +} | |
27670 | + | |
27671 | +static void xen_idle(void) | |
27672 | +{ | |
27673 | + local_irq_disable(); | |
27674 | + | |
27675 | + if (need_resched()) | |
27676 | + local_irq_enable(); | |
27677 | + else { | |
27678 | + current_thread_info()->status &= ~TS_POLLING; | |
27679 | + smp_mb__after_clear_bit(); | |
27680 | + safe_halt(); | |
27681 | + current_thread_info()->status |= TS_POLLING; | |
27682 | + } | |
27683 | +} | |
27684 | + | |
27685 | +#ifdef CONFIG_HOTPLUG_CPU | |
27686 | +static inline void play_dead(void) | |
27687 | +{ | |
27688 | + idle_task_exit(); | |
27689 | + local_irq_disable(); | |
27690 | + cpu_clear(smp_processor_id(), cpu_initialized); | |
27691 | + preempt_enable_no_resched(); | |
27692 | + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); | |
27693 | + cpu_bringup(); | |
27694 | +} | |
27695 | +#else | |
27696 | +static inline void play_dead(void) | |
27697 | +{ | |
27698 | + BUG(); | |
27699 | +} | |
27700 | +#endif /* CONFIG_HOTPLUG_CPU */ | |
27701 | + | |
27702 | +/* | |
27703 | + * The idle thread. There's no useful work to be | |
27704 | + * done, so just try to conserve power and have a | |
27705 | + * low exit latency (ie sit in a loop waiting for | |
27706 | + * somebody to say that they'd like to reschedule) | |
27707 | + */ | |
27708 | +void cpu_idle (void) | |
27709 | +{ | |
27710 | + current_thread_info()->status |= TS_POLLING; | |
27711 | + /* endless idle loop with no priority at all */ | |
27712 | + while (1) { | |
27713 | + while (!need_resched()) { | |
27714 | + void (*idle)(void); | |
27715 | + | |
27716 | + if (__get_cpu_var(cpu_idle_state)) | |
27717 | + __get_cpu_var(cpu_idle_state) = 0; | |
27718 | + rmb(); | |
27719 | + idle = xen_idle; /* no alternatives */ | |
27720 | + if (cpu_is_offline(smp_processor_id())) | |
27721 | + play_dead(); | |
27722 | + enter_idle(); | |
27723 | + idle(); | |
27724 | + __exit_idle(); | |
27725 | + } | |
27726 | + | |
27727 | + preempt_enable_no_resched(); | |
27728 | + schedule(); | |
27729 | + preempt_disable(); | |
27730 | + } | |
27731 | +} | |
27732 | + | |
27733 | +void cpu_idle_wait(void) | |
27734 | +{ | |
27735 | + unsigned int cpu, this_cpu = get_cpu(); | |
27736 | + cpumask_t map; | |
27737 | + | |
27738 | + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | |
27739 | + put_cpu(); | |
27740 | + | |
27741 | + cpus_clear(map); | |
27742 | + for_each_online_cpu(cpu) { | |
27743 | + per_cpu(cpu_idle_state, cpu) = 1; | |
27744 | + cpu_set(cpu, map); | |
27745 | + } | |
27746 | + | |
27747 | + __get_cpu_var(cpu_idle_state) = 0; | |
27748 | + | |
27749 | + wmb(); | |
27750 | + do { | |
27751 | + ssleep(1); | |
27752 | + for_each_online_cpu(cpu) { | |
27753 | + if (cpu_isset(cpu, map) && | |
27754 | + !per_cpu(cpu_idle_state, cpu)) | |
27755 | + cpu_clear(cpu, map); | |
27756 | + } | |
27757 | + cpus_and(map, map, cpu_online_map); | |
27758 | + } while (!cpus_empty(map)); | |
27759 | +} | |
27760 | +EXPORT_SYMBOL_GPL(cpu_idle_wait); | |
27761 | + | |
27762 | +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | |
27763 | +{ | |
27764 | +} | |
27765 | + | |
27766 | +static int __init idle_setup (char *str) | |
27767 | +{ | |
27768 | + if (!strncmp(str, "poll", 4)) { | |
27769 | + printk("using polling idle threads.\n"); | |
27770 | + pm_idle = poll_idle; | |
27771 | + } | |
27772 | + | |
27773 | + boot_option_idle_override = 1; | |
27774 | + return 1; | |
27775 | +} | |
27776 | + | |
27777 | +__setup("idle=", idle_setup); | |
27778 | + | |
27779 | +/* Prints also some state that isn't saved in the pt_regs */ | |
27780 | +void __show_regs(struct pt_regs * regs) | |
27781 | +{ | |
27782 | + unsigned long fs, gs, shadowgs; | |
27783 | + unsigned int fsindex,gsindex; | |
27784 | + unsigned int ds,cs,es; | |
27785 | + | |
27786 | + printk("\n"); | |
27787 | + print_modules(); | |
27788 | + printk("Pid: %d, comm: %.20s %s %s %.*s\n", | |
27789 | + current->pid, current->comm, print_tainted(), | |
27790 | + system_utsname.release, | |
27791 | + (int)strcspn(system_utsname.version, " "), | |
27792 | + system_utsname.version); | |
27793 | + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); | |
27794 | + printk_address(regs->rip); | |
27795 | + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, | |
27796 | + regs->eflags); | |
27797 | + printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", | |
27798 | + regs->rax, regs->rbx, regs->rcx); | |
27799 | + printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", | |
27800 | + regs->rdx, regs->rsi, regs->rdi); | |
27801 | + printk("RBP: %016lx R08: %016lx R09: %016lx\n", | |
27802 | + regs->rbp, regs->r8, regs->r9); | |
27803 | + printk("R10: %016lx R11: %016lx R12: %016lx\n", | |
27804 | + regs->r10, regs->r11, regs->r12); | |
27805 | + printk("R13: %016lx R14: %016lx R15: %016lx\n", | |
27806 | + regs->r13, regs->r14, regs->r15); | |
27807 | + | |
27808 | + asm("mov %%ds,%0" : "=r" (ds)); | |
27809 | + asm("mov %%cs,%0" : "=r" (cs)); | |
27810 | + asm("mov %%es,%0" : "=r" (es)); | |
27811 | + asm("mov %%fs,%0" : "=r" (fsindex)); | |
27812 | + asm("mov %%gs,%0" : "=r" (gsindex)); | |
27813 | + | |
27814 | + rdmsrl(MSR_FS_BASE, fs); | |
27815 | + rdmsrl(MSR_GS_BASE, gs); | |
27816 | + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); | |
27817 | + | |
27818 | + printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", | |
27819 | + fs,fsindex,gs,gsindex,shadowgs); | |
27820 | + printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es); | |
27821 | + | |
27822 | +} | |
27823 | + | |
27824 | +void show_regs(struct pt_regs *regs) | |
27825 | +{ | |
27826 | + printk("CPU %d:", smp_processor_id()); | |
27827 | + __show_regs(regs); | |
27828 | + show_trace(NULL, regs, (void *)(regs + 1)); | |
27829 | +} | |
27830 | + | |
27831 | +/* | |
27832 | + * Free current thread data structures etc.. | |
27833 | + */ | |
27834 | +void exit_thread(void) | |
27835 | +{ | |
27836 | + struct task_struct *me = current; | |
27837 | + struct thread_struct *t = &me->thread; | |
27838 | + | |
27839 | + if (me->thread.io_bitmap_ptr) { | |
27840 | +#ifndef CONFIG_X86_NO_TSS | |
27841 | + struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); | |
27842 | +#endif | |
27843 | +#ifdef CONFIG_XEN | |
27844 | + struct physdev_set_iobitmap iobmp_op; | |
27845 | + memset(&iobmp_op, 0, sizeof(iobmp_op)); | |
27846 | +#endif | |
27847 | + | |
27848 | + kfree(t->io_bitmap_ptr); | |
27849 | + t->io_bitmap_ptr = NULL; | |
27850 | + /* | |
27851 | + * Careful, clear this in the TSS too: | |
27852 | + */ | |
27853 | +#ifndef CONFIG_X86_NO_TSS | |
27854 | + memset(tss->io_bitmap, 0xff, t->io_bitmap_max); | |
27855 | + put_cpu(); | |
27856 | +#endif | |
27857 | +#ifdef CONFIG_XEN | |
27858 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, | |
27859 | + &iobmp_op)); | |
27860 | +#endif | |
27861 | + t->io_bitmap_max = 0; | |
27862 | + } | |
27863 | +} | |
27864 | + | |
27865 | +void load_gs_index(unsigned gs) | |
27866 | +{ | |
27867 | + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs)); | |
27868 | +} | |
27869 | + | |
27870 | +void flush_thread(void) | |
27871 | +{ | |
27872 | + struct task_struct *tsk = current; | |
27873 | + struct thread_info *t = current_thread_info(); | |
27874 | + | |
27875 | + if (t->flags & _TIF_ABI_PENDING) { | |
27876 | + t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); | |
27877 | + if (t->flags & _TIF_IA32) | |
27878 | + current_thread_info()->status |= TS_COMPAT; | |
27879 | + } | |
27880 | + | |
27881 | + tsk->thread.debugreg0 = 0; | |
27882 | + tsk->thread.debugreg1 = 0; | |
27883 | + tsk->thread.debugreg2 = 0; | |
27884 | + tsk->thread.debugreg3 = 0; | |
27885 | + tsk->thread.debugreg6 = 0; | |
27886 | + tsk->thread.debugreg7 = 0; | |
27887 | + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | |
27888 | + /* | |
27889 | + * Forget coprocessor state.. | |
27890 | + */ | |
27891 | + clear_fpu(tsk); | |
27892 | + clear_used_math(); | |
27893 | +} | |
27894 | + | |
27895 | +void release_thread(struct task_struct *dead_task) | |
27896 | +{ | |
27897 | + if (dead_task->mm) { | |
27898 | + if (dead_task->mm->context.size) { | |
27899 | + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", | |
27900 | + dead_task->comm, | |
27901 | + dead_task->mm->context.ldt, | |
27902 | + dead_task->mm->context.size); | |
27903 | + BUG(); | |
27904 | + } | |
27905 | + } | |
27906 | +} | |
27907 | + | |
27908 | +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) | |
27909 | +{ | |
27910 | + struct user_desc ud = { | |
27911 | + .base_addr = addr, | |
27912 | + .limit = 0xfffff, | |
27913 | + .seg_32bit = 1, | |
27914 | + .limit_in_pages = 1, | |
27915 | + .useable = 1, | |
27916 | + }; | |
27917 | + struct n_desc_struct *desc = (void *)t->thread.tls_array; | |
27918 | + desc += tls; | |
27919 | + desc->a = LDT_entry_a(&ud); | |
27920 | + desc->b = LDT_entry_b(&ud); | |
27921 | +} | |
27922 | + | |
27923 | +static inline u32 read_32bit_tls(struct task_struct *t, int tls) | |
27924 | +{ | |
27925 | + struct desc_struct *desc = (void *)t->thread.tls_array; | |
27926 | + desc += tls; | |
27927 | + return desc->base0 | | |
27928 | + (((u32)desc->base1) << 16) | | |
27929 | + (((u32)desc->base2) << 24); | |
27930 | +} | |
27931 | + | |
27932 | +/* | |
27933 | + * This gets called before we allocate a new thread and copy | |
27934 | + * the current task into it. | |
27935 | + */ | |
27936 | +void prepare_to_copy(struct task_struct *tsk) | |
27937 | +{ | |
27938 | + unlazy_fpu(tsk); | |
27939 | +} | |
27940 | + | |
27941 | +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, | |
27942 | + unsigned long unused, | |
27943 | + struct task_struct * p, struct pt_regs * regs) | |
27944 | +{ | |
27945 | + int err; | |
27946 | + struct pt_regs * childregs; | |
27947 | + struct task_struct *me = current; | |
27948 | + | |
27949 | + childregs = ((struct pt_regs *) | |
27950 | + (THREAD_SIZE + task_stack_page(p))) - 1; | |
27951 | + *childregs = *regs; | |
27952 | + | |
27953 | + childregs->rax = 0; | |
27954 | + childregs->rsp = rsp; | |
27955 | + if (rsp == ~0UL) | |
27956 | + childregs->rsp = (unsigned long)childregs; | |
27957 | + | |
27958 | + p->thread.rsp = (unsigned long) childregs; | |
27959 | + p->thread.rsp0 = (unsigned long) (childregs+1); | |
27960 | + p->thread.userrsp = me->thread.userrsp; | |
27961 | + | |
27962 | + set_tsk_thread_flag(p, TIF_FORK); | |
27963 | + | |
27964 | + p->thread.fs = me->thread.fs; | |
27965 | + p->thread.gs = me->thread.gs; | |
27966 | + | |
27967 | + asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); | |
27968 | + asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); | |
27969 | + asm("mov %%es,%0" : "=m" (p->thread.es)); | |
27970 | + asm("mov %%ds,%0" : "=m" (p->thread.ds)); | |
27971 | + | |
27972 | + if (unlikely(me->thread.io_bitmap_ptr != NULL)) { | |
27973 | + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | |
27974 | + if (!p->thread.io_bitmap_ptr) { | |
27975 | + p->thread.io_bitmap_max = 0; | |
27976 | + return -ENOMEM; | |
27977 | + } | |
27978 | + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, | |
27979 | + IO_BITMAP_BYTES); | |
27980 | + } | |
27981 | + | |
27982 | + /* | |
27983 | + * Set a new TLS for the child thread? | |
27984 | + */ | |
27985 | + if (clone_flags & CLONE_SETTLS) { | |
27986 | +#ifdef CONFIG_IA32_EMULATION | |
27987 | + if (test_thread_flag(TIF_IA32)) | |
27988 | + err = ia32_child_tls(p, childregs); | |
27989 | + else | |
27990 | +#endif | |
27991 | + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); | |
27992 | + if (err) | |
27993 | + goto out; | |
27994 | + } | |
27995 | + p->thread.iopl = current->thread.iopl; | |
27996 | + | |
27997 | + err = 0; | |
27998 | +out: | |
27999 | + if (err && p->thread.io_bitmap_ptr) { | |
28000 | + kfree(p->thread.io_bitmap_ptr); | |
28001 | + p->thread.io_bitmap_max = 0; | |
28002 | + } | |
28003 | + return err; | |
28004 | +} | |
28005 | + | |
28006 | +static inline void __save_init_fpu( struct task_struct *tsk ) | |
28007 | +{ | |
28008 | + asm volatile( "rex64 ; fxsave %0 ; fnclex" | |
28009 | + : "=m" (tsk->thread.i387.fxsave)); | |
28010 | + tsk->thread_info->status &= ~TS_USEDFPU; | |
28011 | +} | |
28012 | + | |
28013 | +/* | |
28014 | + * switch_to(x,y) should switch tasks from x to y. | |
28015 | + * | |
28016 | + * This could still be optimized: | |
28017 | + * - fold all the options into a flag word and test it with a single test. | |
28018 | + * - could test fs/gs bitsliced | |
28019 | + * | |
28020 | + * Kprobes not supported here. Set the probe on schedule instead. | |
28021 | + */ | |
28022 | +__kprobes struct task_struct * | |
28023 | +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |
28024 | +{ | |
28025 | + struct thread_struct *prev = &prev_p->thread, | |
28026 | + *next = &next_p->thread; | |
28027 | + int cpu = smp_processor_id(); | |
28028 | +#ifndef CONFIG_X86_NO_TSS | |
28029 | + struct tss_struct *tss = &per_cpu(init_tss, cpu); | |
28030 | +#endif | |
28031 | +#if CONFIG_XEN_COMPAT > 0x030002 | |
28032 | + struct physdev_set_iopl iopl_op; | |
28033 | + struct physdev_set_iobitmap iobmp_op; | |
28034 | +#else | |
28035 | + struct physdev_op _pdo[2], *pdo = _pdo; | |
28036 | +#define iopl_op pdo->u.set_iopl | |
28037 | +#define iobmp_op pdo->u.set_iobitmap | |
28038 | +#endif | |
28039 | + multicall_entry_t _mcl[8], *mcl = _mcl; | |
28040 | + | |
28041 | + /* | |
28042 | + * This is basically '__unlazy_fpu', except that we queue a | |
28043 | + * multicall to indicate FPU task switch, rather than | |
28044 | + * synchronously trapping to Xen. | |
28045 | + * The AMD workaround requires it to be after DS reload, or | |
28046 | + * after DS has been cleared, which we do in __prepare_arch_switch. | |
28047 | + */ | |
28048 | + if (prev_p->thread_info->status & TS_USEDFPU) { | |
28049 | + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ | |
28050 | + mcl->op = __HYPERVISOR_fpu_taskswitch; | |
28051 | + mcl->args[0] = 1; | |
28052 | + mcl++; | |
28053 | + } | |
28054 | + | |
28055 | + /* | |
28056 | + * Reload esp0, LDT and the page table pointer: | |
28057 | + */ | |
28058 | + mcl->op = __HYPERVISOR_stack_switch; | |
28059 | + mcl->args[0] = __KERNEL_DS; | |
28060 | + mcl->args[1] = next->rsp0; | |
28061 | + mcl++; | |
28062 | + | |
28063 | + /* | |
28064 | + * Load the per-thread Thread-Local Storage descriptor. | |
28065 | + * This is load_TLS(next, cpu) with multicalls. | |
28066 | + */ | |
28067 | +#define C(i) do { \ | |
28068 | + if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \ | |
28069 | + mcl->op = __HYPERVISOR_update_descriptor; \ | |
28070 | + mcl->args[0] = virt_to_machine( \ | |
28071 | + &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \ | |
28072 | + mcl->args[1] = next->tls_array[i]; \ | |
28073 | + mcl++; \ | |
28074 | + } \ | |
28075 | +} while (0) | |
28076 | + C(0); C(1); C(2); | |
28077 | +#undef C | |
28078 | + | |
28079 | + if (unlikely(prev->iopl != next->iopl)) { | |
28080 | + iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl; | |
28081 | +#if CONFIG_XEN_COMPAT > 0x030002 | |
28082 | + mcl->op = __HYPERVISOR_physdev_op; | |
28083 | + mcl->args[0] = PHYSDEVOP_set_iopl; | |
28084 | + mcl->args[1] = (unsigned long)&iopl_op; | |
28085 | +#else | |
28086 | + mcl->op = __HYPERVISOR_physdev_op_compat; | |
28087 | + pdo->cmd = PHYSDEVOP_set_iopl; | |
28088 | + mcl->args[0] = (unsigned long)pdo++; | |
28089 | +#endif | |
28090 | + mcl++; | |
28091 | + } | |
28092 | + | |
28093 | + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { | |
28094 | + set_xen_guest_handle(iobmp_op.bitmap, | |
28095 | + (char *)next->io_bitmap_ptr); | |
28096 | + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; | |
28097 | +#if CONFIG_XEN_COMPAT > 0x030002 | |
28098 | + mcl->op = __HYPERVISOR_physdev_op; | |
28099 | + mcl->args[0] = PHYSDEVOP_set_iobitmap; | |
28100 | + mcl->args[1] = (unsigned long)&iobmp_op; | |
28101 | +#else | |
28102 | + mcl->op = __HYPERVISOR_physdev_op_compat; | |
28103 | + pdo->cmd = PHYSDEVOP_set_iobitmap; | |
28104 | + mcl->args[0] = (unsigned long)pdo++; | |
28105 | +#endif | |
28106 | + mcl++; | |
28107 | + } | |
28108 | + | |
28109 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
28110 | + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); | |
28111 | +#endif | |
28112 | + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); | |
28113 | + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) | |
28114 | + BUG(); | |
28115 | + | |
28116 | + /* | |
28117 | + * Switch DS and ES. | |
28118 | + * This won't pick up thread selector changes, but I guess that is ok. | |
28119 | + */ | |
28120 | + if (unlikely(next->es)) | |
28121 | + loadsegment(es, next->es); | |
28122 | + | |
28123 | + if (unlikely(next->ds)) | |
28124 | + loadsegment(ds, next->ds); | |
28125 | + | |
28126 | + /* | |
28127 | + * Switch FS and GS. | |
28128 | + */ | |
28129 | + if (unlikely(next->fsindex)) | |
28130 | + loadsegment(fs, next->fsindex); | |
28131 | + | |
28132 | + if (next->fs) | |
28133 | + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs)); | |
28134 | + | |
28135 | + if (unlikely(next->gsindex)) | |
28136 | + load_gs_index(next->gsindex); | |
28137 | + | |
28138 | + if (next->gs) | |
28139 | + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs)); | |
28140 | + | |
28141 | + /* | |
28142 | + * Switch the PDA context. | |
28143 | + */ | |
28144 | + prev->userrsp = read_pda(oldrsp); | |
28145 | + write_pda(oldrsp, next->userrsp); | |
28146 | + write_pda(pcurrent, next_p); | |
28147 | + write_pda(kernelstack, | |
28148 | + task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); | |
28149 | + | |
28150 | + /* | |
28151 | + * Now maybe reload the debug registers | |
28152 | + */ | |
28153 | + if (unlikely(next->debugreg7)) { | |
28154 | + set_debugreg(next->debugreg0, 0); | |
28155 | + set_debugreg(next->debugreg1, 1); | |
28156 | + set_debugreg(next->debugreg2, 2); | |
28157 | + set_debugreg(next->debugreg3, 3); | |
28158 | + /* no 4 and 5 */ | |
28159 | + set_debugreg(next->debugreg6, 6); | |
28160 | + set_debugreg(next->debugreg7, 7); | |
28161 | + } | |
28162 | + | |
28163 | + return prev_p; | |
28164 | +} | |
28165 | + | |
28166 | +/* | |
28167 | + * sys_execve() executes a new program. | |
28168 | + */ | |
28169 | +asmlinkage | |
28170 | +long sys_execve(char __user *name, char __user * __user *argv, | |
28171 | + char __user * __user *envp, struct pt_regs regs) | |
28172 | +{ | |
28173 | + long error; | |
28174 | + char * filename; | |
28175 | + | |
28176 | + filename = getname(name); | |
28177 | + error = PTR_ERR(filename); | |
28178 | + if (IS_ERR(filename)) | |
28179 | + return error; | |
28180 | + error = do_execve(filename, argv, envp, ®s); | |
28181 | + if (error == 0) { | |
28182 | + task_lock(current); | |
28183 | + current->ptrace &= ~PT_DTRACE; | |
28184 | + task_unlock(current); | |
28185 | + } | |
28186 | + putname(filename); | |
28187 | + return error; | |
28188 | +} | |
28189 | + | |
28190 | +void set_personality_64bit(void) | |
28191 | +{ | |
28192 | + /* inherit personality from parent */ | |
28193 | + | |
28194 | + /* Make sure to be in 64bit mode */ | |
28195 | + clear_thread_flag(TIF_IA32); | |
28196 | + | |
28197 | + /* TBD: overwrites user setup. Should have two bits. | |
28198 | + But 64bit processes have always behaved this way, | |
28199 | + so it's not too bad. The main problem is just that | |
28200 | + 32bit childs are affected again. */ | |
28201 | + current->personality &= ~READ_IMPLIES_EXEC; | |
28202 | +} | |
28203 | + | |
28204 | +asmlinkage long sys_fork(struct pt_regs *regs) | |
28205 | +{ | |
28206 | + return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); | |
28207 | +} | |
28208 | + | |
28209 | +asmlinkage long | |
28210 | +sys_clone(unsigned long clone_flags, unsigned long newsp, | |
28211 | + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) | |
28212 | +{ | |
28213 | + if (!newsp) | |
28214 | + newsp = regs->rsp; | |
28215 | + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); | |
28216 | +} | |
28217 | + | |
28218 | +/* | |
28219 | + * This is trivial, and on the face of it looks like it | |
28220 | + * could equally well be done in user mode. | |
28221 | + * | |
28222 | + * Not so, for quite unobvious reasons - register pressure. | |
28223 | + * In user mode vfork() cannot have a stack frame, and if | |
28224 | + * done by calling the "clone()" system call directly, you | |
28225 | + * do not have enough call-clobbered registers to hold all | |
28226 | + * the information you need. | |
28227 | + */ | |
28228 | +asmlinkage long sys_vfork(struct pt_regs *regs) | |
28229 | +{ | |
28230 | + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, | |
28231 | + NULL, NULL); | |
28232 | +} | |
28233 | + | |
28234 | +unsigned long get_wchan(struct task_struct *p) | |
28235 | +{ | |
28236 | + unsigned long stack; | |
28237 | + u64 fp,rip; | |
28238 | + int count = 0; | |
28239 | + | |
28240 | + if (!p || p == current || p->state==TASK_RUNNING) | |
28241 | + return 0; | |
28242 | + stack = (unsigned long)task_stack_page(p); | |
28243 | + if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) | |
28244 | + return 0; | |
28245 | + fp = *(u64 *)(p->thread.rsp); | |
28246 | + do { | |
28247 | + if (fp < (unsigned long)stack || | |
28248 | + fp > (unsigned long)stack+THREAD_SIZE) | |
28249 | + return 0; | |
28250 | + rip = *(u64 *)(fp+8); | |
28251 | + if (!in_sched_functions(rip)) | |
28252 | + return rip; | |
28253 | + fp = *(u64 *)fp; | |
28254 | + } while (count++ < 16); | |
28255 | + return 0; | |
28256 | +} | |
28257 | + | |
28258 | +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | |
28259 | +{ | |
28260 | + int ret = 0; | |
28261 | + int doit = task == current; | |
28262 | + int cpu; | |
28263 | + | |
28264 | + switch (code) { | |
28265 | + case ARCH_SET_GS: | |
28266 | + if (addr >= TASK_SIZE_OF(task)) | |
28267 | + return -EPERM; | |
28268 | + cpu = get_cpu(); | |
28269 | + /* handle small bases via the GDT because that's faster to | |
28270 | + switch. */ | |
28271 | + if (addr <= 0xffffffff) { | |
28272 | + set_32bit_tls(task, GS_TLS, addr); | |
28273 | + if (doit) { | |
28274 | + load_TLS(&task->thread, cpu); | |
28275 | + load_gs_index(GS_TLS_SEL); | |
28276 | + } | |
28277 | + task->thread.gsindex = GS_TLS_SEL; | |
28278 | + task->thread.gs = 0; | |
28279 | + } else { | |
28280 | + task->thread.gsindex = 0; | |
28281 | + task->thread.gs = addr; | |
28282 | + if (doit) { | |
28283 | + load_gs_index(0); | |
28284 | + ret = HYPERVISOR_set_segment_base( | |
28285 | + SEGBASE_GS_USER, addr); | |
28286 | + } | |
28287 | + } | |
28288 | + put_cpu(); | |
28289 | + break; | |
28290 | + case ARCH_SET_FS: | |
28291 | + /* Not strictly needed for fs, but do it for symmetry | |
28292 | + with gs */ | |
28293 | + if (addr >= TASK_SIZE_OF(task)) | |
28294 | + return -EPERM; | |
28295 | + cpu = get_cpu(); | |
28296 | + /* handle small bases via the GDT because that's faster to | |
28297 | + switch. */ | |
28298 | + if (addr <= 0xffffffff) { | |
28299 | + set_32bit_tls(task, FS_TLS, addr); | |
28300 | + if (doit) { | |
28301 | + load_TLS(&task->thread, cpu); | |
28302 | + asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); | |
28303 | + } | |
28304 | + task->thread.fsindex = FS_TLS_SEL; | |
28305 | + task->thread.fs = 0; | |
28306 | + } else { | |
28307 | + task->thread.fsindex = 0; | |
28308 | + task->thread.fs = addr; | |
28309 | + if (doit) { | |
28310 | + /* set the selector to 0 to not confuse | |
28311 | + __switch_to */ | |
28312 | + asm volatile("movl %0,%%fs" :: "r" (0)); | |
28313 | + ret = HYPERVISOR_set_segment_base(SEGBASE_FS, | |
28314 | + addr); | |
28315 | + } | |
28316 | + } | |
28317 | + put_cpu(); | |
28318 | + break; | |
28319 | + case ARCH_GET_FS: { | |
28320 | + unsigned long base; | |
28321 | + if (task->thread.fsindex == FS_TLS_SEL) | |
28322 | + base = read_32bit_tls(task, FS_TLS); | |
28323 | + else if (doit) | |
28324 | + rdmsrl(MSR_FS_BASE, base); | |
28325 | + else | |
28326 | + base = task->thread.fs; | |
28327 | + ret = put_user(base, (unsigned long __user *)addr); | |
28328 | + break; | |
28329 | + } | |
28330 | + case ARCH_GET_GS: { | |
28331 | + unsigned long base; | |
28332 | + unsigned gsindex; | |
28333 | + if (task->thread.gsindex == GS_TLS_SEL) | |
28334 | + base = read_32bit_tls(task, GS_TLS); | |
28335 | + else if (doit) { | |
28336 | + asm("movl %%gs,%0" : "=r" (gsindex)); | |
28337 | + if (gsindex) | |
28338 | + rdmsrl(MSR_KERNEL_GS_BASE, base); | |
28339 | + else | |
28340 | + base = task->thread.gs; | |
28341 | + } | |
28342 | + else | |
28343 | + base = task->thread.gs; | |
28344 | + ret = put_user(base, (unsigned long __user *)addr); | |
28345 | + break; | |
28346 | + } | |
28347 | + | |
28348 | + default: | |
28349 | + ret = -EINVAL; | |
28350 | + break; | |
28351 | + } | |
28352 | + | |
28353 | + return ret; | |
28354 | +} | |
28355 | + | |
28356 | +long sys_arch_prctl(int code, unsigned long addr) | |
28357 | +{ | |
28358 | + return do_arch_prctl(current, code, addr); | |
28359 | +} | |
28360 | + | |
28361 | +/* | |
28362 | + * Capture the user space registers if the task is not running (in user space) | |
28363 | + */ | |
28364 | +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | |
28365 | +{ | |
28366 | + struct pt_regs *pp, ptregs; | |
28367 | + | |
28368 | + pp = task_pt_regs(tsk); | |
28369 | + | |
28370 | + ptregs = *pp; | |
28371 | + ptregs.cs &= 0xffff; | |
28372 | + ptregs.ss &= 0xffff; | |
28373 | + | |
28374 | + elf_core_copy_regs(regs, &ptregs); | |
28375 | + | |
28376 | + boot_option_idle_override = 1; | |
28377 | + return 1; | |
28378 | +} | |
28379 | + | |
28380 | +unsigned long arch_align_stack(unsigned long sp) | |
28381 | +{ | |
28382 | + if (randomize_va_space) | |
28383 | + sp -= get_random_int() % 8192; | |
28384 | + return sp & ~0xf; | |
28385 | +} | |
28386 | Index: head-2008-11-25/arch/x86/kernel/setup_64-xen.c | |
28387 | =================================================================== | |
28388 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
28389 | +++ head-2008-11-25/arch/x86/kernel/setup_64-xen.c 2008-04-22 15:41:51.000000000 +0200 | |
28390 | @@ -0,0 +1,1652 @@ | |
28391 | +/* | |
28392 | + * linux/arch/x86-64/kernel/setup.c | |
28393 | + * | |
28394 | + * Copyright (C) 1995 Linus Torvalds | |
28395 | + * | |
28396 | + * Nov 2001 Dave Jones <davej@suse.de> | |
28397 | + * Forked from i386 setup code. | |
28398 | + */ | |
28399 | + | |
28400 | +/* | |
28401 | + * This file handles the architecture-dependent parts of initialization | |
28402 | + */ | |
28403 | + | |
28404 | +#include <linux/errno.h> | |
28405 | +#include <linux/sched.h> | |
28406 | +#include <linux/kernel.h> | |
28407 | +#include <linux/mm.h> | |
28408 | +#include <linux/stddef.h> | |
28409 | +#include <linux/unistd.h> | |
28410 | +#include <linux/ptrace.h> | |
28411 | +#include <linux/slab.h> | |
28412 | +#include <linux/user.h> | |
28413 | +#include <linux/a.out.h> | |
28414 | +#include <linux/screen_info.h> | |
28415 | +#include <linux/ioport.h> | |
28416 | +#include <linux/delay.h> | |
28417 | +#include <linux/init.h> | |
28418 | +#include <linux/initrd.h> | |
28419 | +#include <linux/highmem.h> | |
28420 | +#include <linux/bootmem.h> | |
28421 | +#include <linux/module.h> | |
28422 | +#include <asm/processor.h> | |
28423 | +#include <linux/console.h> | |
28424 | +#include <linux/seq_file.h> | |
28425 | +#include <linux/crash_dump.h> | |
28426 | +#include <linux/root_dev.h> | |
28427 | +#include <linux/pci.h> | |
28428 | +#include <linux/acpi.h> | |
28429 | +#include <linux/kallsyms.h> | |
28430 | +#include <linux/edd.h> | |
28431 | +#include <linux/mmzone.h> | |
28432 | +#include <linux/kexec.h> | |
28433 | +#include <linux/cpufreq.h> | |
28434 | +#include <linux/dmi.h> | |
28435 | +#include <linux/dma-mapping.h> | |
28436 | +#include <linux/ctype.h> | |
28437 | + | |
28438 | +#include <asm/mtrr.h> | |
28439 | +#include <asm/uaccess.h> | |
28440 | +#include <asm/system.h> | |
28441 | +#include <asm/io.h> | |
28442 | +#include <asm/smp.h> | |
28443 | +#include <asm/msr.h> | |
28444 | +#include <asm/desc.h> | |
28445 | +#include <video/edid.h> | |
28446 | +#include <asm/e820.h> | |
28447 | +#include <asm/dma.h> | |
28448 | +#include <asm/mpspec.h> | |
28449 | +#include <asm/mmu_context.h> | |
28450 | +#include <asm/bootsetup.h> | |
28451 | +#include <asm/proto.h> | |
28452 | +#include <asm/setup.h> | |
28453 | +#include <asm/mach_apic.h> | |
28454 | +#include <asm/numa.h> | |
28455 | +#include <asm/sections.h> | |
28456 | +#include <asm/dmi.h> | |
28457 | +#ifdef CONFIG_XEN | |
28458 | +#include <linux/percpu.h> | |
28459 | +#include <xen/interface/physdev.h> | |
28460 | +#include "setup_arch_pre.h" | |
28461 | +#include <asm/hypervisor.h> | |
28462 | +#include <xen/interface/nmi.h> | |
28463 | +#include <xen/features.h> | |
28464 | +#include <xen/firmware.h> | |
28465 | +#include <xen/xencons.h> | |
28466 | +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) | |
28467 | +#define PFN_PHYS(x) ((x) << PAGE_SHIFT) | |
28468 | +#include <asm/mach-xen/setup_arch_post.h> | |
28469 | +#include <xen/interface/memory.h> | |
28470 | + | |
28471 | +#ifdef CONFIG_XEN | |
28472 | +#include <xen/interface/kexec.h> | |
28473 | +#endif | |
28474 | + | |
28475 | +extern unsigned long start_pfn; | |
28476 | +extern struct edid_info edid_info; | |
28477 | + | |
28478 | +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; | |
28479 | +EXPORT_SYMBOL(HYPERVISOR_shared_info); | |
28480 | + | |
28481 | +extern char hypercall_page[PAGE_SIZE]; | |
28482 | +EXPORT_SYMBOL(hypercall_page); | |
28483 | + | |
28484 | +static int xen_panic_event(struct notifier_block *, unsigned long, void *); | |
28485 | +static struct notifier_block xen_panic_block = { | |
28486 | + xen_panic_event, NULL, 0 /* try to go last */ | |
28487 | +}; | |
28488 | + | |
28489 | +unsigned long *phys_to_machine_mapping; | |
28490 | +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512]; | |
28491 | + | |
28492 | +EXPORT_SYMBOL(phys_to_machine_mapping); | |
28493 | + | |
28494 | +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]); | |
28495 | +DEFINE_PER_CPU(int, nr_multicall_ents); | |
28496 | + | |
28497 | +/* Raw start-of-day parameters from the hypervisor. */ | |
28498 | +start_info_t *xen_start_info; | |
28499 | +EXPORT_SYMBOL(xen_start_info); | |
28500 | +#endif | |
28501 | + | |
28502 | +/* | |
28503 | + * Machine setup.. | |
28504 | + */ | |
28505 | + | |
28506 | +struct cpuinfo_x86 boot_cpu_data __read_mostly; | |
28507 | +EXPORT_SYMBOL(boot_cpu_data); | |
28508 | + | |
28509 | +unsigned long mmu_cr4_features; | |
28510 | + | |
28511 | +int acpi_disabled; | |
28512 | +EXPORT_SYMBOL(acpi_disabled); | |
28513 | +#ifdef CONFIG_ACPI | |
28514 | +extern int __initdata acpi_ht; | |
28515 | +extern acpi_interrupt_flags acpi_sci_flags; | |
28516 | +int __initdata acpi_force = 0; | |
28517 | +#endif | |
28518 | + | |
28519 | +int acpi_numa __initdata; | |
28520 | + | |
28521 | +/* Boot loader ID as an integer, for the benefit of proc_dointvec */ | |
28522 | +int bootloader_type; | |
28523 | + | |
28524 | +unsigned long saved_video_mode; | |
28525 | + | |
28526 | +/* | |
28527 | + * Early DMI memory | |
28528 | + */ | |
28529 | +int dmi_alloc_index; | |
28530 | +char dmi_alloc_data[DMI_MAX_DATA]; | |
28531 | + | |
28532 | +/* | |
28533 | + * Setup options | |
28534 | + */ | |
28535 | +struct screen_info screen_info; | |
28536 | +EXPORT_SYMBOL(screen_info); | |
28537 | +struct sys_desc_table_struct { | |
28538 | + unsigned short length; | |
28539 | + unsigned char table[0]; | |
28540 | +}; | |
28541 | + | |
28542 | +struct edid_info edid_info; | |
28543 | +EXPORT_SYMBOL_GPL(edid_info); | |
28544 | +struct e820map e820; | |
28545 | +#ifdef CONFIG_XEN | |
28546 | +struct e820map machine_e820; | |
28547 | +#endif | |
28548 | + | |
28549 | +extern int root_mountflags; | |
28550 | + | |
28551 | +char command_line[COMMAND_LINE_SIZE]; | |
28552 | + | |
28553 | +struct resource standard_io_resources[] = { | |
28554 | + { .name = "dma1", .start = 0x00, .end = 0x1f, | |
28555 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | |
28556 | + { .name = "pic1", .start = 0x20, .end = 0x21, | |
28557 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | |
28558 | + { .name = "timer0", .start = 0x40, .end = 0x43, | |
28559 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | |
28560 | + { .name = "timer1", .start = 0x50, .end = 0x53, | |
28561 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | |
28562 | + { .name = "keyboard", .start = 0x60, .end = 0x6f, | |
28563 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | |
28564 | + { .name = "dma page reg", .start = 0x80, .end = 0x8f, | |
28565 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | |
28566 | + { .name = "pic2", .start = 0xa0, .end = 0xa1, | |
28567 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | |
28568 | + { .name = "dma2", .start = 0xc0, .end = 0xdf, | |
28569 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | |
28570 | + { .name = "fpu", .start = 0xf0, .end = 0xff, | |
28571 | + .flags = IORESOURCE_BUSY | IORESOURCE_IO } | |
28572 | +}; | |
28573 | + | |
28574 | +#define STANDARD_IO_RESOURCES \ | |
28575 | + (sizeof standard_io_resources / sizeof standard_io_resources[0]) | |
28576 | + | |
28577 | +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) | |
28578 | + | |
28579 | +struct resource data_resource = { | |
28580 | + .name = "Kernel data", | |
28581 | + .start = 0, | |
28582 | + .end = 0, | |
28583 | + .flags = IORESOURCE_RAM, | |
28584 | +}; | |
28585 | +struct resource code_resource = { | |
28586 | + .name = "Kernel code", | |
28587 | + .start = 0, | |
28588 | + .end = 0, | |
28589 | + .flags = IORESOURCE_RAM, | |
28590 | +}; | |
28591 | + | |
28592 | +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) | |
28593 | + | |
28594 | +static struct resource system_rom_resource = { | |
28595 | + .name = "System ROM", | |
28596 | + .start = 0xf0000, | |
28597 | + .end = 0xfffff, | |
28598 | + .flags = IORESOURCE_ROM, | |
28599 | +}; | |
28600 | + | |
28601 | +static struct resource extension_rom_resource = { | |
28602 | + .name = "Extension ROM", | |
28603 | + .start = 0xe0000, | |
28604 | + .end = 0xeffff, | |
28605 | + .flags = IORESOURCE_ROM, | |
28606 | +}; | |
28607 | + | |
28608 | +static struct resource adapter_rom_resources[] = { | |
28609 | + { .name = "Adapter ROM", .start = 0xc8000, .end = 0, | |
28610 | + .flags = IORESOURCE_ROM }, | |
28611 | + { .name = "Adapter ROM", .start = 0, .end = 0, | |
28612 | + .flags = IORESOURCE_ROM }, | |
28613 | + { .name = "Adapter ROM", .start = 0, .end = 0, | |
28614 | + .flags = IORESOURCE_ROM }, | |
28615 | + { .name = "Adapter ROM", .start = 0, .end = 0, | |
28616 | + .flags = IORESOURCE_ROM }, | |
28617 | + { .name = "Adapter ROM", .start = 0, .end = 0, | |
28618 | + .flags = IORESOURCE_ROM }, | |
28619 | + { .name = "Adapter ROM", .start = 0, .end = 0, | |
28620 | + .flags = IORESOURCE_ROM } | |
28621 | +}; | |
28622 | + | |
28623 | +#define ADAPTER_ROM_RESOURCES \ | |
28624 | + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) | |
28625 | + | |
28626 | +static struct resource video_rom_resource = { | |
28627 | + .name = "Video ROM", | |
28628 | + .start = 0xc0000, | |
28629 | + .end = 0xc7fff, | |
28630 | + .flags = IORESOURCE_ROM, | |
28631 | +}; | |
28632 | + | |
28633 | +static struct resource video_ram_resource = { | |
28634 | + .name = "Video RAM area", | |
28635 | + .start = 0xa0000, | |
28636 | + .end = 0xbffff, | |
28637 | + .flags = IORESOURCE_RAM, | |
28638 | +}; | |
28639 | + | |
28640 | +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) | |
28641 | + | |
28642 | +static int __init romchecksum(unsigned char *rom, unsigned long length) | |
28643 | +{ | |
28644 | + unsigned char *p, sum = 0; | |
28645 | + | |
28646 | + for (p = rom; p < rom + length; p++) | |
28647 | + sum += *p; | |
28648 | + return sum == 0; | |
28649 | +} | |
28650 | + | |
28651 | +static void __init probe_roms(void) | |
28652 | +{ | |
28653 | + unsigned long start, length, upper; | |
28654 | + unsigned char *rom; | |
28655 | + int i; | |
28656 | + | |
28657 | +#ifdef CONFIG_XEN | |
28658 | + /* Nothing to do if not running in dom0. */ | |
28659 | + if (!is_initial_xendomain()) | |
28660 | + return; | |
28661 | +#endif | |
28662 | + | |
28663 | + /* video rom */ | |
28664 | + upper = adapter_rom_resources[0].start; | |
28665 | + for (start = video_rom_resource.start; start < upper; start += 2048) { | |
28666 | + rom = isa_bus_to_virt(start); | |
28667 | + if (!romsignature(rom)) | |
28668 | + continue; | |
28669 | + | |
28670 | + video_rom_resource.start = start; | |
28671 | + | |
28672 | + /* 0 < length <= 0x7f * 512, historically */ | |
28673 | + length = rom[2] * 512; | |
28674 | + | |
28675 | + /* if checksum okay, trust length byte */ | |
28676 | + if (length && romchecksum(rom, length)) | |
28677 | + video_rom_resource.end = start + length - 1; | |
28678 | + | |
28679 | + request_resource(&iomem_resource, &video_rom_resource); | |
28680 | + break; | |
28681 | + } | |
28682 | + | |
28683 | + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; | |
28684 | + if (start < upper) | |
28685 | + start = upper; | |
28686 | + | |
28687 | + /* system rom */ | |
28688 | + request_resource(&iomem_resource, &system_rom_resource); | |
28689 | + upper = system_rom_resource.start; | |
28690 | + | |
28691 | + /* check for extension rom (ignore length byte!) */ | |
28692 | + rom = isa_bus_to_virt(extension_rom_resource.start); | |
28693 | + if (romsignature(rom)) { | |
28694 | + length = extension_rom_resource.end - extension_rom_resource.start + 1; | |
28695 | + if (romchecksum(rom, length)) { | |
28696 | + request_resource(&iomem_resource, &extension_rom_resource); | |
28697 | + upper = extension_rom_resource.start; | |
28698 | + } | |
28699 | + } | |
28700 | + | |
28701 | + /* check for adapter roms on 2k boundaries */ | |
28702 | + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { | |
28703 | + rom = isa_bus_to_virt(start); | |
28704 | + if (!romsignature(rom)) | |
28705 | + continue; | |
28706 | + | |
28707 | + /* 0 < length <= 0x7f * 512, historically */ | |
28708 | + length = rom[2] * 512; | |
28709 | + | |
28710 | + /* but accept any length that fits if checksum okay */ | |
28711 | + if (!length || start + length > upper || !romchecksum(rom, length)) | |
28712 | + continue; | |
28713 | + | |
28714 | + adapter_rom_resources[i].start = start; | |
28715 | + adapter_rom_resources[i].end = start + length - 1; | |
28716 | + request_resource(&iomem_resource, &adapter_rom_resources[i]); | |
28717 | + | |
28718 | + start = adapter_rom_resources[i++].end & ~2047UL; | |
28719 | + } | |
28720 | +} | |
28721 | + | |
28722 | +/* Check for full argument with no trailing characters */ | |
28723 | +static int fullarg(char *p, char *arg) | |
28724 | +{ | |
28725 | + int l = strlen(arg); | |
28726 | + return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l])); | |
28727 | +} | |
28728 | + | |
28729 | +static __init void parse_cmdline_early (char ** cmdline_p) | |
28730 | +{ | |
28731 | + char c = ' ', *to = command_line, *from = COMMAND_LINE; | |
28732 | + int len = 0; | |
28733 | + int userdef = 0; | |
28734 | + | |
28735 | + for (;;) { | |
28736 | + if (c != ' ') | |
28737 | + goto next_char; | |
28738 | + | |
28739 | +#ifdef CONFIG_SMP | |
28740 | + /* | |
28741 | + * If the BIOS enumerates physical processors before logical, | |
28742 | + * maxcpus=N at enumeration-time can be used to disable HT. | |
28743 | + */ | |
28744 | + else if (!memcmp(from, "maxcpus=", 8)) { | |
28745 | + extern unsigned int maxcpus; | |
28746 | + | |
28747 | + maxcpus = simple_strtoul(from + 8, NULL, 0); | |
28748 | + } | |
28749 | +#endif | |
28750 | +#ifdef CONFIG_ACPI | |
28751 | + /* "acpi=off" disables both ACPI table parsing and interpreter init */ | |
28752 | + if (fullarg(from,"acpi=off")) | |
28753 | + disable_acpi(); | |
28754 | + | |
28755 | + if (fullarg(from, "acpi=force")) { | |
28756 | + /* add later when we do DMI horrors: */ | |
28757 | + acpi_force = 1; | |
28758 | + acpi_disabled = 0; | |
28759 | + } | |
28760 | + | |
28761 | + /* acpi=ht just means: do ACPI MADT parsing | |
28762 | + at bootup, but don't enable the full ACPI interpreter */ | |
28763 | + if (fullarg(from, "acpi=ht")) { | |
28764 | + if (!acpi_force) | |
28765 | + disable_acpi(); | |
28766 | + acpi_ht = 1; | |
28767 | + } | |
28768 | + else if (fullarg(from, "pci=noacpi")) | |
28769 | + acpi_disable_pci(); | |
28770 | + else if (fullarg(from, "acpi=noirq")) | |
28771 | + acpi_noirq_set(); | |
28772 | + | |
28773 | + else if (fullarg(from, "acpi_sci=edge")) | |
28774 | + acpi_sci_flags.trigger = 1; | |
28775 | + else if (fullarg(from, "acpi_sci=level")) | |
28776 | + acpi_sci_flags.trigger = 3; | |
28777 | + else if (fullarg(from, "acpi_sci=high")) | |
28778 | + acpi_sci_flags.polarity = 1; | |
28779 | + else if (fullarg(from, "acpi_sci=low")) | |
28780 | + acpi_sci_flags.polarity = 3; | |
28781 | + | |
28782 | + /* acpi=strict disables out-of-spec workarounds */ | |
28783 | + else if (fullarg(from, "acpi=strict")) { | |
28784 | + acpi_strict = 1; | |
28785 | + } | |
28786 | +#ifdef CONFIG_X86_IO_APIC | |
28787 | + else if (fullarg(from, "acpi_skip_timer_override")) | |
28788 | + acpi_skip_timer_override = 1; | |
28789 | +#endif | |
28790 | +#endif | |
28791 | + | |
28792 | +#ifndef CONFIG_XEN | |
28793 | + if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) { | |
28794 | + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); | |
28795 | + disable_apic = 1; | |
28796 | + } | |
28797 | + | |
28798 | + if (fullarg(from, "noapic")) | |
28799 | + skip_ioapic_setup = 1; | |
28800 | + | |
28801 | + if (fullarg(from,"apic")) { | |
28802 | + skip_ioapic_setup = 0; | |
28803 | + ioapic_force = 1; | |
28804 | + } | |
28805 | +#endif | |
28806 | + | |
28807 | + if (!memcmp(from, "mem=", 4)) | |
28808 | + parse_memopt(from+4, &from); | |
28809 | + | |
28810 | + if (!memcmp(from, "memmap=", 7)) { | |
28811 | + /* exactmap option is for used defined memory */ | |
28812 | + if (!memcmp(from+7, "exactmap", 8)) { | |
28813 | +#ifdef CONFIG_CRASH_DUMP | |
28814 | + /* If we are doing a crash dump, we | |
28815 | + * still need to know the real mem | |
28816 | + * size before original memory map is | |
28817 | + * reset. | |
28818 | + */ | |
28819 | + saved_max_pfn = e820_end_of_ram(); | |
28820 | +#endif | |
28821 | + from += 8+7; | |
28822 | + end_pfn_map = 0; | |
28823 | + e820.nr_map = 0; | |
28824 | + userdef = 1; | |
28825 | + } | |
28826 | + else { | |
28827 | + parse_memmapopt(from+7, &from); | |
28828 | + userdef = 1; | |
28829 | + } | |
28830 | + } | |
28831 | + | |
28832 | +#ifdef CONFIG_NUMA | |
28833 | + if (!memcmp(from, "numa=", 5)) | |
28834 | + numa_setup(from+5); | |
28835 | +#endif | |
28836 | + | |
28837 | + if (!memcmp(from,"iommu=",6)) { | |
28838 | + iommu_setup(from+6); | |
28839 | + } | |
28840 | + | |
28841 | + if (fullarg(from,"oops=panic")) | |
28842 | + panic_on_oops = 1; | |
28843 | + | |
28844 | + if (!memcmp(from, "noexec=", 7)) | |
28845 | + nonx_setup(from + 7); | |
28846 | + | |
28847 | +#ifdef CONFIG_KEXEC | |
28848 | + /* crashkernel=size@addr specifies the location to reserve for | |
28849 | + * a crash kernel. By reserving this memory we guarantee | |
28850 | + * that linux never set's it up as a DMA target. | |
28851 | + * Useful for holding code to do something appropriate | |
28852 | + * after a kernel panic. | |
28853 | + */ | |
28854 | + else if (!memcmp(from, "crashkernel=", 12)) { | |
28855 | +#ifndef CONFIG_XEN | |
28856 | + unsigned long size, base; | |
28857 | + size = memparse(from+12, &from); | |
28858 | + if (*from == '@') { | |
28859 | + base = memparse(from+1, &from); | |
28860 | + /* FIXME: Do I want a sanity check | |
28861 | + * to validate the memory range? | |
28862 | + */ | |
28863 | + crashk_res.start = base; | |
28864 | + crashk_res.end = base + size - 1; | |
28865 | + } | |
28866 | +#else | |
28867 | + printk("Ignoring crashkernel command line, " | |
28868 | + "parameter will be supplied by xen\n"); | |
28869 | +#endif | |
28870 | + } | |
28871 | +#endif | |
28872 | + | |
28873 | +#ifdef CONFIG_PROC_VMCORE | |
28874 | + /* elfcorehdr= specifies the location of elf core header | |
28875 | + * stored by the crashed kernel. This option will be passed | |
28876 | + * by kexec loader to the capture kernel. | |
28877 | + */ | |
28878 | + else if(!memcmp(from, "elfcorehdr=", 11)) | |
28879 | + elfcorehdr_addr = memparse(from+11, &from); | |
28880 | +#endif | |
28881 | + | |
28882 | +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN) | |
28883 | + else if (!memcmp(from, "additional_cpus=", 16)) | |
28884 | + setup_additional_cpus(from+16); | |
28885 | +#endif | |
28886 | + | |
28887 | + next_char: | |
28888 | + c = *(from++); | |
28889 | + if (!c) | |
28890 | + break; | |
28891 | + if (COMMAND_LINE_SIZE <= ++len) | |
28892 | + break; | |
28893 | + *(to++) = c; | |
28894 | + } | |
28895 | + if (userdef) { | |
28896 | + printk(KERN_INFO "user-defined physical RAM map:\n"); | |
28897 | + e820_print_map("user"); | |
28898 | + } | |
28899 | + *to = '\0'; | |
28900 | + *cmdline_p = command_line; | |
28901 | +} | |
28902 | + | |
28903 | +#ifndef CONFIG_NUMA | |
28904 | +static void __init | |
28905 | +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |
28906 | +{ | |
28907 | + unsigned long bootmap_size, bootmap; | |
28908 | + | |
28909 | + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | |
28910 | + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); | |
28911 | + if (bootmap == -1L) | |
28912 | + panic("Cannot find bootmem map of size %ld\n",bootmap_size); | |
28913 | + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); | |
28914 | +#ifdef CONFIG_XEN | |
28915 | + e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT); | |
28916 | +#else | |
28917 | + e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); | |
28918 | +#endif | |
28919 | + reserve_bootmem(bootmap, bootmap_size); | |
28920 | +} | |
28921 | +#endif | |
28922 | + | |
28923 | +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | |
28924 | +struct edd edd; | |
28925 | +#ifdef CONFIG_EDD_MODULE | |
28926 | +EXPORT_SYMBOL(edd); | |
28927 | +#endif | |
28928 | +#ifndef CONFIG_XEN | |
28929 | +/** | |
28930 | + * copy_edd() - Copy the BIOS EDD information | |
28931 | + * from boot_params into a safe place. | |
28932 | + * | |
28933 | + */ | |
28934 | +static inline void copy_edd(void) | |
28935 | +{ | |
28936 | + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); | |
28937 | + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); | |
28938 | + edd.mbr_signature_nr = EDD_MBR_SIG_NR; | |
28939 | + edd.edd_info_nr = EDD_NR; | |
28940 | +} | |
28941 | +#endif | |
28942 | +#else | |
28943 | +static inline void copy_edd(void) | |
28944 | +{ | |
28945 | +} | |
28946 | +#endif | |
28947 | + | |
28948 | +#ifndef CONFIG_XEN | |
28949 | +#define EBDA_ADDR_POINTER 0x40E | |
28950 | + | |
28951 | +unsigned __initdata ebda_addr; | |
28952 | +unsigned __initdata ebda_size; | |
28953 | + | |
28954 | +static void discover_ebda(void) | |
28955 | +{ | |
28956 | + /* | |
28957 | + * there is a real-mode segmented pointer pointing to the | |
28958 | + * 4K EBDA area at 0x40E | |
28959 | + */ | |
28960 | + ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; | |
28961 | + ebda_addr <<= 4; | |
28962 | + | |
28963 | + ebda_size = *(unsigned short *)(unsigned long)ebda_addr; | |
28964 | + | |
28965 | + /* Round EBDA up to pages */ | |
28966 | + if (ebda_size == 0) | |
28967 | + ebda_size = 1; | |
28968 | + ebda_size <<= 10; | |
28969 | + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); | |
28970 | + if (ebda_size > 64*1024) | |
28971 | + ebda_size = 64*1024; | |
28972 | +} | |
28973 | +#else | |
28974 | +#define discover_ebda() ((void)0) | |
28975 | +#endif | |
28976 | + | |
28977 | +void __init setup_arch(char **cmdline_p) | |
28978 | +{ | |
28979 | +#ifdef CONFIG_XEN | |
28980 | + /* Register a call for panic conditions. */ | |
28981 | + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); | |
28982 | + | |
28983 | + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); | |
28984 | + screen_info = SCREEN_INFO; | |
28985 | + | |
28986 | + if (is_initial_xendomain()) { | |
28987 | + const struct dom0_vga_console_info *info = | |
28988 | + (void *)((char *)xen_start_info + | |
28989 | + xen_start_info->console.dom0.info_off); | |
28990 | + | |
28991 | + dom0_init_screen_info(info, | |
28992 | + xen_start_info->console.dom0.info_size); | |
28993 | + xen_start_info->console.domU.mfn = 0; | |
28994 | + xen_start_info->console.domU.evtchn = 0; | |
28995 | + } else | |
28996 | + screen_info.orig_video_isVGA = 0; | |
28997 | + | |
28998 | + copy_edid(); | |
28999 | + | |
29000 | + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, | |
29001 | + VMASST_TYPE_writable_pagetables)); | |
29002 | + | |
29003 | + ARCH_SETUP | |
29004 | +#else | |
29005 | + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); | |
29006 | + screen_info = SCREEN_INFO; | |
29007 | + edid_info = EDID_INFO; | |
29008 | +#endif /* !CONFIG_XEN */ | |
29009 | + saved_video_mode = SAVED_VIDEO_MODE; | |
29010 | + bootloader_type = LOADER_TYPE; | |
29011 | + | |
29012 | +#ifdef CONFIG_BLK_DEV_RAM | |
29013 | + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; | |
29014 | + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); | |
29015 | + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); | |
29016 | +#endif | |
29017 | + setup_memory_region(); | |
29018 | + copy_edd(); | |
29019 | + | |
29020 | + if (!MOUNT_ROOT_RDONLY) | |
29021 | + root_mountflags &= ~MS_RDONLY; | |
29022 | + init_mm.start_code = (unsigned long) &_text; | |
29023 | + init_mm.end_code = (unsigned long) &_etext; | |
29024 | + init_mm.end_data = (unsigned long) &_edata; | |
29025 | + init_mm.brk = (unsigned long) &_end; | |
29026 | + | |
29027 | + code_resource.start = virt_to_phys(&_text); | |
29028 | + code_resource.end = virt_to_phys(&_etext)-1; | |
29029 | + data_resource.start = virt_to_phys(&_etext); | |
29030 | + data_resource.end = virt_to_phys(&_edata)-1; | |
29031 | + | |
29032 | + parse_cmdline_early(cmdline_p); | |
29033 | + | |
29034 | + early_identify_cpu(&boot_cpu_data); | |
29035 | + | |
29036 | + /* | |
29037 | + * partially used pages are not usable - thus | |
29038 | + * we are rounding upwards: | |
29039 | + */ | |
29040 | + end_pfn = e820_end_of_ram(); | |
29041 | + num_physpages = end_pfn; /* for pfn_valid */ | |
29042 | + | |
29043 | + check_efer(); | |
29044 | + | |
29045 | + discover_ebda(); | |
29046 | + | |
29047 | + init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); | |
29048 | + | |
29049 | + if (is_initial_xendomain()) | |
29050 | + dmi_scan_machine(); | |
29051 | + | |
29052 | +#ifdef CONFIG_ACPI_NUMA | |
29053 | + /* | |
29054 | + * Parse SRAT to discover nodes. | |
29055 | + */ | |
29056 | + acpi_numa_init(); | |
29057 | +#endif | |
29058 | + | |
29059 | +#ifdef CONFIG_NUMA | |
29060 | + numa_initmem_init(0, end_pfn); | |
29061 | +#else | |
29062 | + contig_initmem_init(0, end_pfn); | |
29063 | +#endif | |
29064 | + | |
29065 | +#ifdef CONFIG_XEN | |
29066 | + /* | |
29067 | + * Reserve kernel, physmap, start info, initial page tables, and | |
29068 | + * direct mapping. | |
29069 | + */ | |
29070 | + reserve_bootmem_generic(__pa_symbol(&_text), | |
29071 | + (table_end << PAGE_SHIFT) - __pa_symbol(&_text)); | |
29072 | +#else | |
29073 | + /* Reserve direct mapping */ | |
29074 | + reserve_bootmem_generic(table_start << PAGE_SHIFT, | |
29075 | + (table_end - table_start) << PAGE_SHIFT); | |
29076 | + | |
29077 | + /* reserve kernel */ | |
29078 | + reserve_bootmem_generic(__pa_symbol(&_text), | |
29079 | + __pa_symbol(&_end) - __pa_symbol(&_text)); | |
29080 | + | |
29081 | + /* | |
29082 | + * reserve physical page 0 - it's a special BIOS page on many boxes, | |
29083 | + * enabling clean reboots, SMP operation, laptop functions. | |
29084 | + */ | |
29085 | + reserve_bootmem_generic(0, PAGE_SIZE); | |
29086 | + | |
29087 | + /* reserve ebda region */ | |
29088 | + if (ebda_addr) | |
29089 | + reserve_bootmem_generic(ebda_addr, ebda_size); | |
29090 | + | |
29091 | +#ifdef CONFIG_SMP | |
29092 | + /* | |
29093 | + * But first pinch a few for the stack/trampoline stuff | |
29094 | + * FIXME: Don't need the extra page at 4K, but need to fix | |
29095 | + * trampoline before removing it. (see the GDT stuff) | |
29096 | + */ | |
29097 | + reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); | |
29098 | + | |
29099 | + /* Reserve SMP trampoline */ | |
29100 | + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); | |
29101 | +#endif | |
29102 | +#endif | |
29103 | + | |
29104 | +#ifdef CONFIG_ACPI_SLEEP | |
29105 | + /* | |
29106 | + * Reserve low memory region for sleep support. | |
29107 | + */ | |
29108 | + acpi_reserve_bootmem(); | |
29109 | +#endif | |
29110 | +#ifdef CONFIG_XEN | |
29111 | +#ifdef CONFIG_BLK_DEV_INITRD | |
29112 | + if (xen_start_info->mod_start) { | |
29113 | + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { | |
29114 | + /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/ | |
29115 | + initrd_start = INITRD_START + PAGE_OFFSET; | |
29116 | + initrd_end = initrd_start+INITRD_SIZE; | |
29117 | + initrd_below_start_ok = 1; | |
29118 | + } else { | |
29119 | + printk(KERN_ERR "initrd extends beyond end of memory " | |
29120 | + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | |
29121 | + (unsigned long)(INITRD_START + INITRD_SIZE), | |
29122 | + (unsigned long)(end_pfn << PAGE_SHIFT)); | |
29123 | + initrd_start = 0; | |
29124 | + } | |
29125 | + } | |
29126 | +#endif | |
29127 | +#else /* CONFIG_XEN */ | |
29128 | +#ifdef CONFIG_BLK_DEV_INITRD | |
29129 | + if (LOADER_TYPE && INITRD_START) { | |
29130 | + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { | |
29131 | + reserve_bootmem_generic(INITRD_START, INITRD_SIZE); | |
29132 | + initrd_start = | |
29133 | + INITRD_START ? INITRD_START + PAGE_OFFSET : 0; | |
29134 | + initrd_end = initrd_start+INITRD_SIZE; | |
29135 | + } | |
29136 | + else { | |
29137 | + printk(KERN_ERR "initrd extends beyond end of memory " | |
29138 | + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | |
29139 | + (unsigned long)(INITRD_START + INITRD_SIZE), | |
29140 | + (unsigned long)(end_pfn << PAGE_SHIFT)); | |
29141 | + initrd_start = 0; | |
29142 | + } | |
29143 | + } | |
29144 | +#endif | |
29145 | +#endif /* !CONFIG_XEN */ | |
29146 | +#ifdef CONFIG_KEXEC | |
29147 | +#ifdef CONFIG_XEN | |
29148 | + xen_machine_kexec_setup_resources(); | |
29149 | +#else | |
29150 | + if (crashk_res.start != crashk_res.end) { | |
29151 | + reserve_bootmem_generic(crashk_res.start, | |
29152 | + crashk_res.end - crashk_res.start + 1); | |
29153 | + } | |
29154 | +#endif | |
29155 | +#endif | |
29156 | + | |
29157 | + paging_init(); | |
29158 | +#ifdef CONFIG_X86_LOCAL_APIC | |
29159 | + /* | |
29160 | + * Find and reserve possible boot-time SMP configuration: | |
29161 | + */ | |
29162 | + find_smp_config(); | |
29163 | +#endif | |
29164 | +#ifdef CONFIG_XEN | |
29165 | + { | |
29166 | + int i, j, k, fpp; | |
29167 | + unsigned long p2m_pages; | |
29168 | + | |
29169 | + p2m_pages = end_pfn; | |
29170 | + if (xen_start_info->nr_pages > end_pfn) { | |
29171 | + /* | |
29172 | + * the end_pfn was shrunk (probably by mem= or highmem= | |
29173 | + * kernel parameter); shrink reservation with the HV | |
29174 | + */ | |
29175 | + struct xen_memory_reservation reservation = { | |
29176 | + .address_bits = 0, | |
29177 | + .extent_order = 0, | |
29178 | + .domid = DOMID_SELF | |
29179 | + }; | |
29180 | + unsigned int difference; | |
29181 | + int ret; | |
29182 | + | |
29183 | + difference = xen_start_info->nr_pages - end_pfn; | |
29184 | + | |
29185 | + set_xen_guest_handle(reservation.extent_start, | |
29186 | + ((unsigned long *)xen_start_info->mfn_list) + end_pfn); | |
29187 | + reservation.nr_extents = difference; | |
29188 | + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, | |
29189 | + &reservation); | |
29190 | + BUG_ON (ret != difference); | |
29191 | + } | |
29192 | + else if (end_pfn > xen_start_info->nr_pages) | |
29193 | + p2m_pages = xen_start_info->nr_pages; | |
29194 | + | |
29195 | + if (!xen_feature(XENFEAT_auto_translated_physmap)) { | |
29196 | + /* Make sure we have a large enough P->M table. */ | |
29197 | + phys_to_machine_mapping = alloc_bootmem_pages( | |
29198 | + end_pfn * sizeof(unsigned long)); | |
29199 | + memset(phys_to_machine_mapping, ~0, | |
29200 | + end_pfn * sizeof(unsigned long)); | |
29201 | + memcpy(phys_to_machine_mapping, | |
29202 | + (unsigned long *)xen_start_info->mfn_list, | |
29203 | + p2m_pages * sizeof(unsigned long)); | |
29204 | + free_bootmem( | |
29205 | + __pa(xen_start_info->mfn_list), | |
29206 | + PFN_PHYS(PFN_UP(xen_start_info->nr_pages * | |
29207 | + sizeof(unsigned long)))); | |
29208 | + | |
29209 | + /* | |
29210 | + * Initialise the list of the frames that specify the | |
29211 | + * list of frames that make up the p2m table. Used by | |
29212 | + * save/restore. | |
29213 | + */ | |
29214 | + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE); | |
29215 | + | |
29216 | + fpp = PAGE_SIZE/sizeof(unsigned long); | |
29217 | + for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) { | |
29218 | + if ((j % fpp) == 0) { | |
29219 | + k++; | |
29220 | + BUG_ON(k>=fpp); | |
29221 | + pfn_to_mfn_frame_list[k] = | |
29222 | + alloc_bootmem_pages(PAGE_SIZE); | |
29223 | + pfn_to_mfn_frame_list_list[k] = | |
29224 | + virt_to_mfn(pfn_to_mfn_frame_list[k]); | |
29225 | + j=0; | |
29226 | + } | |
29227 | + pfn_to_mfn_frame_list[k][j] = | |
29228 | + virt_to_mfn(&phys_to_machine_mapping[i]); | |
29229 | + } | |
29230 | + HYPERVISOR_shared_info->arch.max_pfn = end_pfn; | |
29231 | + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = | |
29232 | + virt_to_mfn(pfn_to_mfn_frame_list_list); | |
29233 | + } | |
29234 | + | |
29235 | + /* Mark all ISA DMA channels in-use - using them wouldn't work. */ | |
29236 | + for (i = 0; i < MAX_DMA_CHANNELS; ++i) | |
29237 | + if (i != 4 && request_dma(i, "xen") != 0) | |
29238 | + BUG(); | |
29239 | + } | |
29240 | + | |
29241 | + if (!is_initial_xendomain()) { | |
29242 | + acpi_disabled = 1; | |
29243 | +#ifdef CONFIG_ACPI | |
29244 | + acpi_ht = 0; | |
29245 | +#endif | |
29246 | + } | |
29247 | +#endif | |
29248 | + | |
29249 | +#ifndef CONFIG_XEN | |
29250 | + check_ioapic(); | |
29251 | +#endif | |
29252 | + | |
29253 | + zap_low_mappings(0); | |
29254 | + | |
29255 | + /* | |
29256 | + * set this early, so we dont allocate cpu0 | |
29257 | + * if MADT list doesnt list BSP first | |
29258 | + * mpparse.c/MP_processor_info() allocates logical cpu numbers. | |
29259 | + */ | |
29260 | + cpu_set(0, cpu_present_map); | |
29261 | +#ifdef CONFIG_ACPI | |
29262 | + /* | |
29263 | + * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). | |
29264 | + * Call this early for SRAT node setup. | |
29265 | + */ | |
29266 | + acpi_boot_table_init(); | |
29267 | + | |
29268 | + /* | |
29269 | + * Read APIC and some other early information from ACPI tables. | |
29270 | + */ | |
29271 | + acpi_boot_init(); | |
29272 | +#endif | |
29273 | + | |
29274 | + init_cpu_to_node(); | |
29275 | + | |
29276 | +#ifdef CONFIG_X86_LOCAL_APIC | |
29277 | + /* | |
29278 | + * get boot-time SMP configuration: | |
29279 | + */ | |
29280 | + if (smp_found_config) | |
29281 | + get_smp_config(); | |
29282 | +#ifndef CONFIG_XEN | |
29283 | + init_apic_mappings(); | |
29284 | +#endif | |
29285 | +#endif | |
29286 | +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) | |
29287 | + prefill_possible_map(); | |
29288 | +#endif | |
29289 | + | |
29290 | + /* | |
29291 | + * Request address space for all standard RAM and ROM resources | |
29292 | + * and also for regions reported as reserved by the e820. | |
29293 | + */ | |
29294 | + probe_roms(); | |
29295 | +#ifdef CONFIG_XEN | |
29296 | + if (is_initial_xendomain()) | |
29297 | + e820_reserve_resources(machine_e820.map, machine_e820.nr_map); | |
29298 | +#else | |
29299 | + e820_reserve_resources(e820.map, e820.nr_map); | |
29300 | +#endif | |
29301 | + | |
29302 | + request_resource(&iomem_resource, &video_ram_resource); | |
29303 | + | |
29304 | + { | |
29305 | + unsigned i; | |
29306 | + /* request I/O space for devices used on all i[345]86 PCs */ | |
29307 | + for (i = 0; i < STANDARD_IO_RESOURCES; i++) | |
29308 | + request_resource(&ioport_resource, &standard_io_resources[i]); | |
29309 | + } | |
29310 | + | |
29311 | +#ifdef CONFIG_XEN | |
29312 | + if (is_initial_xendomain()) | |
29313 | + e820_setup_gap(machine_e820.map, machine_e820.nr_map); | |
29314 | +#else | |
29315 | + e820_setup_gap(e820.map, e820.nr_map); | |
29316 | +#endif | |
29317 | + | |
29318 | +#ifdef CONFIG_XEN | |
29319 | + { | |
29320 | + struct physdev_set_iopl set_iopl; | |
29321 | + | |
29322 | + set_iopl.iopl = 1; | |
29323 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); | |
29324 | + | |
29325 | + if (is_initial_xendomain()) { | |
29326 | +#ifdef CONFIG_VT | |
29327 | +#if defined(CONFIG_VGA_CONSOLE) | |
29328 | + conswitchp = &vga_con; | |
29329 | +#elif defined(CONFIG_DUMMY_CONSOLE) | |
29330 | + conswitchp = &dummy_con; | |
29331 | +#endif | |
29332 | +#endif | |
29333 | + } else { | |
29334 | +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE) | |
29335 | + conswitchp = &dummy_con; | |
29336 | +#endif | |
29337 | + } | |
29338 | + } | |
29339 | +#else /* CONFIG_XEN */ | |
29340 | + | |
29341 | +#ifdef CONFIG_VT | |
29342 | +#if defined(CONFIG_VGA_CONSOLE) | |
29343 | + conswitchp = &vga_con; | |
29344 | +#elif defined(CONFIG_DUMMY_CONSOLE) | |
29345 | + conswitchp = &dummy_con; | |
29346 | +#endif | |
29347 | +#endif | |
29348 | + | |
29349 | +#endif /* !CONFIG_XEN */ | |
29350 | +} | |
29351 | + | |
29352 | +#ifdef CONFIG_XEN | |
29353 | +static int | |
29354 | +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) | |
29355 | +{ | |
29356 | + HYPERVISOR_shutdown(SHUTDOWN_crash); | |
29357 | + /* we're never actually going to get here... */ | |
29358 | + return NOTIFY_DONE; | |
29359 | +} | |
29360 | +#endif /* !CONFIG_XEN */ | |
29361 | + | |
29362 | + | |
29363 | +static int __cpuinit get_model_name(struct cpuinfo_x86 *c) | |
29364 | +{ | |
29365 | + unsigned int *v; | |
29366 | + | |
29367 | + if (c->extended_cpuid_level < 0x80000004) | |
29368 | + return 0; | |
29369 | + | |
29370 | + v = (unsigned int *) c->x86_model_id; | |
29371 | + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); | |
29372 | + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); | |
29373 | + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); | |
29374 | + c->x86_model_id[48] = 0; | |
29375 | + return 1; | |
29376 | +} | |
29377 | + | |
29378 | + | |
29379 | +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) | |
29380 | +{ | |
29381 | + unsigned int n, dummy, eax, ebx, ecx, edx; | |
29382 | + | |
29383 | + n = c->extended_cpuid_level; | |
29384 | + | |
29385 | + if (n >= 0x80000005) { | |
29386 | + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); | |
29387 | + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", | |
29388 | + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | |
29389 | + c->x86_cache_size=(ecx>>24)+(edx>>24); | |
29390 | + /* On K8 L1 TLB is inclusive, so don't count it */ | |
29391 | + c->x86_tlbsize = 0; | |
29392 | + } | |
29393 | + | |
29394 | + if (n >= 0x80000006) { | |
29395 | + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); | |
29396 | + ecx = cpuid_ecx(0x80000006); | |
29397 | + c->x86_cache_size = ecx >> 16; | |
29398 | + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); | |
29399 | + | |
29400 | + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | |
29401 | + c->x86_cache_size, ecx & 0xFF); | |
29402 | + } | |
29403 | + | |
29404 | + if (n >= 0x80000007) | |
29405 | + cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); | |
29406 | + if (n >= 0x80000008) { | |
29407 | + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); | |
29408 | + c->x86_virt_bits = (eax >> 8) & 0xff; | |
29409 | + c->x86_phys_bits = eax & 0xff; | |
29410 | + } | |
29411 | +} | |
29412 | + | |
29413 | +#ifdef CONFIG_NUMA | |
29414 | +static int nearby_node(int apicid) | |
29415 | +{ | |
29416 | + int i; | |
29417 | + for (i = apicid - 1; i >= 0; i--) { | |
29418 | + int node = apicid_to_node[i]; | |
29419 | + if (node != NUMA_NO_NODE && node_online(node)) | |
29420 | + return node; | |
29421 | + } | |
29422 | + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { | |
29423 | + int node = apicid_to_node[i]; | |
29424 | + if (node != NUMA_NO_NODE && node_online(node)) | |
29425 | + return node; | |
29426 | + } | |
29427 | + return first_node(node_online_map); /* Shouldn't happen */ | |
29428 | +} | |
29429 | +#endif | |
29430 | + | |
29431 | +/* | |
29432 | + * On a AMD dual core setup the lower bits of the APIC id distingush the cores. | |
29433 | + * Assumes number of cores is a power of two. | |
29434 | + */ | |
29435 | +static void __init amd_detect_cmp(struct cpuinfo_x86 *c) | |
29436 | +{ | |
29437 | +#ifdef CONFIG_SMP | |
29438 | + unsigned bits; | |
29439 | +#ifdef CONFIG_NUMA | |
29440 | + int cpu = smp_processor_id(); | |
29441 | + int node = 0; | |
29442 | + unsigned apicid = hard_smp_processor_id(); | |
29443 | +#endif | |
29444 | + unsigned ecx = cpuid_ecx(0x80000008); | |
29445 | + | |
29446 | + c->x86_max_cores = (ecx & 0xff) + 1; | |
29447 | + | |
29448 | + /* CPU telling us the core id bits shift? */ | |
29449 | + bits = (ecx >> 12) & 0xF; | |
29450 | + | |
29451 | + /* Otherwise recompute */ | |
29452 | + if (bits == 0) { | |
29453 | + while ((1 << bits) < c->x86_max_cores) | |
29454 | + bits++; | |
29455 | + } | |
29456 | + | |
29457 | + /* Low order bits define the core id (index of core in socket) */ | |
29458 | + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); | |
29459 | + /* Convert the APIC ID into the socket ID */ | |
29460 | + c->phys_proc_id = phys_pkg_id(bits); | |
29461 | + | |
29462 | +#ifdef CONFIG_NUMA | |
29463 | + node = c->phys_proc_id; | |
29464 | + if (apicid_to_node[apicid] != NUMA_NO_NODE) | |
29465 | + node = apicid_to_node[apicid]; | |
29466 | + if (!node_online(node)) { | |
29467 | + /* Two possibilities here: | |
29468 | + - The CPU is missing memory and no node was created. | |
29469 | + In that case try picking one from a nearby CPU | |
29470 | + - The APIC IDs differ from the HyperTransport node IDs | |
29471 | + which the K8 northbridge parsing fills in. | |
29472 | + Assume they are all increased by a constant offset, | |
29473 | + but in the same order as the HT nodeids. | |
29474 | + If that doesn't result in a usable node fall back to the | |
29475 | + path for the previous case. */ | |
29476 | + int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits); | |
29477 | + if (ht_nodeid >= 0 && | |
29478 | + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) | |
29479 | + node = apicid_to_node[ht_nodeid]; | |
29480 | + /* Pick a nearby node */ | |
29481 | + if (!node_online(node)) | |
29482 | + node = nearby_node(apicid); | |
29483 | + } | |
29484 | + numa_set_node(cpu, node); | |
29485 | + | |
29486 | + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | |
29487 | +#endif | |
29488 | +#endif | |
29489 | +} | |
29490 | + | |
29491 | +static void __init init_amd(struct cpuinfo_x86 *c) | |
29492 | +{ | |
29493 | + unsigned level; | |
29494 | + | |
29495 | +#ifdef CONFIG_SMP | |
29496 | + unsigned long value; | |
29497 | + | |
29498 | + /* | |
29499 | + * Disable TLB flush filter by setting HWCR.FFDIS on K8 | |
29500 | + * bit 6 of msr C001_0015 | |
29501 | + * | |
29502 | + * Errata 63 for SH-B3 steppings | |
29503 | + * Errata 122 for all steppings (F+ have it disabled by default) | |
29504 | + */ | |
29505 | + if (c->x86 == 15) { | |
29506 | + rdmsrl(MSR_K8_HWCR, value); | |
29507 | + value |= 1 << 6; | |
29508 | + wrmsrl(MSR_K8_HWCR, value); | |
29509 | + } | |
29510 | +#endif | |
29511 | + | |
29512 | + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; | |
29513 | + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ | |
29514 | + clear_bit(0*32+31, &c->x86_capability); | |
29515 | + | |
29516 | + /* On C+ stepping K8 rep microcode works well for copy/memset */ | |
29517 | + level = cpuid_eax(1); | |
29518 | + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) | |
29519 | + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | |
29520 | + | |
29521 | + /* Enable workaround for FXSAVE leak */ | |
29522 | + if (c->x86 >= 6) | |
29523 | + set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); | |
29524 | + | |
29525 | + level = get_model_name(c); | |
29526 | + if (!level) { | |
29527 | + switch (c->x86) { | |
29528 | + case 15: | |
29529 | + /* Should distinguish Models here, but this is only | |
29530 | + a fallback anyways. */ | |
29531 | + strcpy(c->x86_model_id, "Hammer"); | |
29532 | + break; | |
29533 | + } | |
29534 | + } | |
29535 | + display_cacheinfo(c); | |
29536 | + | |
29537 | + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ | |
29538 | + if (c->x86_power & (1<<8)) | |
29539 | + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); | |
29540 | + | |
29541 | + /* Multi core CPU? */ | |
29542 | + if (c->extended_cpuid_level >= 0x80000008) | |
29543 | + amd_detect_cmp(c); | |
29544 | + | |
29545 | + /* Fix cpuid4 emulation for more */ | |
29546 | + num_cache_leaves = 3; | |
29547 | +} | |
29548 | + | |
29549 | +static void __cpuinit detect_ht(struct cpuinfo_x86 *c) | |
29550 | +{ | |
29551 | +#ifdef CONFIG_SMP | |
29552 | + u32 eax, ebx, ecx, edx; | |
29553 | + int index_msb, core_bits; | |
29554 | + | |
29555 | + cpuid(1, &eax, &ebx, &ecx, &edx); | |
29556 | + | |
29557 | + | |
29558 | + if (!cpu_has(c, X86_FEATURE_HT)) | |
29559 | + return; | |
29560 | + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) | |
29561 | + goto out; | |
29562 | + | |
29563 | + smp_num_siblings = (ebx & 0xff0000) >> 16; | |
29564 | + | |
29565 | + if (smp_num_siblings == 1) { | |
29566 | + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | |
29567 | + } else if (smp_num_siblings > 1 ) { | |
29568 | + | |
29569 | + if (smp_num_siblings > NR_CPUS) { | |
29570 | + printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); | |
29571 | + smp_num_siblings = 1; | |
29572 | + return; | |
29573 | + } | |
29574 | + | |
29575 | + index_msb = get_count_order(smp_num_siblings); | |
29576 | + c->phys_proc_id = phys_pkg_id(index_msb); | |
29577 | + | |
29578 | + smp_num_siblings = smp_num_siblings / c->x86_max_cores; | |
29579 | + | |
29580 | + index_msb = get_count_order(smp_num_siblings) ; | |
29581 | + | |
29582 | + core_bits = get_count_order(c->x86_max_cores); | |
29583 | + | |
29584 | + c->cpu_core_id = phys_pkg_id(index_msb) & | |
29585 | + ((1 << core_bits) - 1); | |
29586 | + } | |
29587 | +out: | |
29588 | + if ((c->x86_max_cores * smp_num_siblings) > 1) { | |
29589 | + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); | |
29590 | + printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); | |
29591 | + } | |
29592 | + | |
29593 | +#endif | |
29594 | +} | |
29595 | + | |
29596 | +/* | |
29597 | + * find out the number of processor cores on the die | |
29598 | + */ | |
29599 | +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) | |
29600 | +{ | |
29601 | + unsigned int eax, t; | |
29602 | + | |
29603 | + if (c->cpuid_level < 4) | |
29604 | + return 1; | |
29605 | + | |
29606 | + cpuid_count(4, 0, &eax, &t, &t, &t); | |
29607 | + | |
29608 | + if (eax & 0x1f) | |
29609 | + return ((eax >> 26) + 1); | |
29610 | + else | |
29611 | + return 1; | |
29612 | +} | |
29613 | + | |
29614 | +static void srat_detect_node(void) | |
29615 | +{ | |
29616 | +#ifdef CONFIG_NUMA | |
29617 | + unsigned node; | |
29618 | + int cpu = smp_processor_id(); | |
29619 | + int apicid = hard_smp_processor_id(); | |
29620 | + | |
29621 | + /* Don't do the funky fallback heuristics the AMD version employs | |
29622 | + for now. */ | |
29623 | + node = apicid_to_node[apicid]; | |
29624 | + if (node == NUMA_NO_NODE) | |
29625 | + node = first_node(node_online_map); | |
29626 | + numa_set_node(cpu, node); | |
29627 | + | |
29628 | + if (acpi_numa > 0) | |
29629 | + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | |
29630 | +#endif | |
29631 | +} | |
29632 | + | |
29633 | +static void __cpuinit init_intel(struct cpuinfo_x86 *c) | |
29634 | +{ | |
29635 | + /* Cache sizes */ | |
29636 | + unsigned n; | |
29637 | + | |
29638 | + init_intel_cacheinfo(c); | |
29639 | + if (c->cpuid_level > 9 ) { | |
29640 | + unsigned eax = cpuid_eax(10); | |
29641 | + /* Check for version and the number of counters */ | |
29642 | + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) | |
29643 | + set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); | |
29644 | + } | |
29645 | + | |
29646 | + n = c->extended_cpuid_level; | |
29647 | + if (n >= 0x80000008) { | |
29648 | + unsigned eax = cpuid_eax(0x80000008); | |
29649 | + c->x86_virt_bits = (eax >> 8) & 0xff; | |
29650 | + c->x86_phys_bits = eax & 0xff; | |
29651 | + /* CPUID workaround for Intel 0F34 CPU */ | |
29652 | + if (c->x86_vendor == X86_VENDOR_INTEL && | |
29653 | + c->x86 == 0xF && c->x86_model == 0x3 && | |
29654 | + c->x86_mask == 0x4) | |
29655 | + c->x86_phys_bits = 36; | |
29656 | + } | |
29657 | + | |
29658 | + if (c->x86 == 15) | |
29659 | + c->x86_cache_alignment = c->x86_clflush_size * 2; | |
29660 | + if ((c->x86 == 0xf && c->x86_model >= 0x03) || | |
29661 | + (c->x86 == 0x6 && c->x86_model >= 0x0e)) | |
29662 | + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); | |
29663 | + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); | |
29664 | + c->x86_max_cores = intel_num_cpu_cores(c); | |
29665 | + | |
29666 | + srat_detect_node(); | |
29667 | +} | |
29668 | + | |
29669 | +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) | |
29670 | +{ | |
29671 | + char *v = c->x86_vendor_id; | |
29672 | + | |
29673 | + if (!strcmp(v, "AuthenticAMD")) | |
29674 | + c->x86_vendor = X86_VENDOR_AMD; | |
29675 | + else if (!strcmp(v, "GenuineIntel")) | |
29676 | + c->x86_vendor = X86_VENDOR_INTEL; | |
29677 | + else | |
29678 | + c->x86_vendor = X86_VENDOR_UNKNOWN; | |
29679 | +} | |
29680 | + | |
29681 | +struct cpu_model_info { | |
29682 | + int vendor; | |
29683 | + int family; | |
29684 | + char *model_names[16]; | |
29685 | +}; | |
29686 | + | |
29687 | +/* Do some early cpuid on the boot CPU to get some parameter that are | |
29688 | + needed before check_bugs. Everything advanced is in identify_cpu | |
29689 | + below. */ | |
29690 | +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) | |
29691 | +{ | |
29692 | + u32 tfms; | |
29693 | + | |
29694 | + c->loops_per_jiffy = loops_per_jiffy; | |
29695 | + c->x86_cache_size = -1; | |
29696 | + c->x86_vendor = X86_VENDOR_UNKNOWN; | |
29697 | + c->x86_model = c->x86_mask = 0; /* So far unknown... */ | |
29698 | + c->x86_vendor_id[0] = '\0'; /* Unset */ | |
29699 | + c->x86_model_id[0] = '\0'; /* Unset */ | |
29700 | + c->x86_clflush_size = 64; | |
29701 | + c->x86_cache_alignment = c->x86_clflush_size; | |
29702 | + c->x86_max_cores = 1; | |
29703 | + c->extended_cpuid_level = 0; | |
29704 | + memset(&c->x86_capability, 0, sizeof c->x86_capability); | |
29705 | + | |
29706 | + /* Get vendor name */ | |
29707 | + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | |
29708 | + (unsigned int *)&c->x86_vendor_id[0], | |
29709 | + (unsigned int *)&c->x86_vendor_id[8], | |
29710 | + (unsigned int *)&c->x86_vendor_id[4]); | |
29711 | + | |
29712 | + get_cpu_vendor(c); | |
29713 | + | |
29714 | + /* Initialize the standard set of capabilities */ | |
29715 | + /* Note that the vendor-specific code below might override */ | |
29716 | + | |
29717 | + /* Intel-defined flags: level 0x00000001 */ | |
29718 | + if (c->cpuid_level >= 0x00000001) { | |
29719 | + __u32 misc; | |
29720 | + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], | |
29721 | + &c->x86_capability[0]); | |
29722 | + c->x86 = (tfms >> 8) & 0xf; | |
29723 | + c->x86_model = (tfms >> 4) & 0xf; | |
29724 | + c->x86_mask = tfms & 0xf; | |
29725 | + if (c->x86 == 0xf) | |
29726 | + c->x86 += (tfms >> 20) & 0xff; | |
29727 | + if (c->x86 >= 0x6) | |
29728 | + c->x86_model += ((tfms >> 16) & 0xF) << 4; | |
29729 | + if (c->x86_capability[0] & (1<<19)) | |
29730 | + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | |
29731 | + } else { | |
29732 | + /* Have CPUID level 0 only - unheard of */ | |
29733 | + c->x86 = 4; | |
29734 | + } | |
29735 | + | |
29736 | +#ifdef CONFIG_SMP | |
29737 | + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; | |
29738 | +#endif | |
29739 | +} | |
29740 | + | |
29741 | +/* | |
29742 | + * This does the hard work of actually picking apart the CPU stuff... | |
29743 | + */ | |
29744 | +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |
29745 | +{ | |
29746 | + int i; | |
29747 | + u32 xlvl; | |
29748 | + | |
29749 | + early_identify_cpu(c); | |
29750 | + | |
29751 | + /* AMD-defined flags: level 0x80000001 */ | |
29752 | + xlvl = cpuid_eax(0x80000000); | |
29753 | + c->extended_cpuid_level = xlvl; | |
29754 | + if ((xlvl & 0xffff0000) == 0x80000000) { | |
29755 | + if (xlvl >= 0x80000001) { | |
29756 | + c->x86_capability[1] = cpuid_edx(0x80000001); | |
29757 | + c->x86_capability[6] = cpuid_ecx(0x80000001); | |
29758 | + } | |
29759 | + if (xlvl >= 0x80000004) | |
29760 | + get_model_name(c); /* Default name */ | |
29761 | + } | |
29762 | + | |
29763 | + /* Transmeta-defined flags: level 0x80860001 */ | |
29764 | + xlvl = cpuid_eax(0x80860000); | |
29765 | + if ((xlvl & 0xffff0000) == 0x80860000) { | |
29766 | + /* Don't set x86_cpuid_level here for now to not confuse. */ | |
29767 | + if (xlvl >= 0x80860001) | |
29768 | + c->x86_capability[2] = cpuid_edx(0x80860001); | |
29769 | + } | |
29770 | + | |
29771 | + c->apicid = phys_pkg_id(0); | |
29772 | + | |
29773 | + /* | |
29774 | + * Vendor-specific initialization. In this section we | |
29775 | + * canonicalize the feature flags, meaning if there are | |
29776 | + * features a certain CPU supports which CPUID doesn't | |
29777 | + * tell us, CPUID claiming incorrect flags, or other bugs, | |
29778 | + * we handle them here. | |
29779 | + * | |
29780 | + * At the end of this section, c->x86_capability better | |
29781 | + * indicate the features this CPU genuinely supports! | |
29782 | + */ | |
29783 | + switch (c->x86_vendor) { | |
29784 | + case X86_VENDOR_AMD: | |
29785 | + init_amd(c); | |
29786 | + break; | |
29787 | + | |
29788 | + case X86_VENDOR_INTEL: | |
29789 | + init_intel(c); | |
29790 | + break; | |
29791 | + | |
29792 | + case X86_VENDOR_UNKNOWN: | |
29793 | + default: | |
29794 | + display_cacheinfo(c); | |
29795 | + break; | |
29796 | + } | |
29797 | + | |
29798 | + select_idle_routine(c); | |
29799 | + detect_ht(c); | |
29800 | + | |
29801 | + /* | |
29802 | + * On SMP, boot_cpu_data holds the common feature set between | |
29803 | + * all CPUs; so make sure that we indicate which features are | |
29804 | + * common between the CPUs. The first time this routine gets | |
29805 | + * executed, c == &boot_cpu_data. | |
29806 | + */ | |
29807 | + if (c != &boot_cpu_data) { | |
29808 | + /* AND the already accumulated flags with these */ | |
29809 | + for (i = 0 ; i < NCAPINTS ; i++) | |
29810 | + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | |
29811 | + } | |
29812 | + | |
29813 | +#ifdef CONFIG_X86_MCE | |
29814 | + mcheck_init(c); | |
29815 | +#endif | |
29816 | + if (c == &boot_cpu_data) | |
29817 | + mtrr_bp_init(); | |
29818 | + else | |
29819 | + mtrr_ap_init(); | |
29820 | +#ifdef CONFIG_NUMA | |
29821 | + numa_add_cpu(smp_processor_id()); | |
29822 | +#endif | |
29823 | +} | |
29824 | + | |
29825 | + | |
29826 | +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | |
29827 | +{ | |
29828 | + if (c->x86_model_id[0]) | |
29829 | + printk("%s", c->x86_model_id); | |
29830 | + | |
29831 | + if (c->x86_mask || c->cpuid_level >= 0) | |
29832 | + printk(" stepping %02x\n", c->x86_mask); | |
29833 | + else | |
29834 | + printk("\n"); | |
29835 | +} | |
29836 | + | |
29837 | +/* | |
29838 | + * Get CPU information for use by the procfs. | |
29839 | + */ | |
29840 | + | |
29841 | +static int show_cpuinfo(struct seq_file *m, void *v) | |
29842 | +{ | |
29843 | + struct cpuinfo_x86 *c = v; | |
29844 | + | |
29845 | + /* | |
29846 | + * These flag bits must match the definitions in <asm/cpufeature.h>. | |
29847 | + * NULL means this bit is undefined or reserved; either way it doesn't | |
29848 | + * have meaning as far as Linux is concerned. Note that it's important | |
29849 | + * to realize there is a difference between this table and CPUID -- if | |
29850 | + * applications want to get the raw CPUID data, they should access | |
29851 | + * /dev/cpu/<cpu_nr>/cpuid instead. | |
29852 | + */ | |
29853 | + static char *x86_cap_flags[] = { | |
29854 | + /* Intel-defined */ | |
29855 | + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", | |
29856 | + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", | |
29857 | + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", | |
29858 | + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL, | |
29859 | + | |
29860 | + /* AMD-defined */ | |
29861 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29862 | + NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, | |
29863 | + NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, | |
29864 | + NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow", | |
29865 | + | |
29866 | + /* Transmeta-defined */ | |
29867 | + "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, | |
29868 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29869 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29870 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29871 | + | |
29872 | + /* Other (Linux-defined) */ | |
29873 | + "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL, | |
29874 | + "constant_tsc", NULL, NULL, | |
29875 | + "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29876 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29877 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29878 | + | |
29879 | + /* Intel-defined (#2) */ | |
29880 | + "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", | |
29881 | + "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, | |
29882 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29883 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29884 | + | |
29885 | + /* VIA/Cyrix/Centaur-defined */ | |
29886 | + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", | |
29887 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29888 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29889 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29890 | + | |
29891 | + /* AMD-defined (#2) */ | |
29892 | + "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL, | |
29893 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29894 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29895 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
29896 | + }; | |
29897 | + static char *x86_power_flags[] = { | |
29898 | + "ts", /* temperature sensor */ | |
29899 | + "fid", /* frequency id control */ | |
29900 | + "vid", /* voltage id control */ | |
29901 | + "ttp", /* thermal trip */ | |
29902 | + "tm", | |
29903 | + "stc", | |
29904 | + NULL, | |
29905 | + /* nothing */ /* constant_tsc - moved to flags */ | |
29906 | + }; | |
29907 | + | |
29908 | + | |
29909 | +#ifdef CONFIG_SMP | |
29910 | + if (!cpu_online(c-cpu_data)) | |
29911 | + return 0; | |
29912 | +#endif | |
29913 | + | |
29914 | + seq_printf(m,"processor\t: %u\n" | |
29915 | + "vendor_id\t: %s\n" | |
29916 | + "cpu family\t: %d\n" | |
29917 | + "model\t\t: %d\n" | |
29918 | + "model name\t: %s\n", | |
29919 | + (unsigned)(c-cpu_data), | |
29920 | + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", | |
29921 | + c->x86, | |
29922 | + (int)c->x86_model, | |
29923 | + c->x86_model_id[0] ? c->x86_model_id : "unknown"); | |
29924 | + | |
29925 | + if (c->x86_mask || c->cpuid_level >= 0) | |
29926 | + seq_printf(m, "stepping\t: %d\n", c->x86_mask); | |
29927 | + else | |
29928 | + seq_printf(m, "stepping\t: unknown\n"); | |
29929 | + | |
29930 | + if (cpu_has(c,X86_FEATURE_TSC)) { | |
29931 | + unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data)); | |
29932 | + if (!freq) | |
29933 | + freq = cpu_khz; | |
29934 | + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", | |
29935 | + freq / 1000, (freq % 1000)); | |
29936 | + } | |
29937 | + | |
29938 | + /* Cache size */ | |
29939 | + if (c->x86_cache_size >= 0) | |
29940 | + seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); | |
29941 | + | |
29942 | +#ifdef CONFIG_SMP | |
29943 | + if (smp_num_siblings * c->x86_max_cores > 1) { | |
29944 | + int cpu = c - cpu_data; | |
29945 | + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); | |
29946 | + seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); | |
29947 | + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); | |
29948 | + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); | |
29949 | + } | |
29950 | +#endif | |
29951 | + | |
29952 | + seq_printf(m, | |
29953 | + "fpu\t\t: yes\n" | |
29954 | + "fpu_exception\t: yes\n" | |
29955 | + "cpuid level\t: %d\n" | |
29956 | + "wp\t\t: yes\n" | |
29957 | + "flags\t\t:", | |
29958 | + c->cpuid_level); | |
29959 | + | |
29960 | + { | |
29961 | + int i; | |
29962 | + for ( i = 0 ; i < 32*NCAPINTS ; i++ ) | |
29963 | + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) | |
29964 | + seq_printf(m, " %s", x86_cap_flags[i]); | |
29965 | + } | |
29966 | + | |
29967 | + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", | |
29968 | + c->loops_per_jiffy/(500000/HZ), | |
29969 | + (c->loops_per_jiffy/(5000/HZ)) % 100); | |
29970 | + | |
29971 | + if (c->x86_tlbsize > 0) | |
29972 | + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); | |
29973 | + seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); | |
29974 | + seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); | |
29975 | + | |
29976 | + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", | |
29977 | + c->x86_phys_bits, c->x86_virt_bits); | |
29978 | + | |
29979 | + seq_printf(m, "power management:"); | |
29980 | + { | |
29981 | + unsigned i; | |
29982 | + for (i = 0; i < 32; i++) | |
29983 | + if (c->x86_power & (1 << i)) { | |
29984 | + if (i < ARRAY_SIZE(x86_power_flags) && | |
29985 | + x86_power_flags[i]) | |
29986 | + seq_printf(m, "%s%s", | |
29987 | + x86_power_flags[i][0]?" ":"", | |
29988 | + x86_power_flags[i]); | |
29989 | + else | |
29990 | + seq_printf(m, " [%d]", i); | |
29991 | + } | |
29992 | + } | |
29993 | + | |
29994 | + seq_printf(m, "\n\n"); | |
29995 | + | |
29996 | + return 0; | |
29997 | +} | |
29998 | + | |
29999 | +static void *c_start(struct seq_file *m, loff_t *pos) | |
30000 | +{ | |
30001 | + return *pos < NR_CPUS ? cpu_data + *pos : NULL; | |
30002 | +} | |
30003 | + | |
30004 | +static void *c_next(struct seq_file *m, void *v, loff_t *pos) | |
30005 | +{ | |
30006 | + ++*pos; | |
30007 | + return c_start(m, pos); | |
30008 | +} | |
30009 | + | |
30010 | +static void c_stop(struct seq_file *m, void *v) | |
30011 | +{ | |
30012 | +} | |
30013 | + | |
30014 | +struct seq_operations cpuinfo_op = { | |
30015 | + .start =c_start, | |
30016 | + .next = c_next, | |
30017 | + .stop = c_stop, | |
30018 | + .show = show_cpuinfo, | |
30019 | +}; | |
30020 | + | |
30021 | +#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE) | |
30022 | +#include <linux/platform_device.h> | |
30023 | +static __init int add_pcspkr(void) | |
30024 | +{ | |
30025 | + struct platform_device *pd; | |
30026 | + int ret; | |
30027 | + | |
30028 | + if (!is_initial_xendomain()) | |
30029 | + return 0; | |
30030 | + | |
30031 | + pd = platform_device_alloc("pcspkr", -1); | |
30032 | + if (!pd) | |
30033 | + return -ENOMEM; | |
30034 | + | |
30035 | + ret = platform_device_add(pd); | |
30036 | + if (ret) | |
30037 | + platform_device_put(pd); | |
30038 | + | |
30039 | + return ret; | |
30040 | +} | |
30041 | +device_initcall(add_pcspkr); | |
30042 | +#endif | |
30043 | Index: head-2008-11-25/arch/x86/kernel/setup64-xen.c | |
30044 | =================================================================== | |
30045 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
30046 | +++ head-2008-11-25/arch/x86/kernel/setup64-xen.c 2008-01-28 12:24:19.000000000 +0100 | |
30047 | @@ -0,0 +1,367 @@ | |
30048 | +/* | |
30049 | + * X86-64 specific CPU setup. | |
30050 | + * Copyright (C) 1995 Linus Torvalds | |
30051 | + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. | |
30052 | + * See setup.c for older changelog. | |
30053 | + * | |
30054 | + * Jun Nakajima <jun.nakajima@intel.com> | |
30055 | + * Modified for Xen | |
30056 | + * | |
30057 | + */ | |
30058 | +#include <linux/init.h> | |
30059 | +#include <linux/kernel.h> | |
30060 | +#include <linux/sched.h> | |
30061 | +#include <linux/string.h> | |
30062 | +#include <linux/bootmem.h> | |
30063 | +#include <linux/bitops.h> | |
30064 | +#include <linux/module.h> | |
30065 | +#include <asm/bootsetup.h> | |
30066 | +#include <asm/pda.h> | |
30067 | +#include <asm/pgtable.h> | |
30068 | +#include <asm/processor.h> | |
30069 | +#include <asm/desc.h> | |
30070 | +#include <asm/atomic.h> | |
30071 | +#include <asm/mmu_context.h> | |
30072 | +#include <asm/smp.h> | |
30073 | +#include <asm/i387.h> | |
30074 | +#include <asm/percpu.h> | |
30075 | +#include <asm/proto.h> | |
30076 | +#include <asm/sections.h> | |
30077 | +#ifdef CONFIG_XEN | |
30078 | +#include <asm/hypervisor.h> | |
30079 | +#endif | |
30080 | + | |
30081 | +char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; | |
30082 | + | |
30083 | +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; | |
30084 | + | |
30085 | +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; | |
30086 | +EXPORT_SYMBOL(_cpu_pda); | |
30087 | +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; | |
30088 | + | |
30089 | +#ifndef CONFIG_X86_NO_IDT | |
30090 | +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; | |
30091 | +#endif | |
30092 | + | |
30093 | +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); | |
30094 | + | |
30095 | +unsigned long __supported_pte_mask __read_mostly = ~0UL; | |
30096 | +EXPORT_SYMBOL(__supported_pte_mask); | |
30097 | +static int do_not_nx __cpuinitdata = 0; | |
30098 | + | |
30099 | +/* noexec=on|off | |
30100 | +Control non executable mappings for 64bit processes. | |
30101 | + | |
30102 | +on Enable(default) | |
30103 | +off Disable | |
30104 | +*/ | |
30105 | +int __init nonx_setup(char *str) | |
30106 | +{ | |
30107 | + if (!strncmp(str, "on", 2)) { | |
30108 | + __supported_pte_mask |= _PAGE_NX; | |
30109 | + do_not_nx = 0; | |
30110 | + } else if (!strncmp(str, "off", 3)) { | |
30111 | + do_not_nx = 1; | |
30112 | + __supported_pte_mask &= ~_PAGE_NX; | |
30113 | + } | |
30114 | + return 1; | |
30115 | +} | |
30116 | +__setup("noexec=", nonx_setup); /* parsed early actually */ | |
30117 | + | |
30118 | +int force_personality32 = 0; | |
30119 | + | |
30120 | +/* noexec32=on|off | |
30121 | +Control non executable heap for 32bit processes. | |
30122 | +To control the stack too use noexec=off | |
30123 | + | |
30124 | +on PROT_READ does not imply PROT_EXEC for 32bit processes | |
30125 | +off PROT_READ implies PROT_EXEC (default) | |
30126 | +*/ | |
30127 | +static int __init nonx32_setup(char *str) | |
30128 | +{ | |
30129 | + if (!strcmp(str, "on")) | |
30130 | + force_personality32 &= ~READ_IMPLIES_EXEC; | |
30131 | + else if (!strcmp(str, "off")) | |
30132 | + force_personality32 |= READ_IMPLIES_EXEC; | |
30133 | + return 1; | |
30134 | +} | |
30135 | +__setup("noexec32=", nonx32_setup); | |
30136 | + | |
30137 | +/* | |
30138 | + * Great future plan: | |
30139 | + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. | |
30140 | + * Always point %gs to its beginning | |
30141 | + */ | |
30142 | +void __init setup_per_cpu_areas(void) | |
30143 | +{ | |
30144 | + int i; | |
30145 | + unsigned long size; | |
30146 | + | |
30147 | +#ifdef CONFIG_HOTPLUG_CPU | |
30148 | + prefill_possible_map(); | |
30149 | +#endif | |
30150 | + | |
30151 | + /* Copy section for each CPU (we discard the original) */ | |
30152 | + size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); | |
30153 | +#ifdef CONFIG_MODULES | |
30154 | + if (size < PERCPU_ENOUGH_ROOM) | |
30155 | + size = PERCPU_ENOUGH_ROOM; | |
30156 | +#endif | |
30157 | + | |
30158 | + for_each_cpu_mask (i, cpu_possible_map) { | |
30159 | + char *ptr; | |
30160 | + | |
30161 | + if (!NODE_DATA(cpu_to_node(i))) { | |
30162 | + printk("cpu with no node %d, num_online_nodes %d\n", | |
30163 | + i, num_online_nodes()); | |
30164 | + ptr = alloc_bootmem(size); | |
30165 | + } else { | |
30166 | + ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); | |
30167 | + } | |
30168 | + if (!ptr) | |
30169 | + panic("Cannot allocate cpu data for CPU %d\n", i); | |
30170 | + cpu_pda(i)->data_offset = ptr - __per_cpu_start; | |
30171 | + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | |
30172 | + } | |
30173 | +} | |
30174 | + | |
30175 | +#ifdef CONFIG_XEN | |
30176 | +static void switch_pt(void) | |
30177 | +{ | |
30178 | + xen_pt_switch(__pa_symbol(init_level4_pgt)); | |
30179 | + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt))); | |
30180 | +} | |
30181 | + | |
30182 | +static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr) | |
30183 | +{ | |
30184 | + unsigned long frames[16]; | |
30185 | + unsigned long va; | |
30186 | + int f; | |
30187 | + | |
30188 | + for (va = gdt_descr->address, f = 0; | |
30189 | + va < gdt_descr->address + gdt_descr->size; | |
30190 | + va += PAGE_SIZE, f++) { | |
30191 | + frames[f] = virt_to_mfn(va); | |
30192 | + make_page_readonly( | |
30193 | + (void *)va, XENFEAT_writable_descriptor_tables); | |
30194 | + } | |
30195 | + if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / | |
30196 | + sizeof (struct desc_struct))) | |
30197 | + BUG(); | |
30198 | +} | |
30199 | +#else | |
30200 | +static void switch_pt(void) | |
30201 | +{ | |
30202 | + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); | |
30203 | +} | |
30204 | + | |
30205 | +static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr) | |
30206 | +{ | |
30207 | + asm volatile("lgdt %0" :: "m" (*gdt_descr)); | |
30208 | + asm volatile("lidt %0" :: "m" (idt_descr)); | |
30209 | +} | |
30210 | +#endif | |
30211 | + | |
30212 | +void pda_init(int cpu) | |
30213 | +{ | |
30214 | + struct x8664_pda *pda = cpu_pda(cpu); | |
30215 | + | |
30216 | + /* Setup up data that may be needed in __get_free_pages early */ | |
30217 | + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); | |
30218 | +#ifndef CONFIG_XEN | |
30219 | + wrmsrl(MSR_GS_BASE, pda); | |
30220 | +#else | |
30221 | + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, | |
30222 | + (unsigned long)pda)) | |
30223 | + BUG(); | |
30224 | +#endif | |
30225 | + pda->cpunumber = cpu; | |
30226 | + pda->irqcount = -1; | |
30227 | + pda->kernelstack = | |
30228 | + (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; | |
30229 | + pda->active_mm = &init_mm; | |
30230 | + pda->mmu_state = 0; | |
30231 | + | |
30232 | + if (cpu == 0) { | |
30233 | +#ifdef CONFIG_XEN | |
30234 | + xen_init_pt(); | |
30235 | +#endif | |
30236 | + /* others are initialized in smpboot.c */ | |
30237 | + pda->pcurrent = &init_task; | |
30238 | + pda->irqstackptr = boot_cpu_stack; | |
30239 | + } else { | |
30240 | + pda->irqstackptr = (char *) | |
30241 | + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); | |
30242 | + if (!pda->irqstackptr) | |
30243 | + panic("cannot allocate irqstack for cpu %d", cpu); | |
30244 | + } | |
30245 | + | |
30246 | + switch_pt(); | |
30247 | + | |
30248 | + pda->irqstackptr += IRQSTACKSIZE-64; | |
30249 | +} | |
30250 | + | |
30251 | +#ifndef CONFIG_X86_NO_TSS | |
30252 | +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ] | |
30253 | +__attribute__((section(".bss.page_aligned"))); | |
30254 | +#endif | |
30255 | + | |
30256 | +/* May not be marked __init: used by software suspend */ | |
30257 | +void syscall_init(void) | |
30258 | +{ | |
30259 | +#ifndef CONFIG_XEN | |
30260 | + /* | |
30261 | + * LSTAR and STAR live in a bit strange symbiosis. | |
30262 | + * They both write to the same internal register. STAR allows to set CS/DS | |
30263 | + * but only a 32bit target. LSTAR sets the 64bit rip. | |
30264 | + */ | |
30265 | + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | |
30266 | + wrmsrl(MSR_LSTAR, system_call); | |
30267 | + | |
30268 | + /* Flags to clear on syscall */ | |
30269 | + wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); | |
30270 | +#endif | |
30271 | +#ifdef CONFIG_IA32_EMULATION | |
30272 | + syscall32_cpu_init (); | |
30273 | +#endif | |
30274 | +} | |
30275 | + | |
30276 | +void __cpuinit check_efer(void) | |
30277 | +{ | |
30278 | + unsigned long efer; | |
30279 | + | |
30280 | + rdmsrl(MSR_EFER, efer); | |
30281 | + if (!(efer & EFER_NX) || do_not_nx) { | |
30282 | + __supported_pte_mask &= ~_PAGE_NX; | |
30283 | + } | |
30284 | +} | |
30285 | + | |
30286 | +unsigned long kernel_eflags; | |
30287 | + | |
30288 | +/* | |
30289 | + * cpu_init() initializes state that is per-CPU. Some data is already | |
30290 | + * initialized (naturally) in the bootstrap process, such as the GDT | |
30291 | + * and IDT. We reload them nevertheless, this function acts as a | |
30292 | + * 'CPU state barrier', nothing should get across. | |
30293 | + * A lot of state is already set up in PDA init. | |
30294 | + */ | |
30295 | +void __cpuinit cpu_init (void) | |
30296 | +{ | |
30297 | + int cpu = stack_smp_processor_id(); | |
30298 | +#ifndef CONFIG_X86_NO_TSS | |
30299 | + struct tss_struct *t = &per_cpu(init_tss, cpu); | |
30300 | + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); | |
30301 | + unsigned long v; | |
30302 | + char *estacks = NULL; | |
30303 | + unsigned i; | |
30304 | +#endif | |
30305 | + struct task_struct *me; | |
30306 | + | |
30307 | + /* CPU 0 is initialised in head64.c */ | |
30308 | + if (cpu != 0) { | |
30309 | + pda_init(cpu); | |
30310 | + zap_low_mappings(cpu); | |
30311 | + } | |
30312 | +#ifndef CONFIG_X86_NO_TSS | |
30313 | + else | |
30314 | + estacks = boot_exception_stacks; | |
30315 | +#endif | |
30316 | + | |
30317 | + me = current; | |
30318 | + | |
30319 | + if (cpu_test_and_set(cpu, cpu_initialized)) | |
30320 | + panic("CPU#%d already initialized!\n", cpu); | |
30321 | + | |
30322 | + printk("Initializing CPU#%d\n", cpu); | |
30323 | + | |
30324 | + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | |
30325 | + | |
30326 | + /* | |
30327 | + * Initialize the per-CPU GDT with the boot GDT, | |
30328 | + * and set up the GDT descriptor: | |
30329 | + */ | |
30330 | +#ifndef CONFIG_XEN | |
30331 | + if (cpu) | |
30332 | + memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); | |
30333 | +#endif | |
30334 | + | |
30335 | + cpu_gdt_descr[cpu].size = GDT_SIZE; | |
30336 | + cpu_gdt_init(&cpu_gdt_descr[cpu]); | |
30337 | + | |
30338 | + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); | |
30339 | + syscall_init(); | |
30340 | + | |
30341 | + wrmsrl(MSR_FS_BASE, 0); | |
30342 | + wrmsrl(MSR_KERNEL_GS_BASE, 0); | |
30343 | + barrier(); | |
30344 | + | |
30345 | + check_efer(); | |
30346 | + | |
30347 | +#ifndef CONFIG_X86_NO_TSS | |
30348 | + /* | |
30349 | + * set up and load the per-CPU TSS | |
30350 | + */ | |
30351 | + for (v = 0; v < N_EXCEPTION_STACKS; v++) { | |
30352 | + if (cpu) { | |
30353 | + static const unsigned int order[N_EXCEPTION_STACKS] = { | |
30354 | + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, | |
30355 | + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER | |
30356 | + }; | |
30357 | + | |
30358 | + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); | |
30359 | + if (!estacks) | |
30360 | + panic("Cannot allocate exception stack %ld %d\n", | |
30361 | + v, cpu); | |
30362 | + } | |
30363 | + switch (v + 1) { | |
30364 | +#if DEBUG_STKSZ > EXCEPTION_STKSZ | |
30365 | + case DEBUG_STACK: | |
30366 | + cpu_pda(cpu)->debugstack = (unsigned long)estacks; | |
30367 | + estacks += DEBUG_STKSZ; | |
30368 | + break; | |
30369 | +#endif | |
30370 | + default: | |
30371 | + estacks += EXCEPTION_STKSZ; | |
30372 | + break; | |
30373 | + } | |
30374 | + orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; | |
30375 | + } | |
30376 | + | |
30377 | + t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | |
30378 | + /* | |
30379 | + * <= is required because the CPU will access up to | |
30380 | + * 8 bits beyond the end of the IO permission bitmap. | |
30381 | + */ | |
30382 | + for (i = 0; i <= IO_BITMAP_LONGS; i++) | |
30383 | + t->io_bitmap[i] = ~0UL; | |
30384 | +#endif | |
30385 | + | |
30386 | + atomic_inc(&init_mm.mm_count); | |
30387 | + me->active_mm = &init_mm; | |
30388 | + if (me->mm) | |
30389 | + BUG(); | |
30390 | + enter_lazy_tlb(&init_mm, me); | |
30391 | + | |
30392 | +#ifndef CONFIG_X86_NO_TSS | |
30393 | + set_tss_desc(cpu, t); | |
30394 | +#endif | |
30395 | +#ifndef CONFIG_XEN | |
30396 | + load_TR_desc(); | |
30397 | +#endif | |
30398 | + load_LDT(&init_mm.context); | |
30399 | + | |
30400 | + /* | |
30401 | + * Clear all 6 debug registers: | |
30402 | + */ | |
30403 | + | |
30404 | + set_debugreg(0UL, 0); | |
30405 | + set_debugreg(0UL, 1); | |
30406 | + set_debugreg(0UL, 2); | |
30407 | + set_debugreg(0UL, 3); | |
30408 | + set_debugreg(0UL, 6); | |
30409 | + set_debugreg(0UL, 7); | |
30410 | + | |
30411 | + fpu_init(); | |
30412 | + | |
30413 | + raw_local_save_flags(kernel_eflags); | |
30414 | +} | |
30415 | Index: head-2008-11-25/arch/x86/kernel/smp_64-xen.c | |
30416 | =================================================================== | |
30417 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
30418 | +++ head-2008-11-25/arch/x86/kernel/smp_64-xen.c 2008-04-02 12:34:02.000000000 +0200 | |
30419 | @@ -0,0 +1,575 @@ | |
30420 | +/* | |
30421 | + * Intel SMP support routines. | |
30422 | + * | |
30423 | + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | |
30424 | + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> | |
30425 | + * (c) 2002,2003 Andi Kleen, SuSE Labs. | |
30426 | + * | |
30427 | + * This code is released under the GNU General Public License version 2 or | |
30428 | + * later. | |
30429 | + */ | |
30430 | + | |
30431 | +#include <linux/init.h> | |
30432 | + | |
30433 | +#include <linux/mm.h> | |
30434 | +#include <linux/delay.h> | |
30435 | +#include <linux/spinlock.h> | |
30436 | +#include <linux/smp_lock.h> | |
30437 | +#include <linux/smp.h> | |
30438 | +#include <linux/kernel_stat.h> | |
30439 | +#include <linux/mc146818rtc.h> | |
30440 | +#include <linux/interrupt.h> | |
30441 | + | |
30442 | +#include <asm/mtrr.h> | |
30443 | +#include <asm/pgalloc.h> | |
30444 | +#include <asm/tlbflush.h> | |
30445 | +#include <asm/mach_apic.h> | |
30446 | +#include <asm/mmu_context.h> | |
30447 | +#include <asm/proto.h> | |
30448 | +#include <asm/apicdef.h> | |
30449 | +#include <asm/idle.h> | |
30450 | +#ifdef CONFIG_XEN | |
30451 | +#include <xen/evtchn.h> | |
30452 | +#endif | |
30453 | + | |
30454 | +#ifndef CONFIG_XEN | |
30455 | +/* | |
30456 | + * Smarter SMP flushing macros. | |
30457 | + * c/o Linus Torvalds. | |
30458 | + * | |
30459 | + * These mean you can really definitely utterly forget about | |
30460 | + * writing to user space from interrupts. (Its not allowed anyway). | |
30461 | + * | |
30462 | + * Optimizations Manfred Spraul <manfred@colorfullife.com> | |
30463 | + * | |
30464 | + * More scalable flush, from Andi Kleen | |
30465 | + * | |
30466 | + * To avoid global state use 8 different call vectors. | |
30467 | + * Each CPU uses a specific vector to trigger flushes on other | |
30468 | + * CPUs. Depending on the received vector the target CPUs look into | |
30469 | + * the right per cpu variable for the flush data. | |
30470 | + * | |
30471 | + * With more than 8 CPUs they are hashed to the 8 available | |
30472 | + * vectors. The limited global vector space forces us to this right now. | |
30473 | + * In future when interrupts are split into per CPU domains this could be | |
30474 | + * fixed, at the cost of triggering multiple IPIs in some cases. | |
30475 | + */ | |
30476 | + | |
30477 | +union smp_flush_state { | |
30478 | + struct { | |
30479 | + cpumask_t flush_cpumask; | |
30480 | + struct mm_struct *flush_mm; | |
30481 | + unsigned long flush_va; | |
30482 | +#define FLUSH_ALL -1ULL | |
30483 | + spinlock_t tlbstate_lock; | |
30484 | + }; | |
30485 | + char pad[SMP_CACHE_BYTES]; | |
30486 | +} ____cacheline_aligned; | |
30487 | + | |
30488 | +/* State is put into the per CPU data section, but padded | |
30489 | + to a full cache line because other CPUs can access it and we don't | |
30490 | + want false sharing in the per cpu data segment. */ | |
30491 | +static DEFINE_PER_CPU(union smp_flush_state, flush_state); | |
30492 | + | |
30493 | +/* | |
30494 | + * We cannot call mmdrop() because we are in interrupt context, | |
30495 | + * instead update mm->cpu_vm_mask. | |
30496 | + */ | |
30497 | +static inline void leave_mm(unsigned long cpu) | |
30498 | +{ | |
30499 | + if (read_pda(mmu_state) == TLBSTATE_OK) | |
30500 | + BUG(); | |
30501 | + cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); | |
30502 | + load_cr3(swapper_pg_dir); | |
30503 | +} | |
30504 | + | |
30505 | +/* | |
30506 | + * | |
30507 | + * The flush IPI assumes that a thread switch happens in this order: | |
30508 | + * [cpu0: the cpu that switches] | |
30509 | + * 1) switch_mm() either 1a) or 1b) | |
30510 | + * 1a) thread switch to a different mm | |
30511 | + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); | |
30512 | + * Stop ipi delivery for the old mm. This is not synchronized with | |
30513 | + * the other cpus, but smp_invalidate_interrupt ignore flush ipis | |
30514 | + * for the wrong mm, and in the worst case we perform a superfluous | |
30515 | + * tlb flush. | |
30516 | + * 1a2) set cpu mmu_state to TLBSTATE_OK | |
30517 | + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | |
30518 | + * was in lazy tlb mode. | |
30519 | + * 1a3) update cpu active_mm | |
30520 | + * Now cpu0 accepts tlb flushes for the new mm. | |
30521 | + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); | |
30522 | + * Now the other cpus will send tlb flush ipis. | |
30523 | + * 1a4) change cr3. | |
30524 | + * 1b) thread switch without mm change | |
30525 | + * cpu active_mm is correct, cpu0 already handles | |
30526 | + * flush ipis. | |
30527 | + * 1b1) set cpu mmu_state to TLBSTATE_OK | |
30528 | + * 1b2) test_and_set the cpu bit in cpu_vm_mask. | |
30529 | + * Atomically set the bit [other cpus will start sending flush ipis], | |
30530 | + * and test the bit. | |
30531 | + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | |
30532 | + * 2) switch %%esp, ie current | |
30533 | + * | |
30534 | + * The interrupt must handle 2 special cases: | |
30535 | + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. | |
30536 | + * - the cpu performs speculative tlb reads, i.e. even if the cpu only | |
30537 | + * runs in kernel space, the cpu could load tlb entries for user space | |
30538 | + * pages. | |
30539 | + * | |
30540 | + * The good news is that cpu mmu_state is local to each cpu, no | |
30541 | + * write/read ordering problems. | |
30542 | + */ | |
30543 | + | |
30544 | +/* | |
30545 | + * TLB flush IPI: | |
30546 | + * | |
30547 | + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | |
30548 | + * 2) Leave the mm if we are in the lazy tlb mode. | |
30549 | + * | |
30550 | + * Interrupts are disabled. | |
30551 | + */ | |
30552 | + | |
30553 | +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) | |
30554 | +{ | |
30555 | + int cpu; | |
30556 | + int sender; | |
30557 | + union smp_flush_state *f; | |
30558 | + | |
30559 | + cpu = smp_processor_id(); | |
30560 | + /* | |
30561 | + * orig_rax contains the negated interrupt vector. | |
30562 | + * Use that to determine where the sender put the data. | |
30563 | + */ | |
30564 | + sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; | |
30565 | + f = &per_cpu(flush_state, sender); | |
30566 | + | |
30567 | + if (!cpu_isset(cpu, f->flush_cpumask)) | |
30568 | + goto out; | |
30569 | + /* | |
30570 | + * This was a BUG() but until someone can quote me the | |
30571 | + * line from the intel manual that guarantees an IPI to | |
30572 | + * multiple CPUs is retried _only_ on the erroring CPUs | |
30573 | + * its staying as a return | |
30574 | + * | |
30575 | + * BUG(); | |
30576 | + */ | |
30577 | + | |
30578 | + if (f->flush_mm == read_pda(active_mm)) { | |
30579 | + if (read_pda(mmu_state) == TLBSTATE_OK) { | |
30580 | + if (f->flush_va == FLUSH_ALL) | |
30581 | + local_flush_tlb(); | |
30582 | + else | |
30583 | + __flush_tlb_one(f->flush_va); | |
30584 | + } else | |
30585 | + leave_mm(cpu); | |
30586 | + } | |
30587 | +out: | |
30588 | + ack_APIC_irq(); | |
30589 | + cpu_clear(cpu, f->flush_cpumask); | |
30590 | +} | |
30591 | + | |
30592 | +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, | |
30593 | + unsigned long va) | |
30594 | +{ | |
30595 | + int sender; | |
30596 | + union smp_flush_state *f; | |
30597 | + | |
30598 | + /* Caller has disabled preemption */ | |
30599 | + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; | |
30600 | + f = &per_cpu(flush_state, sender); | |
30601 | + | |
30602 | + /* Could avoid this lock when | |
30603 | + num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is | |
30604 | + probably not worth checking this for a cache-hot lock. */ | |
30605 | + spin_lock(&f->tlbstate_lock); | |
30606 | + | |
30607 | + f->flush_mm = mm; | |
30608 | + f->flush_va = va; | |
30609 | + cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); | |
30610 | + | |
30611 | + /* | |
30612 | + * We have to send the IPI only to | |
30613 | + * CPUs affected. | |
30614 | + */ | |
30615 | + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); | |
30616 | + | |
30617 | + while (!cpus_empty(f->flush_cpumask)) | |
30618 | + cpu_relax(); | |
30619 | + | |
30620 | + f->flush_mm = NULL; | |
30621 | + f->flush_va = 0; | |
30622 | + spin_unlock(&f->tlbstate_lock); | |
30623 | +} | |
30624 | + | |
30625 | +int __cpuinit init_smp_flush(void) | |
30626 | +{ | |
30627 | + int i; | |
30628 | + for_each_cpu_mask(i, cpu_possible_map) { | |
30629 | + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); | |
30630 | + } | |
30631 | + return 0; | |
30632 | +} | |
30633 | + | |
30634 | +core_initcall(init_smp_flush); | |
30635 | + | |
30636 | +void flush_tlb_current_task(void) | |
30637 | +{ | |
30638 | + struct mm_struct *mm = current->mm; | |
30639 | + cpumask_t cpu_mask; | |
30640 | + | |
30641 | + preempt_disable(); | |
30642 | + cpu_mask = mm->cpu_vm_mask; | |
30643 | + cpu_clear(smp_processor_id(), cpu_mask); | |
30644 | + | |
30645 | + local_flush_tlb(); | |
30646 | + if (!cpus_empty(cpu_mask)) | |
30647 | + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | |
30648 | + preempt_enable(); | |
30649 | +} | |
30650 | +EXPORT_SYMBOL(flush_tlb_current_task); | |
30651 | + | |
30652 | +void flush_tlb_mm (struct mm_struct * mm) | |
30653 | +{ | |
30654 | + cpumask_t cpu_mask; | |
30655 | + | |
30656 | + preempt_disable(); | |
30657 | + cpu_mask = mm->cpu_vm_mask; | |
30658 | + cpu_clear(smp_processor_id(), cpu_mask); | |
30659 | + | |
30660 | + if (current->active_mm == mm) { | |
30661 | + if (current->mm) | |
30662 | + local_flush_tlb(); | |
30663 | + else | |
30664 | + leave_mm(smp_processor_id()); | |
30665 | + } | |
30666 | + if (!cpus_empty(cpu_mask)) | |
30667 | + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | |
30668 | + | |
30669 | + preempt_enable(); | |
30670 | +} | |
30671 | +EXPORT_SYMBOL(flush_tlb_mm); | |
30672 | + | |
30673 | +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) | |
30674 | +{ | |
30675 | + struct mm_struct *mm = vma->vm_mm; | |
30676 | + cpumask_t cpu_mask; | |
30677 | + | |
30678 | + preempt_disable(); | |
30679 | + cpu_mask = mm->cpu_vm_mask; | |
30680 | + cpu_clear(smp_processor_id(), cpu_mask); | |
30681 | + | |
30682 | + if (current->active_mm == mm) { | |
30683 | + if(current->mm) | |
30684 | + __flush_tlb_one(va); | |
30685 | + else | |
30686 | + leave_mm(smp_processor_id()); | |
30687 | + } | |
30688 | + | |
30689 | + if (!cpus_empty(cpu_mask)) | |
30690 | + flush_tlb_others(cpu_mask, mm, va); | |
30691 | + | |
30692 | + preempt_enable(); | |
30693 | +} | |
30694 | +EXPORT_SYMBOL(flush_tlb_page); | |
30695 | + | |
30696 | +static void do_flush_tlb_all(void* info) | |
30697 | +{ | |
30698 | + unsigned long cpu = smp_processor_id(); | |
30699 | + | |
30700 | + __flush_tlb_all(); | |
30701 | + if (read_pda(mmu_state) == TLBSTATE_LAZY) | |
30702 | + leave_mm(cpu); | |
30703 | +} | |
30704 | + | |
30705 | +void flush_tlb_all(void) | |
30706 | +{ | |
30707 | + on_each_cpu(do_flush_tlb_all, NULL, 1, 1); | |
30708 | +} | |
30709 | +#endif /* Xen */ | |
30710 | + | |
30711 | +/* | |
30712 | + * this function sends a 'reschedule' IPI to another CPU. | |
30713 | + * it goes straight through and wastes no time serializing | |
30714 | + * anything. Worst case is that we lose a reschedule ... | |
30715 | + */ | |
30716 | + | |
30717 | +void smp_send_reschedule(int cpu) | |
30718 | +{ | |
30719 | + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); | |
30720 | +} | |
30721 | + | |
30722 | +/* | |
30723 | + * Structure and data for smp_call_function(). This is designed to minimise | |
30724 | + * static memory requirements. It also looks cleaner. | |
30725 | + */ | |
30726 | +static DEFINE_SPINLOCK(call_lock); | |
30727 | + | |
30728 | +struct call_data_struct { | |
30729 | + void (*func) (void *info); | |
30730 | + void *info; | |
30731 | + atomic_t started; | |
30732 | + atomic_t finished; | |
30733 | + int wait; | |
30734 | +}; | |
30735 | + | |
30736 | +static struct call_data_struct * call_data; | |
30737 | + | |
30738 | +void lock_ipi_call_lock(void) | |
30739 | +{ | |
30740 | + spin_lock_irq(&call_lock); | |
30741 | +} | |
30742 | + | |
30743 | +void unlock_ipi_call_lock(void) | |
30744 | +{ | |
30745 | + spin_unlock_irq(&call_lock); | |
30746 | +} | |
30747 | + | |
30748 | +/* | |
30749 | + * this function sends a 'generic call function' IPI to one other CPU | |
30750 | + * in the system. | |
30751 | + * | |
30752 | + * cpu is a standard Linux logical CPU number. | |
30753 | + */ | |
30754 | +static void | |
30755 | +__smp_call_function_single(int cpu, void (*func) (void *info), void *info, | |
30756 | + int nonatomic, int wait) | |
30757 | +{ | |
30758 | + struct call_data_struct data; | |
30759 | + int cpus = 1; | |
30760 | + | |
30761 | + data.func = func; | |
30762 | + data.info = info; | |
30763 | + atomic_set(&data.started, 0); | |
30764 | + data.wait = wait; | |
30765 | + if (wait) | |
30766 | + atomic_set(&data.finished, 0); | |
30767 | + | |
30768 | + call_data = &data; | |
30769 | + wmb(); | |
30770 | + /* Send a message to all other CPUs and wait for them to respond */ | |
30771 | + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); | |
30772 | + | |
30773 | + /* Wait for response */ | |
30774 | + while (atomic_read(&data.started) != cpus) | |
30775 | + cpu_relax(); | |
30776 | + | |
30777 | + if (!wait) | |
30778 | + return; | |
30779 | + | |
30780 | + while (atomic_read(&data.finished) != cpus) | |
30781 | + cpu_relax(); | |
30782 | +} | |
30783 | + | |
30784 | +/* | |
30785 | + * smp_call_function_single - Run a function on another CPU | |
30786 | + * @func: The function to run. This must be fast and non-blocking. | |
30787 | + * @info: An arbitrary pointer to pass to the function. | |
30788 | + * @nonatomic: Currently unused. | |
30789 | + * @wait: If true, wait until function has completed on other CPUs. | |
30790 | + * | |
30791 | + * Retrurns 0 on success, else a negative status code. | |
30792 | + * | |
30793 | + * Does not return until the remote CPU is nearly ready to execute <func> | |
30794 | + * or is or has executed. | |
30795 | + */ | |
30796 | + | |
30797 | +int smp_call_function_single (int cpu, void (*func) (void *info), void *info, | |
30798 | + int nonatomic, int wait) | |
30799 | +{ | |
30800 | + /* prevent preemption and reschedule on another processor */ | |
30801 | + int me = get_cpu(); | |
30802 | + if (cpu == me) { | |
30803 | + WARN_ON(1); | |
30804 | + put_cpu(); | |
30805 | + return -EBUSY; | |
30806 | + } | |
30807 | + spin_lock_bh(&call_lock); | |
30808 | + __smp_call_function_single(cpu, func, info, nonatomic, wait); | |
30809 | + spin_unlock_bh(&call_lock); | |
30810 | + put_cpu(); | |
30811 | + return 0; | |
30812 | +} | |
30813 | + | |
30814 | +/* | |
30815 | + * this function sends a 'generic call function' IPI to all other CPUs | |
30816 | + * in the system. | |
30817 | + */ | |
30818 | +static void __smp_call_function (void (*func) (void *info), void *info, | |
30819 | + int nonatomic, int wait) | |
30820 | +{ | |
30821 | + struct call_data_struct data; | |
30822 | + int cpus = num_online_cpus()-1; | |
30823 | + | |
30824 | + if (!cpus) | |
30825 | + return; | |
30826 | + | |
30827 | + data.func = func; | |
30828 | + data.info = info; | |
30829 | + atomic_set(&data.started, 0); | |
30830 | + data.wait = wait; | |
30831 | + if (wait) | |
30832 | + atomic_set(&data.finished, 0); | |
30833 | + | |
30834 | + call_data = &data; | |
30835 | + wmb(); | |
30836 | + /* Send a message to all other CPUs and wait for them to respond */ | |
30837 | + send_IPI_allbutself(CALL_FUNCTION_VECTOR); | |
30838 | + | |
30839 | + /* Wait for response */ | |
30840 | + while (atomic_read(&data.started) != cpus) | |
30841 | + cpu_relax(); | |
30842 | + | |
30843 | + if (!wait) | |
30844 | + return; | |
30845 | + | |
30846 | + while (atomic_read(&data.finished) != cpus) | |
30847 | + cpu_relax(); | |
30848 | +} | |
30849 | + | |
30850 | +/* | |
30851 | + * smp_call_function - run a function on all other CPUs. | |
30852 | + * @func: The function to run. This must be fast and non-blocking. | |
30853 | + * @info: An arbitrary pointer to pass to the function. | |
30854 | + * @nonatomic: currently unused. | |
30855 | + * @wait: If true, wait (atomically) until function has completed on other | |
30856 | + * CPUs. | |
30857 | + * | |
30858 | + * Returns 0 on success, else a negative status code. Does not return until | |
30859 | + * remote CPUs are nearly ready to execute func or are or have executed. | |
30860 | + * | |
30861 | + * You must not call this function with disabled interrupts or from a | |
30862 | + * hardware interrupt handler or from a bottom half handler. | |
30863 | + * Actually there are a few legal cases, like panic. | |
30864 | + */ | |
30865 | +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, | |
30866 | + int wait) | |
30867 | +{ | |
30868 | + spin_lock(&call_lock); | |
30869 | + __smp_call_function(func,info,nonatomic,wait); | |
30870 | + spin_unlock(&call_lock); | |
30871 | + return 0; | |
30872 | +} | |
30873 | +EXPORT_SYMBOL(smp_call_function); | |
30874 | + | |
30875 | +void smp_stop_cpu(void) | |
30876 | +{ | |
30877 | + unsigned long flags; | |
30878 | + /* | |
30879 | + * Remove this CPU: | |
30880 | + */ | |
30881 | + cpu_clear(smp_processor_id(), cpu_online_map); | |
30882 | + local_irq_save(flags); | |
30883 | + disable_all_local_evtchn(); | |
30884 | + local_irq_restore(flags); | |
30885 | +} | |
30886 | + | |
30887 | +static void smp_really_stop_cpu(void *dummy) | |
30888 | +{ | |
30889 | + smp_stop_cpu(); | |
30890 | + for (;;) | |
30891 | + halt(); | |
30892 | +} | |
30893 | + | |
30894 | +void smp_send_stop(void) | |
30895 | +{ | |
30896 | + int nolock = 0; | |
30897 | +#ifndef CONFIG_XEN | |
30898 | + if (reboot_force) | |
30899 | + return; | |
30900 | +#endif | |
30901 | + /* Don't deadlock on the call lock in panic */ | |
30902 | + if (!spin_trylock(&call_lock)) { | |
30903 | + /* ignore locking because we have panicked anyways */ | |
30904 | + nolock = 1; | |
30905 | + } | |
30906 | + __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); | |
30907 | + if (!nolock) | |
30908 | + spin_unlock(&call_lock); | |
30909 | + | |
30910 | + local_irq_disable(); | |
30911 | + disable_all_local_evtchn(); | |
30912 | + local_irq_enable(); | |
30913 | +} | |
30914 | + | |
30915 | +/* | |
30916 | + * Reschedule call back. Nothing to do, | |
30917 | + * all the work is done automatically when | |
30918 | + * we return from the interrupt. | |
30919 | + */ | |
30920 | +#ifndef CONFIG_XEN | |
30921 | +asmlinkage void smp_reschedule_interrupt(void) | |
30922 | +#else | |
30923 | +asmlinkage irqreturn_t smp_reschedule_interrupt(void) | |
30924 | +#endif | |
30925 | +{ | |
30926 | +#ifndef CONFIG_XEN | |
30927 | + ack_APIC_irq(); | |
30928 | +#else | |
30929 | + return IRQ_HANDLED; | |
30930 | +#endif | |
30931 | +} | |
30932 | + | |
30933 | +#ifndef CONFIG_XEN | |
30934 | +asmlinkage void smp_call_function_interrupt(void) | |
30935 | +#else | |
30936 | +asmlinkage irqreturn_t smp_call_function_interrupt(void) | |
30937 | +#endif | |
30938 | +{ | |
30939 | + void (*func) (void *info) = call_data->func; | |
30940 | + void *info = call_data->info; | |
30941 | + int wait = call_data->wait; | |
30942 | + | |
30943 | +#ifndef CONFIG_XEN | |
30944 | + ack_APIC_irq(); | |
30945 | +#endif | |
30946 | + /* | |
30947 | + * Notify initiating CPU that I've grabbed the data and am | |
30948 | + * about to execute the function | |
30949 | + */ | |
30950 | + mb(); | |
30951 | + atomic_inc(&call_data->started); | |
30952 | + /* | |
30953 | + * At this point the info structure may be out of scope unless wait==1 | |
30954 | + */ | |
30955 | + exit_idle(); | |
30956 | + irq_enter(); | |
30957 | + (*func)(info); | |
30958 | + irq_exit(); | |
30959 | + if (wait) { | |
30960 | + mb(); | |
30961 | + atomic_inc(&call_data->finished); | |
30962 | + } | |
30963 | +#ifdef CONFIG_XEN | |
30964 | + return IRQ_HANDLED; | |
30965 | +#endif | |
30966 | +} | |
30967 | + | |
30968 | +int safe_smp_processor_id(void) | |
30969 | +{ | |
30970 | +#ifdef CONFIG_XEN | |
30971 | + return smp_processor_id(); | |
30972 | +#else | |
30973 | + unsigned apicid, i; | |
30974 | + | |
30975 | + if (disable_apic) | |
30976 | + return 0; | |
30977 | + | |
30978 | + apicid = hard_smp_processor_id(); | |
30979 | + if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid) | |
30980 | + return apicid; | |
30981 | + | |
30982 | + for (i = 0; i < NR_CPUS; ++i) { | |
30983 | + if (x86_cpu_to_apicid[i] == apicid) | |
30984 | + return i; | |
30985 | + } | |
30986 | + | |
30987 | + /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, | |
30988 | + * or called too early. Either way, we must be CPU 0. */ | |
30989 | + if (x86_cpu_to_apicid[0] == BAD_APICID) | |
30990 | + return 0; | |
30991 | + | |
30992 | + return 0; /* Should not happen */ | |
30993 | +#endif | |
30994 | +} | |
30995 | Index: head-2008-11-25/arch/x86/kernel/traps_64-xen.c | |
30996 | =================================================================== | |
30997 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
30998 | +++ head-2008-11-25/arch/x86/kernel/traps_64-xen.c 2008-04-02 12:34:02.000000000 +0200 | |
30999 | @@ -0,0 +1,1173 @@ | |
31000 | +/* | |
31001 | + * linux/arch/x86-64/traps.c | |
31002 | + * | |
31003 | + * Copyright (C) 1991, 1992 Linus Torvalds | |
31004 | + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | |
31005 | + * | |
31006 | + * Pentium III FXSR, SSE support | |
31007 | + * Gareth Hughes <gareth@valinux.com>, May 2000 | |
31008 | + */ | |
31009 | + | |
31010 | +/* | |
31011 | + * 'Traps.c' handles hardware traps and faults after we have saved some | |
31012 | + * state in 'entry.S'. | |
31013 | + */ | |
31014 | +#include <linux/sched.h> | |
31015 | +#include <linux/kernel.h> | |
31016 | +#include <linux/string.h> | |
31017 | +#include <linux/errno.h> | |
31018 | +#include <linux/ptrace.h> | |
31019 | +#include <linux/timer.h> | |
31020 | +#include <linux/mm.h> | |
31021 | +#include <linux/init.h> | |
31022 | +#include <linux/delay.h> | |
31023 | +#include <linux/spinlock.h> | |
31024 | +#include <linux/interrupt.h> | |
31025 | +#include <linux/module.h> | |
31026 | +#include <linux/moduleparam.h> | |
31027 | +#include <linux/nmi.h> | |
31028 | +#include <linux/kprobes.h> | |
31029 | +#include <linux/kexec.h> | |
31030 | +#include <linux/unwind.h> | |
31031 | + | |
31032 | +#include <asm/system.h> | |
31033 | +#include <asm/uaccess.h> | |
31034 | +#include <asm/io.h> | |
31035 | +#include <asm/atomic.h> | |
31036 | +#include <asm/debugreg.h> | |
31037 | +#include <asm/desc.h> | |
31038 | +#include <asm/i387.h> | |
31039 | +#include <asm/kdebug.h> | |
31040 | +#include <asm/processor.h> | |
31041 | +#include <asm/unwind.h> | |
31042 | +#include <asm/smp.h> | |
31043 | +#include <asm/pgalloc.h> | |
31044 | +#include <asm/pda.h> | |
31045 | +#include <asm/proto.h> | |
31046 | +#include <asm/nmi.h> | |
31047 | + | |
31048 | +asmlinkage void divide_error(void); | |
31049 | +asmlinkage void debug(void); | |
31050 | +asmlinkage void nmi(void); | |
31051 | +asmlinkage void int3(void); | |
31052 | +asmlinkage void overflow(void); | |
31053 | +asmlinkage void bounds(void); | |
31054 | +asmlinkage void invalid_op(void); | |
31055 | +asmlinkage void device_not_available(void); | |
31056 | +asmlinkage void double_fault(void); | |
31057 | +asmlinkage void coprocessor_segment_overrun(void); | |
31058 | +asmlinkage void invalid_TSS(void); | |
31059 | +asmlinkage void segment_not_present(void); | |
31060 | +asmlinkage void stack_segment(void); | |
31061 | +asmlinkage void general_protection(void); | |
31062 | +asmlinkage void page_fault(void); | |
31063 | +asmlinkage void coprocessor_error(void); | |
31064 | +asmlinkage void simd_coprocessor_error(void); | |
31065 | +asmlinkage void reserved(void); | |
31066 | +asmlinkage void alignment_check(void); | |
31067 | +asmlinkage void machine_check(void); | |
31068 | +asmlinkage void spurious_interrupt_bug(void); | |
31069 | + | |
31070 | +ATOMIC_NOTIFIER_HEAD(die_chain); | |
31071 | +EXPORT_SYMBOL(die_chain); | |
31072 | + | |
31073 | +int register_die_notifier(struct notifier_block *nb) | |
31074 | +{ | |
31075 | + vmalloc_sync_all(); | |
31076 | + return atomic_notifier_chain_register(&die_chain, nb); | |
31077 | +} | |
31078 | +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ | |
31079 | + | |
31080 | +int unregister_die_notifier(struct notifier_block *nb) | |
31081 | +{ | |
31082 | + return atomic_notifier_chain_unregister(&die_chain, nb); | |
31083 | +} | |
31084 | +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ | |
31085 | + | |
31086 | +static inline void conditional_sti(struct pt_regs *regs) | |
31087 | +{ | |
31088 | + if (regs->eflags & X86_EFLAGS_IF) | |
31089 | + local_irq_enable(); | |
31090 | +} | |
31091 | + | |
31092 | +static inline void preempt_conditional_sti(struct pt_regs *regs) | |
31093 | +{ | |
31094 | + preempt_disable(); | |
31095 | + if (regs->eflags & X86_EFLAGS_IF) | |
31096 | + local_irq_enable(); | |
31097 | +} | |
31098 | + | |
31099 | +static inline void preempt_conditional_cli(struct pt_regs *regs) | |
31100 | +{ | |
31101 | + if (regs->eflags & X86_EFLAGS_IF) | |
31102 | + local_irq_disable(); | |
31103 | + /* Make sure to not schedule here because we could be running | |
31104 | + on an exception stack. */ | |
31105 | + preempt_enable_no_resched(); | |
31106 | +} | |
31107 | + | |
31108 | +static int kstack_depth_to_print = 12; | |
31109 | +#ifdef CONFIG_STACK_UNWIND | |
31110 | +static int call_trace = 1; | |
31111 | +#else | |
31112 | +#define call_trace (-1) | |
31113 | +#endif | |
31114 | + | |
31115 | +#ifdef CONFIG_KALLSYMS | |
31116 | +# include <linux/kallsyms.h> | |
31117 | +void printk_address(unsigned long address) | |
31118 | +{ | |
31119 | + unsigned long offset = 0, symsize; | |
31120 | + const char *symname; | |
31121 | + char *modname; | |
31122 | + char *delim = ":"; | |
31123 | + char namebuf[128]; | |
31124 | + | |
31125 | + symname = kallsyms_lookup(address, &symsize, &offset, | |
31126 | + &modname, namebuf); | |
31127 | + if (!symname) { | |
31128 | + printk(" [<%016lx>]\n", address); | |
31129 | + return; | |
31130 | + } | |
31131 | + if (!modname) | |
31132 | + modname = delim = ""; | |
31133 | + printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", | |
31134 | + address, delim, modname, delim, symname, offset, symsize); | |
31135 | +} | |
31136 | +#else | |
31137 | +void printk_address(unsigned long address) | |
31138 | +{ | |
31139 | + printk(" [<%016lx>]\n", address); | |
31140 | +} | |
31141 | +#endif | |
31142 | + | |
31143 | +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | |
31144 | + unsigned *usedp, const char **idp) | |
31145 | +{ | |
31146 | +#ifndef CONFIG_X86_NO_TSS | |
31147 | + static char ids[][8] = { | |
31148 | + [DEBUG_STACK - 1] = "#DB", | |
31149 | + [NMI_STACK - 1] = "NMI", | |
31150 | + [DOUBLEFAULT_STACK - 1] = "#DF", | |
31151 | + [STACKFAULT_STACK - 1] = "#SS", | |
31152 | + [MCE_STACK - 1] = "#MC", | |
31153 | +#if DEBUG_STKSZ > EXCEPTION_STKSZ | |
31154 | + [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" | |
31155 | +#endif | |
31156 | + }; | |
31157 | + unsigned k; | |
31158 | + | |
31159 | + /* | |
31160 | + * Iterate over all exception stacks, and figure out whether | |
31161 | + * 'stack' is in one of them: | |
31162 | + */ | |
31163 | + for (k = 0; k < N_EXCEPTION_STACKS; k++) { | |
31164 | + unsigned long end; | |
31165 | + | |
31166 | + /* | |
31167 | + * set 'end' to the end of the exception stack. | |
31168 | + */ | |
31169 | + switch (k + 1) { | |
31170 | + /* | |
31171 | + * TODO: this block is not needed i think, because | |
31172 | + * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK] | |
31173 | + * properly too. | |
31174 | + */ | |
31175 | +#if DEBUG_STKSZ > EXCEPTION_STKSZ | |
31176 | + case DEBUG_STACK: | |
31177 | + end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ; | |
31178 | + break; | |
31179 | +#endif | |
31180 | + default: | |
31181 | + end = per_cpu(orig_ist, cpu).ist[k]; | |
31182 | + break; | |
31183 | + } | |
31184 | + /* | |
31185 | + * Is 'stack' above this exception frame's end? | |
31186 | + * If yes then skip to the next frame. | |
31187 | + */ | |
31188 | + if (stack >= end) | |
31189 | + continue; | |
31190 | + /* | |
31191 | + * Is 'stack' above this exception frame's start address? | |
31192 | + * If yes then we found the right frame. | |
31193 | + */ | |
31194 | + if (stack >= end - EXCEPTION_STKSZ) { | |
31195 | + /* | |
31196 | + * Make sure we only iterate through an exception | |
31197 | + * stack once. If it comes up for the second time | |
31198 | + * then there's something wrong going on - just | |
31199 | + * break out and return NULL: | |
31200 | + */ | |
31201 | + if (*usedp & (1U << k)) | |
31202 | + break; | |
31203 | + *usedp |= 1U << k; | |
31204 | + *idp = ids[k]; | |
31205 | + return (unsigned long *)end; | |
31206 | + } | |
31207 | + /* | |
31208 | + * If this is a debug stack, and if it has a larger size than | |
31209 | + * the usual exception stacks, then 'stack' might still | |
31210 | + * be within the lower portion of the debug stack: | |
31211 | + */ | |
31212 | +#if DEBUG_STKSZ > EXCEPTION_STKSZ | |
31213 | + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { | |
31214 | + unsigned j = N_EXCEPTION_STACKS - 1; | |
31215 | + | |
31216 | + /* | |
31217 | + * Black magic. A large debug stack is composed of | |
31218 | + * multiple exception stack entries, which we | |
31219 | + * iterate through now. Dont look: | |
31220 | + */ | |
31221 | + do { | |
31222 | + ++j; | |
31223 | + end -= EXCEPTION_STKSZ; | |
31224 | + ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); | |
31225 | + } while (stack < end - EXCEPTION_STKSZ); | |
31226 | + if (*usedp & (1U << j)) | |
31227 | + break; | |
31228 | + *usedp |= 1U << j; | |
31229 | + *idp = ids[j]; | |
31230 | + return (unsigned long *)end; | |
31231 | + } | |
31232 | +#endif | |
31233 | + } | |
31234 | +#endif | |
31235 | + return NULL; | |
31236 | +} | |
31237 | + | |
31238 | +static int show_trace_unwind(struct unwind_frame_info *info, void *context) | |
31239 | +{ | |
31240 | + int n = 0; | |
31241 | + | |
31242 | + while (unwind(info) == 0 && UNW_PC(info)) { | |
31243 | + n++; | |
31244 | + printk_address(UNW_PC(info)); | |
31245 | + if (arch_unw_user_mode(info)) | |
31246 | + break; | |
31247 | + } | |
31248 | + return n; | |
31249 | +} | |
31250 | + | |
31251 | +/* | |
31252 | + * x86-64 can have upto three kernel stacks: | |
31253 | + * process stack | |
31254 | + * interrupt stack | |
31255 | + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack | |
31256 | + */ | |
31257 | + | |
31258 | +void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack) | |
31259 | +{ | |
31260 | + const unsigned cpu = safe_smp_processor_id(); | |
31261 | + unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; | |
31262 | + unsigned used = 0; | |
31263 | + | |
31264 | + printk("\nCall Trace:\n"); | |
31265 | + | |
31266 | + if (!tsk) | |
31267 | + tsk = current; | |
31268 | + | |
31269 | + if (call_trace >= 0) { | |
31270 | + int unw_ret = 0; | |
31271 | + struct unwind_frame_info info; | |
31272 | + | |
31273 | + if (regs) { | |
31274 | + if (unwind_init_frame_info(&info, tsk, regs) == 0) | |
31275 | + unw_ret = show_trace_unwind(&info, NULL); | |
31276 | + } else if (tsk == current) | |
31277 | + unw_ret = unwind_init_running(&info, show_trace_unwind, NULL); | |
31278 | + else { | |
31279 | + if (unwind_init_blocked(&info, tsk) == 0) | |
31280 | + unw_ret = show_trace_unwind(&info, NULL); | |
31281 | + } | |
31282 | + if (unw_ret > 0) { | |
31283 | + if (call_trace == 1 && !arch_unw_user_mode(&info)) { | |
31284 | + print_symbol("DWARF2 unwinder stuck at %s\n", | |
31285 | + UNW_PC(&info)); | |
31286 | + if ((long)UNW_SP(&info) < 0) { | |
31287 | + printk("Leftover inexact backtrace:\n"); | |
31288 | + stack = (unsigned long *)UNW_SP(&info); | |
31289 | + } else | |
31290 | + printk("Full inexact backtrace again:\n"); | |
31291 | + } else if (call_trace >= 1) | |
31292 | + return; | |
31293 | + else | |
31294 | + printk("Full inexact backtrace again:\n"); | |
31295 | + } else | |
31296 | + printk("Inexact backtrace:\n"); | |
31297 | + } | |
31298 | + | |
31299 | + /* | |
31300 | + * Print function call entries within a stack. 'cond' is the | |
31301 | + * "end of stackframe" condition, that the 'stack++' | |
31302 | + * iteration will eventually trigger. | |
31303 | + */ | |
31304 | +#define HANDLE_STACK(cond) \ | |
31305 | + do while (cond) { \ | |
31306 | + unsigned long addr = *stack++; \ | |
31307 | + if (kernel_text_address(addr)) { \ | |
31308 | + /* \ | |
31309 | + * If the address is either in the text segment of the \ | |
31310 | + * kernel, or in the region which contains vmalloc'ed \ | |
31311 | + * memory, it *may* be the address of a calling \ | |
31312 | + * routine; if so, print it so that someone tracing \ | |
31313 | + * down the cause of the crash will be able to figure \ | |
31314 | + * out the call path that was taken. \ | |
31315 | + */ \ | |
31316 | + printk_address(addr); \ | |
31317 | + } \ | |
31318 | + } while (0) | |
31319 | + | |
31320 | + /* | |
31321 | + * Print function call entries in all stacks, starting at the | |
31322 | + * current stack address. If the stacks consist of nested | |
31323 | + * exceptions | |
31324 | + */ | |
31325 | + for ( ; ; ) { | |
31326 | + const char *id; | |
31327 | + unsigned long *estack_end; | |
31328 | + estack_end = in_exception_stack(cpu, (unsigned long)stack, | |
31329 | + &used, &id); | |
31330 | + | |
31331 | + if (estack_end) { | |
31332 | + printk(" <%s>", id); | |
31333 | + HANDLE_STACK (stack < estack_end); | |
31334 | + printk(" <EOE>"); | |
31335 | + /* | |
31336 | + * We link to the next stack via the | |
31337 | + * second-to-last pointer (index -2 to end) in the | |
31338 | + * exception stack: | |
31339 | + */ | |
31340 | + stack = (unsigned long *) estack_end[-2]; | |
31341 | + continue; | |
31342 | + } | |
31343 | + if (irqstack_end) { | |
31344 | + unsigned long *irqstack; | |
31345 | + irqstack = irqstack_end - | |
31346 | + (IRQSTACKSIZE - 64) / sizeof(*irqstack); | |
31347 | + | |
31348 | + if (stack >= irqstack && stack < irqstack_end) { | |
31349 | + printk(" <IRQ>"); | |
31350 | + HANDLE_STACK (stack < irqstack_end); | |
31351 | + /* | |
31352 | + * We link to the next stack (which would be | |
31353 | + * the process stack normally) the last | |
31354 | + * pointer (index -1 to end) in the IRQ stack: | |
31355 | + */ | |
31356 | + stack = (unsigned long *) (irqstack_end[-1]); | |
31357 | + irqstack_end = NULL; | |
31358 | + printk(" <EOI>"); | |
31359 | + continue; | |
31360 | + } | |
31361 | + } | |
31362 | + break; | |
31363 | + } | |
31364 | + | |
31365 | + /* | |
31366 | + * This prints the process stack: | |
31367 | + */ | |
31368 | + HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); | |
31369 | +#undef HANDLE_STACK | |
31370 | + | |
31371 | + printk("\n"); | |
31372 | +} | |
31373 | + | |
31374 | +static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) | |
31375 | +{ | |
31376 | + unsigned long *stack; | |
31377 | + int i; | |
31378 | + const int cpu = safe_smp_processor_id(); | |
31379 | + unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); | |
31380 | + unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); | |
31381 | + | |
31382 | + // debugging aid: "show_stack(NULL, NULL);" prints the | |
31383 | + // back trace for this cpu. | |
31384 | + | |
31385 | + if (rsp == NULL) { | |
31386 | + if (tsk) | |
31387 | + rsp = (unsigned long *)tsk->thread.rsp; | |
31388 | + else | |
31389 | + rsp = (unsigned long *)&rsp; | |
31390 | + } | |
31391 | + | |
31392 | + stack = rsp; | |
31393 | + for(i=0; i < kstack_depth_to_print; i++) { | |
31394 | + if (stack >= irqstack && stack <= irqstack_end) { | |
31395 | + if (stack == irqstack_end) { | |
31396 | + stack = (unsigned long *) (irqstack_end[-1]); | |
31397 | + printk(" <EOI> "); | |
31398 | + } | |
31399 | + } else { | |
31400 | + if (((long) stack & (THREAD_SIZE-1)) == 0) | |
31401 | + break; | |
31402 | + } | |
31403 | + if (i && ((i % 4) == 0)) | |
31404 | + printk("\n"); | |
31405 | + printk(" %016lx", *stack++); | |
31406 | + touch_nmi_watchdog(); | |
31407 | + } | |
31408 | + show_trace(tsk, regs, rsp); | |
31409 | +} | |
31410 | + | |
31411 | +void show_stack(struct task_struct *tsk, unsigned long * rsp) | |
31412 | +{ | |
31413 | + _show_stack(tsk, NULL, rsp); | |
31414 | +} | |
31415 | + | |
31416 | +/* | |
31417 | + * The architecture-independent dump_stack generator | |
31418 | + */ | |
31419 | +void dump_stack(void) | |
31420 | +{ | |
31421 | + unsigned long dummy; | |
31422 | + show_trace(NULL, NULL, &dummy); | |
31423 | +} | |
31424 | + | |
31425 | +EXPORT_SYMBOL(dump_stack); | |
31426 | + | |
31427 | +void show_registers(struct pt_regs *regs) | |
31428 | +{ | |
31429 | + int i; | |
31430 | + int in_kernel = !user_mode(regs); | |
31431 | + unsigned long rsp; | |
31432 | + const int cpu = safe_smp_processor_id(); | |
31433 | + struct task_struct *cur = cpu_pda(cpu)->pcurrent; | |
31434 | + | |
31435 | + rsp = regs->rsp; | |
31436 | + | |
31437 | + printk("CPU %d ", cpu); | |
31438 | + __show_regs(regs); | |
31439 | + printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | |
31440 | + cur->comm, cur->pid, task_thread_info(cur), cur); | |
31441 | + | |
31442 | + /* | |
31443 | + * When in-kernel, we also print out the stack and code at the | |
31444 | + * time of the fault.. | |
31445 | + */ | |
31446 | + if (in_kernel) { | |
31447 | + | |
31448 | + printk("Stack: "); | |
31449 | + _show_stack(NULL, regs, (unsigned long*)rsp); | |
31450 | + | |
31451 | + printk("\nCode: "); | |
31452 | + if (regs->rip < PAGE_OFFSET) | |
31453 | + goto bad; | |
31454 | + | |
31455 | + for (i=0; i<20; i++) { | |
31456 | + unsigned char c; | |
31457 | + if (__get_user(c, &((unsigned char*)regs->rip)[i])) { | |
31458 | +bad: | |
31459 | + printk(" Bad RIP value."); | |
31460 | + break; | |
31461 | + } | |
31462 | + printk("%02x ", c); | |
31463 | + } | |
31464 | + } | |
31465 | + printk("\n"); | |
31466 | +} | |
31467 | + | |
31468 | +void handle_BUG(struct pt_regs *regs) | |
31469 | +{ | |
31470 | + struct bug_frame f; | |
31471 | + long len; | |
31472 | + const char *prefix = ""; | |
31473 | + | |
31474 | + if (user_mode(regs)) | |
31475 | + return; | |
31476 | + if (__copy_from_user(&f, (const void __user *) regs->rip, | |
31477 | + sizeof(struct bug_frame))) | |
31478 | + return; | |
31479 | + if (f.filename >= 0 || | |
31480 | + f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) | |
31481 | + return; | |
31482 | + len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1; | |
31483 | + if (len < 0 || len >= PATH_MAX) | |
31484 | + f.filename = (int)(long)"unmapped filename"; | |
31485 | + else if (len > 50) { | |
31486 | + f.filename += len - 50; | |
31487 | + prefix = "..."; | |
31488 | + } | |
31489 | + printk("----------- [cut here ] --------- [please bite here ] ---------\n"); | |
31490 | + printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line); | |
31491 | +} | |
31492 | + | |
31493 | +#ifdef CONFIG_BUG | |
31494 | +void out_of_line_bug(void) | |
31495 | +{ | |
31496 | + BUG(); | |
31497 | +} | |
31498 | +EXPORT_SYMBOL(out_of_line_bug); | |
31499 | +#endif | |
31500 | + | |
31501 | +static DEFINE_SPINLOCK(die_lock); | |
31502 | +static int die_owner = -1; | |
31503 | +static unsigned int die_nest_count; | |
31504 | + | |
31505 | +unsigned __kprobes long oops_begin(void) | |
31506 | +{ | |
31507 | + int cpu = safe_smp_processor_id(); | |
31508 | + unsigned long flags; | |
31509 | + | |
31510 | + /* racy, but better than risking deadlock. */ | |
31511 | + local_irq_save(flags); | |
31512 | + if (!spin_trylock(&die_lock)) { | |
31513 | + if (cpu == die_owner) | |
31514 | + /* nested oops. should stop eventually */; | |
31515 | + else | |
31516 | + spin_lock(&die_lock); | |
31517 | + } | |
31518 | + die_nest_count++; | |
31519 | + die_owner = cpu; | |
31520 | + console_verbose(); | |
31521 | + bust_spinlocks(1); | |
31522 | + return flags; | |
31523 | +} | |
31524 | + | |
31525 | +void __kprobes oops_end(unsigned long flags) | |
31526 | +{ | |
31527 | + die_owner = -1; | |
31528 | + bust_spinlocks(0); | |
31529 | + die_nest_count--; | |
31530 | + if (die_nest_count) | |
31531 | + /* We still own the lock */ | |
31532 | + local_irq_restore(flags); | |
31533 | + else | |
31534 | + /* Nest count reaches zero, release the lock. */ | |
31535 | + spin_unlock_irqrestore(&die_lock, flags); | |
31536 | + if (panic_on_oops) | |
31537 | + panic("Fatal exception"); | |
31538 | +} | |
31539 | + | |
31540 | +void __kprobes __die(const char * str, struct pt_regs * regs, long err) | |
31541 | +{ | |
31542 | + static int die_counter; | |
31543 | + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); | |
31544 | +#ifdef CONFIG_PREEMPT | |
31545 | + printk("PREEMPT "); | |
31546 | +#endif | |
31547 | +#ifdef CONFIG_SMP | |
31548 | + printk("SMP "); | |
31549 | +#endif | |
31550 | +#ifdef CONFIG_DEBUG_PAGEALLOC | |
31551 | + printk("DEBUG_PAGEALLOC"); | |
31552 | +#endif | |
31553 | + printk("\n"); | |
31554 | + notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); | |
31555 | + show_registers(regs); | |
31556 | + /* Executive summary in case the oops scrolled away */ | |
31557 | + printk(KERN_ALERT "RIP "); | |
31558 | + printk_address(regs->rip); | |
31559 | + printk(" RSP <%016lx>\n", regs->rsp); | |
31560 | + if (kexec_should_crash(current)) | |
31561 | + crash_kexec(regs); | |
31562 | +} | |
31563 | + | |
31564 | +void die(const char * str, struct pt_regs * regs, long err) | |
31565 | +{ | |
31566 | + unsigned long flags = oops_begin(); | |
31567 | + | |
31568 | + handle_BUG(regs); | |
31569 | + __die(str, regs, err); | |
31570 | + oops_end(flags); | |
31571 | + do_exit(SIGSEGV); | |
31572 | +} | |
31573 | + | |
31574 | +#ifdef CONFIG_X86_LOCAL_APIC | |
31575 | +void __kprobes die_nmi(char *str, struct pt_regs *regs) | |
31576 | +{ | |
31577 | + unsigned long flags = oops_begin(); | |
31578 | + | |
31579 | + /* | |
31580 | + * We are in trouble anyway, lets at least try | |
31581 | + * to get a message out. | |
31582 | + */ | |
31583 | + printk(str, safe_smp_processor_id()); | |
31584 | + show_registers(regs); | |
31585 | + if (kexec_should_crash(current)) | |
31586 | + crash_kexec(regs); | |
31587 | + if (panic_on_timeout || panic_on_oops) | |
31588 | + panic("nmi watchdog"); | |
31589 | + printk("console shuts up ...\n"); | |
31590 | + oops_end(flags); | |
31591 | + nmi_exit(); | |
31592 | + local_irq_enable(); | |
31593 | + do_exit(SIGSEGV); | |
31594 | +} | |
31595 | +#endif | |
31596 | + | |
31597 | +static void __kprobes do_trap(int trapnr, int signr, char *str, | |
31598 | + struct pt_regs * regs, long error_code, | |
31599 | + siginfo_t *info) | |
31600 | +{ | |
31601 | + struct task_struct *tsk = current; | |
31602 | + | |
31603 | + tsk->thread.error_code = error_code; | |
31604 | + tsk->thread.trap_no = trapnr; | |
31605 | + | |
31606 | + if (user_mode(regs)) { | |
31607 | + if (exception_trace && unhandled_signal(tsk, signr)) | |
31608 | + printk(KERN_INFO | |
31609 | + "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", | |
31610 | + tsk->comm, tsk->pid, str, | |
31611 | + regs->rip, regs->rsp, error_code); | |
31612 | + | |
31613 | + if (info) | |
31614 | + force_sig_info(signr, info, tsk); | |
31615 | + else | |
31616 | + force_sig(signr, tsk); | |
31617 | + return; | |
31618 | + } | |
31619 | + | |
31620 | + | |
31621 | + /* kernel trap */ | |
31622 | + { | |
31623 | + const struct exception_table_entry *fixup; | |
31624 | + fixup = search_exception_tables(regs->rip); | |
31625 | + if (fixup) | |
31626 | + regs->rip = fixup->fixup; | |
31627 | + else | |
31628 | + die(str, regs, error_code); | |
31629 | + return; | |
31630 | + } | |
31631 | +} | |
31632 | + | |
31633 | +#define DO_ERROR(trapnr, signr, str, name) \ | |
31634 | +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | |
31635 | +{ \ | |
31636 | + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | |
31637 | + == NOTIFY_STOP) \ | |
31638 | + return; \ | |
31639 | + conditional_sti(regs); \ | |
31640 | + do_trap(trapnr, signr, str, regs, error_code, NULL); \ | |
31641 | +} | |
31642 | + | |
31643 | +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | |
31644 | +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | |
31645 | +{ \ | |
31646 | + siginfo_t info; \ | |
31647 | + info.si_signo = signr; \ | |
31648 | + info.si_errno = 0; \ | |
31649 | + info.si_code = sicode; \ | |
31650 | + info.si_addr = (void __user *)siaddr; \ | |
31651 | + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | |
31652 | + == NOTIFY_STOP) \ | |
31653 | + return; \ | |
31654 | + conditional_sti(regs); \ | |
31655 | + do_trap(trapnr, signr, str, regs, error_code, &info); \ | |
31656 | +} | |
31657 | + | |
31658 | +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) | |
31659 | +DO_ERROR( 4, SIGSEGV, "overflow", overflow) | |
31660 | +DO_ERROR( 5, SIGSEGV, "bounds", bounds) | |
31661 | +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) | |
31662 | +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) | |
31663 | +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | |
31664 | +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | |
31665 | +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | |
31666 | +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) | |
31667 | +DO_ERROR(18, SIGSEGV, "reserved", reserved) | |
31668 | + | |
31669 | +/* Runs on IST stack */ | |
31670 | +asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) | |
31671 | +{ | |
31672 | + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, | |
31673 | + 12, SIGBUS) == NOTIFY_STOP) | |
31674 | + return; | |
31675 | + preempt_conditional_sti(regs); | |
31676 | + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); | |
31677 | + preempt_conditional_cli(regs); | |
31678 | +} | |
31679 | + | |
31680 | +asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) | |
31681 | +{ | |
31682 | + static const char str[] = "double fault"; | |
31683 | + struct task_struct *tsk = current; | |
31684 | + | |
31685 | + /* Return not checked because double check cannot be ignored */ | |
31686 | + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); | |
31687 | + | |
31688 | + tsk->thread.error_code = error_code; | |
31689 | + tsk->thread.trap_no = 8; | |
31690 | + | |
31691 | + /* This is always a kernel trap and never fixable (and thus must | |
31692 | + never return). */ | |
31693 | + for (;;) | |
31694 | + die(str, regs, error_code); | |
31695 | +} | |
31696 | + | |
31697 | +asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, | |
31698 | + long error_code) | |
31699 | +{ | |
31700 | + struct task_struct *tsk = current; | |
31701 | + | |
31702 | + conditional_sti(regs); | |
31703 | + | |
31704 | + tsk->thread.error_code = error_code; | |
31705 | + tsk->thread.trap_no = 13; | |
31706 | + | |
31707 | + if (user_mode(regs)) { | |
31708 | + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) | |
31709 | + printk(KERN_INFO | |
31710 | + "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", | |
31711 | + tsk->comm, tsk->pid, | |
31712 | + regs->rip, regs->rsp, error_code); | |
31713 | + | |
31714 | + force_sig(SIGSEGV, tsk); | |
31715 | + return; | |
31716 | + } | |
31717 | + | |
31718 | + /* kernel gp */ | |
31719 | + { | |
31720 | + const struct exception_table_entry *fixup; | |
31721 | + fixup = search_exception_tables(regs->rip); | |
31722 | + if (fixup) { | |
31723 | + regs->rip = fixup->fixup; | |
31724 | + return; | |
31725 | + } | |
31726 | + if (notify_die(DIE_GPF, "general protection fault", regs, | |
31727 | + error_code, 13, SIGSEGV) == NOTIFY_STOP) | |
31728 | + return; | |
31729 | + die("general protection fault", regs, error_code); | |
31730 | + } | |
31731 | +} | |
31732 | + | |
31733 | +static __kprobes void | |
31734 | +mem_parity_error(unsigned char reason, struct pt_regs * regs) | |
31735 | +{ | |
31736 | + printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); | |
31737 | + printk("You probably have a hardware problem with your RAM chips\n"); | |
31738 | + | |
31739 | +#if 0 /* XEN */ | |
31740 | + /* Clear and disable the memory parity error line. */ | |
31741 | + reason = (reason & 0xf) | 4; | |
31742 | + outb(reason, 0x61); | |
31743 | +#endif /* XEN */ | |
31744 | +} | |
31745 | + | |
31746 | +static __kprobes void | |
31747 | +io_check_error(unsigned char reason, struct pt_regs * regs) | |
31748 | +{ | |
31749 | + printk("NMI: IOCK error (debug interrupt?)\n"); | |
31750 | + show_registers(regs); | |
31751 | + | |
31752 | +#if 0 /* XEN */ | |
31753 | + /* Re-enable the IOCK line, wait for a few seconds */ | |
31754 | + reason = (reason & 0xf) | 8; | |
31755 | + outb(reason, 0x61); | |
31756 | + mdelay(2000); | |
31757 | + reason &= ~8; | |
31758 | + outb(reason, 0x61); | |
31759 | +#endif /* XEN */ | |
31760 | +} | |
31761 | + | |
31762 | +static __kprobes void | |
31763 | +unknown_nmi_error(unsigned char reason, struct pt_regs * regs) | |
31764 | +{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); | |
31765 | + printk("Dazed and confused, but trying to continue\n"); | |
31766 | + printk("Do you have a strange power saving mode enabled?\n"); | |
31767 | +} | |
31768 | + | |
31769 | +/* Runs on IST stack. This code must keep interrupts off all the time. | |
31770 | + Nested NMIs are prevented by the CPU. */ | |
31771 | +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) | |
31772 | +{ | |
31773 | + unsigned char reason = 0; | |
31774 | + int cpu; | |
31775 | + | |
31776 | + cpu = smp_processor_id(); | |
31777 | + | |
31778 | + /* Only the BSP gets external NMIs from the system. */ | |
31779 | + if (!cpu) | |
31780 | + reason = get_nmi_reason(); | |
31781 | + | |
31782 | + if (!(reason & 0xc0)) { | |
31783 | + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | |
31784 | + == NOTIFY_STOP) | |
31785 | + return; | |
31786 | +#ifdef CONFIG_X86_LOCAL_APIC | |
31787 | + /* | |
31788 | + * Ok, so this is none of the documented NMI sources, | |
31789 | + * so it must be the NMI watchdog. | |
31790 | + */ | |
31791 | + if (nmi_watchdog > 0) { | |
31792 | + nmi_watchdog_tick(regs,reason); | |
31793 | + return; | |
31794 | + } | |
31795 | +#endif | |
31796 | + unknown_nmi_error(reason, regs); | |
31797 | + return; | |
31798 | + } | |
31799 | + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | |
31800 | + return; | |
31801 | + | |
31802 | + /* AK: following checks seem to be broken on modern chipsets. FIXME */ | |
31803 | + | |
31804 | + if (reason & 0x80) | |
31805 | + mem_parity_error(reason, regs); | |
31806 | + if (reason & 0x40) | |
31807 | + io_check_error(reason, regs); | |
31808 | +} | |
31809 | + | |
31810 | +/* runs on IST stack. */ | |
31811 | +asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) | |
31812 | +{ | |
31813 | + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { | |
31814 | + return; | |
31815 | + } | |
31816 | + preempt_conditional_sti(regs); | |
31817 | + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | |
31818 | + preempt_conditional_cli(regs); | |
31819 | +} | |
31820 | + | |
31821 | +/* Help handler running on IST stack to switch back to user stack | |
31822 | + for scheduling or signal handling. The actual stack switch is done in | |
31823 | + entry.S */ | |
31824 | +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) | |
31825 | +{ | |
31826 | + struct pt_regs *regs = eregs; | |
31827 | + /* Did already sync */ | |
31828 | + if (eregs == (struct pt_regs *)eregs->rsp) | |
31829 | + ; | |
31830 | + /* Exception from user space */ | |
31831 | + else if (user_mode(eregs)) | |
31832 | + regs = task_pt_regs(current); | |
31833 | + /* Exception from kernel and interrupts are enabled. Move to | |
31834 | + kernel process stack. */ | |
31835 | + else if (eregs->eflags & X86_EFLAGS_IF) | |
31836 | + regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); | |
31837 | + if (eregs != regs) | |
31838 | + *regs = *eregs; | |
31839 | + return regs; | |
31840 | +} | |
31841 | + | |
31842 | +/* runs on IST stack. */ | |
31843 | +asmlinkage void __kprobes do_debug(struct pt_regs * regs, | |
31844 | + unsigned long error_code) | |
31845 | +{ | |
31846 | + unsigned long condition; | |
31847 | + struct task_struct *tsk = current; | |
31848 | + siginfo_t info; | |
31849 | + | |
31850 | + get_debugreg(condition, 6); | |
31851 | + | |
31852 | + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | |
31853 | + SIGTRAP) == NOTIFY_STOP) | |
31854 | + return; | |
31855 | + | |
31856 | + preempt_conditional_sti(regs); | |
31857 | + | |
31858 | + /* Mask out spurious debug traps due to lazy DR7 setting */ | |
31859 | + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | |
31860 | + if (!tsk->thread.debugreg7) { | |
31861 | + goto clear_dr7; | |
31862 | + } | |
31863 | + } | |
31864 | + | |
31865 | + tsk->thread.debugreg6 = condition; | |
31866 | + | |
31867 | + /* Mask out spurious TF errors due to lazy TF clearing */ | |
31868 | + if (condition & DR_STEP) { | |
31869 | + /* | |
31870 | + * The TF error should be masked out only if the current | |
31871 | + * process is not traced and if the TRAP flag has been set | |
31872 | + * previously by a tracing process (condition detected by | |
31873 | + * the PT_DTRACE flag); remember that the i386 TRAP flag | |
31874 | + * can be modified by the process itself in user mode, | |
31875 | + * allowing programs to debug themselves without the ptrace() | |
31876 | + * interface. | |
31877 | + */ | |
31878 | + if (!user_mode(regs)) | |
31879 | + goto clear_TF_reenable; | |
31880 | + /* | |
31881 | + * Was the TF flag set by a debugger? If so, clear it now, | |
31882 | + * so that register information is correct. | |
31883 | + */ | |
31884 | + if (tsk->ptrace & PT_DTRACE) { | |
31885 | + regs->eflags &= ~TF_MASK; | |
31886 | + tsk->ptrace &= ~PT_DTRACE; | |
31887 | + } | |
31888 | + } | |
31889 | + | |
31890 | + /* Ok, finally something we can handle */ | |
31891 | + tsk->thread.trap_no = 1; | |
31892 | + tsk->thread.error_code = error_code; | |
31893 | + info.si_signo = SIGTRAP; | |
31894 | + info.si_errno = 0; | |
31895 | + info.si_code = TRAP_BRKPT; | |
31896 | + info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; | |
31897 | + force_sig_info(SIGTRAP, &info, tsk); | |
31898 | + | |
31899 | +clear_dr7: | |
31900 | + set_debugreg(0UL, 7); | |
31901 | + preempt_conditional_cli(regs); | |
31902 | + return; | |
31903 | + | |
31904 | +clear_TF_reenable: | |
31905 | + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | |
31906 | + regs->eflags &= ~TF_MASK; | |
31907 | + preempt_conditional_cli(regs); | |
31908 | +} | |
31909 | + | |
31910 | +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | |
31911 | +{ | |
31912 | + const struct exception_table_entry *fixup; | |
31913 | + fixup = search_exception_tables(regs->rip); | |
31914 | + if (fixup) { | |
31915 | + regs->rip = fixup->fixup; | |
31916 | + return 1; | |
31917 | + } | |
31918 | + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); | |
31919 | + /* Illegal floating point operation in the kernel */ | |
31920 | + current->thread.trap_no = trapnr; | |
31921 | + die(str, regs, 0); | |
31922 | + return 0; | |
31923 | +} | |
31924 | + | |
31925 | +/* | |
31926 | + * Note that we play around with the 'TS' bit in an attempt to get | |
31927 | + * the correct behaviour even in the presence of the asynchronous | |
31928 | + * IRQ13 behaviour | |
31929 | + */ | |
31930 | +asmlinkage void do_coprocessor_error(struct pt_regs *regs) | |
31931 | +{ | |
31932 | + void __user *rip = (void __user *)(regs->rip); | |
31933 | + struct task_struct * task; | |
31934 | + siginfo_t info; | |
31935 | + unsigned short cwd, swd; | |
31936 | + | |
31937 | + conditional_sti(regs); | |
31938 | + if (!user_mode(regs) && | |
31939 | + kernel_math_error(regs, "kernel x87 math error", 16)) | |
31940 | + return; | |
31941 | + | |
31942 | + /* | |
31943 | + * Save the info for the exception handler and clear the error. | |
31944 | + */ | |
31945 | + task = current; | |
31946 | + save_init_fpu(task); | |
31947 | + task->thread.trap_no = 16; | |
31948 | + task->thread.error_code = 0; | |
31949 | + info.si_signo = SIGFPE; | |
31950 | + info.si_errno = 0; | |
31951 | + info.si_code = __SI_FAULT; | |
31952 | + info.si_addr = rip; | |
31953 | + /* | |
31954 | + * (~cwd & swd) will mask out exceptions that are not set to unmasked | |
31955 | + * status. 0x3f is the exception bits in these regs, 0x200 is the | |
31956 | + * C1 reg you need in case of a stack fault, 0x040 is the stack | |
31957 | + * fault bit. We should only be taking one exception at a time, | |
31958 | + * so if this combination doesn't produce any single exception, | |
31959 | + * then we have a bad program that isn't synchronizing its FPU usage | |
31960 | + * and it will suffer the consequences since we won't be able to | |
31961 | + * fully reproduce the context of the exception | |
31962 | + */ | |
31963 | + cwd = get_fpu_cwd(task); | |
31964 | + swd = get_fpu_swd(task); | |
31965 | + switch (swd & ~cwd & 0x3f) { | |
31966 | + case 0x000: | |
31967 | + default: | |
31968 | + break; | |
31969 | + case 0x001: /* Invalid Op */ | |
31970 | + /* | |
31971 | + * swd & 0x240 == 0x040: Stack Underflow | |
31972 | + * swd & 0x240 == 0x240: Stack Overflow | |
31973 | + * User must clear the SF bit (0x40) if set | |
31974 | + */ | |
31975 | + info.si_code = FPE_FLTINV; | |
31976 | + break; | |
31977 | + case 0x002: /* Denormalize */ | |
31978 | + case 0x010: /* Underflow */ | |
31979 | + info.si_code = FPE_FLTUND; | |
31980 | + break; | |
31981 | + case 0x004: /* Zero Divide */ | |
31982 | + info.si_code = FPE_FLTDIV; | |
31983 | + break; | |
31984 | + case 0x008: /* Overflow */ | |
31985 | + info.si_code = FPE_FLTOVF; | |
31986 | + break; | |
31987 | + case 0x020: /* Precision */ | |
31988 | + info.si_code = FPE_FLTRES; | |
31989 | + break; | |
31990 | + } | |
31991 | + force_sig_info(SIGFPE, &info, task); | |
31992 | +} | |
31993 | + | |
31994 | +asmlinkage void bad_intr(void) | |
31995 | +{ | |
31996 | + printk("bad interrupt"); | |
31997 | +} | |
31998 | + | |
31999 | +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) | |
32000 | +{ | |
32001 | + void __user *rip = (void __user *)(regs->rip); | |
32002 | + struct task_struct * task; | |
32003 | + siginfo_t info; | |
32004 | + unsigned short mxcsr; | |
32005 | + | |
32006 | + conditional_sti(regs); | |
32007 | + if (!user_mode(regs) && | |
32008 | + kernel_math_error(regs, "kernel simd math error", 19)) | |
32009 | + return; | |
32010 | + | |
32011 | + /* | |
32012 | + * Save the info for the exception handler and clear the error. | |
32013 | + */ | |
32014 | + task = current; | |
32015 | + save_init_fpu(task); | |
32016 | + task->thread.trap_no = 19; | |
32017 | + task->thread.error_code = 0; | |
32018 | + info.si_signo = SIGFPE; | |
32019 | + info.si_errno = 0; | |
32020 | + info.si_code = __SI_FAULT; | |
32021 | + info.si_addr = rip; | |
32022 | + /* | |
32023 | + * The SIMD FPU exceptions are handled a little differently, as there | |
32024 | + * is only a single status/control register. Thus, to determine which | |
32025 | + * unmasked exception was caught we must mask the exception mask bits | |
32026 | + * at 0x1f80, and then use these to mask the exception bits at 0x3f. | |
32027 | + */ | |
32028 | + mxcsr = get_fpu_mxcsr(task); | |
32029 | + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | |
32030 | + case 0x000: | |
32031 | + default: | |
32032 | + break; | |
32033 | + case 0x001: /* Invalid Op */ | |
32034 | + info.si_code = FPE_FLTINV; | |
32035 | + break; | |
32036 | + case 0x002: /* Denormalize */ | |
32037 | + case 0x010: /* Underflow */ | |
32038 | + info.si_code = FPE_FLTUND; | |
32039 | + break; | |
32040 | + case 0x004: /* Zero Divide */ | |
32041 | + info.si_code = FPE_FLTDIV; | |
32042 | + break; | |
32043 | + case 0x008: /* Overflow */ | |
32044 | + info.si_code = FPE_FLTOVF; | |
32045 | + break; | |
32046 | + case 0x020: /* Precision */ | |
32047 | + info.si_code = FPE_FLTRES; | |
32048 | + break; | |
32049 | + } | |
32050 | + force_sig_info(SIGFPE, &info, task); | |
32051 | +} | |
32052 | + | |
32053 | +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) | |
32054 | +{ | |
32055 | +} | |
32056 | + | |
32057 | +#if 0 | |
32058 | +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | |
32059 | +{ | |
32060 | +} | |
32061 | +#endif | |
32062 | + | |
32063 | +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) | |
32064 | +{ | |
32065 | +} | |
32066 | + | |
32067 | +/* | |
32068 | + * 'math_state_restore()' saves the current math information in the | |
32069 | + * old math state array, and gets the new ones from the current task | |
32070 | + * | |
32071 | + * Careful.. There are problems with IBM-designed IRQ13 behaviour. | |
32072 | + * Don't touch unless you *really* know how it works. | |
32073 | + */ | |
32074 | +asmlinkage void math_state_restore(void) | |
32075 | +{ | |
32076 | + struct task_struct *me = current; | |
32077 | + /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */ | |
32078 | + | |
32079 | + if (!used_math()) | |
32080 | + init_fpu(me); | |
32081 | + restore_fpu_checking(&me->thread.i387.fxsave); | |
32082 | + task_thread_info(me)->status |= TS_USEDFPU; | |
32083 | +} | |
32084 | + | |
32085 | + | |
32086 | +/* | |
32087 | + * NB. All these are "interrupt gates" (i.e. events_mask is set) because we | |
32088 | + * specify <dpl>|4 in the second field. | |
32089 | + */ | |
32090 | +static trap_info_t __cpuinitdata trap_table[] = { | |
32091 | + { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error }, | |
32092 | + { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, | |
32093 | + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, | |
32094 | + { 4, 3|4, __KERNEL_CS, (unsigned long)overflow }, | |
32095 | + { 5, 0|4, __KERNEL_CS, (unsigned long)bounds }, | |
32096 | + { 6, 0|4, __KERNEL_CS, (unsigned long)invalid_op }, | |
32097 | + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available }, | |
32098 | + { 9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun}, | |
32099 | + { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS }, | |
32100 | + { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present }, | |
32101 | + { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment }, | |
32102 | + { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection }, | |
32103 | + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault }, | |
32104 | + { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug }, | |
32105 | + { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error }, | |
32106 | + { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check }, | |
32107 | +#ifdef CONFIG_X86_MCE | |
32108 | + { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check }, | |
32109 | +#endif | |
32110 | + { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, | |
32111 | +#ifdef CONFIG_IA32_EMULATION | |
32112 | + { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall}, | |
32113 | +#endif | |
32114 | + { 0, 0, 0, 0 } | |
32115 | +}; | |
32116 | + | |
32117 | +void __init trap_init(void) | |
32118 | +{ | |
32119 | + int ret; | |
32120 | + | |
32121 | + ret = HYPERVISOR_set_trap_table(trap_table); | |
32122 | + if (ret) | |
32123 | + printk("HYPERVISOR_set_trap_table failed: error %d\n", ret); | |
32124 | + | |
32125 | + /* | |
32126 | + * Should be a barrier for any external CPU state. | |
32127 | + */ | |
32128 | + cpu_init(); | |
32129 | +} | |
32130 | + | |
32131 | +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt) | |
32132 | +{ | |
32133 | + const trap_info_t *t = trap_table; | |
32134 | + | |
32135 | + for (t = trap_table; t->address; t++) { | |
32136 | + trap_ctxt[t->vector].flags = t->flags; | |
32137 | + trap_ctxt[t->vector].cs = t->cs; | |
32138 | + trap_ctxt[t->vector].address = t->address; | |
32139 | + } | |
32140 | +} | |
32141 | + | |
32142 | + | |
32143 | +/* Actual parsing is done early in setup.c. */ | |
32144 | +static int __init oops_dummy(char *s) | |
32145 | +{ | |
32146 | + panic_on_oops = 1; | |
32147 | + return 1; | |
32148 | +} | |
32149 | +__setup("oops=", oops_dummy); | |
32150 | + | |
32151 | +static int __init kstack_setup(char *s) | |
32152 | +{ | |
32153 | + kstack_depth_to_print = simple_strtoul(s,NULL,0); | |
32154 | + return 1; | |
32155 | +} | |
32156 | +__setup("kstack=", kstack_setup); | |
32157 | + | |
32158 | +#ifdef CONFIG_STACK_UNWIND | |
32159 | +static int __init call_trace_setup(char *s) | |
32160 | +{ | |
32161 | + if (strcmp(s, "old") == 0) | |
32162 | + call_trace = -1; | |
32163 | + else if (strcmp(s, "both") == 0) | |
32164 | + call_trace = 0; | |
32165 | + else if (strcmp(s, "newfallback") == 0) | |
32166 | + call_trace = 1; | |
32167 | + else if (strcmp(s, "new") == 0) | |
32168 | + call_trace = 2; | |
32169 | + return 1; | |
32170 | +} | |
32171 | +__setup("call_trace=", call_trace_setup); | |
32172 | +#endif | |
32173 | Index: head-2008-11-25/arch/x86/kernel/vsyscall_64-xen.c | |
32174 | =================================================================== | |
32175 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
32176 | +++ head-2008-11-25/arch/x86/kernel/vsyscall_64-xen.c 2007-06-18 08:38:13.000000000 +0200 | |
32177 | @@ -0,0 +1,227 @@ | |
32178 | +/* | |
32179 | + * linux/arch/x86_64/kernel/vsyscall.c | |
32180 | + * | |
32181 | + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE | |
32182 | + * Copyright 2003 Andi Kleen, SuSE Labs. | |
32183 | + * | |
32184 | + * Thanks to hpa@transmeta.com for some useful hint. | |
32185 | + * Special thanks to Ingo Molnar for his early experience with | |
32186 | + * a different vsyscall implementation for Linux/IA32 and for the name. | |
32187 | + * | |
32188 | + * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located | |
32189 | + * at virtual address -10Mbyte+1024bytes etc... There are at max 4 | |
32190 | + * vsyscalls. One vsyscall can reserve more than 1 slot to avoid | |
32191 | + * jumping out of line if necessary. We cannot add more with this | |
32192 | + * mechanism because older kernels won't return -ENOSYS. | |
32193 | + * If we want more than four we need a vDSO. | |
32194 | + * | |
32195 | + * Note: the concept clashes with user mode linux. If you use UML and | |
32196 | + * want per guest time just set the kernel.vsyscall64 sysctl to 0. | |
32197 | + */ | |
32198 | + | |
32199 | +#include <linux/time.h> | |
32200 | +#include <linux/init.h> | |
32201 | +#include <linux/kernel.h> | |
32202 | +#include <linux/timer.h> | |
32203 | +#include <linux/seqlock.h> | |
32204 | +#include <linux/jiffies.h> | |
32205 | +#include <linux/sysctl.h> | |
32206 | + | |
32207 | +#include <asm/vsyscall.h> | |
32208 | +#include <asm/pgtable.h> | |
32209 | +#include <asm/page.h> | |
32210 | +#include <asm/fixmap.h> | |
32211 | +#include <asm/errno.h> | |
32212 | +#include <asm/io.h> | |
32213 | + | |
32214 | +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) | |
32215 | + | |
32216 | +int __sysctl_vsyscall __section_sysctl_vsyscall = 1; | |
32217 | +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; | |
32218 | + | |
32219 | +#include <asm/unistd.h> | |
32220 | + | |
32221 | +static __always_inline void timeval_normalize(struct timeval * tv) | |
32222 | +{ | |
32223 | + time_t __sec; | |
32224 | + | |
32225 | + __sec = tv->tv_usec / 1000000; | |
32226 | + if (__sec) { | |
32227 | + tv->tv_usec %= 1000000; | |
32228 | + tv->tv_sec += __sec; | |
32229 | + } | |
32230 | +} | |
32231 | + | |
32232 | +static __always_inline void do_vgettimeofday(struct timeval * tv) | |
32233 | +{ | |
32234 | + long sequence, t; | |
32235 | + unsigned long sec, usec; | |
32236 | + | |
32237 | + do { | |
32238 | + sequence = read_seqbegin(&__xtime_lock); | |
32239 | + | |
32240 | + sec = __xtime.tv_sec; | |
32241 | + usec = (__xtime.tv_nsec / 1000) + | |
32242 | + (__jiffies - __wall_jiffies) * (1000000 / HZ); | |
32243 | + | |
32244 | + if (__vxtime.mode != VXTIME_HPET) { | |
32245 | + t = get_cycles_sync(); | |
32246 | + if (t < __vxtime.last_tsc) | |
32247 | + t = __vxtime.last_tsc; | |
32248 | + usec += ((t - __vxtime.last_tsc) * | |
32249 | + __vxtime.tsc_quot) >> 32; | |
32250 | + /* See comment in x86_64 do_gettimeofday. */ | |
32251 | + } else { | |
32252 | + usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - | |
32253 | + __vxtime.last) * __vxtime.quot) >> 32; | |
32254 | + } | |
32255 | + } while (read_seqretry(&__xtime_lock, sequence)); | |
32256 | + | |
32257 | + tv->tv_sec = sec + usec / 1000000; | |
32258 | + tv->tv_usec = usec % 1000000; | |
32259 | +} | |
32260 | + | |
32261 | +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ | |
32262 | +static __always_inline void do_get_tz(struct timezone * tz) | |
32263 | +{ | |
32264 | + *tz = __sys_tz; | |
32265 | +} | |
32266 | + | |
32267 | +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) | |
32268 | +{ | |
32269 | + int ret; | |
32270 | + asm volatile("vsysc2: syscall" | |
32271 | + : "=a" (ret) | |
32272 | + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); | |
32273 | + return ret; | |
32274 | +} | |
32275 | + | |
32276 | +static __always_inline long time_syscall(long *t) | |
32277 | +{ | |
32278 | + long secs; | |
32279 | + asm volatile("vsysc1: syscall" | |
32280 | + : "=a" (secs) | |
32281 | + : "0" (__NR_time),"D" (t) : __syscall_clobber); | |
32282 | + return secs; | |
32283 | +} | |
32284 | + | |
32285 | +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) | |
32286 | +{ | |
32287 | + if (!__sysctl_vsyscall) | |
32288 | + return gettimeofday(tv,tz); | |
32289 | + if (tv) | |
32290 | + do_vgettimeofday(tv); | |
32291 | + if (tz) | |
32292 | + do_get_tz(tz); | |
32293 | + return 0; | |
32294 | +} | |
32295 | + | |
32296 | +/* This will break when the xtime seconds get inaccurate, but that is | |
32297 | + * unlikely */ | |
32298 | +time_t __vsyscall(1) vtime(time_t *t) | |
32299 | +{ | |
32300 | + if (!__sysctl_vsyscall) | |
32301 | + return time_syscall(t); | |
32302 | + else if (t) | |
32303 | + *t = __xtime.tv_sec; | |
32304 | + return __xtime.tv_sec; | |
32305 | +} | |
32306 | + | |
32307 | +long __vsyscall(2) venosys_0(void) | |
32308 | +{ | |
32309 | + return -ENOSYS; | |
32310 | +} | |
32311 | + | |
32312 | +long __vsyscall(3) venosys_1(void) | |
32313 | +{ | |
32314 | + return -ENOSYS; | |
32315 | +} | |
32316 | + | |
32317 | +#ifdef CONFIG_SYSCTL | |
32318 | + | |
32319 | +#define SYSCALL 0x050f | |
32320 | +#define NOP2 0x9090 | |
32321 | + | |
32322 | +/* | |
32323 | + * NOP out syscall in vsyscall page when not needed. | |
32324 | + */ | |
32325 | +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, | |
32326 | + void __user *buffer, size_t *lenp, loff_t *ppos) | |
32327 | +{ | |
32328 | + extern u16 vsysc1, vsysc2; | |
32329 | + u16 *map1, *map2; | |
32330 | + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); | |
32331 | + if (!write) | |
32332 | + return ret; | |
32333 | + /* gcc has some trouble with __va(__pa()), so just do it this | |
32334 | + way. */ | |
32335 | + map1 = ioremap(__pa_symbol(&vsysc1), 2); | |
32336 | + if (!map1) | |
32337 | + return -ENOMEM; | |
32338 | + map2 = ioremap(__pa_symbol(&vsysc2), 2); | |
32339 | + if (!map2) { | |
32340 | + ret = -ENOMEM; | |
32341 | + goto out; | |
32342 | + } | |
32343 | + if (!sysctl_vsyscall) { | |
32344 | + *map1 = SYSCALL; | |
32345 | + *map2 = SYSCALL; | |
32346 | + } else { | |
32347 | + *map1 = NOP2; | |
32348 | + *map2 = NOP2; | |
32349 | + } | |
32350 | + iounmap(map2); | |
32351 | +out: | |
32352 | + iounmap(map1); | |
32353 | + return ret; | |
32354 | +} | |
32355 | + | |
32356 | +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, | |
32357 | + void __user *oldval, size_t __user *oldlenp, | |
32358 | + void __user *newval, size_t newlen, | |
32359 | + void **context) | |
32360 | +{ | |
32361 | + return -ENOSYS; | |
32362 | +} | |
32363 | + | |
32364 | +static ctl_table kernel_table2[] = { | |
32365 | + { .ctl_name = 99, .procname = "vsyscall64", | |
32366 | + .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, | |
32367 | + .strategy = vsyscall_sysctl_nostrat, | |
32368 | + .proc_handler = vsyscall_sysctl_change }, | |
32369 | + { 0, } | |
32370 | +}; | |
32371 | + | |
32372 | +static ctl_table kernel_root_table2[] = { | |
32373 | + { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, | |
32374 | + .child = kernel_table2 }, | |
32375 | + { 0 }, | |
32376 | +}; | |
32377 | + | |
32378 | +#endif | |
32379 | + | |
32380 | +static void __init map_vsyscall(void) | |
32381 | +{ | |
32382 | + extern char __vsyscall_0; | |
32383 | + unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); | |
32384 | + | |
32385 | + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); | |
32386 | +} | |
32387 | + | |
32388 | +static int __init vsyscall_init(void) | |
32389 | +{ | |
32390 | + BUG_ON(((unsigned long) &vgettimeofday != | |
32391 | + VSYSCALL_ADDR(__NR_vgettimeofday))); | |
32392 | + BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); | |
32393 | + BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); | |
32394 | + map_vsyscall(); | |
32395 | +#ifdef CONFIG_XEN | |
32396 | + sysctl_vsyscall = 0; /* disable vgettimeofay() */ | |
32397 | +#endif | |
32398 | +#ifdef CONFIG_SYSCTL | |
32399 | + register_sysctl_table(kernel_root_table2, 0); | |
32400 | +#endif | |
32401 | + return 0; | |
32402 | +} | |
32403 | + | |
32404 | +__initcall(vsyscall_init); | |
32405 | Index: head-2008-11-25/arch/x86/kernel/xen_entry_64.S | |
32406 | =================================================================== | |
32407 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
32408 | +++ head-2008-11-25/arch/x86/kernel/xen_entry_64.S 2008-04-02 12:34:02.000000000 +0200 | |
32409 | @@ -0,0 +1,36 @@ | |
32410 | +/* | |
32411 | + * Copied from arch/xen/i386/kernel/entry.S | |
32412 | + */ | |
32413 | +/* Offsets into shared_info_t. */ | |
32414 | +#define evtchn_upcall_pending /* 0 */ | |
32415 | +#define evtchn_upcall_mask 1 | |
32416 | + | |
32417 | +#define sizeof_vcpu_shift 6 | |
32418 | + | |
32419 | +#ifdef CONFIG_SMP | |
32420 | +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg) | |
32421 | +//#define preempt_enable(reg) decl threadinfo_preempt_count(reg) | |
32422 | +#define preempt_disable(reg) | |
32423 | +#define preempt_enable(reg) | |
32424 | +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \ | |
32425 | + movq %gs:pda_cpunumber,reg ; \ | |
32426 | + shl $32, reg ; \ | |
32427 | + shr $32-sizeof_vcpu_shift,reg ; \ | |
32428 | + addq HYPERVISOR_shared_info,reg | |
32429 | +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \ | |
32430 | +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff | |
32431 | +#else | |
32432 | +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg | |
32433 | +#define XEN_PUT_VCPU_INFO(reg) | |
32434 | +#define XEN_PUT_VCPU_INFO_fixup | |
32435 | +#endif | |
32436 | + | |
32437 | +#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) | |
32438 | +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) | |
32439 | +#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ | |
32440 | + XEN_LOCKED_BLOCK_EVENTS(reg) ; \ | |
32441 | + XEN_PUT_VCPU_INFO(reg) | |
32442 | +#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ | |
32443 | + XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ | |
32444 | + XEN_PUT_VCPU_INFO(reg) | |
32445 | +#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) | |
32446 | Index: head-2008-11-25/arch/x86/mm/fault_64-xen.c | |
32447 | =================================================================== | |
32448 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
32449 | +++ head-2008-11-25/arch/x86/mm/fault_64-xen.c 2007-11-02 17:34:23.000000000 +0100 | |
32450 | @@ -0,0 +1,724 @@ | |
32451 | +/* | |
32452 | + * linux/arch/x86-64/mm/fault.c | |
32453 | + * | |
32454 | + * Copyright (C) 1995 Linus Torvalds | |
32455 | + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. | |
32456 | + */ | |
32457 | + | |
32458 | +#include <linux/signal.h> | |
32459 | +#include <linux/sched.h> | |
32460 | +#include <linux/kernel.h> | |
32461 | +#include <linux/errno.h> | |
32462 | +#include <linux/string.h> | |
32463 | +#include <linux/types.h> | |
32464 | +#include <linux/ptrace.h> | |
32465 | +#include <linux/mman.h> | |
32466 | +#include <linux/mm.h> | |
32467 | +#include <linux/smp.h> | |
32468 | +#include <linux/smp_lock.h> | |
32469 | +#include <linux/interrupt.h> | |
32470 | +#include <linux/init.h> | |
32471 | +#include <linux/tty.h> | |
32472 | +#include <linux/vt_kern.h> /* For unblank_screen() */ | |
32473 | +#include <linux/compiler.h> | |
32474 | +#include <linux/module.h> | |
32475 | +#include <linux/kprobes.h> | |
32476 | + | |
32477 | +#include <asm/system.h> | |
32478 | +#include <asm/uaccess.h> | |
32479 | +#include <asm/pgalloc.h> | |
32480 | +#include <asm/smp.h> | |
32481 | +#include <asm/tlbflush.h> | |
32482 | +#include <asm/proto.h> | |
32483 | +#include <asm/kdebug.h> | |
32484 | +#include <asm-generic/sections.h> | |
32485 | + | |
32486 | +/* Page fault error code bits */ | |
32487 | +#define PF_PROT (1<<0) /* or no page found */ | |
32488 | +#define PF_WRITE (1<<1) | |
32489 | +#define PF_USER (1<<2) | |
32490 | +#define PF_RSVD (1<<3) | |
32491 | +#define PF_INSTR (1<<4) | |
32492 | + | |
32493 | +#ifdef CONFIG_KPROBES | |
32494 | +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); | |
32495 | + | |
32496 | +/* Hook to register for page fault notifications */ | |
32497 | +int register_page_fault_notifier(struct notifier_block *nb) | |
32498 | +{ | |
32499 | + vmalloc_sync_all(); | |
32500 | + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); | |
32501 | +} | |
32502 | + | |
32503 | +int unregister_page_fault_notifier(struct notifier_block *nb) | |
32504 | +{ | |
32505 | + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); | |
32506 | +} | |
32507 | + | |
32508 | +static inline int notify_page_fault(enum die_val val, const char *str, | |
32509 | + struct pt_regs *regs, long err, int trap, int sig) | |
32510 | +{ | |
32511 | + struct die_args args = { | |
32512 | + .regs = regs, | |
32513 | + .str = str, | |
32514 | + .err = err, | |
32515 | + .trapnr = trap, | |
32516 | + .signr = sig | |
32517 | + }; | |
32518 | + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); | |
32519 | +} | |
32520 | +#else | |
32521 | +static inline int notify_page_fault(enum die_val val, const char *str, | |
32522 | + struct pt_regs *regs, long err, int trap, int sig) | |
32523 | +{ | |
32524 | + return NOTIFY_DONE; | |
32525 | +} | |
32526 | +#endif | |
32527 | + | |
32528 | +void bust_spinlocks(int yes) | |
32529 | +{ | |
32530 | + int loglevel_save = console_loglevel; | |
32531 | + if (yes) { | |
32532 | + oops_in_progress = 1; | |
32533 | + } else { | |
32534 | +#ifdef CONFIG_VT | |
32535 | + unblank_screen(); | |
32536 | +#endif | |
32537 | + oops_in_progress = 0; | |
32538 | + /* | |
32539 | + * OK, the message is on the console. Now we call printk() | |
32540 | + * without oops_in_progress set so that printk will give klogd | |
32541 | + * a poke. Hold onto your hats... | |
32542 | + */ | |
32543 | + console_loglevel = 15; /* NMI oopser may have shut the console up */ | |
32544 | + printk(" "); | |
32545 | + console_loglevel = loglevel_save; | |
32546 | + } | |
32547 | +} | |
32548 | + | |
32549 | +/* Sometimes the CPU reports invalid exceptions on prefetch. | |
32550 | + Check that here and ignore. | |
32551 | + Opcode checker based on code by Richard Brunner */ | |
32552 | +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, | |
32553 | + unsigned long error_code) | |
32554 | +{ | |
32555 | + unsigned char *instr; | |
32556 | + int scan_more = 1; | |
32557 | + int prefetch = 0; | |
32558 | + unsigned char *max_instr; | |
32559 | + | |
32560 | + /* If it was a exec fault ignore */ | |
32561 | + if (error_code & PF_INSTR) | |
32562 | + return 0; | |
32563 | + | |
32564 | + instr = (unsigned char *)convert_rip_to_linear(current, regs); | |
32565 | + max_instr = instr + 15; | |
32566 | + | |
32567 | + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) | |
32568 | + return 0; | |
32569 | + | |
32570 | + while (scan_more && instr < max_instr) { | |
32571 | + unsigned char opcode; | |
32572 | + unsigned char instr_hi; | |
32573 | + unsigned char instr_lo; | |
32574 | + | |
32575 | + if (__get_user(opcode, instr)) | |
32576 | + break; | |
32577 | + | |
32578 | + instr_hi = opcode & 0xf0; | |
32579 | + instr_lo = opcode & 0x0f; | |
32580 | + instr++; | |
32581 | + | |
32582 | + switch (instr_hi) { | |
32583 | + case 0x20: | |
32584 | + case 0x30: | |
32585 | + /* Values 0x26,0x2E,0x36,0x3E are valid x86 | |
32586 | + prefixes. In long mode, the CPU will signal | |
32587 | + invalid opcode if some of these prefixes are | |
32588 | + present so we will never get here anyway */ | |
32589 | + scan_more = ((instr_lo & 7) == 0x6); | |
32590 | + break; | |
32591 | + | |
32592 | + case 0x40: | |
32593 | + /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes | |
32594 | + Need to figure out under what instruction mode the | |
32595 | + instruction was issued ... */ | |
32596 | + /* Could check the LDT for lm, but for now it's good | |
32597 | + enough to assume that long mode only uses well known | |
32598 | + segments or kernel. */ | |
32599 | + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); | |
32600 | + break; | |
32601 | + | |
32602 | + case 0x60: | |
32603 | + /* 0x64 thru 0x67 are valid prefixes in all modes. */ | |
32604 | + scan_more = (instr_lo & 0xC) == 0x4; | |
32605 | + break; | |
32606 | + case 0xF0: | |
32607 | + /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ | |
32608 | + scan_more = !instr_lo || (instr_lo>>1) == 1; | |
32609 | + break; | |
32610 | + case 0x00: | |
32611 | + /* Prefetch instruction is 0x0F0D or 0x0F18 */ | |
32612 | + scan_more = 0; | |
32613 | + if (__get_user(opcode, instr)) | |
32614 | + break; | |
32615 | + prefetch = (instr_lo == 0xF) && | |
32616 | + (opcode == 0x0D || opcode == 0x18); | |
32617 | + break; | |
32618 | + default: | |
32619 | + scan_more = 0; | |
32620 | + break; | |
32621 | + } | |
32622 | + } | |
32623 | + return prefetch; | |
32624 | +} | |
32625 | + | |
32626 | +static int bad_address(void *p) | |
32627 | +{ | |
32628 | + unsigned long dummy; | |
32629 | + return __get_user(dummy, (unsigned long *)p); | |
32630 | +} | |
32631 | + | |
32632 | +void dump_pagetable(unsigned long address) | |
32633 | +{ | |
32634 | + pgd_t *pgd; | |
32635 | + pud_t *pud; | |
32636 | + pmd_t *pmd; | |
32637 | + pte_t *pte; | |
32638 | + | |
32639 | + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); | |
32640 | + pgd += pgd_index(address); | |
32641 | + if (bad_address(pgd)) goto bad; | |
32642 | + printk("PGD %lx ", pgd_val(*pgd)); | |
32643 | + if (!pgd_present(*pgd)) goto ret; | |
32644 | + | |
32645 | + pud = pud_offset(pgd, address); | |
32646 | + if (bad_address(pud)) goto bad; | |
32647 | + printk("PUD %lx ", pud_val(*pud)); | |
32648 | + if (!pud_present(*pud)) goto ret; | |
32649 | + | |
32650 | + pmd = pmd_offset(pud, address); | |
32651 | + if (bad_address(pmd)) goto bad; | |
32652 | + printk("PMD %lx ", pmd_val(*pmd)); | |
32653 | + if (!pmd_present(*pmd)) goto ret; | |
32654 | + | |
32655 | + pte = pte_offset_kernel(pmd, address); | |
32656 | + if (bad_address(pte)) goto bad; | |
32657 | + printk("PTE %lx", pte_val(*pte)); | |
32658 | +ret: | |
32659 | + printk("\n"); | |
32660 | + return; | |
32661 | +bad: | |
32662 | + printk("BAD\n"); | |
32663 | +} | |
32664 | + | |
32665 | +static const char errata93_warning[] = | |
32666 | +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" | |
32667 | +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" | |
32668 | +KERN_ERR "******* Please consider a BIOS update.\n" | |
32669 | +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; | |
32670 | + | |
32671 | +/* Workaround for K8 erratum #93 & buggy BIOS. | |
32672 | + BIOS SMM functions are required to use a specific workaround | |
32673 | + to avoid corruption of the 64bit RIP register on C stepping K8. | |
32674 | + A lot of BIOS that didn't get tested properly miss this. | |
32675 | + The OS sees this as a page fault with the upper 32bits of RIP cleared. | |
32676 | + Try to work around it here. | |
32677 | + Note we only handle faults in kernel here. */ | |
32678 | + | |
32679 | +static int is_errata93(struct pt_regs *regs, unsigned long address) | |
32680 | +{ | |
32681 | + static int warned; | |
32682 | + if (address != regs->rip) | |
32683 | + return 0; | |
32684 | + if ((address >> 32) != 0) | |
32685 | + return 0; | |
32686 | + address |= 0xffffffffUL << 32; | |
32687 | + if ((address >= (u64)_stext && address <= (u64)_etext) || | |
32688 | + (address >= MODULES_VADDR && address <= MODULES_END)) { | |
32689 | + if (!warned) { | |
32690 | + printk(errata93_warning); | |
32691 | + warned = 1; | |
32692 | + } | |
32693 | + regs->rip = address; | |
32694 | + return 1; | |
32695 | + } | |
32696 | + return 0; | |
32697 | +} | |
32698 | + | |
32699 | +int unhandled_signal(struct task_struct *tsk, int sig) | |
32700 | +{ | |
32701 | + if (tsk->pid == 1) | |
32702 | + return 1; | |
32703 | + if (tsk->ptrace & PT_PTRACED) | |
32704 | + return 0; | |
32705 | + return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || | |
32706 | + (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); | |
32707 | +} | |
32708 | + | |
32709 | +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, | |
32710 | + unsigned long error_code) | |
32711 | +{ | |
32712 | + unsigned long flags = oops_begin(); | |
32713 | + struct task_struct *tsk; | |
32714 | + | |
32715 | + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", | |
32716 | + current->comm, address); | |
32717 | + dump_pagetable(address); | |
32718 | + tsk = current; | |
32719 | + tsk->thread.cr2 = address; | |
32720 | + tsk->thread.trap_no = 14; | |
32721 | + tsk->thread.error_code = error_code; | |
32722 | + __die("Bad pagetable", regs, error_code); | |
32723 | + oops_end(flags); | |
32724 | + do_exit(SIGKILL); | |
32725 | +} | |
32726 | + | |
32727 | +/* | |
32728 | + * Handle a fault on the vmalloc area | |
32729 | + * | |
32730 | + * This assumes no large pages in there. | |
32731 | + */ | |
32732 | +static int vmalloc_fault(unsigned long address) | |
32733 | +{ | |
32734 | + pgd_t *pgd, *pgd_ref; | |
32735 | + pud_t *pud, *pud_ref; | |
32736 | + pmd_t *pmd, *pmd_ref; | |
32737 | + pte_t *pte, *pte_ref; | |
32738 | + | |
32739 | + /* Copy kernel mappings over when needed. This can also | |
32740 | + happen within a race in page table update. In the later | |
32741 | + case just flush. */ | |
32742 | + | |
32743 | + /* On Xen the line below does not always work. Needs investigating! */ | |
32744 | + /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/ | |
32745 | + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); | |
32746 | + pgd += pgd_index(address); | |
32747 | + pgd_ref = pgd_offset_k(address); | |
32748 | + if (pgd_none(*pgd_ref)) | |
32749 | + return -1; | |
32750 | + if (pgd_none(*pgd)) | |
32751 | + set_pgd(pgd, *pgd_ref); | |
32752 | + else | |
32753 | + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); | |
32754 | + | |
32755 | + /* Below here mismatches are bugs because these lower tables | |
32756 | + are shared */ | |
32757 | + | |
32758 | + pud = pud_offset(pgd, address); | |
32759 | + pud_ref = pud_offset(pgd_ref, address); | |
32760 | + if (pud_none(*pud_ref)) | |
32761 | + return -1; | |
32762 | + if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref)) | |
32763 | + BUG(); | |
32764 | + pmd = pmd_offset(pud, address); | |
32765 | + pmd_ref = pmd_offset(pud_ref, address); | |
32766 | + if (pmd_none(*pmd_ref)) | |
32767 | + return -1; | |
32768 | + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) | |
32769 | + BUG(); | |
32770 | + pte_ref = pte_offset_kernel(pmd_ref, address); | |
32771 | + if (!pte_present(*pte_ref)) | |
32772 | + return -1; | |
32773 | + pte = pte_offset_kernel(pmd, address); | |
32774 | + /* Don't use pte_page here, because the mappings can point | |
32775 | + outside mem_map, and the NUMA hash lookup cannot handle | |
32776 | + that. */ | |
32777 | + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) | |
32778 | + BUG(); | |
32779 | + return 0; | |
32780 | +} | |
32781 | + | |
32782 | +int page_fault_trace = 0; | |
32783 | +int exception_trace = 1; | |
32784 | + | |
32785 | + | |
32786 | +#define MEM_VERBOSE 1 | |
32787 | + | |
32788 | +#ifdef MEM_VERBOSE | |
32789 | +#define MEM_LOG(_f, _a...) \ | |
32790 | + printk("fault.c:[%d]-> " _f "\n", \ | |
32791 | + __LINE__ , ## _a ) | |
32792 | +#else | |
32793 | +#define MEM_LOG(_f, _a...) ((void)0) | |
32794 | +#endif | |
32795 | + | |
32796 | +static int spurious_fault(struct pt_regs *regs, | |
32797 | + unsigned long address, | |
32798 | + unsigned long error_code) | |
32799 | +{ | |
32800 | + pgd_t *pgd; | |
32801 | + pud_t *pud; | |
32802 | + pmd_t *pmd; | |
32803 | + pte_t *pte; | |
32804 | + | |
32805 | +#ifdef CONFIG_XEN | |
32806 | + /* Faults in hypervisor area are never spurious. */ | |
32807 | + if ((address >= HYPERVISOR_VIRT_START) && | |
32808 | + (address < HYPERVISOR_VIRT_END)) | |
32809 | + return 0; | |
32810 | +#endif | |
32811 | + | |
32812 | + /* Reserved-bit violation or user access to kernel space? */ | |
32813 | + if (error_code & (PF_RSVD|PF_USER)) | |
32814 | + return 0; | |
32815 | + | |
32816 | + pgd = init_mm.pgd + pgd_index(address); | |
32817 | + if (!pgd_present(*pgd)) | |
32818 | + return 0; | |
32819 | + | |
32820 | + pud = pud_offset(pgd, address); | |
32821 | + if (!pud_present(*pud)) | |
32822 | + return 0; | |
32823 | + | |
32824 | + pmd = pmd_offset(pud, address); | |
32825 | + if (!pmd_present(*pmd)) | |
32826 | + return 0; | |
32827 | + | |
32828 | + pte = pte_offset_kernel(pmd, address); | |
32829 | + if (!pte_present(*pte)) | |
32830 | + return 0; | |
32831 | + if ((error_code & PF_WRITE) && !pte_write(*pte)) | |
32832 | + return 0; | |
32833 | + if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX)) | |
32834 | + return 0; | |
32835 | + | |
32836 | + return 1; | |
32837 | +} | |
32838 | + | |
32839 | +/* | |
32840 | + * This routine handles page faults. It determines the address, | |
32841 | + * and the problem, and then passes it off to one of the appropriate | |
32842 | + * routines. | |
32843 | + */ | |
32844 | +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, | |
32845 | + unsigned long error_code) | |
32846 | +{ | |
32847 | + struct task_struct *tsk; | |
32848 | + struct mm_struct *mm; | |
32849 | + struct vm_area_struct * vma; | |
32850 | + unsigned long address; | |
32851 | + const struct exception_table_entry *fixup; | |
32852 | + int write; | |
32853 | + unsigned long flags; | |
32854 | + siginfo_t info; | |
32855 | + | |
32856 | + if (!user_mode(regs)) | |
32857 | + error_code &= ~PF_USER; /* means kernel */ | |
32858 | + | |
32859 | + tsk = current; | |
32860 | + mm = tsk->mm; | |
32861 | + prefetchw(&mm->mmap_sem); | |
32862 | + | |
32863 | + /* get the address */ | |
32864 | + address = current_vcpu_info()->arch.cr2; | |
32865 | + | |
32866 | + info.si_code = SEGV_MAPERR; | |
32867 | + | |
32868 | + | |
32869 | + /* | |
32870 | + * We fault-in kernel-space virtual memory on-demand. The | |
32871 | + * 'reference' page table is init_mm.pgd. | |
32872 | + * | |
32873 | + * NOTE! We MUST NOT take any locks for this case. We may | |
32874 | + * be in an interrupt or a critical region, and should | |
32875 | + * only copy the information from the master page table, | |
32876 | + * nothing more. | |
32877 | + * | |
32878 | + * This verifies that the fault happens in kernel space | |
32879 | + * (error_code & 4) == 0, and that the fault was not a | |
32880 | + * protection error (error_code & 9) == 0. | |
32881 | + */ | |
32882 | + if (unlikely(address >= TASK_SIZE64)) { | |
32883 | + /* | |
32884 | + * Don't check for the module range here: its PML4 | |
32885 | + * is always initialized because it's shared with the main | |
32886 | + * kernel text. Only vmalloc may need PML4 syncups. | |
32887 | + */ | |
32888 | + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && | |
32889 | + ((address >= VMALLOC_START && address < VMALLOC_END))) { | |
32890 | + if (vmalloc_fault(address) >= 0) | |
32891 | + return; | |
32892 | + } | |
32893 | + /* Can take a spurious fault if mapping changes R/O -> R/W. */ | |
32894 | + if (spurious_fault(regs, address, error_code)) | |
32895 | + return; | |
32896 | + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | |
32897 | + SIGSEGV) == NOTIFY_STOP) | |
32898 | + return; | |
32899 | + /* | |
32900 | + * Don't take the mm semaphore here. If we fixup a prefetch | |
32901 | + * fault we could otherwise deadlock. | |
32902 | + */ | |
32903 | + goto bad_area_nosemaphore; | |
32904 | + } | |
32905 | + | |
32906 | + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | |
32907 | + SIGSEGV) == NOTIFY_STOP) | |
32908 | + return; | |
32909 | + | |
32910 | + if (likely(regs->eflags & X86_EFLAGS_IF)) | |
32911 | + local_irq_enable(); | |
32912 | + | |
32913 | + if (unlikely(page_fault_trace)) | |
32914 | + printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", | |
32915 | + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); | |
32916 | + | |
32917 | + if (unlikely(error_code & PF_RSVD)) | |
32918 | + pgtable_bad(address, regs, error_code); | |
32919 | + | |
32920 | + /* | |
32921 | + * If we're in an interrupt or have no user | |
32922 | + * context, we must not take the fault.. | |
32923 | + */ | |
32924 | + if (unlikely(in_atomic() || !mm)) | |
32925 | + goto bad_area_nosemaphore; | |
32926 | + | |
32927 | + again: | |
32928 | + /* When running in the kernel we expect faults to occur only to | |
32929 | + * addresses in user space. All other faults represent errors in the | |
32930 | + * kernel and should generate an OOPS. Unfortunatly, in the case of an | |
32931 | + * erroneous fault occurring in a code path which already holds mmap_sem | |
32932 | + * we will deadlock attempting to validate the fault against the | |
32933 | + * address space. Luckily the kernel only validly references user | |
32934 | + * space from well defined areas of code, which are listed in the | |
32935 | + * exceptions table. | |
32936 | + * | |
32937 | + * As the vast majority of faults will be valid we will only perform | |
32938 | + * the source reference check when there is a possibilty of a deadlock. | |
32939 | + * Attempt to lock the address space, if we cannot we then validate the | |
32940 | + * source. If this is invalid we can skip the address space check, | |
32941 | + * thus avoiding the deadlock. | |
32942 | + */ | |
32943 | + if (!down_read_trylock(&mm->mmap_sem)) { | |
32944 | + if ((error_code & PF_USER) == 0 && | |
32945 | + !search_exception_tables(regs->rip)) | |
32946 | + goto bad_area_nosemaphore; | |
32947 | + down_read(&mm->mmap_sem); | |
32948 | + } | |
32949 | + | |
32950 | + vma = find_vma(mm, address); | |
32951 | + if (!vma) | |
32952 | + goto bad_area; | |
32953 | + if (likely(vma->vm_start <= address)) | |
32954 | + goto good_area; | |
32955 | + if (!(vma->vm_flags & VM_GROWSDOWN)) | |
32956 | + goto bad_area; | |
32957 | + if (error_code & 4) { | |
32958 | + /* Allow userspace just enough access below the stack pointer | |
32959 | + * to let the 'enter' instruction work. | |
32960 | + */ | |
32961 | + if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) | |
32962 | + goto bad_area; | |
32963 | + } | |
32964 | + if (expand_stack(vma, address)) | |
32965 | + goto bad_area; | |
32966 | +/* | |
32967 | + * Ok, we have a good vm_area for this memory access, so | |
32968 | + * we can handle it.. | |
32969 | + */ | |
32970 | +good_area: | |
32971 | + info.si_code = SEGV_ACCERR; | |
32972 | + write = 0; | |
32973 | + switch (error_code & (PF_PROT|PF_WRITE)) { | |
32974 | + default: /* 3: write, present */ | |
32975 | + /* fall through */ | |
32976 | + case PF_WRITE: /* write, not present */ | |
32977 | + if (!(vma->vm_flags & VM_WRITE)) | |
32978 | + goto bad_area; | |
32979 | + write++; | |
32980 | + break; | |
32981 | + case PF_PROT: /* read, present */ | |
32982 | + goto bad_area; | |
32983 | + case 0: /* read, not present */ | |
32984 | + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | |
32985 | + goto bad_area; | |
32986 | + } | |
32987 | + | |
32988 | + /* | |
32989 | + * If for any reason at all we couldn't handle the fault, | |
32990 | + * make sure we exit gracefully rather than endlessly redo | |
32991 | + * the fault. | |
32992 | + */ | |
32993 | + switch (handle_mm_fault(mm, vma, address, write)) { | |
32994 | + case VM_FAULT_MINOR: | |
32995 | + tsk->min_flt++; | |
32996 | + break; | |
32997 | + case VM_FAULT_MAJOR: | |
32998 | + tsk->maj_flt++; | |
32999 | + break; | |
33000 | + case VM_FAULT_SIGBUS: | |
33001 | + goto do_sigbus; | |
33002 | + default: | |
33003 | + goto out_of_memory; | |
33004 | + } | |
33005 | + | |
33006 | + up_read(&mm->mmap_sem); | |
33007 | + return; | |
33008 | + | |
33009 | +/* | |
33010 | + * Something tried to access memory that isn't in our memory map.. | |
33011 | + * Fix it, but check if it's kernel or user first.. | |
33012 | + */ | |
33013 | +bad_area: | |
33014 | + up_read(&mm->mmap_sem); | |
33015 | + | |
33016 | +bad_area_nosemaphore: | |
33017 | + /* User mode accesses just cause a SIGSEGV */ | |
33018 | + if (error_code & PF_USER) { | |
33019 | + if (is_prefetch(regs, address, error_code)) | |
33020 | + return; | |
33021 | + | |
33022 | + /* Work around K8 erratum #100 K8 in compat mode | |
33023 | + occasionally jumps to illegal addresses >4GB. We | |
33024 | + catch this here in the page fault handler because | |
33025 | + these addresses are not reachable. Just detect this | |
33026 | + case and return. Any code segment in LDT is | |
33027 | + compatibility mode. */ | |
33028 | + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && | |
33029 | + (address >> 32)) | |
33030 | + return; | |
33031 | + | |
33032 | + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { | |
33033 | + printk( | |
33034 | + "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", | |
33035 | + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, | |
33036 | + tsk->comm, tsk->pid, address, regs->rip, | |
33037 | + regs->rsp, error_code); | |
33038 | + } | |
33039 | + | |
33040 | + tsk->thread.cr2 = address; | |
33041 | + /* Kernel addresses are always protection faults */ | |
33042 | + tsk->thread.error_code = error_code | (address >= TASK_SIZE); | |
33043 | + tsk->thread.trap_no = 14; | |
33044 | + info.si_signo = SIGSEGV; | |
33045 | + info.si_errno = 0; | |
33046 | + /* info.si_code has been set above */ | |
33047 | + info.si_addr = (void __user *)address; | |
33048 | + force_sig_info(SIGSEGV, &info, tsk); | |
33049 | + return; | |
33050 | + } | |
33051 | + | |
33052 | +no_context: | |
33053 | + | |
33054 | + /* Are we prepared to handle this kernel fault? */ | |
33055 | + fixup = search_exception_tables(regs->rip); | |
33056 | + if (fixup) { | |
33057 | + regs->rip = fixup->fixup; | |
33058 | + return; | |
33059 | + } | |
33060 | + | |
33061 | + /* | |
33062 | + * Hall of shame of CPU/BIOS bugs. | |
33063 | + */ | |
33064 | + | |
33065 | + if (is_prefetch(regs, address, error_code)) | |
33066 | + return; | |
33067 | + | |
33068 | + if (is_errata93(regs, address)) | |
33069 | + return; | |
33070 | + | |
33071 | +/* | |
33072 | + * Oops. The kernel tried to access some bad page. We'll have to | |
33073 | + * terminate things with extreme prejudice. | |
33074 | + */ | |
33075 | + | |
33076 | + flags = oops_begin(); | |
33077 | + | |
33078 | + if (address < PAGE_SIZE) | |
33079 | + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); | |
33080 | + else | |
33081 | + printk(KERN_ALERT "Unable to handle kernel paging request"); | |
33082 | + printk(" at %016lx RIP: \n" KERN_ALERT,address); | |
33083 | + printk_address(regs->rip); | |
33084 | + dump_pagetable(address); | |
33085 | + tsk->thread.cr2 = address; | |
33086 | + tsk->thread.trap_no = 14; | |
33087 | + tsk->thread.error_code = error_code; | |
33088 | + __die("Oops", regs, error_code); | |
33089 | + /* Executive summary in case the body of the oops scrolled away */ | |
33090 | + printk(KERN_EMERG "CR2: %016lx\n", address); | |
33091 | + oops_end(flags); | |
33092 | + do_exit(SIGKILL); | |
33093 | + | |
33094 | +/* | |
33095 | + * We ran out of memory, or some other thing happened to us that made | |
33096 | + * us unable to handle the page fault gracefully. | |
33097 | + */ | |
33098 | +out_of_memory: | |
33099 | + up_read(&mm->mmap_sem); | |
33100 | + if (current->pid == 1) { | |
33101 | + yield(); | |
33102 | + goto again; | |
33103 | + } | |
33104 | + printk("VM: killing process %s\n", tsk->comm); | |
33105 | + if (error_code & 4) | |
33106 | + do_exit(SIGKILL); | |
33107 | + goto no_context; | |
33108 | + | |
33109 | +do_sigbus: | |
33110 | + up_read(&mm->mmap_sem); | |
33111 | + | |
33112 | + /* Kernel mode? Handle exceptions or die */ | |
33113 | + if (!(error_code & PF_USER)) | |
33114 | + goto no_context; | |
33115 | + | |
33116 | + tsk->thread.cr2 = address; | |
33117 | + tsk->thread.error_code = error_code; | |
33118 | + tsk->thread.trap_no = 14; | |
33119 | + info.si_signo = SIGBUS; | |
33120 | + info.si_errno = 0; | |
33121 | + info.si_code = BUS_ADRERR; | |
33122 | + info.si_addr = (void __user *)address; | |
33123 | + force_sig_info(SIGBUS, &info, tsk); | |
33124 | + return; | |
33125 | +} | |
33126 | + | |
33127 | +DEFINE_SPINLOCK(pgd_lock); | |
33128 | +struct page *pgd_list; | |
33129 | + | |
33130 | +void vmalloc_sync_all(void) | |
33131 | +{ | |
33132 | + /* Note that races in the updates of insync and start aren't | |
33133 | + problematic: | |
33134 | + insync can only get set bits added, and updates to start are only | |
33135 | + improving performance (without affecting correctness if undone). */ | |
33136 | + static DECLARE_BITMAP(insync, PTRS_PER_PGD); | |
33137 | + static unsigned long start = VMALLOC_START & PGDIR_MASK; | |
33138 | + unsigned long address; | |
33139 | + | |
33140 | + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { | |
33141 | + if (!test_bit(pgd_index(address), insync)) { | |
33142 | + const pgd_t *pgd_ref = pgd_offset_k(address); | |
33143 | + struct page *page; | |
33144 | + | |
33145 | + if (pgd_none(*pgd_ref)) | |
33146 | + continue; | |
33147 | + spin_lock(&pgd_lock); | |
33148 | + for (page = pgd_list; page; | |
33149 | + page = (struct page *)page->index) { | |
33150 | + pgd_t *pgd; | |
33151 | + pgd = (pgd_t *)page_address(page) + pgd_index(address); | |
33152 | + if (pgd_none(*pgd)) | |
33153 | + set_pgd(pgd, *pgd_ref); | |
33154 | + else | |
33155 | + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); | |
33156 | + } | |
33157 | + spin_unlock(&pgd_lock); | |
33158 | + set_bit(pgd_index(address), insync); | |
33159 | + } | |
33160 | + if (address == start) | |
33161 | + start = address + PGDIR_SIZE; | |
33162 | + } | |
33163 | + /* Check that there is no need to do the same for the modules area. */ | |
33164 | + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); | |
33165 | + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == | |
33166 | + (__START_KERNEL & PGDIR_MASK))); | |
33167 | +} | |
33168 | + | |
33169 | +static int __init enable_pagefaulttrace(char *str) | |
33170 | +{ | |
33171 | + page_fault_trace = 1; | |
33172 | + return 1; | |
33173 | +} | |
33174 | +__setup("pagefaulttrace", enable_pagefaulttrace); | |
33175 | Index: head-2008-11-25/arch/x86/mm/init_64-xen.c | |
33176 | =================================================================== | |
33177 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
33178 | +++ head-2008-11-25/arch/x86/mm/init_64-xen.c 2008-10-29 09:55:56.000000000 +0100 | |
33179 | @@ -0,0 +1,1206 @@ | |
33180 | +/* | |
33181 | + * linux/arch/x86_64/mm/init.c | |
33182 | + * | |
33183 | + * Copyright (C) 1995 Linus Torvalds | |
33184 | + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | |
33185 | + * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> | |
33186 | + * | |
33187 | + * Jun Nakajima <jun.nakajima@intel.com> | |
33188 | + * Modified for Xen. | |
33189 | + */ | |
33190 | + | |
33191 | +#include <linux/signal.h> | |
33192 | +#include <linux/sched.h> | |
33193 | +#include <linux/kernel.h> | |
33194 | +#include <linux/errno.h> | |
33195 | +#include <linux/string.h> | |
33196 | +#include <linux/types.h> | |
33197 | +#include <linux/ptrace.h> | |
33198 | +#include <linux/mman.h> | |
33199 | +#include <linux/mm.h> | |
33200 | +#include <linux/swap.h> | |
33201 | +#include <linux/smp.h> | |
33202 | +#include <linux/init.h> | |
33203 | +#include <linux/pagemap.h> | |
33204 | +#include <linux/bootmem.h> | |
33205 | +#include <linux/proc_fs.h> | |
33206 | +#include <linux/pci.h> | |
33207 | +#include <linux/poison.h> | |
33208 | +#include <linux/dma-mapping.h> | |
33209 | +#include <linux/module.h> | |
33210 | +#include <linux/memory_hotplug.h> | |
33211 | + | |
33212 | +#include <asm/processor.h> | |
33213 | +#include <asm/system.h> | |
33214 | +#include <asm/uaccess.h> | |
33215 | +#include <asm/pgtable.h> | |
33216 | +#include <asm/pgalloc.h> | |
33217 | +#include <asm/dma.h> | |
33218 | +#include <asm/fixmap.h> | |
33219 | +#include <asm/e820.h> | |
33220 | +#include <asm/apic.h> | |
33221 | +#include <asm/tlb.h> | |
33222 | +#include <asm/mmu_context.h> | |
33223 | +#include <asm/proto.h> | |
33224 | +#include <asm/smp.h> | |
33225 | +#include <asm/sections.h> | |
33226 | + | |
33227 | +#include <xen/features.h> | |
33228 | + | |
33229 | +#ifndef Dprintk | |
33230 | +#define Dprintk(x...) | |
33231 | +#endif | |
33232 | + | |
33233 | +struct dma_mapping_ops* dma_ops; | |
33234 | +EXPORT_SYMBOL(dma_ops); | |
33235 | + | |
33236 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
33237 | +unsigned int __kernel_page_user; | |
33238 | +EXPORT_SYMBOL(__kernel_page_user); | |
33239 | +#endif | |
33240 | + | |
33241 | +int after_bootmem; | |
33242 | + | |
33243 | +static unsigned long dma_reserve __initdata; | |
33244 | + | |
33245 | +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | |
33246 | +extern unsigned long start_pfn; | |
33247 | + | |
33248 | +/* | |
33249 | + * Use this until direct mapping is established, i.e. before __va() is | |
33250 | + * available in init_memory_mapping(). | |
33251 | + */ | |
33252 | + | |
33253 | +#define addr_to_page(addr, page) \ | |
33254 | + (addr) &= PHYSICAL_PAGE_MASK; \ | |
33255 | + (page) = ((unsigned long *) ((unsigned long) \ | |
33256 | + (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \ | |
33257 | + __START_KERNEL_map))) | |
33258 | + | |
33259 | +static void __meminit early_make_page_readonly(void *va, unsigned int feature) | |
33260 | +{ | |
33261 | + unsigned long addr, _va = (unsigned long)va; | |
33262 | + pte_t pte, *ptep; | |
33263 | + unsigned long *page = (unsigned long *) init_level4_pgt; | |
33264 | + | |
33265 | + BUG_ON(after_bootmem); | |
33266 | + | |
33267 | + if (xen_feature(feature)) | |
33268 | + return; | |
33269 | + | |
33270 | + addr = (unsigned long) page[pgd_index(_va)]; | |
33271 | + addr_to_page(addr, page); | |
33272 | + | |
33273 | + addr = page[pud_index(_va)]; | |
33274 | + addr_to_page(addr, page); | |
33275 | + | |
33276 | + addr = page[pmd_index(_va)]; | |
33277 | + addr_to_page(addr, page); | |
33278 | + | |
33279 | + ptep = (pte_t *) &page[pte_index(_va)]; | |
33280 | + | |
33281 | + pte.pte = ptep->pte & ~_PAGE_RW; | |
33282 | + if (HYPERVISOR_update_va_mapping(_va, pte, 0)) | |
33283 | + BUG(); | |
33284 | +} | |
33285 | + | |
33286 | +static void __make_page_readonly(void *va) | |
33287 | +{ | |
33288 | + pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep; | |
33289 | + unsigned long addr = (unsigned long) va; | |
33290 | + | |
33291 | + pgd = pgd_offset_k(addr); | |
33292 | + pud = pud_offset(pgd, addr); | |
33293 | + pmd = pmd_offset(pud, addr); | |
33294 | + ptep = pte_offset_kernel(pmd, addr); | |
33295 | + | |
33296 | + pte.pte = ptep->pte & ~_PAGE_RW; | |
33297 | + if (HYPERVISOR_update_va_mapping(addr, pte, 0)) | |
33298 | + xen_l1_entry_update(ptep, pte); /* fallback */ | |
33299 | + | |
33300 | + if ((addr >= VMALLOC_START) && (addr < VMALLOC_END)) | |
33301 | + __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT)); | |
33302 | +} | |
33303 | + | |
33304 | +static void __make_page_writable(void *va) | |
33305 | +{ | |
33306 | + pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep; | |
33307 | + unsigned long addr = (unsigned long) va; | |
33308 | + | |
33309 | + pgd = pgd_offset_k(addr); | |
33310 | + pud = pud_offset(pgd, addr); | |
33311 | + pmd = pmd_offset(pud, addr); | |
33312 | + ptep = pte_offset_kernel(pmd, addr); | |
33313 | + | |
33314 | + pte.pte = ptep->pte | _PAGE_RW; | |
33315 | + if (HYPERVISOR_update_va_mapping(addr, pte, 0)) | |
33316 | + xen_l1_entry_update(ptep, pte); /* fallback */ | |
33317 | + | |
33318 | + if ((addr >= VMALLOC_START) && (addr < VMALLOC_END)) | |
33319 | + __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT)); | |
33320 | +} | |
33321 | + | |
33322 | +void make_page_readonly(void *va, unsigned int feature) | |
33323 | +{ | |
33324 | + if (!xen_feature(feature)) | |
33325 | + __make_page_readonly(va); | |
33326 | +} | |
33327 | + | |
33328 | +void make_page_writable(void *va, unsigned int feature) | |
33329 | +{ | |
33330 | + if (!xen_feature(feature)) | |
33331 | + __make_page_writable(va); | |
33332 | +} | |
33333 | + | |
33334 | +void make_pages_readonly(void *va, unsigned nr, unsigned int feature) | |
33335 | +{ | |
33336 | + if (xen_feature(feature)) | |
33337 | + return; | |
33338 | + | |
33339 | + while (nr-- != 0) { | |
33340 | + __make_page_readonly(va); | |
33341 | + va = (void*)((unsigned long)va + PAGE_SIZE); | |
33342 | + } | |
33343 | +} | |
33344 | + | |
33345 | +void make_pages_writable(void *va, unsigned nr, unsigned int feature) | |
33346 | +{ | |
33347 | + if (xen_feature(feature)) | |
33348 | + return; | |
33349 | + | |
33350 | + while (nr-- != 0) { | |
33351 | + __make_page_writable(va); | |
33352 | + va = (void*)((unsigned long)va + PAGE_SIZE); | |
33353 | + } | |
33354 | +} | |
33355 | + | |
33356 | +/* | |
33357 | + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the | |
33358 | + * physical space so we can cache the place of the first one and move | |
33359 | + * around without checking the pgd every time. | |
33360 | + */ | |
33361 | + | |
33362 | +void show_mem(void) | |
33363 | +{ | |
33364 | + long i, total = 0, reserved = 0; | |
33365 | + long shared = 0, cached = 0; | |
33366 | + pg_data_t *pgdat; | |
33367 | + struct page *page; | |
33368 | + | |
33369 | + printk(KERN_INFO "Mem-info:\n"); | |
33370 | + show_free_areas(); | |
33371 | + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | |
33372 | + | |
33373 | + for_each_online_pgdat(pgdat) { | |
33374 | + for (i = 0; i < pgdat->node_spanned_pages; ++i) { | |
33375 | + page = pfn_to_page(pgdat->node_start_pfn + i); | |
33376 | + total++; | |
33377 | + if (PageReserved(page)) | |
33378 | + reserved++; | |
33379 | + else if (PageSwapCache(page)) | |
33380 | + cached++; | |
33381 | + else if (page_count(page)) | |
33382 | + shared += page_count(page) - 1; | |
33383 | + } | |
33384 | + } | |
33385 | + printk(KERN_INFO "%lu pages of RAM\n", total); | |
33386 | + printk(KERN_INFO "%lu reserved pages\n",reserved); | |
33387 | + printk(KERN_INFO "%lu pages shared\n",shared); | |
33388 | + printk(KERN_INFO "%lu pages swap cached\n",cached); | |
33389 | +} | |
33390 | + | |
33391 | + | |
33392 | +static __init void *spp_getpage(void) | |
33393 | +{ | |
33394 | + void *ptr; | |
33395 | + if (after_bootmem) | |
33396 | + ptr = (void *) get_zeroed_page(GFP_ATOMIC); | |
33397 | + else if (start_pfn < table_end) { | |
33398 | + ptr = __va(start_pfn << PAGE_SHIFT); | |
33399 | + start_pfn++; | |
33400 | + memset(ptr, 0, PAGE_SIZE); | |
33401 | + } else | |
33402 | + ptr = alloc_bootmem_pages(PAGE_SIZE); | |
33403 | + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) | |
33404 | + panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); | |
33405 | + | |
33406 | + Dprintk("spp_getpage %p\n", ptr); | |
33407 | + return ptr; | |
33408 | +} | |
33409 | + | |
33410 | +#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address)) | |
33411 | +#define pud_offset_u(address) (level3_user_pgt + pud_index(address)) | |
33412 | + | |
33413 | +static __init void set_pte_phys(unsigned long vaddr, | |
33414 | + unsigned long phys, pgprot_t prot, int user_mode) | |
33415 | +{ | |
33416 | + pgd_t *pgd; | |
33417 | + pud_t *pud; | |
33418 | + pmd_t *pmd; | |
33419 | + pte_t *pte, new_pte; | |
33420 | + | |
33421 | + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); | |
33422 | + | |
33423 | + pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr)); | |
33424 | + if (pgd_none(*pgd)) { | |
33425 | + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); | |
33426 | + return; | |
33427 | + } | |
33428 | + pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr)); | |
33429 | + if (pud_none(*pud)) { | |
33430 | + pmd = (pmd_t *) spp_getpage(); | |
33431 | + make_page_readonly(pmd, XENFEAT_writable_page_tables); | |
33432 | + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); | |
33433 | + if (pmd != pmd_offset(pud, 0)) { | |
33434 | + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); | |
33435 | + return; | |
33436 | + } | |
33437 | + } | |
33438 | + pmd = pmd_offset(pud, vaddr); | |
33439 | + if (pmd_none(*pmd)) { | |
33440 | + pte = (pte_t *) spp_getpage(); | |
33441 | + make_page_readonly(pte, XENFEAT_writable_page_tables); | |
33442 | + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); | |
33443 | + if (pte != pte_offset_kernel(pmd, 0)) { | |
33444 | + printk("PAGETABLE BUG #02!\n"); | |
33445 | + return; | |
33446 | + } | |
33447 | + } | |
33448 | + if (pgprot_val(prot)) | |
33449 | + new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); | |
33450 | + else | |
33451 | + new_pte = __pte(0); | |
33452 | + | |
33453 | + pte = pte_offset_kernel(pmd, vaddr); | |
33454 | + if (!pte_none(*pte) && __pte_val(new_pte) && | |
33455 | + __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask)) | |
33456 | + pte_ERROR(*pte); | |
33457 | + set_pte(pte, new_pte); | |
33458 | + | |
33459 | + /* | |
33460 | + * It's enough to flush this one mapping. | |
33461 | + * (PGE mappings get flushed as well) | |
33462 | + */ | |
33463 | + __flush_tlb_one(vaddr); | |
33464 | +} | |
33465 | + | |
33466 | +static __init void set_pte_phys_ma(unsigned long vaddr, | |
33467 | + unsigned long phys, pgprot_t prot) | |
33468 | +{ | |
33469 | + pgd_t *pgd; | |
33470 | + pud_t *pud; | |
33471 | + pmd_t *pmd; | |
33472 | + pte_t *pte, new_pte; | |
33473 | + | |
33474 | + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); | |
33475 | + | |
33476 | + pgd = pgd_offset_k(vaddr); | |
33477 | + if (pgd_none(*pgd)) { | |
33478 | + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); | |
33479 | + return; | |
33480 | + } | |
33481 | + pud = pud_offset(pgd, vaddr); | |
33482 | + if (pud_none(*pud)) { | |
33483 | + | |
33484 | + pmd = (pmd_t *) spp_getpage(); | |
33485 | + make_page_readonly(pmd, XENFEAT_writable_page_tables); | |
33486 | + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); | |
33487 | + if (pmd != pmd_offset(pud, 0)) { | |
33488 | + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); | |
33489 | + return; | |
33490 | + } | |
33491 | + } | |
33492 | + pmd = pmd_offset(pud, vaddr); | |
33493 | + if (pmd_none(*pmd)) { | |
33494 | + pte = (pte_t *) spp_getpage(); | |
33495 | + make_page_readonly(pte, XENFEAT_writable_page_tables); | |
33496 | + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); | |
33497 | + if (pte != pte_offset_kernel(pmd, 0)) { | |
33498 | + printk("PAGETABLE BUG #02!\n"); | |
33499 | + return; | |
33500 | + } | |
33501 | + } | |
33502 | + new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot); | |
33503 | + | |
33504 | + pte = pte_offset_kernel(pmd, vaddr); | |
33505 | + if (!pte_none(*pte) && __pte_val(new_pte) && | |
33506 | +#ifdef CONFIG_ACPI | |
33507 | + /* __acpi_map_table() fails to properly call clear_fixmap() */ | |
33508 | + (vaddr < __fix_to_virt(FIX_ACPI_END) || | |
33509 | + vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) && | |
33510 | +#endif | |
33511 | + __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask)) | |
33512 | + pte_ERROR(*pte); | |
33513 | + set_pte(pte, new_pte); | |
33514 | + | |
33515 | + /* | |
33516 | + * It's enough to flush this one mapping. | |
33517 | + * (PGE mappings get flushed as well) | |
33518 | + */ | |
33519 | + __flush_tlb_one(vaddr); | |
33520 | +} | |
33521 | + | |
33522 | +/* NOTE: this is meant to be run only at boot */ | |
33523 | +void __init | |
33524 | +__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) | |
33525 | +{ | |
33526 | + unsigned long address = __fix_to_virt(idx); | |
33527 | + | |
33528 | + if (idx >= __end_of_fixed_addresses) { | |
33529 | + printk("Invalid __set_fixmap\n"); | |
33530 | + return; | |
33531 | + } | |
33532 | + switch (idx) { | |
33533 | + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: | |
33534 | + set_pte_phys(address, phys, prot, 0); | |
33535 | + set_pte_phys(address, phys, prot, 1); | |
33536 | + break; | |
33537 | + default: | |
33538 | + set_pte_phys_ma(address, phys, prot); | |
33539 | + break; | |
33540 | + } | |
33541 | +} | |
33542 | + | |
33543 | +unsigned long __initdata table_start, table_end; | |
33544 | + | |
33545 | +static __meminit void *alloc_static_page(unsigned long *phys) | |
33546 | +{ | |
33547 | + unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map; | |
33548 | + | |
33549 | + if (after_bootmem) { | |
33550 | + void *adr = (void *)get_zeroed_page(GFP_ATOMIC); | |
33551 | + | |
33552 | + *phys = __pa(adr); | |
33553 | + return adr; | |
33554 | + } | |
33555 | + | |
33556 | + *phys = start_pfn << PAGE_SHIFT; | |
33557 | + start_pfn++; | |
33558 | + memset((void *)va, 0, PAGE_SIZE); | |
33559 | + return (void *)va; | |
33560 | +} | |
33561 | + | |
33562 | +#define PTE_SIZE PAGE_SIZE | |
33563 | + | |
33564 | +static inline int make_readonly(unsigned long paddr) | |
33565 | +{ | |
33566 | + extern char __vsyscall_0; | |
33567 | + int readonly = 0; | |
33568 | + | |
33569 | + /* Make new page tables read-only. */ | |
33570 | + if (!xen_feature(XENFEAT_writable_page_tables) | |
33571 | + && (paddr >= (table_start << PAGE_SHIFT)) | |
33572 | + && (paddr < (table_end << PAGE_SHIFT))) | |
33573 | + readonly = 1; | |
33574 | + /* Make old page tables read-only. */ | |
33575 | + if (!xen_feature(XENFEAT_writable_page_tables) | |
33576 | + && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map)) | |
33577 | + && (paddr < (start_pfn << PAGE_SHIFT))) | |
33578 | + readonly = 1; | |
33579 | + | |
33580 | + /* | |
33581 | + * No need for writable mapping of kernel image. This also ensures that | |
33582 | + * page and descriptor tables embedded inside don't have writable | |
33583 | + * mappings. Exclude the vsyscall area here, allowing alternative | |
33584 | + * instruction patching to work. | |
33585 | + */ | |
33586 | + if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)) | |
33587 | + && !(paddr >= __pa_symbol(&__vsyscall_0) | |
33588 | + && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE)) | |
33589 | + readonly = 1; | |
33590 | + | |
33591 | + return readonly; | |
33592 | +} | |
33593 | + | |
33594 | +#ifndef CONFIG_XEN | |
33595 | +/* Must run before zap_low_mappings */ | |
33596 | +__init void *early_ioremap(unsigned long addr, unsigned long size) | |
33597 | +{ | |
33598 | + unsigned long map = round_down(addr, LARGE_PAGE_SIZE); | |
33599 | + | |
33600 | + /* actually usually some more */ | |
33601 | + if (size >= LARGE_PAGE_SIZE) { | |
33602 | + printk("SMBIOS area too long %lu\n", size); | |
33603 | + return NULL; | |
33604 | + } | |
33605 | + set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); | |
33606 | + map += LARGE_PAGE_SIZE; | |
33607 | + set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); | |
33608 | + __flush_tlb(); | |
33609 | + return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1)); | |
33610 | +} | |
33611 | + | |
33612 | +/* To avoid virtual aliases later */ | |
33613 | +__init void early_iounmap(void *addr, unsigned long size) | |
33614 | +{ | |
33615 | + if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) | |
33616 | + printk("early_iounmap: bad address %p\n", addr); | |
33617 | + set_pmd(temp_mappings[0].pmd, __pmd(0)); | |
33618 | + set_pmd(temp_mappings[1].pmd, __pmd(0)); | |
33619 | + __flush_tlb(); | |
33620 | +} | |
33621 | +#endif | |
33622 | + | |
33623 | +static void __meminit | |
33624 | +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) | |
33625 | +{ | |
33626 | + int i, k; | |
33627 | + | |
33628 | + for (i = 0; i < PTRS_PER_PMD; pmd++, i++) { | |
33629 | + unsigned long pte_phys; | |
33630 | + pte_t *pte, *pte_save; | |
33631 | + | |
33632 | + if (address >= end) | |
33633 | + break; | |
33634 | + pte = alloc_static_page(&pte_phys); | |
33635 | + pte_save = pte; | |
33636 | + for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) { | |
33637 | + unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE; | |
33638 | + | |
33639 | + if (address >= (after_bootmem | |
33640 | + ? end | |
33641 | + : xen_start_info->nr_pages << PAGE_SHIFT)) | |
33642 | + pteval = 0; | |
33643 | + else if (make_readonly(address)) | |
33644 | + pteval &= ~_PAGE_RW; | |
33645 | + set_pte(pte, __pte(pteval & __supported_pte_mask)); | |
33646 | + } | |
33647 | + if (!after_bootmem) { | |
33648 | + early_make_page_readonly(pte_save, XENFEAT_writable_page_tables); | |
33649 | + *pmd = __pmd(pte_phys | _KERNPG_TABLE); | |
33650 | + } else { | |
33651 | + make_page_readonly(pte_save, XENFEAT_writable_page_tables); | |
33652 | + set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE)); | |
33653 | + } | |
33654 | + } | |
33655 | +} | |
33656 | + | |
33657 | +static void __meminit | |
33658 | +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) | |
33659 | +{ | |
33660 | + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); | |
33661 | + | |
33662 | + if (pmd_none(*pmd)) { | |
33663 | + spin_lock(&init_mm.page_table_lock); | |
33664 | + phys_pmd_init(pmd, address, end); | |
33665 | + spin_unlock(&init_mm.page_table_lock); | |
33666 | + __flush_tlb_all(); | |
33667 | + } | |
33668 | +} | |
33669 | + | |
33670 | +static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) | |
33671 | +{ | |
33672 | + long i = pud_index(address); | |
33673 | + | |
33674 | + pud = pud + i; | |
33675 | + | |
33676 | + if (after_bootmem && pud_val(*pud)) { | |
33677 | + phys_pmd_update(pud, address, end); | |
33678 | + return; | |
33679 | + } | |
33680 | + | |
33681 | + for (; i < PTRS_PER_PUD; pud++, i++) { | |
33682 | + unsigned long paddr, pmd_phys; | |
33683 | + pmd_t *pmd; | |
33684 | + | |
33685 | + paddr = (address & PGDIR_MASK) + i*PUD_SIZE; | |
33686 | + if (paddr >= end) | |
33687 | + break; | |
33688 | + | |
33689 | + pmd = alloc_static_page(&pmd_phys); | |
33690 | + | |
33691 | + spin_lock(&init_mm.page_table_lock); | |
33692 | + *pud = __pud(pmd_phys | _KERNPG_TABLE); | |
33693 | + phys_pmd_init(pmd, paddr, end); | |
33694 | + spin_unlock(&init_mm.page_table_lock); | |
33695 | + | |
33696 | + early_make_page_readonly(pmd, XENFEAT_writable_page_tables); | |
33697 | + } | |
33698 | + __flush_tlb(); | |
33699 | +} | |
33700 | + | |
33701 | +void __init xen_init_pt(void) | |
33702 | +{ | |
33703 | + unsigned long addr, *page; | |
33704 | + | |
33705 | + /* Find the initial pte page that was built for us. */ | |
33706 | + page = (unsigned long *)xen_start_info->pt_base; | |
33707 | + addr = page[pgd_index(__START_KERNEL_map)]; | |
33708 | + addr_to_page(addr, page); | |
33709 | + addr = page[pud_index(__START_KERNEL_map)]; | |
33710 | + addr_to_page(addr, page); | |
33711 | + | |
33712 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
33713 | + /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER | |
33714 | + in kernel PTEs. We check that here. */ | |
33715 | + if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) { | |
33716 | + unsigned long *pg; | |
33717 | + pte_t pte; | |
33718 | + | |
33719 | + /* Mess with the initial mapping of page 0. It's not needed. */ | |
33720 | + BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map); | |
33721 | + addr = page[pmd_index(__START_KERNEL_map)]; | |
33722 | + addr_to_page(addr, pg); | |
33723 | + pte.pte = pg[pte_index(__START_KERNEL_map)]; | |
33724 | + BUG_ON(!(pte.pte & _PAGE_PRESENT)); | |
33725 | + | |
33726 | + /* If _PAGE_USER isn't set, we obviously do not need it. */ | |
33727 | + if (pte.pte & _PAGE_USER) { | |
33728 | + /* _PAGE_USER is needed, but is it set implicitly? */ | |
33729 | + pte.pte &= ~_PAGE_USER; | |
33730 | + if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map, | |
33731 | + pte, 0) != 0) || | |
33732 | + !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER)) | |
33733 | + /* We need to explicitly specify _PAGE_USER. */ | |
33734 | + __kernel_page_user = _PAGE_USER; | |
33735 | + } | |
33736 | + } | |
33737 | +#endif | |
33738 | + | |
33739 | + /* Construct mapping of initial pte page in our own directories. */ | |
33740 | + init_level4_pgt[pgd_index(__START_KERNEL_map)] = | |
33741 | + __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE); | |
33742 | + level3_kernel_pgt[pud_index(__START_KERNEL_map)] = | |
33743 | + __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE); | |
33744 | + memcpy(level2_kernel_pgt, page, PAGE_SIZE); | |
33745 | + | |
33746 | + __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] = | |
33747 | + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE); | |
33748 | + | |
33749 | + early_make_page_readonly(init_level4_pgt, | |
33750 | + XENFEAT_writable_page_tables); | |
33751 | + early_make_page_readonly(__user_pgd(init_level4_pgt), | |
33752 | + XENFEAT_writable_page_tables); | |
33753 | + early_make_page_readonly(level3_kernel_pgt, | |
33754 | + XENFEAT_writable_page_tables); | |
33755 | + early_make_page_readonly(level3_user_pgt, | |
33756 | + XENFEAT_writable_page_tables); | |
33757 | + early_make_page_readonly(level2_kernel_pgt, | |
33758 | + XENFEAT_writable_page_tables); | |
33759 | + | |
33760 | + if (!xen_feature(XENFEAT_writable_page_tables)) { | |
33761 | + xen_pgd_pin(__pa_symbol(init_level4_pgt)); | |
33762 | + xen_pgd_pin(__pa_symbol(__user_pgd(init_level4_pgt))); | |
33763 | + } | |
33764 | +} | |
33765 | + | |
33766 | +static void __init extend_init_mapping(unsigned long tables_space) | |
33767 | +{ | |
33768 | + unsigned long va = __START_KERNEL_map; | |
33769 | + unsigned long phys, addr, *pte_page; | |
33770 | + pmd_t *pmd; | |
33771 | + pte_t *pte, new_pte; | |
33772 | + unsigned long *page = (unsigned long *)init_level4_pgt; | |
33773 | + | |
33774 | + addr = page[pgd_index(va)]; | |
33775 | + addr_to_page(addr, page); | |
33776 | + addr = page[pud_index(va)]; | |
33777 | + addr_to_page(addr, page); | |
33778 | + | |
33779 | + /* Kill mapping of low 1MB. */ | |
33780 | + while (va < (unsigned long)&_text) { | |
33781 | + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) | |
33782 | + BUG(); | |
33783 | + va += PAGE_SIZE; | |
33784 | + } | |
33785 | + | |
33786 | + /* Ensure init mappings cover kernel text/data and initial tables. */ | |
33787 | + while (va < (__START_KERNEL_map | |
33788 | + + (start_pfn << PAGE_SHIFT) | |
33789 | + + tables_space)) { | |
33790 | + pmd = (pmd_t *)&page[pmd_index(va)]; | |
33791 | + if (pmd_none(*pmd)) { | |
33792 | + pte_page = alloc_static_page(&phys); | |
33793 | + early_make_page_readonly( | |
33794 | + pte_page, XENFEAT_writable_page_tables); | |
33795 | + set_pmd(pmd, __pmd(phys | _KERNPG_TABLE)); | |
33796 | + } else { | |
33797 | + addr = page[pmd_index(va)]; | |
33798 | + addr_to_page(addr, pte_page); | |
33799 | + } | |
33800 | + pte = (pte_t *)&pte_page[pte_index(va)]; | |
33801 | + if (pte_none(*pte)) { | |
33802 | + new_pte = pfn_pte( | |
33803 | + (va - __START_KERNEL_map) >> PAGE_SHIFT, | |
33804 | + __pgprot(_KERNPG_TABLE)); | |
33805 | + xen_l1_entry_update(pte, new_pte); | |
33806 | + } | |
33807 | + va += PAGE_SIZE; | |
33808 | + } | |
33809 | + | |
33810 | + /* Finally, blow away any spurious initial mappings. */ | |
33811 | + while (1) { | |
33812 | + pmd = (pmd_t *)&page[pmd_index(va)]; | |
33813 | + if (pmd_none(*pmd)) | |
33814 | + break; | |
33815 | + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) | |
33816 | + BUG(); | |
33817 | + va += PAGE_SIZE; | |
33818 | + } | |
33819 | +} | |
33820 | + | |
33821 | +static void __init find_early_table_space(unsigned long end) | |
33822 | +{ | |
33823 | + unsigned long puds, pmds, ptes, tables; | |
33824 | + | |
33825 | + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | |
33826 | + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | |
33827 | + ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT; | |
33828 | + | |
33829 | + tables = round_up(puds * 8, PAGE_SIZE) + | |
33830 | + round_up(pmds * 8, PAGE_SIZE) + | |
33831 | + round_up(ptes * 8, PAGE_SIZE); | |
33832 | + | |
33833 | + extend_init_mapping(tables); | |
33834 | + | |
33835 | + table_start = start_pfn; | |
33836 | + table_end = table_start + (tables>>PAGE_SHIFT); | |
33837 | + | |
33838 | + early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", | |
33839 | + end, table_start << PAGE_SHIFT, | |
33840 | + (table_start << PAGE_SHIFT) + tables); | |
33841 | +} | |
33842 | + | |
33843 | +static void xen_finish_init_mapping(void) | |
33844 | +{ | |
33845 | + unsigned long i, start, end; | |
33846 | + | |
33847 | + /* Re-vector virtual addresses pointing into the initial | |
33848 | + mapping to the just-established permanent ones. */ | |
33849 | + xen_start_info = __va(__pa(xen_start_info)); | |
33850 | + xen_start_info->pt_base = (unsigned long) | |
33851 | + __va(__pa(xen_start_info->pt_base)); | |
33852 | + if (!xen_feature(XENFEAT_auto_translated_physmap)) { | |
33853 | + phys_to_machine_mapping = | |
33854 | + __va(__pa(xen_start_info->mfn_list)); | |
33855 | + xen_start_info->mfn_list = (unsigned long) | |
33856 | + phys_to_machine_mapping; | |
33857 | + } | |
33858 | + if (xen_start_info->mod_start) | |
33859 | + xen_start_info->mod_start = (unsigned long) | |
33860 | + __va(__pa(xen_start_info->mod_start)); | |
33861 | + | |
33862 | + /* Destroy the Xen-created mappings beyond the kernel image as | |
33863 | + * well as the temporary mappings created above. Prevents | |
33864 | + * overlap with modules area (if init mapping is very big). | |
33865 | + */ | |
33866 | + start = PAGE_ALIGN((unsigned long)_end); | |
33867 | + end = __START_KERNEL_map + (table_end << PAGE_SHIFT); | |
33868 | + for (; start < end; start += PAGE_SIZE) | |
33869 | + if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0)) | |
33870 | + BUG(); | |
33871 | + | |
33872 | + /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */ | |
33873 | + table_end = ~0UL; | |
33874 | + | |
33875 | + /* | |
33876 | + * Prefetch pte's for the bt_ioremap() area. It gets used before the | |
33877 | + * boot-time allocator is online, so allocate-on-demand would fail. | |
33878 | + */ | |
33879 | + for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++) | |
33880 | + __set_fixmap(i, 0, __pgprot(0)); | |
33881 | + | |
33882 | + /* Switch to the real shared_info page, and clear the dummy page. */ | |
33883 | + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); | |
33884 | + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); | |
33885 | + memset(empty_zero_page, 0, sizeof(empty_zero_page)); | |
33886 | + | |
33887 | + /* Set up mapping of lowest 1MB of physical memory. */ | |
33888 | + for (i = 0; i < NR_FIX_ISAMAPS; i++) | |
33889 | + if (is_initial_xendomain()) | |
33890 | + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); | |
33891 | + else | |
33892 | + __set_fixmap(FIX_ISAMAP_BEGIN - i, | |
33893 | + virt_to_mfn(empty_zero_page) | |
33894 | + << PAGE_SHIFT, | |
33895 | + PAGE_KERNEL_RO); | |
33896 | + | |
33897 | + /* Disable the 'start_pfn' allocator. */ | |
33898 | + table_end = start_pfn; | |
33899 | +} | |
33900 | + | |
33901 | +/* Setup the direct mapping of the physical memory at PAGE_OFFSET. | |
33902 | + This runs before bootmem is initialized and gets pages directly from the | |
33903 | + physical memory. To access them they are temporarily mapped. */ | |
33904 | +void __meminit init_memory_mapping(unsigned long start, unsigned long end) | |
33905 | +{ | |
33906 | + unsigned long next; | |
33907 | + | |
33908 | + Dprintk("init_memory_mapping\n"); | |
33909 | + | |
33910 | + /* | |
33911 | + * Find space for the kernel direct mapping tables. | |
33912 | + * Later we should allocate these tables in the local node of the memory | |
33913 | + * mapped. Unfortunately this is done currently before the nodes are | |
33914 | + * discovered. | |
33915 | + */ | |
33916 | + if (!after_bootmem) | |
33917 | + find_early_table_space(end); | |
33918 | + | |
33919 | + start = (unsigned long)__va(start); | |
33920 | + end = (unsigned long)__va(end); | |
33921 | + | |
33922 | + for (; start < end; start = next) { | |
33923 | + unsigned long pud_phys; | |
33924 | + pgd_t *pgd = pgd_offset_k(start); | |
33925 | + pud_t *pud; | |
33926 | + | |
33927 | + if (after_bootmem) | |
33928 | + pud = pud_offset(pgd, start & PGDIR_MASK); | |
33929 | + else | |
33930 | + pud = alloc_static_page(&pud_phys); | |
33931 | + next = start + PGDIR_SIZE; | |
33932 | + if (next > end) | |
33933 | + next = end; | |
33934 | + phys_pud_init(pud, __pa(start), __pa(next)); | |
33935 | + if (!after_bootmem) { | |
33936 | + early_make_page_readonly(pud, XENFEAT_writable_page_tables); | |
33937 | + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); | |
33938 | + } | |
33939 | + } | |
33940 | + | |
33941 | + if (!after_bootmem) { | |
33942 | + BUG_ON(start_pfn != table_end); | |
33943 | + xen_finish_init_mapping(); | |
33944 | + } | |
33945 | + | |
33946 | + __flush_tlb_all(); | |
33947 | +} | |
33948 | + | |
33949 | +void __cpuinit zap_low_mappings(int cpu) | |
33950 | +{ | |
33951 | + /* this is not required for Xen */ | |
33952 | +#if 0 | |
33953 | + swap_low_mappings(); | |
33954 | +#endif | |
33955 | +} | |
33956 | + | |
33957 | +/* Compute zone sizes for the DMA and DMA32 zones in a node. */ | |
33958 | +__init void | |
33959 | +size_zones(unsigned long *z, unsigned long *h, | |
33960 | + unsigned long start_pfn, unsigned long end_pfn) | |
33961 | +{ | |
33962 | + int i; | |
33963 | + unsigned long w; | |
33964 | + | |
33965 | + for (i = 0; i < MAX_NR_ZONES; i++) | |
33966 | + z[i] = 0; | |
33967 | + | |
33968 | + if (start_pfn < MAX_DMA_PFN) | |
33969 | + z[ZONE_DMA] = MAX_DMA_PFN - start_pfn; | |
33970 | + if (start_pfn < MAX_DMA32_PFN) { | |
33971 | + unsigned long dma32_pfn = MAX_DMA32_PFN; | |
33972 | + if (dma32_pfn > end_pfn) | |
33973 | + dma32_pfn = end_pfn; | |
33974 | + z[ZONE_DMA32] = dma32_pfn - start_pfn; | |
33975 | + } | |
33976 | + z[ZONE_NORMAL] = end_pfn - start_pfn; | |
33977 | + | |
33978 | + /* Remove lower zones from higher ones. */ | |
33979 | + w = 0; | |
33980 | + for (i = 0; i < MAX_NR_ZONES; i++) { | |
33981 | + if (z[i]) | |
33982 | + z[i] -= w; | |
33983 | + w += z[i]; | |
33984 | + } | |
33985 | + | |
33986 | + /* Compute holes */ | |
33987 | + w = start_pfn; | |
33988 | + for (i = 0; i < MAX_NR_ZONES; i++) { | |
33989 | + unsigned long s = w; | |
33990 | + w += z[i]; | |
33991 | + h[i] = e820_hole_size(s, w); | |
33992 | + } | |
33993 | + | |
33994 | + /* Add the space pace needed for mem_map to the holes too. */ | |
33995 | + for (i = 0; i < MAX_NR_ZONES; i++) | |
33996 | + h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE; | |
33997 | + | |
33998 | + /* The 16MB DMA zone has the kernel and other misc mappings. | |
33999 | + Account them too */ | |
34000 | + if (h[ZONE_DMA]) { | |
34001 | + h[ZONE_DMA] += dma_reserve; | |
34002 | + if (h[ZONE_DMA] >= z[ZONE_DMA]) { | |
34003 | + printk(KERN_WARNING | |
34004 | + "Kernel too large and filling up ZONE_DMA?\n"); | |
34005 | + h[ZONE_DMA] = z[ZONE_DMA]; | |
34006 | + } | |
34007 | + } | |
34008 | +} | |
34009 | + | |
34010 | +#ifndef CONFIG_NUMA | |
34011 | +void __init paging_init(void) | |
34012 | +{ | |
34013 | + unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; | |
34014 | + | |
34015 | + memory_present(0, 0, end_pfn); | |
34016 | + sparse_init(); | |
34017 | + size_zones(zones, holes, 0, end_pfn); | |
34018 | + free_area_init_node(0, NODE_DATA(0), zones, | |
34019 | + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); | |
34020 | + | |
34021 | + init_mm.context.pinned = 1; | |
34022 | +} | |
34023 | +#endif | |
34024 | + | |
34025 | +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches | |
34026 | + from the CPU leading to inconsistent cache lines. address and size | |
34027 | + must be aligned to 2MB boundaries. | |
34028 | + Does nothing when the mapping doesn't exist. */ | |
34029 | +void __init clear_kernel_mapping(unsigned long address, unsigned long size) | |
34030 | +{ | |
34031 | + unsigned long end = address + size; | |
34032 | + | |
34033 | + BUG_ON(address & ~LARGE_PAGE_MASK); | |
34034 | + BUG_ON(size & ~LARGE_PAGE_MASK); | |
34035 | + | |
34036 | + for (; address < end; address += LARGE_PAGE_SIZE) { | |
34037 | + pgd_t *pgd = pgd_offset_k(address); | |
34038 | + pud_t *pud; | |
34039 | + pmd_t *pmd; | |
34040 | + if (pgd_none(*pgd)) | |
34041 | + continue; | |
34042 | + pud = pud_offset(pgd, address); | |
34043 | + if (pud_none(*pud)) | |
34044 | + continue; | |
34045 | + pmd = pmd_offset(pud, address); | |
34046 | + if (!pmd || pmd_none(*pmd)) | |
34047 | + continue; | |
34048 | + if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) { | |
34049 | + /* Could handle this, but it should not happen currently. */ | |
34050 | + printk(KERN_ERR | |
34051 | + "clear_kernel_mapping: mapping has been split. will leak memory\n"); | |
34052 | + pmd_ERROR(*pmd); | |
34053 | + } | |
34054 | + set_pmd(pmd, __pmd(0)); | |
34055 | + } | |
34056 | + __flush_tlb_all(); | |
34057 | +} | |
34058 | + | |
34059 | +/* | |
34060 | + * Memory hotplug specific functions | |
34061 | + */ | |
34062 | +void online_page(struct page *page) | |
34063 | +{ | |
34064 | + ClearPageReserved(page); | |
34065 | + init_page_count(page); | |
34066 | + __free_page(page); | |
34067 | + totalram_pages++; | |
34068 | + num_physpages++; | |
34069 | +} | |
34070 | + | |
34071 | +#ifdef CONFIG_MEMORY_HOTPLUG | |
34072 | +/* | |
34073 | + * XXX: memory_add_physaddr_to_nid() is to find node id from physical address | |
34074 | + * via probe interface of sysfs. If acpi notifies hot-add event, then it | |
34075 | + * can tell node id by searching dsdt. But, probe interface doesn't have | |
34076 | + * node id. So, return 0 as node id at this time. | |
34077 | + */ | |
34078 | +#ifdef CONFIG_NUMA | |
34079 | +int memory_add_physaddr_to_nid(u64 start) | |
34080 | +{ | |
34081 | + return 0; | |
34082 | +} | |
34083 | +#endif | |
34084 | + | |
34085 | +/* | |
34086 | + * Memory is added always to NORMAL zone. This means you will never get | |
34087 | + * additional DMA/DMA32 memory. | |
34088 | + */ | |
34089 | +int arch_add_memory(int nid, u64 start, u64 size) | |
34090 | +{ | |
34091 | + struct pglist_data *pgdat = NODE_DATA(nid); | |
34092 | + struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2; | |
34093 | + unsigned long start_pfn = start >> PAGE_SHIFT; | |
34094 | + unsigned long nr_pages = size >> PAGE_SHIFT; | |
34095 | + int ret; | |
34096 | + | |
34097 | + ret = __add_pages(zone, start_pfn, nr_pages); | |
34098 | + if (ret) | |
34099 | + goto error; | |
34100 | + | |
34101 | + init_memory_mapping(start, (start + size -1)); | |
34102 | + | |
34103 | + return ret; | |
34104 | +error: | |
34105 | + printk("%s: Problem encountered in __add_pages!\n", __func__); | |
34106 | + return ret; | |
34107 | +} | |
34108 | +EXPORT_SYMBOL_GPL(arch_add_memory); | |
34109 | + | |
34110 | +int remove_memory(u64 start, u64 size) | |
34111 | +{ | |
34112 | + return -EINVAL; | |
34113 | +} | |
34114 | +EXPORT_SYMBOL_GPL(remove_memory); | |
34115 | + | |
34116 | +#else /* CONFIG_MEMORY_HOTPLUG */ | |
34117 | +/* | |
34118 | + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, | |
34119 | + * just online the pages. | |
34120 | + */ | |
34121 | +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) | |
34122 | +{ | |
34123 | + int err = -EIO; | |
34124 | + unsigned long pfn; | |
34125 | + unsigned long total = 0, mem = 0; | |
34126 | + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { | |
34127 | + if (pfn_valid(pfn)) { | |
34128 | + online_page(pfn_to_page(pfn)); | |
34129 | + err = 0; | |
34130 | + mem++; | |
34131 | + } | |
34132 | + total++; | |
34133 | + } | |
34134 | + if (!err) { | |
34135 | + z->spanned_pages += total; | |
34136 | + z->present_pages += mem; | |
34137 | + z->zone_pgdat->node_spanned_pages += total; | |
34138 | + z->zone_pgdat->node_present_pages += mem; | |
34139 | + } | |
34140 | + return err; | |
34141 | +} | |
34142 | +#endif /* CONFIG_MEMORY_HOTPLUG */ | |
34143 | + | |
34144 | +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, | |
34145 | + kcore_vsyscall; | |
34146 | + | |
34147 | +void __init mem_init(void) | |
34148 | +{ | |
34149 | + long codesize, reservedpages, datasize, initsize; | |
34150 | + unsigned long pfn; | |
34151 | + | |
34152 | + pci_iommu_alloc(); | |
34153 | + | |
34154 | + /* How many end-of-memory variables you have, grandma! */ | |
34155 | + max_low_pfn = end_pfn; | |
34156 | + max_pfn = end_pfn; | |
34157 | + num_physpages = end_pfn; | |
34158 | + high_memory = (void *) __va(end_pfn * PAGE_SIZE); | |
34159 | + | |
34160 | + /* clear the zero-page */ | |
34161 | + memset(empty_zero_page, 0, PAGE_SIZE); | |
34162 | + | |
34163 | + reservedpages = 0; | |
34164 | + | |
34165 | + /* this will put all low memory onto the freelists */ | |
34166 | +#ifdef CONFIG_NUMA | |
34167 | + totalram_pages = numa_free_all_bootmem(); | |
34168 | +#else | |
34169 | + totalram_pages = free_all_bootmem(); | |
34170 | +#endif | |
34171 | + /* XEN: init and count pages outside initial allocation. */ | |
34172 | + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { | |
34173 | + ClearPageReserved(pfn_to_page(pfn)); | |
34174 | + init_page_count(pfn_to_page(pfn)); | |
34175 | + totalram_pages++; | |
34176 | + } | |
34177 | + reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn); | |
34178 | + | |
34179 | + after_bootmem = 1; | |
34180 | + | |
34181 | + codesize = (unsigned long) &_etext - (unsigned long) &_text; | |
34182 | + datasize = (unsigned long) &_edata - (unsigned long) &_etext; | |
34183 | + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | |
34184 | + | |
34185 | + /* Register memory areas for /proc/kcore */ | |
34186 | + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | |
34187 | + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | |
34188 | + VMALLOC_END-VMALLOC_START); | |
34189 | + kclist_add(&kcore_kernel, &_stext, _end - _stext); | |
34190 | + kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); | |
34191 | + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, | |
34192 | + VSYSCALL_END - VSYSCALL_START); | |
34193 | + | |
34194 | + printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", | |
34195 | + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | |
34196 | + end_pfn << (PAGE_SHIFT-10), | |
34197 | + codesize >> 10, | |
34198 | + reservedpages << (PAGE_SHIFT-10), | |
34199 | + datasize >> 10, | |
34200 | + initsize >> 10); | |
34201 | + | |
34202 | +#ifndef CONFIG_XEN | |
34203 | +#ifdef CONFIG_SMP | |
34204 | + /* | |
34205 | + * Sync boot_level4_pgt mappings with the init_level4_pgt | |
34206 | + * except for the low identity mappings which are already zapped | |
34207 | + * in init_level4_pgt. This sync-up is essential for AP's bringup | |
34208 | + */ | |
34209 | + memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); | |
34210 | +#endif | |
34211 | +#endif | |
34212 | +} | |
34213 | + | |
34214 | +void free_init_pages(char *what, unsigned long begin, unsigned long end) | |
34215 | +{ | |
34216 | + unsigned long addr; | |
34217 | + | |
34218 | + if (begin >= end) | |
34219 | + return; | |
34220 | + | |
34221 | + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); | |
34222 | + for (addr = begin; addr < end; addr += PAGE_SIZE) { | |
34223 | + ClearPageReserved(virt_to_page(addr)); | |
34224 | + init_page_count(virt_to_page(addr)); | |
34225 | + memset((void *)(addr & ~(PAGE_SIZE-1)), | |
34226 | + POISON_FREE_INITMEM, PAGE_SIZE); | |
34227 | + if (addr >= __START_KERNEL_map) { | |
34228 | + /* make_readonly() reports all kernel addresses. */ | |
34229 | + __make_page_writable(__va(__pa(addr))); | |
34230 | + if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { | |
34231 | + pgd_t *pgd = pgd_offset_k(addr); | |
34232 | + pud_t *pud = pud_offset(pgd, addr); | |
34233 | + pmd_t *pmd = pmd_offset(pud, addr); | |
34234 | + pte_t *pte = pte_offset_kernel(pmd, addr); | |
34235 | + | |
34236 | + xen_l1_entry_update(pte, __pte(0)); /* fallback */ | |
34237 | + } | |
34238 | + } | |
34239 | + free_page(addr); | |
34240 | + totalram_pages++; | |
34241 | + } | |
34242 | +} | |
34243 | + | |
34244 | +void free_initmem(void) | |
34245 | +{ | |
34246 | + memset(__initdata_begin, POISON_FREE_INITDATA, | |
34247 | + __initdata_end - __initdata_begin); | |
34248 | + free_init_pages("unused kernel memory", | |
34249 | + (unsigned long)(&__init_begin), | |
34250 | + (unsigned long)(&__init_end)); | |
34251 | +} | |
34252 | + | |
34253 | +#ifdef CONFIG_DEBUG_RODATA | |
34254 | + | |
34255 | +void mark_rodata_ro(void) | |
34256 | +{ | |
34257 | + unsigned long addr = (unsigned long)__start_rodata; | |
34258 | + | |
34259 | + for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) | |
34260 | + change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); | |
34261 | + | |
34262 | + printk ("Write protecting the kernel read-only data: %luk\n", | |
34263 | + (__end_rodata - __start_rodata) >> 10); | |
34264 | + | |
34265 | + /* | |
34266 | + * change_page_attr_addr() requires a global_flush_tlb() call after it. | |
34267 | + * We do this after the printk so that if something went wrong in the | |
34268 | + * change, the printk gets out at least to give a better debug hint | |
34269 | + * of who is the culprit. | |
34270 | + */ | |
34271 | + global_flush_tlb(); | |
34272 | +} | |
34273 | +#endif | |
34274 | + | |
34275 | +#ifdef CONFIG_BLK_DEV_INITRD | |
34276 | +void free_initrd_mem(unsigned long start, unsigned long end) | |
34277 | +{ | |
34278 | + free_init_pages("initrd memory", start, end); | |
34279 | +} | |
34280 | +#endif | |
34281 | + | |
34282 | +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) | |
34283 | +{ | |
34284 | + /* Should check here against the e820 map to avoid double free */ | |
34285 | +#ifdef CONFIG_NUMA | |
34286 | + int nid = phys_to_nid(phys); | |
34287 | + reserve_bootmem_node(NODE_DATA(nid), phys, len); | |
34288 | +#else | |
34289 | + reserve_bootmem(phys, len); | |
34290 | +#endif | |
34291 | + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) | |
34292 | + dma_reserve += len / PAGE_SIZE; | |
34293 | +} | |
34294 | + | |
34295 | +int kern_addr_valid(unsigned long addr) | |
34296 | +{ | |
34297 | + unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; | |
34298 | + pgd_t *pgd; | |
34299 | + pud_t *pud; | |
34300 | + pmd_t *pmd; | |
34301 | + pte_t *pte; | |
34302 | + | |
34303 | + if (above != 0 && above != -1UL) | |
34304 | + return 0; | |
34305 | + | |
34306 | + pgd = pgd_offset_k(addr); | |
34307 | + if (pgd_none(*pgd)) | |
34308 | + return 0; | |
34309 | + | |
34310 | + pud = pud_offset(pgd, addr); | |
34311 | + if (pud_none(*pud)) | |
34312 | + return 0; | |
34313 | + | |
34314 | + pmd = pmd_offset(pud, addr); | |
34315 | + if (pmd_none(*pmd)) | |
34316 | + return 0; | |
34317 | + if (pmd_large(*pmd)) | |
34318 | + return pfn_valid(pmd_pfn(*pmd)); | |
34319 | + | |
34320 | + pte = pte_offset_kernel(pmd, addr); | |
34321 | + if (pte_none(*pte)) | |
34322 | + return 0; | |
34323 | + return pfn_valid(pte_pfn(*pte)); | |
34324 | +} | |
34325 | + | |
34326 | +#ifdef CONFIG_SYSCTL | |
34327 | +#include <linux/sysctl.h> | |
34328 | + | |
34329 | +extern int exception_trace, page_fault_trace; | |
34330 | + | |
34331 | +static ctl_table debug_table2[] = { | |
34332 | + { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, | |
34333 | + proc_dointvec }, | |
34334 | + { 0, } | |
34335 | +}; | |
34336 | + | |
34337 | +static ctl_table debug_root_table2[] = { | |
34338 | + { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, | |
34339 | + .child = debug_table2 }, | |
34340 | + { 0 }, | |
34341 | +}; | |
34342 | + | |
34343 | +static __init int x8664_sysctl_init(void) | |
34344 | +{ | |
34345 | + register_sysctl_table(debug_root_table2, 1); | |
34346 | + return 0; | |
34347 | +} | |
34348 | +__initcall(x8664_sysctl_init); | |
34349 | +#endif | |
34350 | + | |
34351 | +/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only | |
34352 | + covers the 64bit vsyscall page now. 32bit has a real VMA now and does | |
34353 | + not need special handling anymore. */ | |
34354 | + | |
34355 | +static struct vm_area_struct gate_vma = { | |
34356 | + .vm_start = VSYSCALL_START, | |
34357 | + .vm_end = VSYSCALL_END, | |
34358 | + .vm_page_prot = PAGE_READONLY | |
34359 | +}; | |
34360 | + | |
34361 | +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | |
34362 | +{ | |
34363 | +#ifdef CONFIG_IA32_EMULATION | |
34364 | + if (test_tsk_thread_flag(tsk, TIF_IA32)) | |
34365 | + return NULL; | |
34366 | +#endif | |
34367 | + return &gate_vma; | |
34368 | +} | |
34369 | + | |
34370 | +int in_gate_area(struct task_struct *task, unsigned long addr) | |
34371 | +{ | |
34372 | + struct vm_area_struct *vma = get_gate_vma(task); | |
34373 | + if (!vma) | |
34374 | + return 0; | |
34375 | + return (addr >= vma->vm_start) && (addr < vma->vm_end); | |
34376 | +} | |
34377 | + | |
34378 | +/* Use this when you have no reliable task/vma, typically from interrupt | |
34379 | + * context. It is less reliable than using the task's vma and may give | |
34380 | + * false positives. | |
34381 | + */ | |
34382 | +int in_gate_area_no_task(unsigned long addr) | |
34383 | +{ | |
34384 | + return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); | |
34385 | +} | |
34386 | Index: head-2008-11-25/arch/x86/mm/pageattr_64-xen.c | |
34387 | =================================================================== | |
34388 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
34389 | +++ head-2008-11-25/arch/x86/mm/pageattr_64-xen.c 2008-07-21 11:00:32.000000000 +0200 | |
34390 | @@ -0,0 +1,502 @@ | |
34391 | +/* | |
34392 | + * Copyright 2002 Andi Kleen, SuSE Labs. | |
34393 | + * Thanks to Ben LaHaise for precious feedback. | |
34394 | + */ | |
34395 | + | |
34396 | +#include <linux/mm.h> | |
34397 | +#include <linux/sched.h> | |
34398 | +#include <linux/highmem.h> | |
34399 | +#include <linux/module.h> | |
34400 | +#include <linux/slab.h> | |
34401 | +#include <asm/uaccess.h> | |
34402 | +#include <asm/processor.h> | |
34403 | +#include <asm/tlbflush.h> | |
34404 | +#include <asm/io.h> | |
34405 | + | |
34406 | +#ifdef CONFIG_XEN | |
34407 | +#include <asm/pgalloc.h> | |
34408 | +#include <asm/mmu_context.h> | |
34409 | + | |
34410 | +LIST_HEAD(mm_unpinned); | |
34411 | +DEFINE_SPINLOCK(mm_unpinned_lock); | |
34412 | + | |
34413 | +static void _pin_lock(struct mm_struct *mm, int lock) { | |
34414 | + if (lock) | |
34415 | + spin_lock(&mm->page_table_lock); | |
34416 | +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | |
34417 | + /* While mm->page_table_lock protects us against insertions and | |
34418 | + * removals of higher level page table pages, it doesn't protect | |
34419 | + * against updates of pte-s. Such updates, however, require the | |
34420 | + * pte pages to be in consistent state (unpinned+writable or | |
34421 | + * pinned+readonly). The pinning and attribute changes, however | |
34422 | + * cannot be done atomically, which is why such updates must be | |
34423 | + * prevented from happening concurrently. | |
34424 | + * Note that no pte lock can ever elsewhere be acquired nesting | |
34425 | + * with an already acquired one in the same mm, or with the mm's | |
34426 | + * page_table_lock already acquired, as that would break in the | |
34427 | + * non-split case (where all these are actually resolving to the | |
34428 | + * one page_table_lock). Thus acquiring all of them here is not | |
34429 | + * going to result in dead locks, and the order of acquires | |
34430 | + * doesn't matter. | |
34431 | + */ | |
34432 | + { | |
34433 | + pgd_t *pgd = mm->pgd; | |
34434 | + unsigned g; | |
34435 | + | |
34436 | + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { | |
34437 | + pud_t *pud; | |
34438 | + unsigned u; | |
34439 | + | |
34440 | + if (pgd_none(*pgd)) | |
34441 | + continue; | |
34442 | + pud = pud_offset(pgd, 0); | |
34443 | + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
34444 | + pmd_t *pmd; | |
34445 | + unsigned m; | |
34446 | + | |
34447 | + if (pud_none(*pud)) | |
34448 | + continue; | |
34449 | + pmd = pmd_offset(pud, 0); | |
34450 | + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | |
34451 | + spinlock_t *ptl; | |
34452 | + | |
34453 | + if (pmd_none(*pmd)) | |
34454 | + continue; | |
34455 | + ptl = pte_lockptr(0, pmd); | |
34456 | + if (lock) | |
34457 | + spin_lock(ptl); | |
34458 | + else | |
34459 | + spin_unlock(ptl); | |
34460 | + } | |
34461 | + } | |
34462 | + } | |
34463 | + } | |
34464 | +#endif | |
34465 | + if (!lock) | |
34466 | + spin_unlock(&mm->page_table_lock); | |
34467 | +} | |
34468 | +#define pin_lock(mm) _pin_lock(mm, 1) | |
34469 | +#define pin_unlock(mm) _pin_lock(mm, 0) | |
34470 | + | |
34471 | +#define PIN_BATCH 8 | |
34472 | +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); | |
34473 | + | |
34474 | +static inline unsigned int mm_walk_set_prot(void *pt, pgprot_t flags, | |
34475 | + unsigned int cpu, unsigned int seq) | |
34476 | +{ | |
34477 | + struct page *page = virt_to_page(pt); | |
34478 | + unsigned long pfn = page_to_pfn(page); | |
34479 | + | |
34480 | + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, | |
34481 | + (unsigned long)__va(pfn << PAGE_SHIFT), | |
34482 | + pfn_pte(pfn, flags), 0); | |
34483 | + if (unlikely(++seq == PIN_BATCH)) { | |
34484 | + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), | |
34485 | + PIN_BATCH, NULL))) | |
34486 | + BUG(); | |
34487 | + seq = 0; | |
34488 | + } | |
34489 | + | |
34490 | + return seq; | |
34491 | +} | |
34492 | + | |
34493 | +static void mm_walk(struct mm_struct *mm, pgprot_t flags) | |
34494 | +{ | |
34495 | + pgd_t *pgd; | |
34496 | + pud_t *pud; | |
34497 | + pmd_t *pmd; | |
34498 | + pte_t *pte; | |
34499 | + int g,u,m; | |
34500 | + unsigned int cpu, seq; | |
34501 | + multicall_entry_t *mcl; | |
34502 | + | |
34503 | + pgd = mm->pgd; | |
34504 | + cpu = get_cpu(); | |
34505 | + | |
34506 | + /* | |
34507 | + * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not | |
34508 | + * be the 'current' task's pagetables (e.g., current may be 32-bit, | |
34509 | + * but the pagetables may be for a 64-bit task). | |
34510 | + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct | |
34511 | + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE. | |
34512 | + */ | |
34513 | + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { | |
34514 | + if (pgd_none(*pgd)) | |
34515 | + continue; | |
34516 | + pud = pud_offset(pgd, 0); | |
34517 | + if (PTRS_PER_PUD > 1) /* not folded */ | |
34518 | + seq = mm_walk_set_prot(pud,flags,cpu,seq); | |
34519 | + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
34520 | + if (pud_none(*pud)) | |
34521 | + continue; | |
34522 | + pmd = pmd_offset(pud, 0); | |
34523 | + if (PTRS_PER_PMD > 1) /* not folded */ | |
34524 | + seq = mm_walk_set_prot(pmd,flags,cpu,seq); | |
34525 | + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | |
34526 | + if (pmd_none(*pmd)) | |
34527 | + continue; | |
34528 | + pte = pte_offset_kernel(pmd,0); | |
34529 | + seq = mm_walk_set_prot(pte,flags,cpu,seq); | |
34530 | + } | |
34531 | + } | |
34532 | + } | |
34533 | + | |
34534 | + mcl = per_cpu(pb_mcl, cpu); | |
34535 | + if (unlikely(seq > PIN_BATCH - 2)) { | |
34536 | + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL))) | |
34537 | + BUG(); | |
34538 | + seq = 0; | |
34539 | + } | |
34540 | + MULTI_update_va_mapping(mcl + seq, | |
34541 | + (unsigned long)__user_pgd(mm->pgd), | |
34542 | + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, flags), | |
34543 | + 0); | |
34544 | + MULTI_update_va_mapping(mcl + seq + 1, | |
34545 | + (unsigned long)mm->pgd, | |
34546 | + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, flags), | |
34547 | + UVMF_TLB_FLUSH); | |
34548 | + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL))) | |
34549 | + BUG(); | |
34550 | + | |
34551 | + put_cpu(); | |
34552 | +} | |
34553 | + | |
34554 | +void mm_pin(struct mm_struct *mm) | |
34555 | +{ | |
34556 | + if (xen_feature(XENFEAT_writable_page_tables)) | |
34557 | + return; | |
34558 | + | |
34559 | + pin_lock(mm); | |
34560 | + | |
34561 | + mm_walk(mm, PAGE_KERNEL_RO); | |
34562 | + xen_pgd_pin(__pa(mm->pgd)); /* kernel */ | |
34563 | + xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */ | |
34564 | + mm->context.pinned = 1; | |
34565 | + spin_lock(&mm_unpinned_lock); | |
34566 | + list_del(&mm->context.unpinned); | |
34567 | + spin_unlock(&mm_unpinned_lock); | |
34568 | + | |
34569 | + pin_unlock(mm); | |
34570 | +} | |
34571 | + | |
34572 | +void mm_unpin(struct mm_struct *mm) | |
34573 | +{ | |
34574 | + if (xen_feature(XENFEAT_writable_page_tables)) | |
34575 | + return; | |
34576 | + | |
34577 | + pin_lock(mm); | |
34578 | + | |
34579 | + xen_pgd_unpin(__pa(mm->pgd)); | |
34580 | + xen_pgd_unpin(__pa(__user_pgd(mm->pgd))); | |
34581 | + mm_walk(mm, PAGE_KERNEL); | |
34582 | + mm->context.pinned = 0; | |
34583 | + spin_lock(&mm_unpinned_lock); | |
34584 | + list_add(&mm->context.unpinned, &mm_unpinned); | |
34585 | + spin_unlock(&mm_unpinned_lock); | |
34586 | + | |
34587 | + pin_unlock(mm); | |
34588 | +} | |
34589 | + | |
34590 | +void mm_pin_all(void) | |
34591 | +{ | |
34592 | + if (xen_feature(XENFEAT_writable_page_tables)) | |
34593 | + return; | |
34594 | + | |
34595 | + /* | |
34596 | + * Allow uninterrupted access to the mm_unpinned list. We don't | |
34597 | + * actually take the mm_unpinned_lock as it is taken inside mm_pin(). | |
34598 | + * All other CPUs must be at a safe point (e.g., in stop_machine | |
34599 | + * or offlined entirely). | |
34600 | + */ | |
34601 | + preempt_disable(); | |
34602 | + while (!list_empty(&mm_unpinned)) | |
34603 | + mm_pin(list_entry(mm_unpinned.next, struct mm_struct, | |
34604 | + context.unpinned)); | |
34605 | + preempt_enable(); | |
34606 | +} | |
34607 | + | |
34608 | +void _arch_dup_mmap(struct mm_struct *mm) | |
34609 | +{ | |
34610 | + if (!mm->context.pinned) | |
34611 | + mm_pin(mm); | |
34612 | +} | |
34613 | + | |
34614 | +void _arch_exit_mmap(struct mm_struct *mm) | |
34615 | +{ | |
34616 | + struct task_struct *tsk = current; | |
34617 | + | |
34618 | + task_lock(tsk); | |
34619 | + | |
34620 | + /* | |
34621 | + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() | |
34622 | + * *much* faster this way, as no tlb flushes means bigger wrpt batches. | |
34623 | + */ | |
34624 | + if (tsk->active_mm == mm) { | |
34625 | + tsk->active_mm = &init_mm; | |
34626 | + atomic_inc(&init_mm.mm_count); | |
34627 | + | |
34628 | + switch_mm(mm, &init_mm, tsk); | |
34629 | + | |
34630 | + atomic_dec(&mm->mm_count); | |
34631 | + BUG_ON(atomic_read(&mm->mm_count) == 0); | |
34632 | + } | |
34633 | + | |
34634 | + task_unlock(tsk); | |
34635 | + | |
34636 | + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) && | |
34637 | + !mm->context.has_foreign_mappings ) | |
34638 | + mm_unpin(mm); | |
34639 | +} | |
34640 | + | |
34641 | +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) | |
34642 | +{ | |
34643 | + struct page *pte; | |
34644 | + | |
34645 | + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | |
34646 | + if (pte) { | |
34647 | + SetPageForeign(pte, pte_free); | |
34648 | + init_page_count(pte); | |
34649 | + } | |
34650 | + return pte; | |
34651 | +} | |
34652 | + | |
34653 | +void pte_free(struct page *pte) | |
34654 | +{ | |
34655 | + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); | |
34656 | + | |
34657 | + if (!pte_write(*virt_to_ptep(va))) | |
34658 | + if (HYPERVISOR_update_va_mapping( | |
34659 | + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0)) | |
34660 | + BUG(); | |
34661 | + | |
34662 | + ClearPageForeign(pte); | |
34663 | + init_page_count(pte); | |
34664 | + | |
34665 | + __free_page(pte); | |
34666 | +} | |
34667 | +#endif /* CONFIG_XEN */ | |
34668 | + | |
34669 | +pte_t *lookup_address(unsigned long address) | |
34670 | +{ | |
34671 | + pgd_t *pgd = pgd_offset_k(address); | |
34672 | + pud_t *pud; | |
34673 | + pmd_t *pmd; | |
34674 | + pte_t *pte; | |
34675 | + if (pgd_none(*pgd)) | |
34676 | + return NULL; | |
34677 | + pud = pud_offset(pgd, address); | |
34678 | + if (!pud_present(*pud)) | |
34679 | + return NULL; | |
34680 | + pmd = pmd_offset(pud, address); | |
34681 | + if (!pmd_present(*pmd)) | |
34682 | + return NULL; | |
34683 | + if (pmd_large(*pmd)) | |
34684 | + return (pte_t *)pmd; | |
34685 | + pte = pte_offset_kernel(pmd, address); | |
34686 | + if (pte && !pte_present(*pte)) | |
34687 | + pte = NULL; | |
34688 | + return pte; | |
34689 | +} | |
34690 | + | |
34691 | +static struct page *split_large_page(unsigned long address, pgprot_t prot, | |
34692 | + pgprot_t ref_prot) | |
34693 | +{ | |
34694 | + int i; | |
34695 | + unsigned long addr; | |
34696 | + struct page *base = alloc_pages(GFP_KERNEL, 0); | |
34697 | + pte_t *pbase; | |
34698 | + if (!base) | |
34699 | + return NULL; | |
34700 | + /* | |
34701 | + * page_private is used to track the number of entries in | |
34702 | + * the page table page have non standard attributes. | |
34703 | + */ | |
34704 | + SetPagePrivate(base); | |
34705 | + page_private(base) = 0; | |
34706 | + | |
34707 | + address = __pa(address); | |
34708 | + addr = address & LARGE_PAGE_MASK; | |
34709 | + pbase = (pte_t *)page_address(base); | |
34710 | + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { | |
34711 | + pbase[i] = pfn_pte(addr >> PAGE_SHIFT, | |
34712 | + addr == address ? prot : ref_prot); | |
34713 | + } | |
34714 | + return base; | |
34715 | +} | |
34716 | + | |
34717 | + | |
34718 | +static void flush_kernel_map(void *address) | |
34719 | +{ | |
34720 | + if (0 && address && cpu_has_clflush) { | |
34721 | + /* is this worth it? */ | |
34722 | + int i; | |
34723 | + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) | |
34724 | + asm volatile("clflush (%0)" :: "r" (address + i)); | |
34725 | + } else | |
34726 | + asm volatile("wbinvd":::"memory"); | |
34727 | + if (address) | |
34728 | + __flush_tlb_one(address); | |
34729 | + else | |
34730 | + __flush_tlb_all(); | |
34731 | +} | |
34732 | + | |
34733 | + | |
34734 | +static inline void flush_map(unsigned long address) | |
34735 | +{ | |
34736 | + on_each_cpu(flush_kernel_map, (void *)address, 1, 1); | |
34737 | +} | |
34738 | + | |
34739 | +static struct page *deferred_pages; /* protected by init_mm.mmap_sem */ | |
34740 | + | |
34741 | +static inline void save_page(struct page *fpage) | |
34742 | +{ | |
34743 | + fpage->lru.next = (struct list_head *)deferred_pages; | |
34744 | + deferred_pages = fpage; | |
34745 | +} | |
34746 | + | |
34747 | +/* | |
34748 | + * No more special protections in this 2/4MB area - revert to a | |
34749 | + * large page again. | |
34750 | + */ | |
34751 | +static void revert_page(unsigned long address, pgprot_t ref_prot) | |
34752 | +{ | |
34753 | + pgd_t *pgd; | |
34754 | + pud_t *pud; | |
34755 | + pmd_t *pmd; | |
34756 | + pte_t large_pte; | |
34757 | + | |
34758 | + pgd = pgd_offset_k(address); | |
34759 | + BUG_ON(pgd_none(*pgd)); | |
34760 | + pud = pud_offset(pgd,address); | |
34761 | + BUG_ON(pud_none(*pud)); | |
34762 | + pmd = pmd_offset(pud, address); | |
34763 | + BUG_ON(__pmd_val(*pmd) & _PAGE_PSE); | |
34764 | + pgprot_val(ref_prot) |= _PAGE_PSE; | |
34765 | + large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); | |
34766 | + set_pte((pte_t *)pmd, large_pte); | |
34767 | +} | |
34768 | + | |
34769 | +static int | |
34770 | +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, | |
34771 | + pgprot_t ref_prot) | |
34772 | +{ | |
34773 | + pte_t *kpte; | |
34774 | + struct page *kpte_page; | |
34775 | + unsigned kpte_flags; | |
34776 | + pgprot_t ref_prot2; | |
34777 | + kpte = lookup_address(address); | |
34778 | + if (!kpte) return 0; | |
34779 | + kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); | |
34780 | + kpte_flags = pte_val(*kpte); | |
34781 | + if (pgprot_val(prot) != pgprot_val(ref_prot)) { | |
34782 | + if ((kpte_flags & _PAGE_PSE) == 0) { | |
34783 | + set_pte(kpte, pfn_pte(pfn, prot)); | |
34784 | + } else { | |
34785 | + /* | |
34786 | + * split_large_page will take the reference for this | |
34787 | + * change_page_attr on the split page. | |
34788 | + */ | |
34789 | + | |
34790 | + struct page *split; | |
34791 | + ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE)); | |
34792 | + | |
34793 | + split = split_large_page(address, prot, ref_prot2); | |
34794 | + if (!split) | |
34795 | + return -ENOMEM; | |
34796 | + set_pte(kpte,mk_pte(split, ref_prot2)); | |
34797 | + kpte_page = split; | |
34798 | + } | |
34799 | + page_private(kpte_page)++; | |
34800 | + } else if ((kpte_flags & _PAGE_PSE) == 0) { | |
34801 | + set_pte(kpte, pfn_pte(pfn, ref_prot)); | |
34802 | + BUG_ON(page_private(kpte_page) == 0); | |
34803 | + page_private(kpte_page)--; | |
34804 | + } else | |
34805 | + BUG(); | |
34806 | + | |
34807 | + /* on x86-64 the direct mapping set at boot is not using 4k pages */ | |
34808 | + /* | |
34809 | + * ..., but the XEN guest kernels (currently) do: | |
34810 | + * If the pte was reserved, it means it was created at boot | |
34811 | + * time (not via split_large_page) and in turn we must not | |
34812 | + * replace it with a large page. | |
34813 | + */ | |
34814 | +#ifndef CONFIG_XEN | |
34815 | + BUG_ON(PageReserved(kpte_page)); | |
34816 | +#else | |
34817 | + if (PageReserved(kpte_page)) | |
34818 | + return 0; | |
34819 | +#endif | |
34820 | + | |
34821 | + if (page_private(kpte_page) == 0) { | |
34822 | + save_page(kpte_page); | |
34823 | + revert_page(address, ref_prot); | |
34824 | + } | |
34825 | + return 0; | |
34826 | +} | |
34827 | + | |
34828 | +/* | |
34829 | + * Change the page attributes of an page in the linear mapping. | |
34830 | + * | |
34831 | + * This should be used when a page is mapped with a different caching policy | |
34832 | + * than write-back somewhere - some CPUs do not like it when mappings with | |
34833 | + * different caching policies exist. This changes the page attributes of the | |
34834 | + * in kernel linear mapping too. | |
34835 | + * | |
34836 | + * The caller needs to ensure that there are no conflicting mappings elsewhere. | |
34837 | + * This function only deals with the kernel linear map. | |
34838 | + * | |
34839 | + * Caller must call global_flush_tlb() after this. | |
34840 | + */ | |
34841 | +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) | |
34842 | +{ | |
34843 | + int err = 0; | |
34844 | + int i; | |
34845 | + | |
34846 | + down_write(&init_mm.mmap_sem); | |
34847 | + for (i = 0; i < numpages; i++, address += PAGE_SIZE) { | |
34848 | + unsigned long pfn = __pa(address) >> PAGE_SHIFT; | |
34849 | + | |
34850 | + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); | |
34851 | + if (err) | |
34852 | + break; | |
34853 | + /* Handle kernel mapping too which aliases part of the | |
34854 | + * lowmem */ | |
34855 | + if (__pa(address) < KERNEL_TEXT_SIZE) { | |
34856 | + unsigned long addr2; | |
34857 | + pgprot_t prot2 = prot; | |
34858 | + addr2 = __START_KERNEL_map + __pa(address); | |
34859 | + pgprot_val(prot2) &= ~_PAGE_NX; | |
34860 | + err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); | |
34861 | + } | |
34862 | + } | |
34863 | + up_write(&init_mm.mmap_sem); | |
34864 | + return err; | |
34865 | +} | |
34866 | + | |
34867 | +/* Don't call this for MMIO areas that may not have a mem_map entry */ | |
34868 | +int change_page_attr(struct page *page, int numpages, pgprot_t prot) | |
34869 | +{ | |
34870 | + unsigned long addr = (unsigned long)page_address(page); | |
34871 | + return change_page_attr_addr(addr, numpages, prot); | |
34872 | +} | |
34873 | + | |
34874 | +void global_flush_tlb(void) | |
34875 | +{ | |
34876 | + struct page *dpage; | |
34877 | + | |
34878 | + down_read(&init_mm.mmap_sem); | |
34879 | + dpage = xchg(&deferred_pages, NULL); | |
34880 | + up_read(&init_mm.mmap_sem); | |
34881 | + | |
34882 | + flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0); | |
34883 | + while (dpage) { | |
34884 | + struct page *tmp = dpage; | |
34885 | + dpage = (struct page *)dpage->lru.next; | |
34886 | + ClearPagePrivate(tmp); | |
34887 | + __free_page(tmp); | |
34888 | + } | |
34889 | +} | |
34890 | + | |
34891 | +EXPORT_SYMBOL(change_page_attr); | |
34892 | +EXPORT_SYMBOL(global_flush_tlb); | |
34893 | Index: head-2008-11-25/drivers/pci/msi-xen.c | |
34894 | =================================================================== | |
34895 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
34896 | +++ head-2008-11-25/drivers/pci/msi-xen.c 2008-10-13 13:43:45.000000000 +0200 | |
34897 | @@ -0,0 +1,809 @@ | |
34898 | +/* | |
34899 | + * File: msi.c | |
34900 | + * Purpose: PCI Message Signaled Interrupt (MSI) | |
34901 | + * | |
34902 | + * Copyright (C) 2003-2004 Intel | |
34903 | + * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) | |
34904 | + */ | |
34905 | + | |
34906 | +#include <linux/mm.h> | |
34907 | +#include <linux/irq.h> | |
34908 | +#include <linux/interrupt.h> | |
34909 | +#include <linux/init.h> | |
34910 | +#include <linux/ioport.h> | |
34911 | +#include <linux/smp_lock.h> | |
34912 | +#include <linux/pci.h> | |
34913 | +#include <linux/proc_fs.h> | |
34914 | + | |
34915 | +#include <xen/evtchn.h> | |
34916 | + | |
34917 | +#include <asm/errno.h> | |
34918 | +#include <asm/io.h> | |
34919 | +#include <asm/smp.h> | |
34920 | + | |
34921 | +#include "pci.h" | |
34922 | +#include "msi.h" | |
34923 | + | |
34924 | +static int pci_msi_enable = 1; | |
34925 | + | |
34926 | +static struct msi_ops *msi_ops; | |
34927 | + | |
34928 | +int msi_register(struct msi_ops *ops) | |
34929 | +{ | |
34930 | + msi_ops = ops; | |
34931 | + return 0; | |
34932 | +} | |
34933 | + | |
34934 | +static LIST_HEAD(msi_dev_head); | |
34935 | +DEFINE_SPINLOCK(msi_dev_lock); | |
34936 | + | |
34937 | +struct msi_dev_list { | |
34938 | + struct pci_dev *dev; | |
34939 | + struct list_head list; | |
34940 | + spinlock_t pirq_list_lock; | |
34941 | + struct list_head pirq_list_head; | |
34942 | +}; | |
34943 | + | |
34944 | +struct msi_pirq_entry { | |
34945 | + struct list_head list; | |
34946 | + int pirq; | |
34947 | + int entry_nr; | |
34948 | +}; | |
34949 | + | |
34950 | +static struct msi_dev_list *get_msi_dev_pirq_list(struct pci_dev *dev) | |
34951 | +{ | |
34952 | + struct msi_dev_list *msi_dev_list, *ret = NULL; | |
34953 | + unsigned long flags; | |
34954 | + | |
34955 | + spin_lock_irqsave(&msi_dev_lock, flags); | |
34956 | + | |
34957 | + list_for_each_entry(msi_dev_list, &msi_dev_head, list) | |
34958 | + if ( msi_dev_list->dev == dev ) | |
34959 | + ret = msi_dev_list; | |
34960 | + | |
34961 | + if ( ret ) { | |
34962 | + spin_unlock_irqrestore(&msi_dev_lock, flags); | |
34963 | + return ret; | |
34964 | + } | |
34965 | + | |
34966 | + /* Has not allocate msi_dev until now. */ | |
34967 | + ret = kzalloc(sizeof(struct msi_dev_list), GFP_ATOMIC); | |
34968 | + | |
34969 | + /* Failed to allocate msi_dev structure */ | |
34970 | + if ( !ret ) { | |
34971 | + spin_unlock_irqrestore(&msi_dev_lock, flags); | |
34972 | + return NULL; | |
34973 | + } | |
34974 | + | |
34975 | + ret->dev = dev; | |
34976 | + spin_lock_init(&ret->pirq_list_lock); | |
34977 | + INIT_LIST_HEAD(&ret->pirq_list_head); | |
34978 | + list_add_tail(&ret->list, &msi_dev_head); | |
34979 | + spin_unlock_irqrestore(&msi_dev_lock, flags); | |
34980 | + return ret; | |
34981 | +} | |
34982 | + | |
34983 | +static int attach_pirq_entry(int pirq, int entry_nr, | |
34984 | + struct msi_dev_list *msi_dev_entry) | |
34985 | +{ | |
34986 | + struct msi_pirq_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); | |
34987 | + unsigned long flags; | |
34988 | + | |
34989 | + if (!entry) | |
34990 | + return -ENOMEM; | |
34991 | + entry->pirq = pirq; | |
34992 | + entry->entry_nr = entry_nr; | |
34993 | + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags); | |
34994 | + list_add_tail(&entry->list, &msi_dev_entry->pirq_list_head); | |
34995 | + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags); | |
34996 | + return 0; | |
34997 | +} | |
34998 | + | |
34999 | +static void detach_pirq_entry(int entry_nr, | |
35000 | + struct msi_dev_list *msi_dev_entry) | |
35001 | +{ | |
35002 | + unsigned long flags; | |
35003 | + struct msi_pirq_entry *pirq_entry; | |
35004 | + | |
35005 | + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) { | |
35006 | + if (pirq_entry->entry_nr == entry_nr) { | |
35007 | + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags); | |
35008 | + list_del(&pirq_entry->list); | |
35009 | + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags); | |
35010 | + kfree(pirq_entry); | |
35011 | + return; | |
35012 | + } | |
35013 | + } | |
35014 | +} | |
35015 | + | |
35016 | +/* | |
35017 | + * pciback will provide device's owner | |
35018 | + */ | |
35019 | +static int (*get_owner)(struct pci_dev *dev); | |
35020 | + | |
35021 | +int register_msi_get_owner(int (*func)(struct pci_dev *dev)) | |
35022 | +{ | |
35023 | + if (get_owner) { | |
35024 | + printk(KERN_WARNING "register msi_get_owner again\n"); | |
35025 | + return -EEXIST; | |
35026 | + } | |
35027 | + get_owner = func; | |
35028 | + return 0; | |
35029 | +} | |
35030 | + | |
35031 | +int unregister_msi_get_owner(int (*func)(struct pci_dev *dev)) | |
35032 | +{ | |
35033 | + if (get_owner != func) | |
35034 | + return -EINVAL; | |
35035 | + get_owner = NULL; | |
35036 | + return 0; | |
35037 | +} | |
35038 | + | |
35039 | +static int msi_get_dev_owner(struct pci_dev *dev) | |
35040 | +{ | |
35041 | + int owner; | |
35042 | + | |
35043 | + BUG_ON(!is_initial_xendomain()); | |
35044 | + if (get_owner && (owner = get_owner(dev)) >= 0) { | |
35045 | + printk(KERN_INFO "get owner for dev %x get %x \n", | |
35046 | + dev->devfn, owner); | |
35047 | + return owner; | |
35048 | + } | |
35049 | + | |
35050 | + return DOMID_SELF; | |
35051 | +} | |
35052 | + | |
35053 | +static int msi_unmap_pirq(struct pci_dev *dev, int pirq) | |
35054 | +{ | |
35055 | + struct physdev_unmap_pirq unmap; | |
35056 | + int rc; | |
35057 | + | |
35058 | + unmap.domid = msi_get_dev_owner(dev); | |
35059 | + /* See comments in msi_map_pirq_to_vector, input parameter pirq | |
35060 | + * mean irq number only if the device belongs to dom0 itself. | |
35061 | + */ | |
35062 | + unmap.pirq = (unmap.domid != DOMID_SELF) | |
35063 | + ? pirq : evtchn_get_xen_pirq(pirq); | |
35064 | + | |
35065 | + if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap))) | |
35066 | + printk(KERN_WARNING "unmap irq %x failed\n", pirq); | |
35067 | + | |
35068 | + if (rc < 0) | |
35069 | + return rc; | |
35070 | + | |
35071 | + if (unmap.domid == DOMID_SELF) | |
35072 | + evtchn_map_pirq(pirq, 0); | |
35073 | + | |
35074 | + return 0; | |
35075 | +} | |
35076 | + | |
35077 | +static u64 find_table_base(struct pci_dev *dev, int pos) | |
35078 | +{ | |
35079 | + u8 bar; | |
35080 | + u32 reg; | |
35081 | + unsigned long flags; | |
35082 | + | |
35083 | + pci_read_config_dword(dev, msix_table_offset_reg(pos), ®); | |
35084 | + bar = reg & PCI_MSIX_FLAGS_BIRMASK; | |
35085 | + | |
35086 | + flags = pci_resource_flags(dev, bar); | |
35087 | + if (flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET | IORESOURCE_BUSY)) | |
35088 | + return 0; | |
35089 | + | |
35090 | + return pci_resource_start(dev, bar); | |
35091 | +} | |
35092 | + | |
35093 | +/* | |
35094 | + * Protected by msi_lock | |
35095 | + */ | |
35096 | +static int msi_map_pirq_to_vector(struct pci_dev *dev, int pirq, | |
35097 | + int entry_nr, u64 table_base) | |
35098 | +{ | |
35099 | + struct physdev_map_pirq map_irq; | |
35100 | + int rc; | |
35101 | + domid_t domid = DOMID_SELF; | |
35102 | + | |
35103 | + domid = msi_get_dev_owner(dev); | |
35104 | + | |
35105 | + map_irq.domid = domid; | |
35106 | + map_irq.type = MAP_PIRQ_TYPE_MSI; | |
35107 | + map_irq.index = -1; | |
35108 | + map_irq.pirq = pirq < 0 ? -1 : evtchn_get_xen_pirq(pirq); | |
35109 | + map_irq.bus = dev->bus->number; | |
35110 | + map_irq.devfn = dev->devfn; | |
35111 | + map_irq.entry_nr = entry_nr; | |
35112 | + map_irq.table_base = table_base; | |
35113 | + | |
35114 | + if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq))) | |
35115 | + printk(KERN_WARNING "map irq failed\n"); | |
35116 | + | |
35117 | + if (rc < 0) | |
35118 | + return rc; | |
35119 | + /* This happens when MSI support is not enabled in Xen. */ | |
35120 | + if (rc == 0 && map_irq.pirq < 0) | |
35121 | + return -ENOSYS; | |
35122 | + | |
35123 | + BUG_ON(map_irq.pirq <= 0); | |
35124 | + | |
35125 | + /* If mapping of this particular MSI is on behalf of another domain, | |
35126 | + * we do not need to get an irq in dom0. This also implies: | |
35127 | + * dev->irq in dom0 will be 'Xen pirq' if this device belongs to | |
35128 | + * to another domain, and will be 'Linux irq' if it belongs to dom0. | |
35129 | + */ | |
35130 | + return ((domid != DOMID_SELF) ? | |
35131 | + map_irq.pirq : evtchn_map_pirq(pirq, map_irq.pirq)); | |
35132 | +} | |
35133 | + | |
35134 | +static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base) | |
35135 | +{ | |
35136 | + return msi_map_pirq_to_vector(dev, -1, entry_nr, table_base); | |
35137 | +} | |
35138 | + | |
35139 | +static int msi_init(void) | |
35140 | +{ | |
35141 | + static int status = 0; | |
35142 | + | |
35143 | + if (pci_msi_quirk) { | |
35144 | + pci_msi_enable = 0; | |
35145 | + printk(KERN_WARNING "PCI: MSI quirk detected. MSI disabled.\n"); | |
35146 | + status = -EINVAL; | |
35147 | + } | |
35148 | + | |
35149 | + return status; | |
35150 | +} | |
35151 | + | |
35152 | +void pci_scan_msi_device(struct pci_dev *dev) { } | |
35153 | + | |
35154 | +void disable_msi_mode(struct pci_dev *dev, int pos, int type) | |
35155 | +{ | |
35156 | + u16 control; | |
35157 | + | |
35158 | + pci_read_config_word(dev, msi_control_reg(pos), &control); | |
35159 | + if (type == PCI_CAP_ID_MSI) { | |
35160 | + /* Set enabled bits to single MSI & enable MSI_enable bit */ | |
35161 | + msi_disable(control); | |
35162 | + pci_write_config_word(dev, msi_control_reg(pos), control); | |
35163 | + dev->msi_enabled = 0; | |
35164 | + } else { | |
35165 | + msix_disable(control); | |
35166 | + pci_write_config_word(dev, msi_control_reg(pos), control); | |
35167 | + dev->msix_enabled = 0; | |
35168 | + } | |
35169 | + if (pci_find_capability(dev, PCI_CAP_ID_EXP)) { | |
35170 | + /* PCI Express Endpoint device detected */ | |
35171 | + pci_intx(dev, 1); /* enable intx */ | |
35172 | + } | |
35173 | +} | |
35174 | + | |
35175 | +static void enable_msi_mode(struct pci_dev *dev, int pos, int type) | |
35176 | +{ | |
35177 | + u16 control; | |
35178 | + | |
35179 | + pci_read_config_word(dev, msi_control_reg(pos), &control); | |
35180 | + if (type == PCI_CAP_ID_MSI) { | |
35181 | + /* Set enabled bits to single MSI & enable MSI_enable bit */ | |
35182 | + msi_enable(control, 1); | |
35183 | + pci_write_config_word(dev, msi_control_reg(pos), control); | |
35184 | + dev->msi_enabled = 1; | |
35185 | + } else { | |
35186 | + msix_enable(control); | |
35187 | + pci_write_config_word(dev, msi_control_reg(pos), control); | |
35188 | + dev->msix_enabled = 1; | |
35189 | + } | |
35190 | + if (pci_find_capability(dev, PCI_CAP_ID_EXP)) { | |
35191 | + /* PCI Express Endpoint device detected */ | |
35192 | + pci_intx(dev, 0); /* disable intx */ | |
35193 | + } | |
35194 | +} | |
35195 | + | |
35196 | +#ifdef CONFIG_PM | |
35197 | +int pci_save_msi_state(struct pci_dev *dev) | |
35198 | +{ | |
35199 | + int pos; | |
35200 | + | |
35201 | + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); | |
35202 | + if (pos <= 0 || dev->no_msi) | |
35203 | + return 0; | |
35204 | + | |
35205 | + if (!dev->msi_enabled) | |
35206 | + return 0; | |
35207 | + | |
35208 | + /* Restore dev->irq to its default pin-assertion vector */ | |
35209 | + msi_unmap_pirq(dev, dev->irq); | |
35210 | + /* Disable MSI mode */ | |
35211 | + disable_msi_mode(dev, pos, PCI_CAP_ID_MSI); | |
35212 | + /* Set the flags for use of restore */ | |
35213 | + dev->msi_enabled = 1; | |
35214 | + return 0; | |
35215 | +} | |
35216 | + | |
35217 | +void pci_restore_msi_state(struct pci_dev *dev) | |
35218 | +{ | |
35219 | + int pos, pirq; | |
35220 | + | |
35221 | + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); | |
35222 | + if (pos <= 0) | |
35223 | + return; | |
35224 | + | |
35225 | + if (!dev->msi_enabled) | |
35226 | + return; | |
35227 | + | |
35228 | + pirq = msi_map_pirq_to_vector(dev, dev->irq, 0, 0); | |
35229 | + if (pirq < 0) | |
35230 | + return; | |
35231 | + enable_msi_mode(dev, pos, PCI_CAP_ID_MSI); | |
35232 | +} | |
35233 | + | |
35234 | +int pci_save_msix_state(struct pci_dev *dev) | |
35235 | +{ | |
35236 | + int pos; | |
35237 | + unsigned long flags; | |
35238 | + struct msi_dev_list *msi_dev_entry; | |
35239 | + struct msi_pirq_entry *pirq_entry, *tmp; | |
35240 | + | |
35241 | + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); | |
35242 | + if (pos <= 0 || dev->no_msi) | |
35243 | + return 0; | |
35244 | + | |
35245 | + /* save the capability */ | |
35246 | + if (!dev->msix_enabled) | |
35247 | + return 0; | |
35248 | + | |
35249 | + msi_dev_entry = get_msi_dev_pirq_list(dev); | |
35250 | + | |
35251 | + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags); | |
35252 | + list_for_each_entry_safe(pirq_entry, tmp, | |
35253 | + &msi_dev_entry->pirq_list_head, list) | |
35254 | + msi_unmap_pirq(dev, pirq_entry->pirq); | |
35255 | + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags); | |
35256 | + | |
35257 | + disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); | |
35258 | + /* Set the flags for use of restore */ | |
35259 | + dev->msix_enabled = 1; | |
35260 | + | |
35261 | + return 0; | |
35262 | +} | |
35263 | + | |
35264 | +void pci_restore_msix_state(struct pci_dev *dev) | |
35265 | +{ | |
35266 | + int pos; | |
35267 | + unsigned long flags; | |
35268 | + u64 table_base; | |
35269 | + struct msi_dev_list *msi_dev_entry; | |
35270 | + struct msi_pirq_entry *pirq_entry, *tmp; | |
35271 | + | |
35272 | + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); | |
35273 | + if (pos <= 0) | |
35274 | + return; | |
35275 | + | |
35276 | + if (!dev->msix_enabled) | |
35277 | + return; | |
35278 | + | |
35279 | + msi_dev_entry = get_msi_dev_pirq_list(dev); | |
35280 | + table_base = find_table_base(dev, pos); | |
35281 | + if (!table_base) | |
35282 | + return; | |
35283 | + | |
35284 | + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags); | |
35285 | + list_for_each_entry_safe(pirq_entry, tmp, | |
35286 | + &msi_dev_entry->pirq_list_head, list) { | |
35287 | + int rc = msi_map_pirq_to_vector(dev, pirq_entry->pirq, | |
35288 | + pirq_entry->entry_nr, table_base); | |
35289 | + if (rc < 0) | |
35290 | + printk(KERN_WARNING | |
35291 | + "%s: re-mapping irq #%d (pirq%d) failed: %d\n", | |
35292 | + pci_name(dev), pirq_entry->entry_nr, | |
35293 | + pirq_entry->pirq, rc); | |
35294 | + } | |
35295 | + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags); | |
35296 | + | |
35297 | + enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); | |
35298 | +} | |
35299 | +#endif | |
35300 | + | |
35301 | +/** | |
35302 | + * msi_capability_init - configure device's MSI capability structure | |
35303 | + * @dev: pointer to the pci_dev data structure of MSI device function | |
35304 | + * | |
35305 | + * Setup the MSI capability structure of device function with a single | |
35306 | + * MSI vector, regardless of device function is capable of handling | |
35307 | + * multiple messages. A return of zero indicates the successful setup | |
35308 | + * of an entry zero with the new MSI vector or non-zero for otherwise. | |
35309 | + **/ | |
35310 | +static int msi_capability_init(struct pci_dev *dev) | |
35311 | +{ | |
35312 | + int pos, pirq; | |
35313 | + u16 control; | |
35314 | + | |
35315 | + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); | |
35316 | + pci_read_config_word(dev, msi_control_reg(pos), &control); | |
35317 | + | |
35318 | + pirq = msi_map_vector(dev, 0, 0); | |
35319 | + if (pirq < 0) | |
35320 | + return -EBUSY; | |
35321 | + | |
35322 | + dev->irq = pirq; | |
35323 | + /* Set MSI enabled bits */ | |
35324 | + enable_msi_mode(dev, pos, PCI_CAP_ID_MSI); | |
35325 | + dev->msi_enabled = 1; | |
35326 | + | |
35327 | + return 0; | |
35328 | +} | |
35329 | + | |
35330 | +/** | |
35331 | + * msix_capability_init - configure device's MSI-X capability | |
35332 | + * @dev: pointer to the pci_dev data structure of MSI-X device function | |
35333 | + * @entries: pointer to an array of struct msix_entry entries | |
35334 | + * @nvec: number of @entries | |
35335 | + * | |
35336 | + * Setup the MSI-X capability structure of device function with a | |
35337 | + * single MSI-X vector. A return of zero indicates the successful setup of | |
35338 | + * requested MSI-X entries with allocated vectors or non-zero for otherwise. | |
35339 | + **/ | |
35340 | +static int msix_capability_init(struct pci_dev *dev, | |
35341 | + struct msix_entry *entries, int nvec) | |
35342 | +{ | |
35343 | + u64 table_base; | |
35344 | + int pirq, i, j, mapped, pos; | |
35345 | + struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); | |
35346 | + struct msi_pirq_entry *pirq_entry; | |
35347 | + | |
35348 | + if (!msi_dev_entry) | |
35349 | + return -ENOMEM; | |
35350 | + | |
35351 | + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); | |
35352 | + table_base = find_table_base(dev, pos); | |
35353 | + if (!table_base) | |
35354 | + return -ENODEV; | |
35355 | + | |
35356 | + /* MSI-X Table Initialization */ | |
35357 | + for (i = 0; i < nvec; i++) { | |
35358 | + mapped = 0; | |
35359 | + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) { | |
35360 | + if (pirq_entry->entry_nr == entries[i].entry) { | |
35361 | + printk(KERN_WARNING "msix entry %d for dev %02x:%02x:%01x are \ | |
35362 | + not freed before acquire again.\n", entries[i].entry, | |
35363 | + dev->bus->number, PCI_SLOT(dev->devfn), | |
35364 | + PCI_FUNC(dev->devfn)); | |
35365 | + (entries + i)->vector = pirq_entry->pirq; | |
35366 | + mapped = 1; | |
35367 | + break; | |
35368 | + } | |
35369 | + } | |
35370 | + if (mapped) | |
35371 | + continue; | |
35372 | + pirq = msi_map_vector(dev, entries[i].entry, table_base); | |
35373 | + if (pirq < 0) | |
35374 | + break; | |
35375 | + attach_pirq_entry(pirq, entries[i].entry, msi_dev_entry); | |
35376 | + (entries + i)->vector = pirq; | |
35377 | + } | |
35378 | + | |
35379 | + if (i != nvec) { | |
35380 | + for (j = --i; j >= 0; j--) { | |
35381 | + msi_unmap_pirq(dev, entries[j].vector); | |
35382 | + detach_pirq_entry(entries[j].entry, msi_dev_entry); | |
35383 | + entries[j].vector = 0; | |
35384 | + } | |
35385 | + return -EBUSY; | |
35386 | + } | |
35387 | + | |
35388 | + enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); | |
35389 | + dev->msix_enabled = 1; | |
35390 | + | |
35391 | + return 0; | |
35392 | +} | |
35393 | + | |
35394 | +/** | |
35395 | + * pci_enable_msi - configure device's MSI capability structure | |
35396 | + * @dev: pointer to the pci_dev data structure of MSI device function | |
35397 | + * | |
35398 | + * Setup the MSI capability structure of device function with | |
35399 | + * a single MSI vector upon its software driver call to request for | |
35400 | + * MSI mode enabled on its hardware device function. A return of zero | |
35401 | + * indicates the successful setup of an entry zero with the new MSI | |
35402 | + * vector or non-zero for otherwise. | |
35403 | + **/ | |
35404 | +extern int pci_frontend_enable_msi(struct pci_dev *dev); | |
35405 | +int pci_enable_msi(struct pci_dev* dev) | |
35406 | +{ | |
35407 | + struct pci_bus *bus; | |
35408 | + int pos, temp, status = -EINVAL; | |
35409 | + | |
35410 | + if (!pci_msi_enable || !dev) | |
35411 | + return status; | |
35412 | + | |
35413 | + if (dev->no_msi) | |
35414 | + return status; | |
35415 | + | |
35416 | + for (bus = dev->bus; bus; bus = bus->parent) | |
35417 | + if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI) | |
35418 | + return -EINVAL; | |
35419 | + | |
35420 | + status = msi_init(); | |
35421 | + if (status < 0) | |
35422 | + return status; | |
35423 | + | |
35424 | +#ifdef CONFIG_XEN_PCIDEV_FRONTEND | |
35425 | + if (!is_initial_xendomain()) | |
35426 | + { | |
35427 | + int ret; | |
35428 | + | |
35429 | + temp = dev->irq; | |
35430 | + ret = pci_frontend_enable_msi(dev); | |
35431 | + if (ret) | |
35432 | + return ret; | |
35433 | + | |
35434 | + dev->irq = evtchn_map_pirq(-1, dev->irq); | |
35435 | + dev->irq_old = temp; | |
35436 | + | |
35437 | + return ret; | |
35438 | + } | |
35439 | +#endif | |
35440 | + | |
35441 | + temp = dev->irq; | |
35442 | + | |
35443 | + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); | |
35444 | + if (!pos) | |
35445 | + return -EINVAL; | |
35446 | + | |
35447 | + /* Check whether driver already requested for MSI-X vectors */ | |
35448 | + if (dev->msix_enabled) { | |
35449 | + printk(KERN_INFO "PCI: %s: Can't enable MSI. " | |
35450 | + "Device already has MSI-X vectors assigned\n", | |
35451 | + pci_name(dev)); | |
35452 | + dev->irq = temp; | |
35453 | + return -EINVAL; | |
35454 | + } | |
35455 | + | |
35456 | + status = msi_capability_init(dev); | |
35457 | + if ( !status ) | |
35458 | + dev->irq_old = temp; | |
35459 | + else | |
35460 | + dev->irq = temp; | |
35461 | + | |
35462 | + return status; | |
35463 | +} | |
35464 | + | |
35465 | +extern void pci_frontend_disable_msi(struct pci_dev* dev); | |
35466 | +void pci_disable_msi(struct pci_dev* dev) | |
35467 | +{ | |
35468 | + int pos; | |
35469 | + int pirq; | |
35470 | + | |
35471 | + if (!pci_msi_enable) | |
35472 | + return; | |
35473 | + if (!dev) | |
35474 | + return; | |
35475 | + | |
35476 | +#ifdef CONFIG_XEN_PCIDEV_FRONTEND | |
35477 | + if (!is_initial_xendomain()) { | |
35478 | + evtchn_map_pirq(dev->irq, 0); | |
35479 | + pci_frontend_disable_msi(dev); | |
35480 | + dev->irq = dev->irq_old; | |
35481 | + return; | |
35482 | + } | |
35483 | +#endif | |
35484 | + | |
35485 | + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); | |
35486 | + if (!pos) | |
35487 | + return; | |
35488 | + | |
35489 | + pirq = dev->irq; | |
35490 | + /* Restore dev->irq to its default pin-assertion vector */ | |
35491 | + dev->irq = dev->irq_old; | |
35492 | + msi_unmap_pirq(dev, pirq); | |
35493 | + | |
35494 | + /* Disable MSI mode */ | |
35495 | + disable_msi_mode(dev, pos, PCI_CAP_ID_MSI); | |
35496 | +} | |
35497 | + | |
35498 | +/** | |
35499 | + * pci_enable_msix - configure device's MSI-X capability structure | |
35500 | + * @dev: pointer to the pci_dev data structure of MSI-X device function | |
35501 | + * @entries: pointer to an array of MSI-X entries | |
35502 | + * @nvec: number of MSI-X vectors requested for allocation by device driver | |
35503 | + * | |
35504 | + * Setup the MSI-X capability structure of device function with the number | |
35505 | + * of requested vectors upon its software driver call to request for | |
35506 | + * MSI-X mode enabled on its hardware device function. A return of zero | |
35507 | + * indicates the successful configuration of MSI-X capability structure | |
35508 | + * with new allocated MSI-X vectors. A return of < 0 indicates a failure. | |
35509 | + * Or a return of > 0 indicates that driver request is exceeding the number | |
35510 | + * of vectors available. Driver should use the returned value to re-send | |
35511 | + * its request. | |
35512 | + **/ | |
35513 | +extern int pci_frontend_enable_msix(struct pci_dev *dev, | |
35514 | + struct msix_entry *entries, int nvec); | |
35515 | +int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec) | |
35516 | +{ | |
35517 | + struct pci_bus *bus; | |
35518 | + int status, pos, nr_entries; | |
35519 | + int i, j, temp; | |
35520 | + u16 control; | |
35521 | + | |
35522 | + if (!pci_msi_enable || !dev || !entries) | |
35523 | + return -EINVAL; | |
35524 | + | |
35525 | + if (dev->no_msi) | |
35526 | + return -EINVAL; | |
35527 | + | |
35528 | + for (bus = dev->bus; bus; bus = bus->parent) | |
35529 | + if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI) | |
35530 | + return -EINVAL; | |
35531 | + | |
35532 | +#ifdef CONFIG_XEN_PCIDEV_FRONTEND | |
35533 | + if (!is_initial_xendomain()) { | |
35534 | + struct msi_dev_list *msi_dev_entry; | |
35535 | + struct msi_pirq_entry *pirq_entry; | |
35536 | + int ret, irq; | |
35537 | + | |
35538 | + ret = pci_frontend_enable_msix(dev, entries, nvec); | |
35539 | + if (ret) { | |
35540 | + printk("get %x from pci_frontend_enable_msix\n", ret); | |
35541 | + return ret; | |
35542 | + } | |
35543 | + | |
35544 | + msi_dev_entry = get_msi_dev_pirq_list(dev); | |
35545 | + for (i = 0; i < nvec; i++) { | |
35546 | + int mapped = 0; | |
35547 | + | |
35548 | + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) { | |
35549 | + if (pirq_entry->entry_nr == entries[i].entry) { | |
35550 | + irq = pirq_entry->pirq; | |
35551 | + BUG_ON(entries[i].vector != evtchn_get_xen_pirq(irq)); | |
35552 | + entries[i].vector = irq; | |
35553 | + mapped = 1; | |
35554 | + break; | |
35555 | + } | |
35556 | + } | |
35557 | + if (mapped) | |
35558 | + continue; | |
35559 | + irq = evtchn_map_pirq(-1, entries[i].vector); | |
35560 | + attach_pirq_entry(irq, entries[i].entry, msi_dev_entry); | |
35561 | + entries[i].vector = irq; | |
35562 | + } | |
35563 | + return 0; | |
35564 | + } | |
35565 | +#endif | |
35566 | + | |
35567 | + status = msi_init(); | |
35568 | + if (status < 0) | |
35569 | + return status; | |
35570 | + | |
35571 | + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); | |
35572 | + if (!pos) | |
35573 | + return -EINVAL; | |
35574 | + | |
35575 | + pci_read_config_word(dev, msi_control_reg(pos), &control); | |
35576 | + nr_entries = multi_msix_capable(control); | |
35577 | + if (nvec > nr_entries) | |
35578 | + return -EINVAL; | |
35579 | + | |
35580 | + /* Check for any invalid entries */ | |
35581 | + for (i = 0; i < nvec; i++) { | |
35582 | + if (entries[i].entry >= nr_entries) | |
35583 | + return -EINVAL; /* invalid entry */ | |
35584 | + for (j = i + 1; j < nvec; j++) { | |
35585 | + if (entries[i].entry == entries[j].entry) | |
35586 | + return -EINVAL; /* duplicate entry */ | |
35587 | + } | |
35588 | + } | |
35589 | + | |
35590 | + temp = dev->irq; | |
35591 | + /* Check whether driver already requested for MSI vector */ | |
35592 | + if (dev->msi_enabled) { | |
35593 | + printk(KERN_INFO "PCI: %s: Can't enable MSI-X. " | |
35594 | + "Device already has an MSI vector assigned\n", | |
35595 | + pci_name(dev)); | |
35596 | + dev->irq = temp; | |
35597 | + return -EINVAL; | |
35598 | + } | |
35599 | + | |
35600 | + status = msix_capability_init(dev, entries, nvec); | |
35601 | + | |
35602 | + if ( !status ) | |
35603 | + dev->irq_old = temp; | |
35604 | + else | |
35605 | + dev->irq = temp; | |
35606 | + | |
35607 | + return status; | |
35608 | +} | |
35609 | + | |
35610 | +extern void pci_frontend_disable_msix(struct pci_dev* dev); | |
35611 | +void pci_disable_msix(struct pci_dev* dev) | |
35612 | +{ | |
35613 | + int pos; | |
35614 | + u16 control; | |
35615 | + | |
35616 | + | |
35617 | + if (!pci_msi_enable) | |
35618 | + return; | |
35619 | + if (!dev) | |
35620 | + return; | |
35621 | + | |
35622 | +#ifdef CONFIG_XEN_PCIDEV_FRONTEND | |
35623 | + if (!is_initial_xendomain()) { | |
35624 | + struct msi_dev_list *msi_dev_entry; | |
35625 | + struct msi_pirq_entry *pirq_entry, *tmp; | |
35626 | + | |
35627 | + pci_frontend_disable_msix(dev); | |
35628 | + | |
35629 | + msi_dev_entry = get_msi_dev_pirq_list(dev); | |
35630 | + list_for_each_entry_safe(pirq_entry, tmp, | |
35631 | + &msi_dev_entry->pirq_list_head, list) { | |
35632 | + evtchn_map_pirq(pirq_entry->pirq, 0); | |
35633 | + list_del(&pirq_entry->list); | |
35634 | + kfree(pirq_entry); | |
35635 | + } | |
35636 | + | |
35637 | + dev->irq = dev->irq_old; | |
35638 | + return; | |
35639 | + } | |
35640 | +#endif | |
35641 | + | |
35642 | + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); | |
35643 | + if (!pos) | |
35644 | + return; | |
35645 | + | |
35646 | + pci_read_config_word(dev, msi_control_reg(pos), &control); | |
35647 | + if (!(control & PCI_MSIX_FLAGS_ENABLE)) | |
35648 | + return; | |
35649 | + | |
35650 | + msi_remove_pci_irq_vectors(dev); | |
35651 | + | |
35652 | + /* Disable MSI mode */ | |
35653 | + disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); | |
35654 | +} | |
35655 | + | |
35656 | +/** | |
35657 | + * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state | |
35658 | + * @dev: pointer to the pci_dev data structure of MSI(X) device function | |
35659 | + * | |
35660 | + * Being called during hotplug remove, from which the device function | |
35661 | + * is hot-removed. All previous assigned MSI/MSI-X vectors, if | |
35662 | + * allocated for this device function, are reclaimed to unused state, | |
35663 | + * which may be used later on. | |
35664 | + **/ | |
35665 | +void msi_remove_pci_irq_vectors(struct pci_dev* dev) | |
35666 | +{ | |
35667 | + unsigned long flags; | |
35668 | + struct msi_dev_list *msi_dev_entry; | |
35669 | + struct msi_pirq_entry *pirq_entry, *tmp; | |
35670 | + | |
35671 | + if (!pci_msi_enable || !dev) | |
35672 | + return; | |
35673 | + | |
35674 | + msi_dev_entry = get_msi_dev_pirq_list(dev); | |
35675 | + | |
35676 | + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags); | |
35677 | + if (!list_empty(&msi_dev_entry->pirq_list_head)) | |
35678 | + { | |
35679 | + printk(KERN_WARNING "msix pirqs for dev %02x:%02x:%01x are not freed \ | |
35680 | + before acquire again.\n", dev->bus->number, PCI_SLOT(dev->devfn), | |
35681 | + PCI_FUNC(dev->devfn)); | |
35682 | + list_for_each_entry_safe(pirq_entry, tmp, | |
35683 | + &msi_dev_entry->pirq_list_head, list) { | |
35684 | + msi_unmap_pirq(dev, pirq_entry->pirq); | |
35685 | + list_del(&pirq_entry->list); | |
35686 | + kfree(pirq_entry); | |
35687 | + } | |
35688 | + } | |
35689 | + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags); | |
35690 | + dev->irq = dev->irq_old; | |
35691 | +} | |
35692 | + | |
35693 | +void pci_no_msi(void) | |
35694 | +{ | |
35695 | + pci_msi_enable = 0; | |
35696 | +} | |
35697 | + | |
35698 | +EXPORT_SYMBOL(pci_enable_msi); | |
35699 | +EXPORT_SYMBOL(pci_disable_msi); | |
35700 | +EXPORT_SYMBOL(pci_enable_msix); | |
35701 | +EXPORT_SYMBOL(pci_disable_msix); | |
35702 | +#ifdef CONFIG_XEN | |
35703 | +EXPORT_SYMBOL(register_msi_get_owner); | |
35704 | +EXPORT_SYMBOL(unregister_msi_get_owner); | |
35705 | +#endif | |
35706 | + | |
35707 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/agp.h | |
35708 | =================================================================== | |
35709 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
35710 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/agp.h 2007-06-22 09:08:06.000000000 +0200 | |
35711 | @@ -0,0 +1,44 @@ | |
35712 | +#ifndef AGP_H | |
35713 | +#define AGP_H 1 | |
35714 | + | |
35715 | +#include <asm/pgtable.h> | |
35716 | +#include <asm/cacheflush.h> | |
35717 | +#include <asm/system.h> | |
35718 | + | |
35719 | +/* | |
35720 | + * Functions to keep the agpgart mappings coherent with the MMU. | |
35721 | + * The GART gives the CPU a physical alias of pages in memory. The alias region is | |
35722 | + * mapped uncacheable. Make sure there are no conflicting mappings | |
35723 | + * with different cachability attributes for the same page. This avoids | |
35724 | + * data corruption on some CPUs. | |
35725 | + */ | |
35726 | + | |
35727 | +/* Caller's responsibility to call global_flush_tlb() for | |
35728 | + * performance reasons */ | |
35729 | +#define map_page_into_agp(page) ( \ | |
35730 | + xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \ | |
35731 | + ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)) | |
35732 | +#define unmap_page_from_agp(page) ( \ | |
35733 | + xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \ | |
35734 | + /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ | |
35735 | + change_page_attr(page, 1, PAGE_KERNEL)) | |
35736 | +#define flush_agp_mappings() global_flush_tlb() | |
35737 | + | |
35738 | +/* Could use CLFLUSH here if the cpu supports it. But then it would | |
35739 | + need to be called for each cacheline of the whole page so it may not be | |
35740 | + worth it. Would need a page for it. */ | |
35741 | +#define flush_agp_cache() wbinvd() | |
35742 | + | |
35743 | +/* Convert a physical address to an address suitable for the GART. */ | |
35744 | +#define phys_to_gart(x) phys_to_machine(x) | |
35745 | +#define gart_to_phys(x) machine_to_phys(x) | |
35746 | + | |
35747 | +/* GATT allocation. Returns/accepts GATT kernel virtual address. */ | |
35748 | +#define alloc_gatt_pages(order) ({ \ | |
35749 | + char *_t; dma_addr_t _d; \ | |
35750 | + _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \ | |
35751 | + _t; }) | |
35752 | +#define free_gatt_pages(table, order) \ | |
35753 | + dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table)) | |
35754 | + | |
35755 | +#endif | |
35756 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/desc_32.h | |
35757 | =================================================================== | |
35758 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
35759 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/desc_32.h 2008-01-28 12:24:19.000000000 +0100 | |
35760 | @@ -0,0 +1,166 @@ | |
35761 | +#ifndef __ARCH_DESC_H | |
35762 | +#define __ARCH_DESC_H | |
35763 | + | |
35764 | +#include <asm/ldt.h> | |
35765 | +#include <asm/segment.h> | |
35766 | + | |
35767 | +#define CPU_16BIT_STACK_SIZE 1024 | |
35768 | + | |
35769 | +#ifndef __ASSEMBLY__ | |
35770 | + | |
35771 | +#include <linux/preempt.h> | |
35772 | +#include <linux/smp.h> | |
35773 | + | |
35774 | +#include <asm/mmu.h> | |
35775 | + | |
35776 | +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; | |
35777 | + | |
35778 | +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); | |
35779 | + | |
35780 | +struct Xgt_desc_struct { | |
35781 | + unsigned short size; | |
35782 | + unsigned long address __attribute__((packed)); | |
35783 | + unsigned short pad; | |
35784 | +} __attribute__ ((packed)); | |
35785 | + | |
35786 | +extern struct Xgt_desc_struct idt_descr; | |
35787 | +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); | |
35788 | + | |
35789 | + | |
35790 | +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) | |
35791 | +{ | |
35792 | + return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; | |
35793 | +} | |
35794 | + | |
35795 | +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) | |
35796 | +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) | |
35797 | + | |
35798 | +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) | |
35799 | +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) | |
35800 | +#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr)) | |
35801 | +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt)) | |
35802 | + | |
35803 | +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr)) | |
35804 | +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) | |
35805 | +#define store_tr(tr) __asm__ ("str %0":"=mr" (tr)) | |
35806 | +#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt)) | |
35807 | + | |
35808 | +/* | |
35809 | + * This is the ldt that every process will get unless we need | |
35810 | + * something other than this. | |
35811 | + */ | |
35812 | +extern struct desc_struct default_ldt[]; | |
35813 | +extern void set_intr_gate(unsigned int irq, void * addr); | |
35814 | + | |
35815 | +#define _set_tssldt_desc(n,addr,limit,type) \ | |
35816 | +__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ | |
35817 | + "movw %w1,2(%2)\n\t" \ | |
35818 | + "rorl $16,%1\n\t" \ | |
35819 | + "movb %b1,4(%2)\n\t" \ | |
35820 | + "movb %4,5(%2)\n\t" \ | |
35821 | + "movb $0,6(%2)\n\t" \ | |
35822 | + "movb %h1,7(%2)\n\t" \ | |
35823 | + "rorl $16,%1" \ | |
35824 | + : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type)) | |
35825 | + | |
35826 | +#ifndef CONFIG_X86_NO_TSS | |
35827 | +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr) | |
35828 | +{ | |
35829 | + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr, | |
35830 | + offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89); | |
35831 | +} | |
35832 | + | |
35833 | +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) | |
35834 | +#endif | |
35835 | + | |
35836 | +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size) | |
35837 | +{ | |
35838 | + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82); | |
35839 | +} | |
35840 | + | |
35841 | +#define LDT_entry_a(info) \ | |
35842 | + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) | |
35843 | + | |
35844 | +#define LDT_entry_b(info) \ | |
35845 | + (((info)->base_addr & 0xff000000) | \ | |
35846 | + (((info)->base_addr & 0x00ff0000) >> 16) | \ | |
35847 | + ((info)->limit & 0xf0000) | \ | |
35848 | + (((info)->read_exec_only ^ 1) << 9) | \ | |
35849 | + ((info)->contents << 10) | \ | |
35850 | + (((info)->seg_not_present ^ 1) << 15) | \ | |
35851 | + ((info)->seg_32bit << 22) | \ | |
35852 | + ((info)->limit_in_pages << 23) | \ | |
35853 | + ((info)->useable << 20) | \ | |
35854 | + 0x7000) | |
35855 | + | |
35856 | +#define LDT_empty(info) (\ | |
35857 | + (info)->base_addr == 0 && \ | |
35858 | + (info)->limit == 0 && \ | |
35859 | + (info)->contents == 0 && \ | |
35860 | + (info)->read_exec_only == 1 && \ | |
35861 | + (info)->seg_32bit == 0 && \ | |
35862 | + (info)->limit_in_pages == 0 && \ | |
35863 | + (info)->seg_not_present == 1 && \ | |
35864 | + (info)->useable == 0 ) | |
35865 | + | |
35866 | +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); | |
35867 | + | |
35868 | +#if TLS_SIZE != 24 | |
35869 | +# error update this code. | |
35870 | +#endif | |
35871 | + | |
35872 | +static inline void load_TLS(struct thread_struct *t, unsigned int cpu) | |
35873 | +{ | |
35874 | +#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \ | |
35875 | + *(u64 *)&t->tls_array[i])) \ | |
35876 | + BUG(); | |
35877 | + C(0); C(1); C(2); | |
35878 | +#undef C | |
35879 | +} | |
35880 | + | |
35881 | +static inline void clear_LDT(void) | |
35882 | +{ | |
35883 | + int cpu = get_cpu(); | |
35884 | + | |
35885 | + /* | |
35886 | + * NB. We load the default_ldt for lcall7/27 handling on demand, as | |
35887 | + * it slows down context switching. Noone uses it anyway. | |
35888 | + */ | |
35889 | + cpu = cpu; /* XXX avoid compiler warning */ | |
35890 | + xen_set_ldt(NULL, 0); | |
35891 | + put_cpu(); | |
35892 | +} | |
35893 | + | |
35894 | +/* | |
35895 | + * load one particular LDT into the current CPU | |
35896 | + */ | |
35897 | +static inline void load_LDT_nolock(mm_context_t *pc, int cpu) | |
35898 | +{ | |
35899 | + void *segments = pc->ldt; | |
35900 | + int count = pc->size; | |
35901 | + | |
35902 | + if (likely(!count)) | |
35903 | + segments = NULL; | |
35904 | + | |
35905 | + xen_set_ldt(segments, count); | |
35906 | +} | |
35907 | + | |
35908 | +static inline void load_LDT(mm_context_t *pc) | |
35909 | +{ | |
35910 | + int cpu = get_cpu(); | |
35911 | + load_LDT_nolock(pc, cpu); | |
35912 | + put_cpu(); | |
35913 | +} | |
35914 | + | |
35915 | +static inline unsigned long get_desc_base(unsigned long *desc) | |
35916 | +{ | |
35917 | + unsigned long base; | |
35918 | + base = ((desc[0] >> 16) & 0x0000ffff) | | |
35919 | + ((desc[1] << 16) & 0x00ff0000) | | |
35920 | + (desc[1] & 0xff000000); | |
35921 | + return base; | |
35922 | +} | |
35923 | + | |
35924 | +#endif /* !__ASSEMBLY__ */ | |
35925 | + | |
35926 | +#endif | |
35927 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_32.h | |
35928 | =================================================================== | |
35929 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
35930 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2008-04-02 12:34:02.000000000 +0200 | |
35931 | @@ -0,0 +1,151 @@ | |
35932 | +#ifndef _ASM_I386_DMA_MAPPING_H | |
35933 | +#define _ASM_I386_DMA_MAPPING_H | |
35934 | + | |
35935 | +/* | |
35936 | + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for | |
35937 | + * documentation. | |
35938 | + */ | |
35939 | + | |
35940 | +#include <linux/mm.h> | |
35941 | +#include <asm/cache.h> | |
35942 | +#include <asm/io.h> | |
35943 | +#include <asm/scatterlist.h> | |
35944 | +#include <asm/swiotlb.h> | |
35945 | + | |
35946 | +static inline int | |
35947 | +address_needs_mapping(struct device *hwdev, dma_addr_t addr) | |
35948 | +{ | |
35949 | + dma_addr_t mask = 0xffffffff; | |
35950 | + /* If the device has a mask, use it, otherwise default to 32 bits */ | |
35951 | + if (hwdev && hwdev->dma_mask) | |
35952 | + mask = *hwdev->dma_mask; | |
35953 | + return (addr & ~mask) != 0; | |
35954 | +} | |
35955 | + | |
35956 | +extern int range_straddles_page_boundary(paddr_t p, size_t size); | |
35957 | + | |
35958 | +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) | |
35959 | +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) | |
35960 | + | |
35961 | +void *dma_alloc_coherent(struct device *dev, size_t size, | |
35962 | + dma_addr_t *dma_handle, gfp_t flag); | |
35963 | + | |
35964 | +void dma_free_coherent(struct device *dev, size_t size, | |
35965 | + void *vaddr, dma_addr_t dma_handle); | |
35966 | + | |
35967 | +extern dma_addr_t | |
35968 | +dma_map_single(struct device *dev, void *ptr, size_t size, | |
35969 | + enum dma_data_direction direction); | |
35970 | + | |
35971 | +extern void | |
35972 | +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, | |
35973 | + enum dma_data_direction direction); | |
35974 | + | |
35975 | +extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg, | |
35976 | + int nents, enum dma_data_direction direction); | |
35977 | +extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, | |
35978 | + int nents, enum dma_data_direction direction); | |
35979 | + | |
35980 | +#ifdef CONFIG_HIGHMEM | |
35981 | +extern dma_addr_t | |
35982 | +dma_map_page(struct device *dev, struct page *page, unsigned long offset, | |
35983 | + size_t size, enum dma_data_direction direction); | |
35984 | + | |
35985 | +extern void | |
35986 | +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, | |
35987 | + enum dma_data_direction direction); | |
35988 | +#else | |
35989 | +#define dma_map_page(dev, page, offset, size, dir) \ | |
35990 | + dma_map_single(dev, page_address(page) + (offset), (size), (dir)) | |
35991 | +#define dma_unmap_page dma_unmap_single | |
35992 | +#endif | |
35993 | + | |
35994 | +extern void | |
35995 | +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, | |
35996 | + enum dma_data_direction direction); | |
35997 | + | |
35998 | +extern void | |
35999 | +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, | |
36000 | + enum dma_data_direction direction); | |
36001 | + | |
36002 | +static inline void | |
36003 | +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, | |
36004 | + unsigned long offset, size_t size, | |
36005 | + enum dma_data_direction direction) | |
36006 | +{ | |
36007 | + dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction); | |
36008 | +} | |
36009 | + | |
36010 | +static inline void | |
36011 | +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, | |
36012 | + unsigned long offset, size_t size, | |
36013 | + enum dma_data_direction direction) | |
36014 | +{ | |
36015 | + dma_sync_single_for_device(dev, dma_handle+offset, size, direction); | |
36016 | +} | |
36017 | + | |
36018 | +static inline void | |
36019 | +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, | |
36020 | + enum dma_data_direction direction) | |
36021 | +{ | |
36022 | + if (swiotlb) | |
36023 | + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction); | |
36024 | + flush_write_buffers(); | |
36025 | +} | |
36026 | + | |
36027 | +static inline void | |
36028 | +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, | |
36029 | + enum dma_data_direction direction) | |
36030 | +{ | |
36031 | + if (swiotlb) | |
36032 | + swiotlb_sync_sg_for_device(dev,sg,nelems,direction); | |
36033 | + flush_write_buffers(); | |
36034 | +} | |
36035 | + | |
36036 | +extern int | |
36037 | +dma_mapping_error(dma_addr_t dma_addr); | |
36038 | + | |
36039 | +extern int | |
36040 | +dma_supported(struct device *dev, u64 mask); | |
36041 | + | |
36042 | +static inline int | |
36043 | +dma_set_mask(struct device *dev, u64 mask) | |
36044 | +{ | |
36045 | + if(!dev->dma_mask || !dma_supported(dev, mask)) | |
36046 | + return -EIO; | |
36047 | + | |
36048 | + *dev->dma_mask = mask; | |
36049 | + | |
36050 | + return 0; | |
36051 | +} | |
36052 | + | |
36053 | +static inline int | |
36054 | +dma_get_cache_alignment(void) | |
36055 | +{ | |
36056 | + /* no easy way to get cache size on all x86, so return the | |
36057 | + * maximum possible, to be safe */ | |
36058 | + return (1 << INTERNODE_CACHE_SHIFT); | |
36059 | +} | |
36060 | + | |
36061 | +#define dma_is_consistent(d) (1) | |
36062 | + | |
36063 | +static inline void | |
36064 | +dma_cache_sync(void *vaddr, size_t size, | |
36065 | + enum dma_data_direction direction) | |
36066 | +{ | |
36067 | + flush_write_buffers(); | |
36068 | +} | |
36069 | + | |
36070 | +#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY | |
36071 | +extern int | |
36072 | +dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, | |
36073 | + dma_addr_t device_addr, size_t size, int flags); | |
36074 | + | |
36075 | +extern void | |
36076 | +dma_release_declared_memory(struct device *dev); | |
36077 | + | |
36078 | +extern void * | |
36079 | +dma_mark_declared_memory_occupied(struct device *dev, | |
36080 | + dma_addr_t device_addr, size_t size); | |
36081 | + | |
36082 | +#endif | |
36083 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_32.h | |
36084 | =================================================================== | |
36085 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
36086 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_32.h 2007-06-12 13:14:02.000000000 +0200 | |
36087 | @@ -0,0 +1,155 @@ | |
36088 | +/* | |
36089 | + * fixmap.h: compile-time virtual memory allocation | |
36090 | + * | |
36091 | + * This file is subject to the terms and conditions of the GNU General Public | |
36092 | + * License. See the file "COPYING" in the main directory of this archive | |
36093 | + * for more details. | |
36094 | + * | |
36095 | + * Copyright (C) 1998 Ingo Molnar | |
36096 | + * | |
36097 | + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | |
36098 | + */ | |
36099 | + | |
36100 | +#ifndef _ASM_FIXMAP_H | |
36101 | +#define _ASM_FIXMAP_H | |
36102 | + | |
36103 | + | |
36104 | +/* used by vmalloc.c, vsyscall.lds.S. | |
36105 | + * | |
36106 | + * Leave one empty page between vmalloc'ed areas and | |
36107 | + * the start of the fixmap. | |
36108 | + */ | |
36109 | +extern unsigned long __FIXADDR_TOP; | |
36110 | + | |
36111 | +#ifndef __ASSEMBLY__ | |
36112 | +#include <linux/kernel.h> | |
36113 | +#include <asm/acpi.h> | |
36114 | +#include <asm/apicdef.h> | |
36115 | +#include <asm/page.h> | |
36116 | +#ifdef CONFIG_HIGHMEM | |
36117 | +#include <linux/threads.h> | |
36118 | +#include <asm/kmap_types.h> | |
36119 | +#endif | |
36120 | + | |
36121 | +/* | |
36122 | + * Here we define all the compile-time 'special' virtual | |
36123 | + * addresses. The point is to have a constant address at | |
36124 | + * compile time, but to set the physical address only | |
36125 | + * in the boot process. We allocate these special addresses | |
36126 | + * from the end of virtual memory (0xfffff000) backwards. | |
36127 | + * Also this lets us do fail-safe vmalloc(), we | |
36128 | + * can guarantee that these special addresses and | |
36129 | + * vmalloc()-ed addresses never overlap. | |
36130 | + * | |
36131 | + * these 'compile-time allocated' memory buffers are | |
36132 | + * fixed-size 4k pages. (or larger if used with an increment | |
36133 | + * highger than 1) use fixmap_set(idx,phys) to associate | |
36134 | + * physical memory with fixmap indices. | |
36135 | + * | |
36136 | + * TLB entries of such buffers will not be flushed across | |
36137 | + * task switches. | |
36138 | + */ | |
36139 | +enum fixed_addresses { | |
36140 | + FIX_HOLE, | |
36141 | + FIX_VDSO, | |
36142 | +#ifdef CONFIG_X86_LOCAL_APIC | |
36143 | + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ | |
36144 | +#endif | |
36145 | +#ifdef CONFIG_X86_IO_APIC | |
36146 | + FIX_IO_APIC_BASE_0, | |
36147 | + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, | |
36148 | +#endif | |
36149 | +#ifdef CONFIG_X86_VISWS_APIC | |
36150 | + FIX_CO_CPU, /* Cobalt timer */ | |
36151 | + FIX_CO_APIC, /* Cobalt APIC Redirection Table */ | |
36152 | + FIX_LI_PCIA, /* Lithium PCI Bridge A */ | |
36153 | + FIX_LI_PCIB, /* Lithium PCI Bridge B */ | |
36154 | +#endif | |
36155 | +#ifdef CONFIG_X86_F00F_BUG | |
36156 | + FIX_F00F_IDT, /* Virtual mapping for IDT */ | |
36157 | +#endif | |
36158 | +#ifdef CONFIG_X86_CYCLONE_TIMER | |
36159 | + FIX_CYCLONE_TIMER, /*cyclone timer register*/ | |
36160 | +#endif | |
36161 | +#ifdef CONFIG_HIGHMEM | |
36162 | + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ | |
36163 | + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, | |
36164 | +#endif | |
36165 | +#ifdef CONFIG_ACPI | |
36166 | + FIX_ACPI_BEGIN, | |
36167 | + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, | |
36168 | +#endif | |
36169 | +#ifdef CONFIG_PCI_MMCONFIG | |
36170 | + FIX_PCIE_MCFG, | |
36171 | +#endif | |
36172 | + FIX_SHARED_INFO, | |
36173 | +#define NR_FIX_ISAMAPS 256 | |
36174 | + FIX_ISAMAP_END, | |
36175 | + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, | |
36176 | + __end_of_permanent_fixed_addresses, | |
36177 | + /* temporary boot-time mappings, used before ioremap() is functional */ | |
36178 | +#define NR_FIX_BTMAPS 16 | |
36179 | + FIX_BTMAP_END = __end_of_permanent_fixed_addresses, | |
36180 | + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, | |
36181 | + FIX_WP_TEST, | |
36182 | + __end_of_fixed_addresses | |
36183 | +}; | |
36184 | + | |
36185 | +extern void set_fixaddr_top(unsigned long top); | |
36186 | + | |
36187 | +extern void __set_fixmap(enum fixed_addresses idx, | |
36188 | + maddr_t phys, pgprot_t flags); | |
36189 | + | |
36190 | +#define set_fixmap(idx, phys) \ | |
36191 | + __set_fixmap(idx, phys, PAGE_KERNEL) | |
36192 | +/* | |
36193 | + * Some hardware wants to get fixmapped without caching. | |
36194 | + */ | |
36195 | +#define set_fixmap_nocache(idx, phys) \ | |
36196 | + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) | |
36197 | + | |
36198 | +#define clear_fixmap(idx) \ | |
36199 | + __set_fixmap(idx, 0, __pgprot(0)) | |
36200 | + | |
36201 | +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) | |
36202 | + | |
36203 | +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) | |
36204 | +#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) | |
36205 | +#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) | |
36206 | +#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE) | |
36207 | + | |
36208 | +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) | |
36209 | +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) | |
36210 | + | |
36211 | +extern void __this_fixmap_does_not_exist(void); | |
36212 | + | |
36213 | +/* | |
36214 | + * 'index to address' translation. If anyone tries to use the idx | |
36215 | + * directly without tranlation, we catch the bug with a NULL-deference | |
36216 | + * kernel oops. Illegal ranges of incoming indices are caught too. | |
36217 | + */ | |
36218 | +static __always_inline unsigned long fix_to_virt(const unsigned int idx) | |
36219 | +{ | |
36220 | + /* | |
36221 | + * this branch gets completely eliminated after inlining, | |
36222 | + * except when someone tries to use fixaddr indices in an | |
36223 | + * illegal way. (such as mixing up address types or using | |
36224 | + * out-of-range indices). | |
36225 | + * | |
36226 | + * If it doesn't get removed, the linker will complain | |
36227 | + * loudly with a reasonably clear error message.. | |
36228 | + */ | |
36229 | + if (idx >= __end_of_fixed_addresses) | |
36230 | + __this_fixmap_does_not_exist(); | |
36231 | + | |
36232 | + return __fix_to_virt(idx); | |
36233 | +} | |
36234 | + | |
36235 | +static inline unsigned long virt_to_fix(const unsigned long vaddr) | |
36236 | +{ | |
36237 | + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); | |
36238 | + return __virt_to_fix(vaddr); | |
36239 | +} | |
36240 | + | |
36241 | +#endif /* !__ASSEMBLY__ */ | |
36242 | +#endif | |
36243 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/gnttab_dma.h | |
36244 | =================================================================== | |
36245 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
36246 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/gnttab_dma.h 2007-08-06 15:10:49.000000000 +0200 | |
36247 | @@ -0,0 +1,41 @@ | |
36248 | +/* | |
36249 | + * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au> | |
36250 | + * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp> | |
36251 | + * VA Linux Systems Japan K.K. | |
36252 | + * | |
36253 | + * This program is free software; you can redistribute it and/or modify | |
36254 | + * it under the terms of the GNU General Public License as published by | |
36255 | + * the Free Software Foundation; either version 2 of the License, or | |
36256 | + * (at your option) any later version. | |
36257 | + * | |
36258 | + * This program is distributed in the hope that it will be useful, | |
36259 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
36260 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
36261 | + * GNU General Public License for more details. | |
36262 | + * | |
36263 | + * You should have received a copy of the GNU General Public License | |
36264 | + * along with this program; if not, write to the Free Software | |
36265 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
36266 | + */ | |
36267 | + | |
36268 | +#ifndef _ASM_I386_GNTTAB_DMA_H | |
36269 | +#define _ASM_I386_GNTTAB_DMA_H | |
36270 | + | |
36271 | +static inline int gnttab_dma_local_pfn(struct page *page) | |
36272 | +{ | |
36273 | + /* Has it become a local MFN? */ | |
36274 | + return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page)))); | |
36275 | +} | |
36276 | + | |
36277 | +static inline maddr_t gnttab_dma_map_page(struct page *page) | |
36278 | +{ | |
36279 | + __gnttab_dma_map_page(page); | |
36280 | + return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT); | |
36281 | +} | |
36282 | + | |
36283 | +static inline void gnttab_dma_unmap_page(maddr_t maddr) | |
36284 | +{ | |
36285 | + __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr))); | |
36286 | +} | |
36287 | + | |
36288 | +#endif /* _ASM_I386_GNTTAB_DMA_H */ | |
36289 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/highmem.h | |
36290 | =================================================================== | |
36291 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
36292 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/highmem.h 2008-10-29 09:55:56.000000000 +0100 | |
36293 | @@ -0,0 +1,97 @@ | |
36294 | +/* | |
36295 | + * highmem.h: virtual kernel memory mappings for high memory | |
36296 | + * | |
36297 | + * Used in CONFIG_HIGHMEM systems for memory pages which | |
36298 | + * are not addressable by direct kernel virtual addresses. | |
36299 | + * | |
36300 | + * Copyright (C) 1999 Gerhard Wichert, Siemens AG | |
36301 | + * Gerhard.Wichert@pdb.siemens.de | |
36302 | + * | |
36303 | + * | |
36304 | + * Redesigned the x86 32-bit VM architecture to deal with | |
36305 | + * up to 16 Terabyte physical memory. With current x86 CPUs | |
36306 | + * we now support up to 64 Gigabytes physical RAM. | |
36307 | + * | |
36308 | + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | |
36309 | + */ | |
36310 | + | |
36311 | +#ifndef _ASM_HIGHMEM_H | |
36312 | +#define _ASM_HIGHMEM_H | |
36313 | + | |
36314 | +#ifdef __KERNEL__ | |
36315 | + | |
36316 | +#include <linux/interrupt.h> | |
36317 | +#include <linux/threads.h> | |
36318 | +#include <asm/kmap_types.h> | |
36319 | +#include <asm/tlbflush.h> | |
36320 | + | |
36321 | +/* declarations for highmem.c */ | |
36322 | +extern unsigned long highstart_pfn, highend_pfn; | |
36323 | + | |
36324 | +extern pte_t *kmap_pte; | |
36325 | +extern pgprot_t kmap_prot; | |
36326 | +extern pte_t *pkmap_page_table; | |
36327 | + | |
36328 | +/* | |
36329 | + * Right now we initialize only a single pte table. It can be extended | |
36330 | + * easily, subsequent pte tables have to be allocated in one physical | |
36331 | + * chunk of RAM. | |
36332 | + */ | |
36333 | +#ifdef CONFIG_X86_PAE | |
36334 | +#define LAST_PKMAP 512 | |
36335 | +#else | |
36336 | +#define LAST_PKMAP 1024 | |
36337 | +#endif | |
36338 | +/* | |
36339 | + * Ordering is: | |
36340 | + * | |
36341 | + * FIXADDR_TOP | |
36342 | + * fixed_addresses | |
36343 | + * FIXADDR_START | |
36344 | + * temp fixed addresses | |
36345 | + * FIXADDR_BOOT_START | |
36346 | + * Persistent kmap area | |
36347 | + * PKMAP_BASE | |
36348 | + * VMALLOC_END | |
36349 | + * Vmalloc area | |
36350 | + * VMALLOC_START | |
36351 | + * high_memory | |
36352 | + */ | |
36353 | +#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK ) | |
36354 | +#define LAST_PKMAP_MASK (LAST_PKMAP-1) | |
36355 | +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) | |
36356 | +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) | |
36357 | + | |
36358 | +extern void * FASTCALL(kmap_high(struct page *page)); | |
36359 | +extern void FASTCALL(kunmap_high(struct page *page)); | |
36360 | + | |
36361 | +void *kmap(struct page *page); | |
36362 | +void kunmap(struct page *page); | |
36363 | +void *kmap_atomic(struct page *page, enum km_type type); | |
36364 | +void *kmap_atomic_pte(struct page *page, enum km_type type); | |
36365 | +void kunmap_atomic(void *kvaddr, enum km_type type); | |
36366 | +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); | |
36367 | +struct page *kmap_atomic_to_page(void *ptr); | |
36368 | + | |
36369 | +#define flush_cache_kmaps() do { } while (0) | |
36370 | + | |
36371 | +void clear_highpage(struct page *); | |
36372 | +static inline void clear_user_highpage(struct page *page, unsigned long vaddr) | |
36373 | +{ | |
36374 | + clear_highpage(page); | |
36375 | +} | |
36376 | +#define __HAVE_ARCH_CLEAR_HIGHPAGE | |
36377 | +#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE | |
36378 | + | |
36379 | +void copy_highpage(struct page *to, struct page *from); | |
36380 | +static inline void copy_user_highpage(struct page *to, struct page *from, | |
36381 | + unsigned long vaddr) | |
36382 | +{ | |
36383 | + copy_highpage(to, from); | |
36384 | +} | |
36385 | +#define __HAVE_ARCH_COPY_HIGHPAGE | |
36386 | +#define __HAVE_ARCH_COPY_USER_HIGHPAGE | |
36387 | + | |
36388 | +#endif /* __KERNEL__ */ | |
36389 | + | |
36390 | +#endif /* _ASM_HIGHMEM_H */ | |
36391 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_32.h | |
36392 | =================================================================== | |
36393 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
36394 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_32.h 2008-11-25 12:22:34.000000000 +0100 | |
36395 | @@ -0,0 +1,409 @@ | |
36396 | +/****************************************************************************** | |
36397 | + * hypercall.h | |
36398 | + * | |
36399 | + * Linux-specific hypervisor handling. | |
36400 | + * | |
36401 | + * Copyright (c) 2002-2004, K A Fraser | |
36402 | + * | |
36403 | + * This program is free software; you can redistribute it and/or | |
36404 | + * modify it under the terms of the GNU General Public License version 2 | |
36405 | + * as published by the Free Software Foundation; or, when distributed | |
36406 | + * separately from the Linux kernel or incorporated into other | |
36407 | + * software packages, subject to the following license: | |
36408 | + * | |
36409 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | |
36410 | + * of this source file (the "Software"), to deal in the Software without | |
36411 | + * restriction, including without limitation the rights to use, copy, modify, | |
36412 | + * merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
36413 | + * and to permit persons to whom the Software is furnished to do so, subject to | |
36414 | + * the following conditions: | |
36415 | + * | |
36416 | + * The above copyright notice and this permission notice shall be included in | |
36417 | + * all copies or substantial portions of the Software. | |
36418 | + * | |
36419 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
36420 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
36421 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
36422 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
36423 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
36424 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
36425 | + * IN THE SOFTWARE. | |
36426 | + */ | |
36427 | + | |
36428 | +#ifndef __HYPERCALL_H__ | |
36429 | +#define __HYPERCALL_H__ | |
36430 | + | |
36431 | +#include <linux/string.h> /* memcpy() */ | |
36432 | +#include <linux/stringify.h> | |
36433 | + | |
36434 | +#ifndef __HYPERVISOR_H__ | |
36435 | +# error "please don't include this file directly" | |
36436 | +#endif | |
36437 | + | |
36438 | +#ifdef CONFIG_XEN | |
36439 | +#define HYPERCALL_STR(name) \ | |
36440 | + "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)" | |
36441 | +#else | |
36442 | +#define HYPERCALL_STR(name) \ | |
36443 | + "mov hypercall_stubs,%%eax; " \ | |
36444 | + "add $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\ | |
36445 | + "call *%%eax" | |
36446 | +#endif | |
36447 | + | |
36448 | +#define _hypercall0(type, name) \ | |
36449 | +({ \ | |
36450 | + type __res; \ | |
36451 | + asm volatile ( \ | |
36452 | + HYPERCALL_STR(name) \ | |
36453 | + : "=a" (__res) \ | |
36454 | + : \ | |
36455 | + : "memory" ); \ | |
36456 | + __res; \ | |
36457 | +}) | |
36458 | + | |
36459 | +#define _hypercall1(type, name, a1) \ | |
36460 | +({ \ | |
36461 | + type __res; \ | |
36462 | + long __ign1; \ | |
36463 | + asm volatile ( \ | |
36464 | + HYPERCALL_STR(name) \ | |
36465 | + : "=a" (__res), "=b" (__ign1) \ | |
36466 | + : "1" ((long)(a1)) \ | |
36467 | + : "memory" ); \ | |
36468 | + __res; \ | |
36469 | +}) | |
36470 | + | |
36471 | +#define _hypercall2(type, name, a1, a2) \ | |
36472 | +({ \ | |
36473 | + type __res; \ | |
36474 | + long __ign1, __ign2; \ | |
36475 | + asm volatile ( \ | |
36476 | + HYPERCALL_STR(name) \ | |
36477 | + : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \ | |
36478 | + : "1" ((long)(a1)), "2" ((long)(a2)) \ | |
36479 | + : "memory" ); \ | |
36480 | + __res; \ | |
36481 | +}) | |
36482 | + | |
36483 | +#define _hypercall3(type, name, a1, a2, a3) \ | |
36484 | +({ \ | |
36485 | + type __res; \ | |
36486 | + long __ign1, __ign2, __ign3; \ | |
36487 | + asm volatile ( \ | |
36488 | + HYPERCALL_STR(name) \ | |
36489 | + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ | |
36490 | + "=d" (__ign3) \ | |
36491 | + : "1" ((long)(a1)), "2" ((long)(a2)), \ | |
36492 | + "3" ((long)(a3)) \ | |
36493 | + : "memory" ); \ | |
36494 | + __res; \ | |
36495 | +}) | |
36496 | + | |
36497 | +#define _hypercall4(type, name, a1, a2, a3, a4) \ | |
36498 | +({ \ | |
36499 | + type __res; \ | |
36500 | + long __ign1, __ign2, __ign3, __ign4; \ | |
36501 | + asm volatile ( \ | |
36502 | + HYPERCALL_STR(name) \ | |
36503 | + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ | |
36504 | + "=d" (__ign3), "=S" (__ign4) \ | |
36505 | + : "1" ((long)(a1)), "2" ((long)(a2)), \ | |
36506 | + "3" ((long)(a3)), "4" ((long)(a4)) \ | |
36507 | + : "memory" ); \ | |
36508 | + __res; \ | |
36509 | +}) | |
36510 | + | |
36511 | +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ | |
36512 | +({ \ | |
36513 | + type __res; \ | |
36514 | + long __ign1, __ign2, __ign3, __ign4, __ign5; \ | |
36515 | + asm volatile ( \ | |
36516 | + HYPERCALL_STR(name) \ | |
36517 | + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ | |
36518 | + "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \ | |
36519 | + : "1" ((long)(a1)), "2" ((long)(a2)), \ | |
36520 | + "3" ((long)(a3)), "4" ((long)(a4)), \ | |
36521 | + "5" ((long)(a5)) \ | |
36522 | + : "memory" ); \ | |
36523 | + __res; \ | |
36524 | +}) | |
36525 | + | |
36526 | +static inline int __must_check | |
36527 | +HYPERVISOR_set_trap_table( | |
36528 | + const trap_info_t *table) | |
36529 | +{ | |
36530 | + return _hypercall1(int, set_trap_table, table); | |
36531 | +} | |
36532 | + | |
36533 | +static inline int __must_check | |
36534 | +HYPERVISOR_mmu_update( | |
36535 | + mmu_update_t *req, unsigned int count, unsigned int *success_count, | |
36536 | + domid_t domid) | |
36537 | +{ | |
36538 | + return _hypercall4(int, mmu_update, req, count, success_count, domid); | |
36539 | +} | |
36540 | + | |
36541 | +static inline int __must_check | |
36542 | +HYPERVISOR_mmuext_op( | |
36543 | + struct mmuext_op *op, unsigned int count, unsigned int *success_count, | |
36544 | + domid_t domid) | |
36545 | +{ | |
36546 | + return _hypercall4(int, mmuext_op, op, count, success_count, domid); | |
36547 | +} | |
36548 | + | |
36549 | +static inline int __must_check | |
36550 | +HYPERVISOR_set_gdt( | |
36551 | + unsigned long *frame_list, unsigned int entries) | |
36552 | +{ | |
36553 | + return _hypercall2(int, set_gdt, frame_list, entries); | |
36554 | +} | |
36555 | + | |
36556 | +static inline int __must_check | |
36557 | +HYPERVISOR_stack_switch( | |
36558 | + unsigned long ss, unsigned long esp) | |
36559 | +{ | |
36560 | + return _hypercall2(int, stack_switch, ss, esp); | |
36561 | +} | |
36562 | + | |
36563 | +static inline int __must_check | |
36564 | +HYPERVISOR_set_callbacks( | |
36565 | + unsigned long event_selector, unsigned long event_address, | |
36566 | + unsigned long failsafe_selector, unsigned long failsafe_address) | |
36567 | +{ | |
36568 | + return _hypercall4(int, set_callbacks, | |
36569 | + event_selector, event_address, | |
36570 | + failsafe_selector, failsafe_address); | |
36571 | +} | |
36572 | + | |
36573 | +static inline int | |
36574 | +HYPERVISOR_fpu_taskswitch( | |
36575 | + int set) | |
36576 | +{ | |
36577 | + return _hypercall1(int, fpu_taskswitch, set); | |
36578 | +} | |
36579 | + | |
36580 | +static inline int __must_check | |
36581 | +HYPERVISOR_sched_op_compat( | |
36582 | + int cmd, unsigned long arg) | |
36583 | +{ | |
36584 | + return _hypercall2(int, sched_op_compat, cmd, arg); | |
36585 | +} | |
36586 | + | |
36587 | +static inline int __must_check | |
36588 | +HYPERVISOR_sched_op( | |
36589 | + int cmd, void *arg) | |
36590 | +{ | |
36591 | + return _hypercall2(int, sched_op, cmd, arg); | |
36592 | +} | |
36593 | + | |
36594 | +static inline long __must_check | |
36595 | +HYPERVISOR_set_timer_op( | |
36596 | + u64 timeout) | |
36597 | +{ | |
36598 | + unsigned long timeout_hi = (unsigned long)(timeout>>32); | |
36599 | + unsigned long timeout_lo = (unsigned long)timeout; | |
36600 | + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); | |
36601 | +} | |
36602 | + | |
36603 | +static inline int __must_check | |
36604 | +HYPERVISOR_platform_op( | |
36605 | + struct xen_platform_op *platform_op) | |
36606 | +{ | |
36607 | + platform_op->interface_version = XENPF_INTERFACE_VERSION; | |
36608 | + return _hypercall1(int, platform_op, platform_op); | |
36609 | +} | |
36610 | + | |
36611 | +static inline int __must_check | |
36612 | +HYPERVISOR_set_debugreg( | |
36613 | + unsigned int reg, unsigned long value) | |
36614 | +{ | |
36615 | + return _hypercall2(int, set_debugreg, reg, value); | |
36616 | +} | |
36617 | + | |
36618 | +static inline unsigned long __must_check | |
36619 | +HYPERVISOR_get_debugreg( | |
36620 | + unsigned int reg) | |
36621 | +{ | |
36622 | + return _hypercall1(unsigned long, get_debugreg, reg); | |
36623 | +} | |
36624 | + | |
36625 | +static inline int __must_check | |
36626 | +HYPERVISOR_update_descriptor( | |
36627 | + u64 ma, u64 desc) | |
36628 | +{ | |
36629 | + return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); | |
36630 | +} | |
36631 | + | |
36632 | +static inline int __must_check | |
36633 | +HYPERVISOR_memory_op( | |
36634 | + unsigned int cmd, void *arg) | |
36635 | +{ | |
36636 | + return _hypercall2(int, memory_op, cmd, arg); | |
36637 | +} | |
36638 | + | |
36639 | +static inline int __must_check | |
36640 | +HYPERVISOR_multicall( | |
36641 | + multicall_entry_t *call_list, unsigned int nr_calls) | |
36642 | +{ | |
36643 | + return _hypercall2(int, multicall, call_list, nr_calls); | |
36644 | +} | |
36645 | + | |
36646 | +static inline int __must_check | |
36647 | +HYPERVISOR_update_va_mapping( | |
36648 | + unsigned long va, pte_t new_val, unsigned long flags) | |
36649 | +{ | |
36650 | + unsigned long pte_hi = 0; | |
36651 | +#ifdef CONFIG_X86_PAE | |
36652 | + pte_hi = new_val.pte_high; | |
36653 | +#endif | |
36654 | + return _hypercall4(int, update_va_mapping, va, | |
36655 | + new_val.pte_low, pte_hi, flags); | |
36656 | +} | |
36657 | + | |
36658 | +static inline int __must_check | |
36659 | +HYPERVISOR_event_channel_op( | |
36660 | + int cmd, void *arg) | |
36661 | +{ | |
36662 | + int rc = _hypercall2(int, event_channel_op, cmd, arg); | |
36663 | + | |
36664 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
36665 | + if (unlikely(rc == -ENOSYS)) { | |
36666 | + struct evtchn_op op; | |
36667 | + op.cmd = cmd; | |
36668 | + memcpy(&op.u, arg, sizeof(op.u)); | |
36669 | + rc = _hypercall1(int, event_channel_op_compat, &op); | |
36670 | + memcpy(arg, &op.u, sizeof(op.u)); | |
36671 | + } | |
36672 | +#endif | |
36673 | + | |
36674 | + return rc; | |
36675 | +} | |
36676 | + | |
36677 | +static inline int __must_check | |
36678 | +HYPERVISOR_xen_version( | |
36679 | + int cmd, void *arg) | |
36680 | +{ | |
36681 | + return _hypercall2(int, xen_version, cmd, arg); | |
36682 | +} | |
36683 | + | |
36684 | +static inline int __must_check | |
36685 | +HYPERVISOR_console_io( | |
36686 | + int cmd, unsigned int count, char *str) | |
36687 | +{ | |
36688 | + return _hypercall3(int, console_io, cmd, count, str); | |
36689 | +} | |
36690 | + | |
36691 | +static inline int __must_check | |
36692 | +HYPERVISOR_physdev_op( | |
36693 | + int cmd, void *arg) | |
36694 | +{ | |
36695 | + int rc = _hypercall2(int, physdev_op, cmd, arg); | |
36696 | + | |
36697 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
36698 | + if (unlikely(rc == -ENOSYS)) { | |
36699 | + struct physdev_op op; | |
36700 | + op.cmd = cmd; | |
36701 | + memcpy(&op.u, arg, sizeof(op.u)); | |
36702 | + rc = _hypercall1(int, physdev_op_compat, &op); | |
36703 | + memcpy(arg, &op.u, sizeof(op.u)); | |
36704 | + } | |
36705 | +#endif | |
36706 | + | |
36707 | + return rc; | |
36708 | +} | |
36709 | + | |
36710 | +static inline int __must_check | |
36711 | +HYPERVISOR_grant_table_op( | |
36712 | + unsigned int cmd, void *uop, unsigned int count) | |
36713 | +{ | |
36714 | + return _hypercall3(int, grant_table_op, cmd, uop, count); | |
36715 | +} | |
36716 | + | |
36717 | +static inline int __must_check | |
36718 | +HYPERVISOR_update_va_mapping_otherdomain( | |
36719 | + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) | |
36720 | +{ | |
36721 | + unsigned long pte_hi = 0; | |
36722 | +#ifdef CONFIG_X86_PAE | |
36723 | + pte_hi = new_val.pte_high; | |
36724 | +#endif | |
36725 | + return _hypercall5(int, update_va_mapping_otherdomain, va, | |
36726 | + new_val.pte_low, pte_hi, flags, domid); | |
36727 | +} | |
36728 | + | |
36729 | +static inline int __must_check | |
36730 | +HYPERVISOR_vm_assist( | |
36731 | + unsigned int cmd, unsigned int type) | |
36732 | +{ | |
36733 | + return _hypercall2(int, vm_assist, cmd, type); | |
36734 | +} | |
36735 | + | |
36736 | +static inline int __must_check | |
36737 | +HYPERVISOR_vcpu_op( | |
36738 | + int cmd, unsigned int vcpuid, void *extra_args) | |
36739 | +{ | |
36740 | + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); | |
36741 | +} | |
36742 | + | |
36743 | +static inline int __must_check | |
36744 | +HYPERVISOR_suspend( | |
36745 | + unsigned long srec) | |
36746 | +{ | |
36747 | + struct sched_shutdown sched_shutdown = { | |
36748 | + .reason = SHUTDOWN_suspend | |
36749 | + }; | |
36750 | + | |
36751 | + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, | |
36752 | + &sched_shutdown, srec); | |
36753 | + | |
36754 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
36755 | + if (rc == -ENOSYS) | |
36756 | + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, | |
36757 | + SHUTDOWN_suspend, srec); | |
36758 | +#endif | |
36759 | + | |
36760 | + return rc; | |
36761 | +} | |
36762 | + | |
36763 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
36764 | +static inline int | |
36765 | +HYPERVISOR_nmi_op( | |
36766 | + unsigned long op, void *arg) | |
36767 | +{ | |
36768 | + return _hypercall2(int, nmi_op, op, arg); | |
36769 | +} | |
36770 | +#endif | |
36771 | + | |
36772 | +#ifndef CONFIG_XEN | |
36773 | +static inline unsigned long __must_check | |
36774 | +HYPERVISOR_hvm_op( | |
36775 | + int op, void *arg) | |
36776 | +{ | |
36777 | + return _hypercall2(unsigned long, hvm_op, op, arg); | |
36778 | +} | |
36779 | +#endif | |
36780 | + | |
36781 | +static inline int __must_check | |
36782 | +HYPERVISOR_callback_op( | |
36783 | + int cmd, const void *arg) | |
36784 | +{ | |
36785 | + return _hypercall2(int, callback_op, cmd, arg); | |
36786 | +} | |
36787 | + | |
36788 | +static inline int __must_check | |
36789 | +HYPERVISOR_xenoprof_op( | |
36790 | + int op, void *arg) | |
36791 | +{ | |
36792 | + return _hypercall2(int, xenoprof_op, op, arg); | |
36793 | +} | |
36794 | + | |
36795 | +static inline int __must_check | |
36796 | +HYPERVISOR_kexec_op( | |
36797 | + unsigned long op, void *args) | |
36798 | +{ | |
36799 | + return _hypercall2(int, kexec_op, op, args); | |
36800 | +} | |
36801 | + | |
36802 | + | |
36803 | + | |
36804 | +#endif /* __HYPERCALL_H__ */ | |
36805 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypervisor.h | |
36806 | =================================================================== | |
36807 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
36808 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypervisor.h 2008-02-20 09:32:49.000000000 +0100 | |
36809 | @@ -0,0 +1,259 @@ | |
36810 | +/****************************************************************************** | |
36811 | + * hypervisor.h | |
36812 | + * | |
36813 | + * Linux-specific hypervisor handling. | |
36814 | + * | |
36815 | + * Copyright (c) 2002-2004, K A Fraser | |
36816 | + * | |
36817 | + * This program is free software; you can redistribute it and/or | |
36818 | + * modify it under the terms of the GNU General Public License version 2 | |
36819 | + * as published by the Free Software Foundation; or, when distributed | |
36820 | + * separately from the Linux kernel or incorporated into other | |
36821 | + * software packages, subject to the following license: | |
36822 | + * | |
36823 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | |
36824 | + * of this source file (the "Software"), to deal in the Software without | |
36825 | + * restriction, including without limitation the rights to use, copy, modify, | |
36826 | + * merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
36827 | + * and to permit persons to whom the Software is furnished to do so, subject to | |
36828 | + * the following conditions: | |
36829 | + * | |
36830 | + * The above copyright notice and this permission notice shall be included in | |
36831 | + * all copies or substantial portions of the Software. | |
36832 | + * | |
36833 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
36834 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
36835 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
36836 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
36837 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
36838 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
36839 | + * IN THE SOFTWARE. | |
36840 | + */ | |
36841 | + | |
36842 | +#ifndef __HYPERVISOR_H__ | |
36843 | +#define __HYPERVISOR_H__ | |
36844 | + | |
36845 | +#include <linux/types.h> | |
36846 | +#include <linux/kernel.h> | |
36847 | +#include <linux/version.h> | |
36848 | +#include <linux/errno.h> | |
36849 | +#include <xen/interface/xen.h> | |
36850 | +#include <xen/interface/platform.h> | |
36851 | +#include <xen/interface/event_channel.h> | |
36852 | +#include <xen/interface/physdev.h> | |
36853 | +#include <xen/interface/sched.h> | |
36854 | +#include <xen/interface/nmi.h> | |
36855 | +#include <asm/ptrace.h> | |
36856 | +#include <asm/page.h> | |
36857 | +#if defined(__i386__) | |
36858 | +# ifdef CONFIG_X86_PAE | |
36859 | +# include <asm-generic/pgtable-nopud.h> | |
36860 | +# else | |
36861 | +# include <asm-generic/pgtable-nopmd.h> | |
36862 | +# endif | |
36863 | +#elif defined(__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) | |
36864 | +# include <asm-generic/pgtable-nopud.h> | |
36865 | +#endif | |
36866 | + | |
36867 | +extern shared_info_t *HYPERVISOR_shared_info; | |
36868 | + | |
36869 | +#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu)) | |
36870 | +#ifdef CONFIG_SMP | |
36871 | +#define current_vcpu_info() vcpu_info(smp_processor_id()) | |
36872 | +#else | |
36873 | +#define current_vcpu_info() vcpu_info(0) | |
36874 | +#endif | |
36875 | + | |
36876 | +#ifdef CONFIG_X86_32 | |
36877 | +extern unsigned long hypervisor_virt_start; | |
36878 | +#endif | |
36879 | + | |
36880 | +/* arch/xen/i386/kernel/setup.c */ | |
36881 | +extern start_info_t *xen_start_info; | |
36882 | +#ifdef CONFIG_XEN_PRIVILEGED_GUEST | |
36883 | +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN) | |
36884 | +#else | |
36885 | +#define is_initial_xendomain() 0 | |
36886 | +#endif | |
36887 | + | |
36888 | +/* arch/xen/kernel/evtchn.c */ | |
36889 | +/* Force a proper event-channel callback from Xen. */ | |
36890 | +void force_evtchn_callback(void); | |
36891 | + | |
36892 | +/* arch/xen/kernel/process.c */ | |
36893 | +void xen_cpu_idle (void); | |
36894 | + | |
36895 | +/* arch/xen/i386/kernel/hypervisor.c */ | |
36896 | +void do_hypervisor_callback(struct pt_regs *regs); | |
36897 | + | |
36898 | +/* arch/xen/i386/mm/hypervisor.c */ | |
36899 | +/* | |
36900 | + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already | |
36901 | + * be MACHINE addresses. | |
36902 | + */ | |
36903 | + | |
36904 | +void xen_pt_switch(unsigned long ptr); | |
36905 | +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */ | |
36906 | +void xen_load_gs(unsigned int selector); /* x86_64 only */ | |
36907 | +void xen_tlb_flush(void); | |
36908 | +void xen_invlpg(unsigned long ptr); | |
36909 | + | |
36910 | +void xen_l1_entry_update(pte_t *ptr, pte_t val); | |
36911 | +void xen_l2_entry_update(pmd_t *ptr, pmd_t val); | |
36912 | +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */ | |
36913 | +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */ | |
36914 | +void xen_pgd_pin(unsigned long ptr); | |
36915 | +void xen_pgd_unpin(unsigned long ptr); | |
36916 | + | |
36917 | +void xen_set_ldt(const void *ptr, unsigned int ents); | |
36918 | + | |
36919 | +#ifdef CONFIG_SMP | |
36920 | +#include <linux/cpumask.h> | |
36921 | +void xen_tlb_flush_all(void); | |
36922 | +void xen_invlpg_all(unsigned long ptr); | |
36923 | +void xen_tlb_flush_mask(cpumask_t *mask); | |
36924 | +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr); | |
36925 | +#endif | |
36926 | + | |
36927 | +/* Returns zero on success else negative errno. */ | |
36928 | +int xen_create_contiguous_region( | |
36929 | + unsigned long vstart, unsigned int order, unsigned int address_bits); | |
36930 | +void xen_destroy_contiguous_region( | |
36931 | + unsigned long vstart, unsigned int order); | |
36932 | + | |
36933 | +struct page; | |
36934 | + | |
36935 | +int xen_limit_pages_to_max_mfn( | |
36936 | + struct page *pages, unsigned int order, unsigned int address_bits); | |
36937 | + | |
36938 | +/* Turn jiffies into Xen system time. */ | |
36939 | +u64 jiffies_to_st(unsigned long jiffies); | |
36940 | + | |
36941 | +#ifdef CONFIG_XEN_SCRUB_PAGES | |
36942 | +void scrub_pages(void *, unsigned int); | |
36943 | +#else | |
36944 | +#define scrub_pages(_p,_n) ((void)0) | |
36945 | +#endif | |
36946 | + | |
36947 | +#include <xen/hypercall.h> | |
36948 | + | |
36949 | +#if defined(CONFIG_X86_64) | |
36950 | +#define MULTI_UVMFLAGS_INDEX 2 | |
36951 | +#define MULTI_UVMDOMID_INDEX 3 | |
36952 | +#else | |
36953 | +#define MULTI_UVMFLAGS_INDEX 3 | |
36954 | +#define MULTI_UVMDOMID_INDEX 4 | |
36955 | +#endif | |
36956 | + | |
36957 | +#ifdef CONFIG_XEN | |
36958 | +#define is_running_on_xen() 1 | |
36959 | +#else | |
36960 | +extern char *hypercall_stubs; | |
36961 | +#define is_running_on_xen() (!!hypercall_stubs) | |
36962 | +#endif | |
36963 | + | |
36964 | +static inline int | |
36965 | +HYPERVISOR_yield( | |
36966 | + void) | |
36967 | +{ | |
36968 | + int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL); | |
36969 | + | |
36970 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
36971 | + if (rc == -ENOSYS) | |
36972 | + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); | |
36973 | +#endif | |
36974 | + | |
36975 | + return rc; | |
36976 | +} | |
36977 | + | |
36978 | +static inline int | |
36979 | +HYPERVISOR_block( | |
36980 | + void) | |
36981 | +{ | |
36982 | + int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL); | |
36983 | + | |
36984 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
36985 | + if (rc == -ENOSYS) | |
36986 | + rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0); | |
36987 | +#endif | |
36988 | + | |
36989 | + return rc; | |
36990 | +} | |
36991 | + | |
36992 | +static inline void /*__noreturn*/ | |
36993 | +HYPERVISOR_shutdown( | |
36994 | + unsigned int reason) | |
36995 | +{ | |
36996 | + struct sched_shutdown sched_shutdown = { | |
36997 | + .reason = reason | |
36998 | + }; | |
36999 | + | |
37000 | + VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown)); | |
37001 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
37002 | + VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason)); | |
37003 | +#endif | |
37004 | + /* Don't recurse needlessly. */ | |
37005 | + BUG_ON(reason != SHUTDOWN_crash); | |
37006 | + for(;;); | |
37007 | +} | |
37008 | + | |
37009 | +static inline int __must_check | |
37010 | +HYPERVISOR_poll( | |
37011 | + evtchn_port_t *ports, unsigned int nr_ports, u64 timeout) | |
37012 | +{ | |
37013 | + int rc; | |
37014 | + struct sched_poll sched_poll = { | |
37015 | + .nr_ports = nr_ports, | |
37016 | + .timeout = jiffies_to_st(timeout) | |
37017 | + }; | |
37018 | + set_xen_guest_handle(sched_poll.ports, ports); | |
37019 | + | |
37020 | + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); | |
37021 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
37022 | + if (rc == -ENOSYS) | |
37023 | + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); | |
37024 | +#endif | |
37025 | + | |
37026 | + return rc; | |
37027 | +} | |
37028 | + | |
37029 | +#ifdef CONFIG_XEN | |
37030 | + | |
37031 | +static inline void | |
37032 | +MULTI_update_va_mapping( | |
37033 | + multicall_entry_t *mcl, unsigned long va, | |
37034 | + pte_t new_val, unsigned long flags) | |
37035 | +{ | |
37036 | + mcl->op = __HYPERVISOR_update_va_mapping; | |
37037 | + mcl->args[0] = va; | |
37038 | +#if defined(CONFIG_X86_64) | |
37039 | + mcl->args[1] = new_val.pte; | |
37040 | +#elif defined(CONFIG_X86_PAE) | |
37041 | + mcl->args[1] = new_val.pte_low; | |
37042 | + mcl->args[2] = new_val.pte_high; | |
37043 | +#else | |
37044 | + mcl->args[1] = new_val.pte_low; | |
37045 | + mcl->args[2] = 0; | |
37046 | +#endif | |
37047 | + mcl->args[MULTI_UVMFLAGS_INDEX] = flags; | |
37048 | +} | |
37049 | + | |
37050 | +static inline void | |
37051 | +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd, | |
37052 | + void *uop, unsigned int count) | |
37053 | +{ | |
37054 | + mcl->op = __HYPERVISOR_grant_table_op; | |
37055 | + mcl->args[0] = cmd; | |
37056 | + mcl->args[1] = (unsigned long)uop; | |
37057 | + mcl->args[2] = count; | |
37058 | +} | |
37059 | + | |
37060 | +#else /* !defined(CONFIG_XEN) */ | |
37061 | + | |
37062 | +/* Multicalls not supported for HVM guests. */ | |
37063 | +#define MULTI_update_va_mapping(a,b,c,d) ((void)0) | |
37064 | +#define MULTI_grant_table_op(a,b,c,d) ((void)0) | |
37065 | + | |
37066 | +#endif | |
37067 | + | |
37068 | +#endif /* __HYPERVISOR_H__ */ | |
37069 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/io_32.h | |
37070 | =================================================================== | |
37071 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
37072 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/io_32.h 2007-08-16 18:07:01.000000000 +0200 | |
37073 | @@ -0,0 +1,389 @@ | |
37074 | +#ifndef _ASM_IO_H | |
37075 | +#define _ASM_IO_H | |
37076 | + | |
37077 | +#include <linux/string.h> | |
37078 | +#include <linux/compiler.h> | |
37079 | + | |
37080 | +/* | |
37081 | + * This file contains the definitions for the x86 IO instructions | |
37082 | + * inb/inw/inl/outb/outw/outl and the "string versions" of the same | |
37083 | + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" | |
37084 | + * versions of the single-IO instructions (inb_p/inw_p/..). | |
37085 | + * | |
37086 | + * This file is not meant to be obfuscating: it's just complicated | |
37087 | + * to (a) handle it all in a way that makes gcc able to optimize it | |
37088 | + * as well as possible and (b) trying to avoid writing the same thing | |
37089 | + * over and over again with slight variations and possibly making a | |
37090 | + * mistake somewhere. | |
37091 | + */ | |
37092 | + | |
37093 | +/* | |
37094 | + * Thanks to James van Artsdalen for a better timing-fix than | |
37095 | + * the two short jumps: using outb's to a nonexistent port seems | |
37096 | + * to guarantee better timings even on fast machines. | |
37097 | + * | |
37098 | + * On the other hand, I'd like to be sure of a non-existent port: | |
37099 | + * I feel a bit unsafe about using 0x80 (should be safe, though) | |
37100 | + * | |
37101 | + * Linus | |
37102 | + */ | |
37103 | + | |
37104 | + /* | |
37105 | + * Bit simplified and optimized by Jan Hubicka | |
37106 | + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. | |
37107 | + * | |
37108 | + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, | |
37109 | + * isa_read[wl] and isa_write[wl] fixed | |
37110 | + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> | |
37111 | + */ | |
37112 | + | |
37113 | +#define IO_SPACE_LIMIT 0xffff | |
37114 | + | |
37115 | +#define XQUAD_PORTIO_BASE 0xfe400000 | |
37116 | +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ | |
37117 | + | |
37118 | +#ifdef __KERNEL__ | |
37119 | + | |
37120 | +#include <asm-generic/iomap.h> | |
37121 | + | |
37122 | +#include <linux/vmalloc.h> | |
37123 | +#include <asm/fixmap.h> | |
37124 | + | |
37125 | +/* | |
37126 | + * Convert a physical pointer to a virtual kernel pointer for /dev/mem | |
37127 | + * access | |
37128 | + */ | |
37129 | +#define xlate_dev_mem_ptr(p) __va(p) | |
37130 | + | |
37131 | +/* | |
37132 | + * Convert a virtual cached pointer to an uncached pointer | |
37133 | + */ | |
37134 | +#define xlate_dev_kmem_ptr(p) p | |
37135 | + | |
37136 | +/** | |
37137 | + * virt_to_phys - map virtual addresses to physical | |
37138 | + * @address: address to remap | |
37139 | + * | |
37140 | + * The returned physical address is the physical (CPU) mapping for | |
37141 | + * the memory address given. It is only valid to use this function on | |
37142 | + * addresses directly mapped or allocated via kmalloc. | |
37143 | + * | |
37144 | + * This function does not give bus mappings for DMA transfers. In | |
37145 | + * almost all conceivable cases a device driver should not be using | |
37146 | + * this function | |
37147 | + */ | |
37148 | + | |
37149 | +static inline unsigned long virt_to_phys(volatile void * address) | |
37150 | +{ | |
37151 | + return __pa(address); | |
37152 | +} | |
37153 | + | |
37154 | +/** | |
37155 | + * phys_to_virt - map physical address to virtual | |
37156 | + * @address: address to remap | |
37157 | + * | |
37158 | + * The returned virtual address is a current CPU mapping for | |
37159 | + * the memory address given. It is only valid to use this function on | |
37160 | + * addresses that have a kernel mapping | |
37161 | + * | |
37162 | + * This function does not handle bus mappings for DMA transfers. In | |
37163 | + * almost all conceivable cases a device driver should not be using | |
37164 | + * this function | |
37165 | + */ | |
37166 | + | |
37167 | +static inline void * phys_to_virt(unsigned long address) | |
37168 | +{ | |
37169 | + return __va(address); | |
37170 | +} | |
37171 | + | |
37172 | +/* | |
37173 | + * Change "struct page" to physical address. | |
37174 | + */ | |
37175 | +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) | |
37176 | +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page))) | |
37177 | +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page))) | |
37178 | + | |
37179 | +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \ | |
37180 | + (unsigned long) bio_offset((bio))) | |
37181 | +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \ | |
37182 | + (unsigned long) (bv)->bv_offset) | |
37183 | + | |
37184 | +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ | |
37185 | + (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \ | |
37186 | + ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \ | |
37187 | + bvec_to_pseudophys((vec2)))) | |
37188 | + | |
37189 | +extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); | |
37190 | + | |
37191 | +/** | |
37192 | + * ioremap - map bus memory into CPU space | |
37193 | + * @offset: bus address of the memory | |
37194 | + * @size: size of the resource to map | |
37195 | + * | |
37196 | + * ioremap performs a platform specific sequence of operations to | |
37197 | + * make bus memory CPU accessible via the readb/readw/readl/writeb/ | |
37198 | + * writew/writel functions and the other mmio helpers. The returned | |
37199 | + * address is not guaranteed to be usable directly as a virtual | |
37200 | + * address. | |
37201 | + */ | |
37202 | + | |
37203 | +static inline void __iomem * ioremap(unsigned long offset, unsigned long size) | |
37204 | +{ | |
37205 | + return __ioremap(offset, size, 0); | |
37206 | +} | |
37207 | + | |
37208 | +extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size); | |
37209 | +extern void iounmap(volatile void __iomem *addr); | |
37210 | + | |
37211 | +/* | |
37212 | + * bt_ioremap() and bt_iounmap() are for temporary early boot-time | |
37213 | + * mappings, before the real ioremap() is functional. | |
37214 | + * A boot-time mapping is currently limited to at most 16 pages. | |
37215 | + */ | |
37216 | +extern void *bt_ioremap(unsigned long offset, unsigned long size); | |
37217 | +extern void bt_iounmap(void *addr, unsigned long size); | |
37218 | + | |
37219 | +/* Use early IO mappings for DMI because it's initialized early */ | |
37220 | +#define dmi_ioremap bt_ioremap | |
37221 | +#define dmi_iounmap bt_iounmap | |
37222 | +#define dmi_alloc alloc_bootmem | |
37223 | + | |
37224 | +/* | |
37225 | + * ISA I/O bus memory addresses are 1:1 with the physical address. | |
37226 | + */ | |
37227 | +#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); }) | |
37228 | +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x | |
37229 | +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x)) | |
37230 | + | |
37231 | +/* | |
37232 | + * However PCI ones are not necessarily 1:1 and therefore these interfaces | |
37233 | + * are forbidden in portable PCI drivers. | |
37234 | + * | |
37235 | + * Allow them on x86 for legacy drivers, though. | |
37236 | + */ | |
37237 | +#define virt_to_bus(_x) phys_to_machine(__pa(_x)) | |
37238 | +#define bus_to_virt(_x) __va(machine_to_phys(_x)) | |
37239 | + | |
37240 | +/* | |
37241 | + * readX/writeX() are used to access memory mapped devices. On some | |
37242 | + * architectures the memory mapped IO stuff needs to be accessed | |
37243 | + * differently. On the x86 architecture, we just read/write the | |
37244 | + * memory location directly. | |
37245 | + */ | |
37246 | + | |
37247 | +static inline unsigned char readb(const volatile void __iomem *addr) | |
37248 | +{ | |
37249 | + return *(volatile unsigned char __force *) addr; | |
37250 | +} | |
37251 | +static inline unsigned short readw(const volatile void __iomem *addr) | |
37252 | +{ | |
37253 | + return *(volatile unsigned short __force *) addr; | |
37254 | +} | |
37255 | +static inline unsigned int readl(const volatile void __iomem *addr) | |
37256 | +{ | |
37257 | + return *(volatile unsigned int __force *) addr; | |
37258 | +} | |
37259 | +#define readb_relaxed(addr) readb(addr) | |
37260 | +#define readw_relaxed(addr) readw(addr) | |
37261 | +#define readl_relaxed(addr) readl(addr) | |
37262 | +#define __raw_readb readb | |
37263 | +#define __raw_readw readw | |
37264 | +#define __raw_readl readl | |
37265 | + | |
37266 | +static inline void writeb(unsigned char b, volatile void __iomem *addr) | |
37267 | +{ | |
37268 | + *(volatile unsigned char __force *) addr = b; | |
37269 | +} | |
37270 | +static inline void writew(unsigned short b, volatile void __iomem *addr) | |
37271 | +{ | |
37272 | + *(volatile unsigned short __force *) addr = b; | |
37273 | +} | |
37274 | +static inline void writel(unsigned int b, volatile void __iomem *addr) | |
37275 | +{ | |
37276 | + *(volatile unsigned int __force *) addr = b; | |
37277 | +} | |
37278 | +#define __raw_writeb writeb | |
37279 | +#define __raw_writew writew | |
37280 | +#define __raw_writel writel | |
37281 | + | |
37282 | +#define mmiowb() | |
37283 | + | |
37284 | +static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count) | |
37285 | +{ | |
37286 | + memset((void __force *) addr, val, count); | |
37287 | +} | |
37288 | +static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count) | |
37289 | +{ | |
37290 | + __memcpy(dst, (void __force *) src, count); | |
37291 | +} | |
37292 | +static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count) | |
37293 | +{ | |
37294 | + __memcpy((void __force *) dst, src, count); | |
37295 | +} | |
37296 | + | |
37297 | +/* | |
37298 | + * ISA space is 'always mapped' on a typical x86 system, no need to | |
37299 | + * explicitly ioremap() it. The fact that the ISA IO space is mapped | |
37300 | + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values | |
37301 | + * are physical addresses. The following constant pointer can be | |
37302 | + * used as the IO-area pointer (it can be iounmapped as well, so the | |
37303 | + * analogy with PCI is quite large): | |
37304 | + */ | |
37305 | +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN))) | |
37306 | + | |
37307 | +/* | |
37308 | + * Again, i386 does not require mem IO specific function. | |
37309 | + */ | |
37310 | + | |
37311 | +#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d)) | |
37312 | + | |
37313 | +/** | |
37314 | + * check_signature - find BIOS signatures | |
37315 | + * @io_addr: mmio address to check | |
37316 | + * @signature: signature block | |
37317 | + * @length: length of signature | |
37318 | + * | |
37319 | + * Perform a signature comparison with the mmio address io_addr. This | |
37320 | + * address should have been obtained by ioremap. | |
37321 | + * Returns 1 on a match. | |
37322 | + */ | |
37323 | + | |
37324 | +static inline int check_signature(volatile void __iomem * io_addr, | |
37325 | + const unsigned char *signature, int length) | |
37326 | +{ | |
37327 | + int retval = 0; | |
37328 | + do { | |
37329 | + if (readb(io_addr) != *signature) | |
37330 | + goto out; | |
37331 | + io_addr++; | |
37332 | + signature++; | |
37333 | + length--; | |
37334 | + } while (length); | |
37335 | + retval = 1; | |
37336 | +out: | |
37337 | + return retval; | |
37338 | +} | |
37339 | + | |
37340 | +/* | |
37341 | + * Cache management | |
37342 | + * | |
37343 | + * This needed for two cases | |
37344 | + * 1. Out of order aware processors | |
37345 | + * 2. Accidentally out of order processors (PPro errata #51) | |
37346 | + */ | |
37347 | + | |
37348 | +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) | |
37349 | + | |
37350 | +static inline void flush_write_buffers(void) | |
37351 | +{ | |
37352 | + __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory"); | |
37353 | +} | |
37354 | + | |
37355 | +#define dma_cache_inv(_start,_size) flush_write_buffers() | |
37356 | +#define dma_cache_wback(_start,_size) flush_write_buffers() | |
37357 | +#define dma_cache_wback_inv(_start,_size) flush_write_buffers() | |
37358 | + | |
37359 | +#else | |
37360 | + | |
37361 | +/* Nothing to do */ | |
37362 | + | |
37363 | +#define dma_cache_inv(_start,_size) do { } while (0) | |
37364 | +#define dma_cache_wback(_start,_size) do { } while (0) | |
37365 | +#define dma_cache_wback_inv(_start,_size) do { } while (0) | |
37366 | +#define flush_write_buffers() | |
37367 | + | |
37368 | +#endif | |
37369 | + | |
37370 | +#endif /* __KERNEL__ */ | |
37371 | + | |
37372 | +#ifdef SLOW_IO_BY_JUMPING | |
37373 | +#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:" | |
37374 | +#else | |
37375 | +#define __SLOW_DOWN_IO "outb %%al,$0x80;" | |
37376 | +#endif | |
37377 | + | |
37378 | +static inline void slow_down_io(void) { | |
37379 | + __asm__ __volatile__( | |
37380 | + __SLOW_DOWN_IO | |
37381 | +#ifdef REALLY_SLOW_IO | |
37382 | + __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO | |
37383 | +#endif | |
37384 | + : : ); | |
37385 | +} | |
37386 | + | |
37387 | +#ifdef CONFIG_X86_NUMAQ | |
37388 | +extern void *xquad_portio; /* Where the IO area was mapped */ | |
37389 | +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) | |
37390 | +#define __BUILDIO(bwl,bw,type) \ | |
37391 | +static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \ | |
37392 | + if (xquad_portio) \ | |
37393 | + write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \ | |
37394 | + else \ | |
37395 | + out##bwl##_local(value, port); \ | |
37396 | +} \ | |
37397 | +static inline void out##bwl(unsigned type value, int port) { \ | |
37398 | + out##bwl##_quad(value, port, 0); \ | |
37399 | +} \ | |
37400 | +static inline unsigned type in##bwl##_quad(int port, int quad) { \ | |
37401 | + if (xquad_portio) \ | |
37402 | + return read##bwl(XQUAD_PORT_ADDR(port, quad)); \ | |
37403 | + else \ | |
37404 | + return in##bwl##_local(port); \ | |
37405 | +} \ | |
37406 | +static inline unsigned type in##bwl(int port) { \ | |
37407 | + return in##bwl##_quad(port, 0); \ | |
37408 | +} | |
37409 | +#else | |
37410 | +#define __BUILDIO(bwl,bw,type) \ | |
37411 | +static inline void out##bwl(unsigned type value, int port) { \ | |
37412 | + out##bwl##_local(value, port); \ | |
37413 | +} \ | |
37414 | +static inline unsigned type in##bwl(int port) { \ | |
37415 | + return in##bwl##_local(port); \ | |
37416 | +} | |
37417 | +#endif | |
37418 | + | |
37419 | + | |
37420 | +#define BUILDIO(bwl,bw,type) \ | |
37421 | +static inline void out##bwl##_local(unsigned type value, int port) { \ | |
37422 | + __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \ | |
37423 | +} \ | |
37424 | +static inline unsigned type in##bwl##_local(int port) { \ | |
37425 | + unsigned type value; \ | |
37426 | + __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \ | |
37427 | + return value; \ | |
37428 | +} \ | |
37429 | +static inline void out##bwl##_local_p(unsigned type value, int port) { \ | |
37430 | + out##bwl##_local(value, port); \ | |
37431 | + slow_down_io(); \ | |
37432 | +} \ | |
37433 | +static inline unsigned type in##bwl##_local_p(int port) { \ | |
37434 | + unsigned type value = in##bwl##_local(port); \ | |
37435 | + slow_down_io(); \ | |
37436 | + return value; \ | |
37437 | +} \ | |
37438 | +__BUILDIO(bwl,bw,type) \ | |
37439 | +static inline void out##bwl##_p(unsigned type value, int port) { \ | |
37440 | + out##bwl(value, port); \ | |
37441 | + slow_down_io(); \ | |
37442 | +} \ | |
37443 | +static inline unsigned type in##bwl##_p(int port) { \ | |
37444 | + unsigned type value = in##bwl(port); \ | |
37445 | + slow_down_io(); \ | |
37446 | + return value; \ | |
37447 | +} \ | |
37448 | +static inline void outs##bwl(int port, const void *addr, unsigned long count) { \ | |
37449 | + __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \ | |
37450 | +} \ | |
37451 | +static inline void ins##bwl(int port, void *addr, unsigned long count) { \ | |
37452 | + __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \ | |
37453 | +} | |
37454 | + | |
37455 | +BUILDIO(b,b,char) | |
37456 | +BUILDIO(w,w,short) | |
37457 | +BUILDIO(l,,int) | |
37458 | + | |
37459 | +/* We will be supplying our own /dev/mem implementation */ | |
37460 | +#define ARCH_HAS_DEV_MEM | |
37461 | + | |
37462 | +#endif | |
37463 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_32.h | |
37464 | =================================================================== | |
37465 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
37466 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_32.h 2007-06-12 13:14:02.000000000 +0200 | |
37467 | @@ -0,0 +1,127 @@ | |
37468 | +/* | |
37469 | + * include/asm-i386/irqflags.h | |
37470 | + * | |
37471 | + * IRQ flags handling | |
37472 | + * | |
37473 | + * This file gets included from lowlevel asm headers too, to provide | |
37474 | + * wrapped versions of the local_irq_*() APIs, based on the | |
37475 | + * raw_local_irq_*() functions from the lowlevel headers. | |
37476 | + */ | |
37477 | +#ifndef _ASM_IRQFLAGS_H | |
37478 | +#define _ASM_IRQFLAGS_H | |
37479 | + | |
37480 | +#ifndef __ASSEMBLY__ | |
37481 | + | |
37482 | +/* | |
37483 | + * The use of 'barrier' in the following reflects their use as local-lock | |
37484 | + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following | |
37485 | + * critical operations are executed. All critical operations must complete | |
37486 | + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also | |
37487 | + * includes these barriers, for example. | |
37488 | + */ | |
37489 | + | |
37490 | +#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask) | |
37491 | + | |
37492 | +#define raw_local_save_flags(flags) \ | |
37493 | + do { (flags) = __raw_local_save_flags(); } while (0) | |
37494 | + | |
37495 | +#define raw_local_irq_restore(x) \ | |
37496 | +do { \ | |
37497 | + vcpu_info_t *_vcpu; \ | |
37498 | + barrier(); \ | |
37499 | + _vcpu = current_vcpu_info(); \ | |
37500 | + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ | |
37501 | + barrier(); /* unmask then check (avoid races) */ \ | |
37502 | + if (unlikely(_vcpu->evtchn_upcall_pending)) \ | |
37503 | + force_evtchn_callback(); \ | |
37504 | + } \ | |
37505 | +} while (0) | |
37506 | + | |
37507 | +#define raw_local_irq_disable() \ | |
37508 | +do { \ | |
37509 | + current_vcpu_info()->evtchn_upcall_mask = 1; \ | |
37510 | + barrier(); \ | |
37511 | +} while (0) | |
37512 | + | |
37513 | +#define raw_local_irq_enable() \ | |
37514 | +do { \ | |
37515 | + vcpu_info_t *_vcpu; \ | |
37516 | + barrier(); \ | |
37517 | + _vcpu = current_vcpu_info(); \ | |
37518 | + _vcpu->evtchn_upcall_mask = 0; \ | |
37519 | + barrier(); /* unmask then check (avoid races) */ \ | |
37520 | + if (unlikely(_vcpu->evtchn_upcall_pending)) \ | |
37521 | + force_evtchn_callback(); \ | |
37522 | +} while (0) | |
37523 | + | |
37524 | +/* | |
37525 | + * Used in the idle loop; sti takes one instruction cycle | |
37526 | + * to complete: | |
37527 | + */ | |
37528 | +void raw_safe_halt(void); | |
37529 | + | |
37530 | +/* | |
37531 | + * Used when interrupts are already enabled or to | |
37532 | + * shutdown the processor: | |
37533 | + */ | |
37534 | +void halt(void); | |
37535 | + | |
37536 | +static inline int raw_irqs_disabled_flags(unsigned long flags) | |
37537 | +{ | |
37538 | + return (flags != 0); | |
37539 | +} | |
37540 | + | |
37541 | +#define raw_irqs_disabled() \ | |
37542 | +({ \ | |
37543 | + unsigned long flags = __raw_local_save_flags(); \ | |
37544 | + \ | |
37545 | + raw_irqs_disabled_flags(flags); \ | |
37546 | +}) | |
37547 | + | |
37548 | +/* | |
37549 | + * For spinlocks, etc: | |
37550 | + */ | |
37551 | +#define __raw_local_irq_save() \ | |
37552 | +({ \ | |
37553 | + unsigned long flags = __raw_local_save_flags(); \ | |
37554 | + \ | |
37555 | + raw_local_irq_disable(); \ | |
37556 | + \ | |
37557 | + flags; \ | |
37558 | +}) | |
37559 | + | |
37560 | +#define raw_local_irq_save(flags) \ | |
37561 | + do { (flags) = __raw_local_irq_save(); } while (0) | |
37562 | + | |
37563 | +#endif /* __ASSEMBLY__ */ | |
37564 | + | |
37565 | +/* | |
37566 | + * Do the CPU's IRQ-state tracing from assembly code. We call a | |
37567 | + * C function, so save all the C-clobbered registers: | |
37568 | + */ | |
37569 | +#ifdef CONFIG_TRACE_IRQFLAGS | |
37570 | + | |
37571 | +# define TRACE_IRQS_ON \ | |
37572 | + pushl %eax; \ | |
37573 | + pushl %ecx; \ | |
37574 | + pushl %edx; \ | |
37575 | + call trace_hardirqs_on; \ | |
37576 | + popl %edx; \ | |
37577 | + popl %ecx; \ | |
37578 | + popl %eax; | |
37579 | + | |
37580 | +# define TRACE_IRQS_OFF \ | |
37581 | + pushl %eax; \ | |
37582 | + pushl %ecx; \ | |
37583 | + pushl %edx; \ | |
37584 | + call trace_hardirqs_off; \ | |
37585 | + popl %edx; \ | |
37586 | + popl %ecx; \ | |
37587 | + popl %eax; | |
37588 | + | |
37589 | +#else | |
37590 | +# define TRACE_IRQS_ON | |
37591 | +# define TRACE_IRQS_OFF | |
37592 | +#endif | |
37593 | + | |
37594 | +#endif | |
37595 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_32.h | |
37596 | =================================================================== | |
37597 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
37598 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_32.h 2008-04-02 12:34:02.000000000 +0200 | |
37599 | @@ -0,0 +1,193 @@ | |
37600 | +#ifndef _I386_MADDR_H | |
37601 | +#define _I386_MADDR_H | |
37602 | + | |
37603 | +#include <xen/features.h> | |
37604 | +#include <xen/interface/xen.h> | |
37605 | + | |
37606 | +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ | |
37607 | +#define INVALID_P2M_ENTRY (~0UL) | |
37608 | +#define FOREIGN_FRAME_BIT (1UL<<31) | |
37609 | +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) | |
37610 | + | |
37611 | +/* Definitions for machine and pseudophysical addresses. */ | |
37612 | +#ifdef CONFIG_X86_PAE | |
37613 | +typedef unsigned long long paddr_t; | |
37614 | +typedef unsigned long long maddr_t; | |
37615 | +#else | |
37616 | +typedef unsigned long paddr_t; | |
37617 | +typedef unsigned long maddr_t; | |
37618 | +#endif | |
37619 | + | |
37620 | +#ifdef CONFIG_XEN | |
37621 | + | |
37622 | +extern unsigned long *phys_to_machine_mapping; | |
37623 | +extern unsigned long max_mapnr; | |
37624 | + | |
37625 | +#undef machine_to_phys_mapping | |
37626 | +extern unsigned long *machine_to_phys_mapping; | |
37627 | +extern unsigned int machine_to_phys_order; | |
37628 | + | |
37629 | +static inline unsigned long pfn_to_mfn(unsigned long pfn) | |
37630 | +{ | |
37631 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
37632 | + return pfn; | |
37633 | + BUG_ON(max_mapnr && pfn >= max_mapnr); | |
37634 | + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; | |
37635 | +} | |
37636 | + | |
37637 | +static inline int phys_to_machine_mapping_valid(unsigned long pfn) | |
37638 | +{ | |
37639 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
37640 | + return 1; | |
37641 | + BUG_ON(max_mapnr && pfn >= max_mapnr); | |
37642 | + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); | |
37643 | +} | |
37644 | + | |
37645 | +static inline unsigned long mfn_to_pfn(unsigned long mfn) | |
37646 | +{ | |
37647 | + unsigned long pfn; | |
37648 | + | |
37649 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
37650 | + return mfn; | |
37651 | + | |
37652 | + if (unlikely((mfn >> machine_to_phys_order) != 0)) | |
37653 | + return max_mapnr; | |
37654 | + | |
37655 | + /* The array access can fail (e.g., device space beyond end of RAM). */ | |
37656 | + asm ( | |
37657 | + "1: movl %1,%0\n" | |
37658 | + "2:\n" | |
37659 | + ".section .fixup,\"ax\"\n" | |
37660 | + "3: movl %2,%0\n" | |
37661 | + " jmp 2b\n" | |
37662 | + ".previous\n" | |
37663 | + ".section __ex_table,\"a\"\n" | |
37664 | + " .align 4\n" | |
37665 | + " .long 1b,3b\n" | |
37666 | + ".previous" | |
37667 | + : "=r" (pfn) | |
37668 | + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) ); | |
37669 | + | |
37670 | + return pfn; | |
37671 | +} | |
37672 | + | |
37673 | +/* | |
37674 | + * We detect special mappings in one of two ways: | |
37675 | + * 1. If the MFN is an I/O page then Xen will set the m2p entry | |
37676 | + * to be outside our maximum possible pseudophys range. | |
37677 | + * 2. If the MFN belongs to a different domain then we will certainly | |
37678 | + * not have MFN in our p2m table. Conversely, if the page is ours, | |
37679 | + * then we'll have p2m(m2p(MFN))==MFN. | |
37680 | + * If we detect a special mapping then it doesn't have a 'struct page'. | |
37681 | + * We force !pfn_valid() by returning an out-of-range pointer. | |
37682 | + * | |
37683 | + * NB. These checks require that, for any MFN that is not in our reservation, | |
37684 | + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if | |
37685 | + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. | |
37686 | + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. | |
37687 | + * | |
37688 | + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* | |
37689 | + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we | |
37690 | + * require. In all the cases we care about, the FOREIGN_FRAME bit is | |
37691 | + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. | |
37692 | + */ | |
37693 | +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) | |
37694 | +{ | |
37695 | + unsigned long pfn = mfn_to_pfn(mfn); | |
37696 | + if ((pfn < max_mapnr) | |
37697 | + && !xen_feature(XENFEAT_auto_translated_physmap) | |
37698 | + && (phys_to_machine_mapping[pfn] != mfn)) | |
37699 | + return max_mapnr; /* force !pfn_valid() */ | |
37700 | + return pfn; | |
37701 | +} | |
37702 | + | |
37703 | +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) | |
37704 | +{ | |
37705 | + BUG_ON(max_mapnr && pfn >= max_mapnr); | |
37706 | + if (xen_feature(XENFEAT_auto_translated_physmap)) { | |
37707 | + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); | |
37708 | + return; | |
37709 | + } | |
37710 | + phys_to_machine_mapping[pfn] = mfn; | |
37711 | +} | |
37712 | + | |
37713 | +static inline maddr_t phys_to_machine(paddr_t phys) | |
37714 | +{ | |
37715 | + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); | |
37716 | + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); | |
37717 | + return machine; | |
37718 | +} | |
37719 | + | |
37720 | +static inline paddr_t machine_to_phys(maddr_t machine) | |
37721 | +{ | |
37722 | + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); | |
37723 | + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); | |
37724 | + return phys; | |
37725 | +} | |
37726 | + | |
37727 | +#ifdef CONFIG_X86_PAE | |
37728 | +static inline paddr_t pte_phys_to_machine(paddr_t phys) | |
37729 | +{ | |
37730 | + /* | |
37731 | + * In PAE mode, the NX bit needs to be dealt with in the value | |
37732 | + * passed to pfn_to_mfn(). On x86_64, we need to mask it off, | |
37733 | + * but for i386 the conversion to ulong for the argument will | |
37734 | + * clip it off. | |
37735 | + */ | |
37736 | + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); | |
37737 | + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK); | |
37738 | + return machine; | |
37739 | +} | |
37740 | + | |
37741 | +static inline paddr_t pte_machine_to_phys(maddr_t machine) | |
37742 | +{ | |
37743 | + /* | |
37744 | + * In PAE mode, the NX bit needs to be dealt with in the value | |
37745 | + * passed to mfn_to_pfn(). On x86_64, we need to mask it off, | |
37746 | + * but for i386 the conversion to ulong for the argument will | |
37747 | + * clip it off. | |
37748 | + */ | |
37749 | + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); | |
37750 | + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); | |
37751 | + return phys; | |
37752 | +} | |
37753 | +#endif | |
37754 | + | |
37755 | +#ifdef CONFIG_X86_PAE | |
37756 | +#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } ) | |
37757 | +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) | |
37758 | +{ | |
37759 | + pte_t pte; | |
37760 | + | |
37761 | + pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \ | |
37762 | + (pgprot_val(pgprot) >> 32); | |
37763 | + pte.pte_high &= (__supported_pte_mask >> 32); | |
37764 | + pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \ | |
37765 | + __supported_pte_mask; | |
37766 | + return pte; | |
37767 | +} | |
37768 | +#else | |
37769 | +#define __pte_ma(x) ((pte_t) { (x) } ) | |
37770 | +#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) | |
37771 | +#endif | |
37772 | + | |
37773 | +#else /* !CONFIG_XEN */ | |
37774 | + | |
37775 | +#define pfn_to_mfn(pfn) (pfn) | |
37776 | +#define mfn_to_pfn(mfn) (mfn) | |
37777 | +#define mfn_to_local_pfn(mfn) (mfn) | |
37778 | +#define set_phys_to_machine(pfn, mfn) ((void)0) | |
37779 | +#define phys_to_machine_mapping_valid(pfn) (1) | |
37780 | +#define phys_to_machine(phys) ((maddr_t)(phys)) | |
37781 | +#define machine_to_phys(mach) ((paddr_t)(mach)) | |
37782 | +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot) | |
37783 | +#define __pte_ma(x) __pte(x) | |
37784 | + | |
37785 | +#endif /* !CONFIG_XEN */ | |
37786 | + | |
37787 | +/* VIRT <-> MACHINE conversion */ | |
37788 | +#define virt_to_machine(v) (phys_to_machine(__pa(v))) | |
37789 | +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) | |
37790 | +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) | |
37791 | + | |
37792 | +#endif /* _I386_MADDR_H */ | |
37793 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_32.h | |
37794 | =================================================================== | |
37795 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
37796 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_32.h 2007-06-12 13:14:02.000000000 +0200 | |
37797 | @@ -0,0 +1,108 @@ | |
37798 | +#ifndef __I386_SCHED_H | |
37799 | +#define __I386_SCHED_H | |
37800 | + | |
37801 | +#include <asm/desc.h> | |
37802 | +#include <asm/atomic.h> | |
37803 | +#include <asm/pgalloc.h> | |
37804 | +#include <asm/tlbflush.h> | |
37805 | + | |
37806 | +/* | |
37807 | + * Used for LDT copy/destruction. | |
37808 | + */ | |
37809 | +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); | |
37810 | +void destroy_context(struct mm_struct *mm); | |
37811 | + | |
37812 | + | |
37813 | +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) | |
37814 | +{ | |
37815 | +#if 0 /* XEN: no lazy tlb */ | |
37816 | + unsigned cpu = smp_processor_id(); | |
37817 | + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) | |
37818 | + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; | |
37819 | +#endif | |
37820 | +} | |
37821 | + | |
37822 | +#define prepare_arch_switch(next) __prepare_arch_switch() | |
37823 | + | |
37824 | +static inline void __prepare_arch_switch(void) | |
37825 | +{ | |
37826 | + /* | |
37827 | + * Save away %fs and %gs. No need to save %es and %ds, as those | |
37828 | + * are always kernel segments while inside the kernel. Must | |
37829 | + * happen before reload of cr3/ldt (i.e., not in __switch_to). | |
37830 | + */ | |
37831 | + asm volatile ( "mov %%fs,%0 ; mov %%gs,%1" | |
37832 | + : "=m" (current->thread.fs), | |
37833 | + "=m" (current->thread.gs)); | |
37834 | + asm volatile ( "movl %0,%%fs ; movl %0,%%gs" | |
37835 | + : : "r" (0) ); | |
37836 | +} | |
37837 | + | |
37838 | +extern void mm_pin(struct mm_struct *mm); | |
37839 | +extern void mm_unpin(struct mm_struct *mm); | |
37840 | +void mm_pin_all(void); | |
37841 | + | |
37842 | +static inline void switch_mm(struct mm_struct *prev, | |
37843 | + struct mm_struct *next, | |
37844 | + struct task_struct *tsk) | |
37845 | +{ | |
37846 | + int cpu = smp_processor_id(); | |
37847 | + struct mmuext_op _op[2], *op = _op; | |
37848 | + | |
37849 | + if (likely(prev != next)) { | |
37850 | + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && | |
37851 | + !test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)); | |
37852 | + | |
37853 | + /* stop flush ipis for the previous mm */ | |
37854 | + cpu_clear(cpu, prev->cpu_vm_mask); | |
37855 | +#if 0 /* XEN: no lazy tlb */ | |
37856 | + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; | |
37857 | + per_cpu(cpu_tlbstate, cpu).active_mm = next; | |
37858 | +#endif | |
37859 | + cpu_set(cpu, next->cpu_vm_mask); | |
37860 | + | |
37861 | + /* Re-load page tables: load_cr3(next->pgd) */ | |
37862 | + op->cmd = MMUEXT_NEW_BASEPTR; | |
37863 | + op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); | |
37864 | + op++; | |
37865 | + | |
37866 | + /* | |
37867 | + * load the LDT, if the LDT is different: | |
37868 | + */ | |
37869 | + if (unlikely(prev->context.ldt != next->context.ldt)) { | |
37870 | + /* load_LDT_nolock(&next->context, cpu) */ | |
37871 | + op->cmd = MMUEXT_SET_LDT; | |
37872 | + op->arg1.linear_addr = (unsigned long)next->context.ldt; | |
37873 | + op->arg2.nr_ents = next->context.size; | |
37874 | + op++; | |
37875 | + } | |
37876 | + | |
37877 | + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); | |
37878 | + } | |
37879 | +#if 0 /* XEN: no lazy tlb */ | |
37880 | + else { | |
37881 | + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; | |
37882 | + BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); | |
37883 | + | |
37884 | + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { | |
37885 | + /* We were in lazy tlb mode and leave_mm disabled | |
37886 | + * tlb flush IPI delivery. We must reload %cr3. | |
37887 | + */ | |
37888 | + load_cr3(next->pgd); | |
37889 | + load_LDT_nolock(&next->context, cpu); | |
37890 | + } | |
37891 | + } | |
37892 | +#endif | |
37893 | +} | |
37894 | + | |
37895 | +#define deactivate_mm(tsk, mm) \ | |
37896 | + asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) | |
37897 | + | |
37898 | +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) | |
37899 | +{ | |
37900 | + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)) | |
37901 | + mm_pin(next); | |
37902 | + switch_mm(prev, next, NULL); | |
37903 | +} | |
37904 | + | |
37905 | +#endif | |
37906 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pci_32.h | |
37907 | =================================================================== | |
37908 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
37909 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pci_32.h 2007-09-14 11:14:51.000000000 +0200 | |
37910 | @@ -0,0 +1,148 @@ | |
37911 | +#ifndef __i386_PCI_H | |
37912 | +#define __i386_PCI_H | |
37913 | + | |
37914 | + | |
37915 | +#ifdef __KERNEL__ | |
37916 | +#include <linux/mm.h> /* for struct page */ | |
37917 | + | |
37918 | +/* Can be used to override the logic in pci_scan_bus for skipping | |
37919 | + already-configured bus numbers - to be used for buggy BIOSes | |
37920 | + or architectures with incomplete PCI setup by the loader */ | |
37921 | + | |
37922 | +#ifdef CONFIG_PCI | |
37923 | +extern unsigned int pcibios_assign_all_busses(void); | |
37924 | +#else | |
37925 | +#define pcibios_assign_all_busses() 0 | |
37926 | +#endif | |
37927 | + | |
37928 | +#include <asm/hypervisor.h> | |
37929 | +#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain()) | |
37930 | + | |
37931 | +extern unsigned long pci_mem_start; | |
37932 | +#define PCIBIOS_MIN_IO 0x1000 | |
37933 | +#define PCIBIOS_MIN_MEM (pci_mem_start) | |
37934 | + | |
37935 | +#define PCIBIOS_MIN_CARDBUS_IO 0x4000 | |
37936 | + | |
37937 | +void pcibios_config_init(void); | |
37938 | +struct pci_bus * pcibios_scan_root(int bus); | |
37939 | + | |
37940 | +void pcibios_set_master(struct pci_dev *dev); | |
37941 | +void pcibios_penalize_isa_irq(int irq, int active); | |
37942 | +struct irq_routing_table *pcibios_get_irq_routing_table(void); | |
37943 | +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); | |
37944 | + | |
37945 | +/* Dynamic DMA mapping stuff. | |
37946 | + * i386 has everything mapped statically. | |
37947 | + */ | |
37948 | + | |
37949 | +#include <linux/types.h> | |
37950 | +#include <linux/slab.h> | |
37951 | +#include <asm/scatterlist.h> | |
37952 | +#include <linux/string.h> | |
37953 | +#include <asm/io.h> | |
37954 | + | |
37955 | +struct pci_dev; | |
37956 | + | |
37957 | +#ifdef CONFIG_SWIOTLB | |
37958 | + | |
37959 | + | |
37960 | +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */ | |
37961 | +#define PCI_DMA_BUS_IS_PHYS (0) | |
37962 | + | |
37963 | +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ | |
37964 | + dma_addr_t ADDR_NAME; | |
37965 | +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ | |
37966 | + __u32 LEN_NAME; | |
37967 | +#define pci_unmap_addr(PTR, ADDR_NAME) \ | |
37968 | + ((PTR)->ADDR_NAME) | |
37969 | +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ | |
37970 | + (((PTR)->ADDR_NAME) = (VAL)) | |
37971 | +#define pci_unmap_len(PTR, LEN_NAME) \ | |
37972 | + ((PTR)->LEN_NAME) | |
37973 | +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ | |
37974 | + (((PTR)->LEN_NAME) = (VAL)) | |
37975 | + | |
37976 | +#else | |
37977 | + | |
37978 | +/* The PCI address space does equal the physical memory | |
37979 | + * address space. The networking and block device layers use | |
37980 | + * this boolean for bounce buffer decisions. | |
37981 | + */ | |
37982 | +#define PCI_DMA_BUS_IS_PHYS (1) | |
37983 | + | |
37984 | +/* pci_unmap_{page,single} is a nop so... */ | |
37985 | +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) | |
37986 | +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) | |
37987 | +#define pci_unmap_addr(PTR, ADDR_NAME) (0) | |
37988 | +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) | |
37989 | +#define pci_unmap_len(PTR, LEN_NAME) (0) | |
37990 | +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) | |
37991 | + | |
37992 | +#endif | |
37993 | + | |
37994 | +/* This is always fine. */ | |
37995 | +#define pci_dac_dma_supported(pci_dev, mask) (1) | |
37996 | + | |
37997 | +static inline dma64_addr_t | |
37998 | +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction) | |
37999 | +{ | |
38000 | + return ((dma64_addr_t) page_to_phys(page) + | |
38001 | + (dma64_addr_t) offset); | |
38002 | +} | |
38003 | + | |
38004 | +static inline struct page * | |
38005 | +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) | |
38006 | +{ | |
38007 | + return pfn_to_page(dma_addr >> PAGE_SHIFT); | |
38008 | +} | |
38009 | + | |
38010 | +static inline unsigned long | |
38011 | +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) | |
38012 | +{ | |
38013 | + return (dma_addr & ~PAGE_MASK); | |
38014 | +} | |
38015 | + | |
38016 | +static inline void | |
38017 | +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) | |
38018 | +{ | |
38019 | +} | |
38020 | + | |
38021 | +static inline void | |
38022 | +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) | |
38023 | +{ | |
38024 | + flush_write_buffers(); | |
38025 | +} | |
38026 | + | |
38027 | +#define HAVE_PCI_MMAP | |
38028 | +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, | |
38029 | + enum pci_mmap_state mmap_state, int write_combine); | |
38030 | + | |
38031 | + | |
38032 | +static inline void pcibios_add_platform_entries(struct pci_dev *dev) | |
38033 | +{ | |
38034 | +} | |
38035 | + | |
38036 | +#ifdef CONFIG_PCI | |
38037 | +static inline void pci_dma_burst_advice(struct pci_dev *pdev, | |
38038 | + enum pci_dma_burst_strategy *strat, | |
38039 | + unsigned long *strategy_parameter) | |
38040 | +{ | |
38041 | + *strat = PCI_DMA_BURST_INFINITY; | |
38042 | + *strategy_parameter = ~0UL; | |
38043 | +} | |
38044 | +#endif | |
38045 | + | |
38046 | +#endif /* __KERNEL__ */ | |
38047 | + | |
38048 | +#ifdef CONFIG_XEN_PCIDEV_FRONTEND | |
38049 | +#include <xen/pcifront.h> | |
38050 | +#endif /* CONFIG_XEN_PCIDEV_FRONTEND */ | |
38051 | + | |
38052 | +/* implement the pci_ DMA API in terms of the generic device dma_ one */ | |
38053 | +#include <asm-generic/pci-dma-compat.h> | |
38054 | + | |
38055 | +/* generic pci stuff */ | |
38056 | +#include <asm-generic/pci.h> | |
38057 | + | |
38058 | +#endif /* __i386_PCI_H */ | |
38059 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_32.h | |
38060 | =================================================================== | |
38061 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
38062 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_32.h 2008-07-21 11:00:33.000000000 +0200 | |
38063 | @@ -0,0 +1,59 @@ | |
38064 | +#ifndef _I386_PGALLOC_H | |
38065 | +#define _I386_PGALLOC_H | |
38066 | + | |
38067 | +#include <asm/fixmap.h> | |
38068 | +#include <linux/threads.h> | |
38069 | +#include <linux/mm.h> /* for struct page */ | |
38070 | +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ | |
38071 | + | |
38072 | +#define pmd_populate_kernel(mm, pmd, pte) \ | |
38073 | + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) | |
38074 | + | |
38075 | +#define pmd_populate(mm, pmd, pte) \ | |
38076 | +do { \ | |
38077 | + unsigned long pfn = page_to_pfn(pte); \ | |
38078 | + if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) { \ | |
38079 | + if (!PageHighMem(pte)) \ | |
38080 | + BUG_ON(HYPERVISOR_update_va_mapping( \ | |
38081 | + (unsigned long)__va(pfn << PAGE_SHIFT), \ | |
38082 | + pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \ | |
38083 | + else if (!test_and_set_bit(PG_pinned, &pte->flags)) \ | |
38084 | + kmap_flush_unused(); \ | |
38085 | + set_pmd(pmd, \ | |
38086 | + __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \ | |
38087 | + } else \ | |
38088 | + *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \ | |
38089 | +} while (0) | |
38090 | + | |
38091 | +/* | |
38092 | + * Allocate and free page tables. | |
38093 | + */ | |
38094 | +extern pgd_t *pgd_alloc(struct mm_struct *); | |
38095 | +extern void pgd_free(pgd_t *pgd); | |
38096 | + | |
38097 | +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); | |
38098 | +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); | |
38099 | + | |
38100 | +static inline void pte_free_kernel(pte_t *pte) | |
38101 | +{ | |
38102 | + make_lowmem_page_writable(pte, XENFEAT_writable_page_tables); | |
38103 | + free_page((unsigned long)pte); | |
38104 | +} | |
38105 | + | |
38106 | +extern void pte_free(struct page *pte); | |
38107 | + | |
38108 | +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) | |
38109 | + | |
38110 | +#ifdef CONFIG_X86_PAE | |
38111 | +/* | |
38112 | + * In the PAE case we free the pmds as part of the pgd. | |
38113 | + */ | |
38114 | +#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) | |
38115 | +#define pmd_free(x) do { } while (0) | |
38116 | +#define __pmd_free_tlb(tlb,x) do { } while (0) | |
38117 | +#define pud_populate(mm, pmd, pte) BUG() | |
38118 | +#endif | |
38119 | + | |
38120 | +#define check_pgt_cache() do { } while (0) | |
38121 | + | |
38122 | +#endif /* _I386_PGALLOC_H */ | |
38123 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h | |
38124 | =================================================================== | |
38125 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
38126 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h 2007-06-12 13:14:02.000000000 +0200 | |
38127 | @@ -0,0 +1,24 @@ | |
38128 | +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H | |
38129 | +#define _I386_PGTABLE_3LEVEL_DEFS_H | |
38130 | + | |
38131 | +#define HAVE_SHARED_KERNEL_PMD 0 | |
38132 | + | |
38133 | +/* | |
38134 | + * PGDIR_SHIFT determines what a top-level page table entry can map | |
38135 | + */ | |
38136 | +#define PGDIR_SHIFT 30 | |
38137 | +#define PTRS_PER_PGD 4 | |
38138 | + | |
38139 | +/* | |
38140 | + * PMD_SHIFT determines the size of the area a middle-level | |
38141 | + * page table can map | |
38142 | + */ | |
38143 | +#define PMD_SHIFT 21 | |
38144 | +#define PTRS_PER_PMD 512 | |
38145 | + | |
38146 | +/* | |
38147 | + * entries per page directory level | |
38148 | + */ | |
38149 | +#define PTRS_PER_PTE 512 | |
38150 | + | |
38151 | +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */ | |
38152 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level.h | |
38153 | =================================================================== | |
38154 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
38155 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-04-02 12:34:02.000000000 +0200 | |
38156 | @@ -0,0 +1,211 @@ | |
38157 | +#ifndef _I386_PGTABLE_3LEVEL_H | |
38158 | +#define _I386_PGTABLE_3LEVEL_H | |
38159 | + | |
38160 | +#include <asm-generic/pgtable-nopud.h> | |
38161 | + | |
38162 | +/* | |
38163 | + * Intel Physical Address Extension (PAE) Mode - three-level page | |
38164 | + * tables on PPro+ CPUs. | |
38165 | + * | |
38166 | + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | |
38167 | + */ | |
38168 | + | |
38169 | +#define pte_ERROR(e) \ | |
38170 | + printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \ | |
38171 | + &(e), __pte_val(e), pte_pfn(e)) | |
38172 | +#define pmd_ERROR(e) \ | |
38173 | + printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \ | |
38174 | + &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT) | |
38175 | +#define pgd_ERROR(e) \ | |
38176 | + printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \ | |
38177 | + &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT) | |
38178 | + | |
38179 | +#define pud_none(pud) 0 | |
38180 | +#define pud_bad(pud) 0 | |
38181 | +#define pud_present(pud) 1 | |
38182 | + | |
38183 | +/* | |
38184 | + * Is the pte executable? | |
38185 | + */ | |
38186 | +static inline int pte_x(pte_t pte) | |
38187 | +{ | |
38188 | + return !(__pte_val(pte) & _PAGE_NX); | |
38189 | +} | |
38190 | + | |
38191 | +/* | |
38192 | + * All present user-pages with !NX bit are user-executable: | |
38193 | + */ | |
38194 | +static inline int pte_exec(pte_t pte) | |
38195 | +{ | |
38196 | + return pte_user(pte) && pte_x(pte); | |
38197 | +} | |
38198 | +/* | |
38199 | + * All present pages with !NX bit are kernel-executable: | |
38200 | + */ | |
38201 | +static inline int pte_exec_kernel(pte_t pte) | |
38202 | +{ | |
38203 | + return pte_x(pte); | |
38204 | +} | |
38205 | + | |
38206 | +/* Rules for using set_pte: the pte being assigned *must* be | |
38207 | + * either not present or in a state where the hardware will | |
38208 | + * not attempt to update the pte. In places where this is | |
38209 | + * not possible, use pte_get_and_clear to obtain the old pte | |
38210 | + * value and then use set_pte to update it. -ben | |
38211 | + */ | |
38212 | +#define __HAVE_ARCH_SET_PTE_ATOMIC | |
38213 | + | |
38214 | +static inline void set_pte(pte_t *ptep, pte_t pte) | |
38215 | +{ | |
38216 | + ptep->pte_high = pte.pte_high; | |
38217 | + smp_wmb(); | |
38218 | + ptep->pte_low = pte.pte_low; | |
38219 | +} | |
38220 | +#define set_pte_atomic(pteptr,pteval) \ | |
38221 | + set_64bit((unsigned long long *)(pteptr),__pte_val(pteval)) | |
38222 | + | |
38223 | +#define set_pte_at(_mm,addr,ptep,pteval) do { \ | |
38224 | + if (((_mm) != current->mm && (_mm) != &init_mm) || \ | |
38225 | + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \ | |
38226 | + set_pte((ptep), (pteval)); \ | |
38227 | +} while (0) | |
38228 | + | |
38229 | +#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \ | |
38230 | + if (((_mm) != current->mm && (_mm) != &init_mm) || \ | |
38231 | + HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \ | |
38232 | + set_pte((ptep), (pteval)); \ | |
38233 | + xen_invlpg((addr)); \ | |
38234 | + } \ | |
38235 | +} while (0) | |
38236 | + | |
38237 | +#define set_pmd(pmdptr,pmdval) \ | |
38238 | + xen_l2_entry_update((pmdptr), (pmdval)) | |
38239 | +#define set_pud(pudptr,pudval) \ | |
38240 | + xen_l3_entry_update((pudptr), (pudval)) | |
38241 | + | |
38242 | +/* | |
38243 | + * Pentium-II erratum A13: in PAE mode we explicitly have to flush | |
38244 | + * the TLB via cr3 if the top-level pgd is changed... | |
38245 | + * We do not let the generic code free and clear pgd entries due to | |
38246 | + * this erratum. | |
38247 | + */ | |
38248 | +static inline void pud_clear (pud_t * pud) { } | |
38249 | + | |
38250 | +#define pud_page(pud) \ | |
38251 | +((struct page *) __va(pud_val(pud) & PAGE_MASK)) | |
38252 | + | |
38253 | +#define pud_page_kernel(pud) \ | |
38254 | +((unsigned long) __va(pud_val(pud) & PAGE_MASK)) | |
38255 | + | |
38256 | + | |
38257 | +/* Find an entry in the second-level page table.. */ | |
38258 | +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ | |
38259 | + pmd_index(address)) | |
38260 | + | |
38261 | +static inline int pte_none(pte_t pte) | |
38262 | +{ | |
38263 | + return !(pte.pte_low | pte.pte_high); | |
38264 | +} | |
38265 | + | |
38266 | +/* | |
38267 | + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table | |
38268 | + * entry, so clear the bottom half first and enforce ordering with a compiler | |
38269 | + * barrier. | |
38270 | + */ | |
38271 | +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
38272 | +{ | |
38273 | + if ((mm != current->mm && mm != &init_mm) | |
38274 | + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { | |
38275 | + ptep->pte_low = 0; | |
38276 | + smp_wmb(); | |
38277 | + ptep->pte_high = 0; | |
38278 | + } | |
38279 | +} | |
38280 | + | |
38281 | +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) | |
38282 | + | |
38283 | +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
38284 | +{ | |
38285 | + pte_t pte = *ptep; | |
38286 | + if (!pte_none(pte)) { | |
38287 | + if ((mm != &init_mm) || | |
38288 | + HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { | |
38289 | + uint64_t val = __pte_val(pte); | |
38290 | + if (__cmpxchg64(ptep, val, 0) != val) { | |
38291 | + /* xchg acts as a barrier before the setting of the high bits */ | |
38292 | + pte.pte_low = xchg(&ptep->pte_low, 0); | |
38293 | + pte.pte_high = ptep->pte_high; | |
38294 | + ptep->pte_high = 0; | |
38295 | + } | |
38296 | + } | |
38297 | + } | |
38298 | + return pte; | |
38299 | +} | |
38300 | + | |
38301 | +#define ptep_clear_flush(vma, addr, ptep) \ | |
38302 | +({ \ | |
38303 | + pte_t *__ptep = (ptep); \ | |
38304 | + pte_t __res = *__ptep; \ | |
38305 | + if (!pte_none(__res) && \ | |
38306 | + ((vma)->vm_mm != current->mm || \ | |
38307 | + HYPERVISOR_update_va_mapping(addr, __pte(0), \ | |
38308 | + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ | |
38309 | + UVMF_INVLPG|UVMF_MULTI))) { \ | |
38310 | + __ptep->pte_low = 0; \ | |
38311 | + smp_wmb(); \ | |
38312 | + __ptep->pte_high = 0; \ | |
38313 | + flush_tlb_page(vma, addr); \ | |
38314 | + } \ | |
38315 | + __res; \ | |
38316 | +}) | |
38317 | + | |
38318 | +static inline int pte_same(pte_t a, pte_t b) | |
38319 | +{ | |
38320 | + return a.pte_low == b.pte_low && a.pte_high == b.pte_high; | |
38321 | +} | |
38322 | + | |
38323 | +#define pte_page(x) pfn_to_page(pte_pfn(x)) | |
38324 | + | |
38325 | +#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \ | |
38326 | + ((_pte).pte_high << (32-PAGE_SHIFT))) | |
38327 | +#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \ | |
38328 | + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) | |
38329 | +#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IO ? max_mapnr : \ | |
38330 | + (_pte).pte_low & _PAGE_PRESENT ? \ | |
38331 | + mfn_to_local_pfn(__pte_mfn(_pte)) : \ | |
38332 | + __pte_mfn(_pte)) | |
38333 | + | |
38334 | +extern unsigned long long __supported_pte_mask; | |
38335 | + | |
38336 | +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) | |
38337 | +{ | |
38338 | + return __pte((((unsigned long long)page_nr << PAGE_SHIFT) | | |
38339 | + pgprot_val(pgprot)) & __supported_pte_mask); | |
38340 | +} | |
38341 | + | |
38342 | +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) | |
38343 | +{ | |
38344 | + return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | | |
38345 | + pgprot_val(pgprot)) & __supported_pte_mask); | |
38346 | +} | |
38347 | + | |
38348 | +/* | |
38349 | + * Bits 0, 6 and 7 are taken in the low part of the pte, | |
38350 | + * put the 32 bits of offset into the high part. | |
38351 | + */ | |
38352 | +#define pte_to_pgoff(pte) ((pte).pte_high) | |
38353 | +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) | |
38354 | +#define PTE_FILE_MAX_BITS 32 | |
38355 | + | |
38356 | +/* Encode and de-code a swap entry */ | |
38357 | +#define __swp_type(x) (((x).val) & 0x1f) | |
38358 | +#define __swp_offset(x) ((x).val >> 5) | |
38359 | +#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) | |
38360 | +#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) | |
38361 | +#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val }) | |
38362 | + | |
38363 | +#define __pmd_free_tlb(tlb, x) do { } while (0) | |
38364 | + | |
38365 | +void vmalloc_sync_all(void); | |
38366 | + | |
38367 | +#endif /* _I386_PGTABLE_3LEVEL_H */ | |
38368 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_32.h | |
38369 | =================================================================== | |
38370 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
38371 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-07-21 11:00:33.000000000 +0200 | |
38372 | @@ -0,0 +1,537 @@ | |
38373 | +#ifndef _I386_PGTABLE_H | |
38374 | +#define _I386_PGTABLE_H | |
38375 | + | |
38376 | +#include <asm/hypervisor.h> | |
38377 | + | |
38378 | +/* | |
38379 | + * The Linux memory management assumes a three-level page table setup. On | |
38380 | + * the i386, we use that, but "fold" the mid level into the top-level page | |
38381 | + * table, so that we physically have the same two-level page table as the | |
38382 | + * i386 mmu expects. | |
38383 | + * | |
38384 | + * This file contains the functions and defines necessary to modify and use | |
38385 | + * the i386 page table tree. | |
38386 | + */ | |
38387 | +#ifndef __ASSEMBLY__ | |
38388 | +#include <asm/processor.h> | |
38389 | +#include <asm/fixmap.h> | |
38390 | +#include <linux/threads.h> | |
38391 | + | |
38392 | +#ifndef _I386_BITOPS_H | |
38393 | +#include <asm/bitops.h> | |
38394 | +#endif | |
38395 | + | |
38396 | +#include <linux/slab.h> | |
38397 | +#include <linux/list.h> | |
38398 | +#include <linux/spinlock.h> | |
38399 | + | |
38400 | +/* Is this pagetable pinned? */ | |
38401 | +#define PG_pinned PG_arch_1 | |
38402 | + | |
38403 | +struct mm_struct; | |
38404 | +struct vm_area_struct; | |
38405 | + | |
38406 | +/* | |
38407 | + * ZERO_PAGE is a global shared page that is always zero: used | |
38408 | + * for zero-mapped memory areas etc.. | |
38409 | + */ | |
38410 | +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) | |
38411 | +extern unsigned long empty_zero_page[1024]; | |
38412 | +extern pgd_t *swapper_pg_dir; | |
38413 | +extern kmem_cache_t *pgd_cache; | |
38414 | +extern kmem_cache_t *pmd_cache; | |
38415 | +extern spinlock_t pgd_lock; | |
38416 | +extern struct page *pgd_list; | |
38417 | + | |
38418 | +void pmd_ctor(void *, kmem_cache_t *, unsigned long); | |
38419 | +void pgd_ctor(void *, kmem_cache_t *, unsigned long); | |
38420 | +void pgd_dtor(void *, kmem_cache_t *, unsigned long); | |
38421 | +void pgtable_cache_init(void); | |
38422 | +void paging_init(void); | |
38423 | + | |
38424 | +/* | |
38425 | + * The Linux x86 paging architecture is 'compile-time dual-mode', it | |
38426 | + * implements both the traditional 2-level x86 page tables and the | |
38427 | + * newer 3-level PAE-mode page tables. | |
38428 | + */ | |
38429 | +#ifdef CONFIG_X86_PAE | |
38430 | +# include <asm/pgtable-3level-defs.h> | |
38431 | +# define PMD_SIZE (1UL << PMD_SHIFT) | |
38432 | +# define PMD_MASK (~(PMD_SIZE-1)) | |
38433 | +#else | |
38434 | +# include <asm/pgtable-2level-defs.h> | |
38435 | +#endif | |
38436 | + | |
38437 | +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) | |
38438 | +#define PGDIR_MASK (~(PGDIR_SIZE-1)) | |
38439 | + | |
38440 | +#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) | |
38441 | +#define FIRST_USER_ADDRESS 0 | |
38442 | + | |
38443 | +#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) | |
38444 | +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) | |
38445 | + | |
38446 | +#define TWOLEVEL_PGDIR_SHIFT 22 | |
38447 | +#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) | |
38448 | +#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) | |
38449 | + | |
38450 | +/* Just any arbitrary offset to the start of the vmalloc VM area: the | |
38451 | + * current 8MB value just means that there will be a 8MB "hole" after the | |
38452 | + * physical memory until the kernel virtual memory starts. That means that | |
38453 | + * any out-of-bounds memory accesses will hopefully be caught. | |
38454 | + * The vmalloc() routines leaves a hole of 4kB between each vmalloced | |
38455 | + * area for the same reason. ;) | |
38456 | + */ | |
38457 | +#define VMALLOC_OFFSET (8*1024*1024) | |
38458 | +#define VMALLOC_START (((unsigned long) high_memory + vmalloc_earlyreserve + \ | |
38459 | + 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) | |
38460 | +#ifdef CONFIG_HIGHMEM | |
38461 | +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) | |
38462 | +#else | |
38463 | +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) | |
38464 | +#endif | |
38465 | + | |
38466 | +/* | |
38467 | + * _PAGE_PSE set in the page directory entry just means that | |
38468 | + * the page directory entry points directly to a 4MB-aligned block of | |
38469 | + * memory. | |
38470 | + */ | |
38471 | +#define _PAGE_BIT_PRESENT 0 | |
38472 | +#define _PAGE_BIT_RW 1 | |
38473 | +#define _PAGE_BIT_USER 2 | |
38474 | +#define _PAGE_BIT_PWT 3 | |
38475 | +#define _PAGE_BIT_PCD 4 | |
38476 | +#define _PAGE_BIT_ACCESSED 5 | |
38477 | +#define _PAGE_BIT_DIRTY 6 | |
38478 | +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */ | |
38479 | +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ | |
38480 | +/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */ | |
38481 | +#define _PAGE_BIT_UNUSED2 10 | |
38482 | +#define _PAGE_BIT_UNUSED3 11 | |
38483 | +#define _PAGE_BIT_NX 63 | |
38484 | + | |
38485 | +#define _PAGE_PRESENT 0x001 | |
38486 | +#define _PAGE_RW 0x002 | |
38487 | +#define _PAGE_USER 0x004 | |
38488 | +#define _PAGE_PWT 0x008 | |
38489 | +#define _PAGE_PCD 0x010 | |
38490 | +#define _PAGE_ACCESSED 0x020 | |
38491 | +#define _PAGE_DIRTY 0x040 | |
38492 | +#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */ | |
38493 | +#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ | |
38494 | +/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */ | |
38495 | +#define _PAGE_UNUSED2 0x400 | |
38496 | +#define _PAGE_UNUSED3 0x800 | |
38497 | + | |
38498 | +/* If _PAGE_PRESENT is clear, we use these: */ | |
38499 | +#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ | |
38500 | +#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE; | |
38501 | + pte_present gives true */ | |
38502 | +#ifdef CONFIG_X86_PAE | |
38503 | +#define _PAGE_NX (1ULL<<_PAGE_BIT_NX) | |
38504 | +#else | |
38505 | +#define _PAGE_NX 0 | |
38506 | +#endif | |
38507 | + | |
38508 | +/* Mapped page is I/O or foreign and has no associated page struct. */ | |
38509 | +#define _PAGE_IO 0x200 | |
38510 | + | |
38511 | +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) | |
38512 | +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | |
38513 | +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) | |
38514 | + | |
38515 | +#define PAGE_NONE \ | |
38516 | + __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) | |
38517 | +#define PAGE_SHARED \ | |
38518 | + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) | |
38519 | + | |
38520 | +#define PAGE_SHARED_EXEC \ | |
38521 | + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) | |
38522 | +#define PAGE_COPY_NOEXEC \ | |
38523 | + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
38524 | +#define PAGE_COPY_EXEC \ | |
38525 | + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
38526 | +#define PAGE_COPY \ | |
38527 | + PAGE_COPY_NOEXEC | |
38528 | +#define PAGE_READONLY \ | |
38529 | + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
38530 | +#define PAGE_READONLY_EXEC \ | |
38531 | + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
38532 | + | |
38533 | +#define _PAGE_KERNEL \ | |
38534 | + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) | |
38535 | +#define _PAGE_KERNEL_EXEC \ | |
38536 | + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) | |
38537 | + | |
38538 | +extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; | |
38539 | +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) | |
38540 | +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) | |
38541 | +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) | |
38542 | +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) | |
38543 | + | |
38544 | +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) | |
38545 | +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) | |
38546 | +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) | |
38547 | +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) | |
38548 | +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) | |
38549 | +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) | |
38550 | + | |
38551 | +/* | |
38552 | + * The i386 can't do page protection for execute, and considers that | |
38553 | + * the same are read. Also, write permissions imply read permissions. | |
38554 | + * This is the closest we can get.. | |
38555 | + */ | |
38556 | +#define __P000 PAGE_NONE | |
38557 | +#define __P001 PAGE_READONLY | |
38558 | +#define __P010 PAGE_COPY | |
38559 | +#define __P011 PAGE_COPY | |
38560 | +#define __P100 PAGE_READONLY_EXEC | |
38561 | +#define __P101 PAGE_READONLY_EXEC | |
38562 | +#define __P110 PAGE_COPY_EXEC | |
38563 | +#define __P111 PAGE_COPY_EXEC | |
38564 | + | |
38565 | +#define __S000 PAGE_NONE | |
38566 | +#define __S001 PAGE_READONLY | |
38567 | +#define __S010 PAGE_SHARED | |
38568 | +#define __S011 PAGE_SHARED | |
38569 | +#define __S100 PAGE_READONLY_EXEC | |
38570 | +#define __S101 PAGE_READONLY_EXEC | |
38571 | +#define __S110 PAGE_SHARED_EXEC | |
38572 | +#define __S111 PAGE_SHARED_EXEC | |
38573 | + | |
38574 | +/* | |
38575 | + * Define this if things work differently on an i386 and an i486: | |
38576 | + * it will (on an i486) warn about kernel memory accesses that are | |
38577 | + * done without a 'access_ok(VERIFY_WRITE,..)' | |
38578 | + */ | |
38579 | +#undef TEST_ACCESS_OK | |
38580 | + | |
38581 | +/* The boot page tables (all created as a single array) */ | |
38582 | +extern unsigned long pg0[]; | |
38583 | + | |
38584 | +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) | |
38585 | + | |
38586 | +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ | |
38587 | +#define pmd_none(x) (!(unsigned long)__pmd_val(x)) | |
38588 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
38589 | +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. | |
38590 | + can temporarily clear it. */ | |
38591 | +#define pmd_present(x) (__pmd_val(x)) | |
38592 | +#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) | |
38593 | +#else | |
38594 | +#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) | |
38595 | +#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) | |
38596 | +#endif | |
38597 | + | |
38598 | + | |
38599 | +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) | |
38600 | + | |
38601 | +/* | |
38602 | + * The following only work if pte_present() is true. | |
38603 | + * Undefined behaviour if not.. | |
38604 | + */ | |
38605 | +static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; } | |
38606 | +static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; } | |
38607 | +static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } | |
38608 | +static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } | |
38609 | +static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } | |
38610 | +static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } | |
38611 | + | |
38612 | +/* | |
38613 | + * The following only works if pte_present() is not true. | |
38614 | + */ | |
38615 | +static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; } | |
38616 | + | |
38617 | +static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } | |
38618 | +static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } | |
38619 | +static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; } | |
38620 | +static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; } | |
38621 | +static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; } | |
38622 | +static inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; } | |
38623 | +static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; } | |
38624 | +static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } | |
38625 | +static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } | |
38626 | +static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } | |
38627 | +static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } | |
38628 | + | |
38629 | +#ifdef CONFIG_X86_PAE | |
38630 | +# include <asm/pgtable-3level.h> | |
38631 | +#else | |
38632 | +# include <asm/pgtable-2level.h> | |
38633 | +#endif | |
38634 | + | |
38635 | +#define ptep_test_and_clear_dirty(vma, addr, ptep) \ | |
38636 | +({ \ | |
38637 | + pte_t __pte = *(ptep); \ | |
38638 | + int __ret = pte_dirty(__pte); \ | |
38639 | + if (__ret) { \ | |
38640 | + __pte = pte_mkclean(__pte); \ | |
38641 | + if ((vma)->vm_mm != current->mm || \ | |
38642 | + HYPERVISOR_update_va_mapping(addr, __pte, 0)) \ | |
38643 | + (ptep)->pte_low = __pte.pte_low; \ | |
38644 | + } \ | |
38645 | + __ret; \ | |
38646 | +}) | |
38647 | + | |
38648 | +#define ptep_test_and_clear_young(vma, addr, ptep) \ | |
38649 | +({ \ | |
38650 | + pte_t __pte = *(ptep); \ | |
38651 | + int __ret = pte_young(__pte); \ | |
38652 | + if (__ret) \ | |
38653 | + __pte = pte_mkold(__pte); \ | |
38654 | + if ((vma)->vm_mm != current->mm || \ | |
38655 | + HYPERVISOR_update_va_mapping(addr, __pte, 0)) \ | |
38656 | + (ptep)->pte_low = __pte.pte_low; \ | |
38657 | + __ret; \ | |
38658 | +}) | |
38659 | + | |
38660 | +#define ptep_get_and_clear_full(mm, addr, ptep, full) \ | |
38661 | + ((full) ? ({ \ | |
38662 | + pte_t __res = *(ptep); \ | |
38663 | + if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) \ | |
38664 | + xen_l1_entry_update(ptep, __pte(0)); \ | |
38665 | + else \ | |
38666 | + *(ptep) = __pte(0); \ | |
38667 | + __res; \ | |
38668 | + }) : \ | |
38669 | + ptep_get_and_clear(mm, addr, ptep)) | |
38670 | + | |
38671 | +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
38672 | +{ | |
38673 | + pte_t pte = *ptep; | |
38674 | + if (pte_write(pte)) | |
38675 | + set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); | |
38676 | +} | |
38677 | + | |
38678 | +/* | |
38679 | + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); | |
38680 | + * | |
38681 | + * dst - pointer to pgd range anwhere on a pgd page | |
38682 | + * src - "" | |
38683 | + * count - the number of pgds to copy. | |
38684 | + * | |
38685 | + * dst and src can be on the same page, but the range must not overlap, | |
38686 | + * and must not cross a page boundary. | |
38687 | + */ | |
38688 | +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) | |
38689 | +{ | |
38690 | + memcpy(dst, src, count * sizeof(pgd_t)); | |
38691 | +} | |
38692 | + | |
38693 | +/* | |
38694 | + * Macro to mark a page protection value as "uncacheable". On processors which do not support | |
38695 | + * it, this is a no-op. | |
38696 | + */ | |
38697 | +#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \ | |
38698 | + ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot)) | |
38699 | + | |
38700 | +/* | |
38701 | + * Conversion functions: convert a page and protection to a page entry, | |
38702 | + * and a page entry and page directory to the page they refer to. | |
38703 | + */ | |
38704 | + | |
38705 | +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) | |
38706 | + | |
38707 | +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) | |
38708 | +{ | |
38709 | + /* | |
38710 | + * Since this might change the present bit (which controls whether | |
38711 | + * a pte_t object has undergone p2m translation), we must use | |
38712 | + * pte_val() on the input pte and __pte() for the return value. | |
38713 | + */ | |
38714 | + paddr_t pteval = pte_val(pte); | |
38715 | + | |
38716 | + pteval &= _PAGE_CHG_MASK; | |
38717 | + pteval |= pgprot_val(newprot); | |
38718 | +#ifdef CONFIG_X86_PAE | |
38719 | + pteval &= __supported_pte_mask; | |
38720 | +#endif | |
38721 | + return __pte(pteval); | |
38722 | +} | |
38723 | + | |
38724 | +#define pmd_large(pmd) \ | |
38725 | +((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) | |
38726 | + | |
38727 | +/* | |
38728 | + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] | |
38729 | + * | |
38730 | + * this macro returns the index of the entry in the pgd page which would | |
38731 | + * control the given virtual address | |
38732 | + */ | |
38733 | +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) | |
38734 | +#define pgd_index_k(addr) pgd_index(addr) | |
38735 | + | |
38736 | +/* | |
38737 | + * pgd_offset() returns a (pgd_t *) | |
38738 | + * pgd_index() is used get the offset into the pgd page's array of pgd_t's; | |
38739 | + */ | |
38740 | +#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) | |
38741 | + | |
38742 | +/* | |
38743 | + * a shortcut which implies the use of the kernel's pgd, instead | |
38744 | + * of a process's | |
38745 | + */ | |
38746 | +#define pgd_offset_k(address) pgd_offset(&init_mm, address) | |
38747 | + | |
38748 | +/* | |
38749 | + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] | |
38750 | + * | |
38751 | + * this macro returns the index of the entry in the pmd page which would | |
38752 | + * control the given virtual address | |
38753 | + */ | |
38754 | +#define pmd_index(address) \ | |
38755 | + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) | |
38756 | + | |
38757 | +/* | |
38758 | + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] | |
38759 | + * | |
38760 | + * this macro returns the index of the entry in the pte page which would | |
38761 | + * control the given virtual address | |
38762 | + */ | |
38763 | +#define pte_index(address) \ | |
38764 | + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) | |
38765 | +#define pte_offset_kernel(dir, address) \ | |
38766 | + ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) | |
38767 | + | |
38768 | +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) | |
38769 | + | |
38770 | +#define pmd_page_kernel(pmd) \ | |
38771 | + ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) | |
38772 | + | |
38773 | +/* | |
38774 | + * Helper function that returns the kernel pagetable entry controlling | |
38775 | + * the virtual address 'address'. NULL means no pagetable entry present. | |
38776 | + * NOTE: the return type is pte_t but if the pmd is PSE then we return it | |
38777 | + * as a pte too. | |
38778 | + */ | |
38779 | +extern pte_t *lookup_address(unsigned long address); | |
38780 | + | |
38781 | +/* | |
38782 | + * Make a given kernel text page executable/non-executable. | |
38783 | + * Returns the previous executability setting of that page (which | |
38784 | + * is used to restore the previous state). Used by the SMP bootup code. | |
38785 | + * NOTE: this is an __init function for security reasons. | |
38786 | + */ | |
38787 | +#ifdef CONFIG_X86_PAE | |
38788 | + extern int set_kernel_exec(unsigned long vaddr, int enable); | |
38789 | +#else | |
38790 | + static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;} | |
38791 | +#endif | |
38792 | + | |
38793 | +extern void noexec_setup(const char *str); | |
38794 | + | |
38795 | +#if defined(CONFIG_HIGHPTE) | |
38796 | +#define pte_offset_map(dir, address) \ | |
38797 | + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \ | |
38798 | + pte_index(address)) | |
38799 | +#define pte_offset_map_nested(dir, address) \ | |
38800 | + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \ | |
38801 | + pte_index(address)) | |
38802 | +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) | |
38803 | +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) | |
38804 | +#else | |
38805 | +#define pte_offset_map(dir, address) \ | |
38806 | + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) | |
38807 | +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) | |
38808 | +#define pte_unmap(pte) do { } while (0) | |
38809 | +#define pte_unmap_nested(pte) do { } while (0) | |
38810 | +#endif | |
38811 | + | |
38812 | +#define __HAVE_ARCH_PTEP_ESTABLISH | |
38813 | +#define ptep_establish(vma, address, ptep, pteval) \ | |
38814 | + do { \ | |
38815 | + if ( likely((vma)->vm_mm == current->mm) ) { \ | |
38816 | + BUG_ON(HYPERVISOR_update_va_mapping(address, \ | |
38817 | + pteval, \ | |
38818 | + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ | |
38819 | + UVMF_INVLPG|UVMF_MULTI)); \ | |
38820 | + } else { \ | |
38821 | + xen_l1_entry_update(ptep, pteval); \ | |
38822 | + flush_tlb_page(vma, address); \ | |
38823 | + } \ | |
38824 | + } while (0) | |
38825 | + | |
38826 | +/* | |
38827 | + * The i386 doesn't have any external MMU info: the kernel page | |
38828 | + * tables contain all the necessary information. | |
38829 | + * | |
38830 | + * Also, we only update the dirty/accessed state if we set | |
38831 | + * the dirty bit by hand in the kernel, since the hardware | |
38832 | + * will do the accessed bit for us, and we don't want to | |
38833 | + * race with other CPU's that might be updating the dirty | |
38834 | + * bit at the same time. | |
38835 | + */ | |
38836 | +#define update_mmu_cache(vma,address,pte) do { } while (0) | |
38837 | +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | |
38838 | +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ | |
38839 | + do { \ | |
38840 | + if (dirty) \ | |
38841 | + ptep_establish(vma, address, ptep, entry); \ | |
38842 | + } while (0) | |
38843 | + | |
38844 | +#include <xen/features.h> | |
38845 | +void make_lowmem_page_readonly(void *va, unsigned int feature); | |
38846 | +void make_lowmem_page_writable(void *va, unsigned int feature); | |
38847 | +void make_page_readonly(void *va, unsigned int feature); | |
38848 | +void make_page_writable(void *va, unsigned int feature); | |
38849 | +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); | |
38850 | +void make_pages_writable(void *va, unsigned int nr, unsigned int feature); | |
38851 | + | |
38852 | +#define virt_to_ptep(va) \ | |
38853 | +({ \ | |
38854 | + pte_t *__ptep = lookup_address((unsigned long)(va)); \ | |
38855 | + BUG_ON(!__ptep || !pte_present(*__ptep)); \ | |
38856 | + __ptep; \ | |
38857 | +}) | |
38858 | + | |
38859 | +#define arbitrary_virt_to_machine(va) \ | |
38860 | + (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \ | |
38861 | + | ((unsigned long)(va) & (PAGE_SIZE - 1))) | |
38862 | + | |
38863 | +#endif /* !__ASSEMBLY__ */ | |
38864 | + | |
38865 | +#ifdef CONFIG_FLATMEM | |
38866 | +#define kern_addr_valid(addr) (1) | |
38867 | +#endif /* CONFIG_FLATMEM */ | |
38868 | + | |
38869 | +int direct_remap_pfn_range(struct vm_area_struct *vma, | |
38870 | + unsigned long address, | |
38871 | + unsigned long mfn, | |
38872 | + unsigned long size, | |
38873 | + pgprot_t prot, | |
38874 | + domid_t domid); | |
38875 | +int direct_kernel_remap_pfn_range(unsigned long address, | |
38876 | + unsigned long mfn, | |
38877 | + unsigned long size, | |
38878 | + pgprot_t prot, | |
38879 | + domid_t domid); | |
38880 | +int create_lookup_pte_addr(struct mm_struct *mm, | |
38881 | + unsigned long address, | |
38882 | + uint64_t *ptep); | |
38883 | +int touch_pte_range(struct mm_struct *mm, | |
38884 | + unsigned long address, | |
38885 | + unsigned long size); | |
38886 | + | |
38887 | +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |
38888 | + unsigned long addr, unsigned long end, pgprot_t newprot); | |
38889 | + | |
38890 | +#define arch_change_pte_range(mm, pmd, addr, end, newprot) \ | |
38891 | + xen_change_pte_range(mm, pmd, addr, end, newprot) | |
38892 | + | |
38893 | +#define io_remap_pfn_range(vma,from,pfn,size,prot) \ | |
38894 | +direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO) | |
38895 | + | |
38896 | +#define MK_IOSPACE_PFN(space, pfn) (pfn) | |
38897 | +#define GET_IOSPACE(pfn) 0 | |
38898 | +#define GET_PFN(pfn) (pfn) | |
38899 | + | |
38900 | +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG | |
38901 | +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY | |
38902 | +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR | |
38903 | +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL | |
38904 | +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH | |
38905 | +#define __HAVE_ARCH_PTEP_SET_WRPROTECT | |
38906 | +#define __HAVE_ARCH_PTE_SAME | |
38907 | +#include <asm-generic/pgtable.h> | |
38908 | + | |
38909 | +#endif /* _I386_PGTABLE_H */ | |
38910 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/processor_32.h | |
38911 | =================================================================== | |
38912 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
38913 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/processor_32.h 2008-01-28 12:24:19.000000000 +0100 | |
38914 | @@ -0,0 +1,743 @@ | |
38915 | +/* | |
38916 | + * include/asm-i386/processor.h | |
38917 | + * | |
38918 | + * Copyright (C) 1994 Linus Torvalds | |
38919 | + */ | |
38920 | + | |
38921 | +#ifndef __ASM_I386_PROCESSOR_H | |
38922 | +#define __ASM_I386_PROCESSOR_H | |
38923 | + | |
38924 | +#include <asm/vm86.h> | |
38925 | +#include <asm/math_emu.h> | |
38926 | +#include <asm/segment.h> | |
38927 | +#include <asm/page.h> | |
38928 | +#include <asm/types.h> | |
38929 | +#include <asm/sigcontext.h> | |
38930 | +#include <asm/cpufeature.h> | |
38931 | +#include <asm/msr.h> | |
38932 | +#include <asm/system.h> | |
38933 | +#include <linux/cache.h> | |
38934 | +#include <linux/threads.h> | |
38935 | +#include <asm/percpu.h> | |
38936 | +#include <linux/cpumask.h> | |
38937 | +#include <xen/interface/physdev.h> | |
38938 | + | |
38939 | +/* flag for disabling the tsc */ | |
38940 | +extern int tsc_disable; | |
38941 | + | |
38942 | +struct desc_struct { | |
38943 | + unsigned long a,b; | |
38944 | +}; | |
38945 | + | |
38946 | +#define desc_empty(desc) \ | |
38947 | + (!((desc)->a | (desc)->b)) | |
38948 | + | |
38949 | +#define desc_equal(desc1, desc2) \ | |
38950 | + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) | |
38951 | +/* | |
38952 | + * Default implementation of macro that returns current | |
38953 | + * instruction pointer ("program counter"). | |
38954 | + */ | |
38955 | +#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) | |
38956 | + | |
38957 | +/* | |
38958 | + * CPU type and hardware bug flags. Kept separately for each CPU. | |
38959 | + * Members of this structure are referenced in head.S, so think twice | |
38960 | + * before touching them. [mj] | |
38961 | + */ | |
38962 | + | |
38963 | +struct cpuinfo_x86 { | |
38964 | + __u8 x86; /* CPU family */ | |
38965 | + __u8 x86_vendor; /* CPU vendor */ | |
38966 | + __u8 x86_model; | |
38967 | + __u8 x86_mask; | |
38968 | + char wp_works_ok; /* It doesn't on 386's */ | |
38969 | + char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */ | |
38970 | + char hard_math; | |
38971 | + char rfu; | |
38972 | + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ | |
38973 | + unsigned long x86_capability[NCAPINTS]; | |
38974 | + char x86_vendor_id[16]; | |
38975 | + char x86_model_id[64]; | |
38976 | + int x86_cache_size; /* in KB - valid for CPUS which support this | |
38977 | + call */ | |
38978 | + int x86_cache_alignment; /* In bytes */ | |
38979 | + char fdiv_bug; | |
38980 | + char f00f_bug; | |
38981 | + char coma_bug; | |
38982 | + char pad0; | |
38983 | + int x86_power; | |
38984 | + unsigned long loops_per_jiffy; | |
38985 | +#ifdef CONFIG_SMP | |
38986 | + cpumask_t llc_shared_map; /* cpus sharing the last level cache */ | |
38987 | +#endif | |
38988 | + unsigned char x86_max_cores; /* cpuid returned max cores value */ | |
38989 | + unsigned char apicid; | |
38990 | +#ifdef CONFIG_SMP | |
38991 | + unsigned char booted_cores; /* number of cores as seen by OS */ | |
38992 | + __u8 phys_proc_id; /* Physical processor id. */ | |
38993 | + __u8 cpu_core_id; /* Core id */ | |
38994 | +#endif | |
38995 | +} __attribute__((__aligned__(SMP_CACHE_BYTES))); | |
38996 | + | |
38997 | +#define X86_VENDOR_INTEL 0 | |
38998 | +#define X86_VENDOR_CYRIX 1 | |
38999 | +#define X86_VENDOR_AMD 2 | |
39000 | +#define X86_VENDOR_UMC 3 | |
39001 | +#define X86_VENDOR_NEXGEN 4 | |
39002 | +#define X86_VENDOR_CENTAUR 5 | |
39003 | +#define X86_VENDOR_RISE 6 | |
39004 | +#define X86_VENDOR_TRANSMETA 7 | |
39005 | +#define X86_VENDOR_NSC 8 | |
39006 | +#define X86_VENDOR_NUM 9 | |
39007 | +#define X86_VENDOR_UNKNOWN 0xff | |
39008 | + | |
39009 | +/* | |
39010 | + * capabilities of CPUs | |
39011 | + */ | |
39012 | + | |
39013 | +extern struct cpuinfo_x86 boot_cpu_data; | |
39014 | +extern struct cpuinfo_x86 new_cpu_data; | |
39015 | +#ifndef CONFIG_X86_NO_TSS | |
39016 | +extern struct tss_struct doublefault_tss; | |
39017 | +DECLARE_PER_CPU(struct tss_struct, init_tss); | |
39018 | +#endif | |
39019 | + | |
39020 | +#ifdef CONFIG_SMP | |
39021 | +extern struct cpuinfo_x86 cpu_data[]; | |
39022 | +#define current_cpu_data cpu_data[smp_processor_id()] | |
39023 | +#else | |
39024 | +#define cpu_data (&boot_cpu_data) | |
39025 | +#define current_cpu_data boot_cpu_data | |
39026 | +#endif | |
39027 | + | |
39028 | +extern int cpu_llc_id[NR_CPUS]; | |
39029 | +extern char ignore_fpu_irq; | |
39030 | + | |
39031 | +extern void identify_cpu(struct cpuinfo_x86 *); | |
39032 | +extern void print_cpu_info(struct cpuinfo_x86 *); | |
39033 | +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); | |
39034 | +extern unsigned short num_cache_leaves; | |
39035 | + | |
39036 | +#ifdef CONFIG_X86_HT | |
39037 | +extern void detect_ht(struct cpuinfo_x86 *c); | |
39038 | +#else | |
39039 | +static inline void detect_ht(struct cpuinfo_x86 *c) {} | |
39040 | +#endif | |
39041 | + | |
39042 | +/* | |
39043 | + * EFLAGS bits | |
39044 | + */ | |
39045 | +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ | |
39046 | +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ | |
39047 | +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ | |
39048 | +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ | |
39049 | +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ | |
39050 | +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ | |
39051 | +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ | |
39052 | +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ | |
39053 | +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ | |
39054 | +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ | |
39055 | +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ | |
39056 | +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ | |
39057 | +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ | |
39058 | +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ | |
39059 | +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ | |
39060 | +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ | |
39061 | +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ | |
39062 | + | |
39063 | +/* | |
39064 | + * Generic CPUID function | |
39065 | + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx | |
39066 | + * resulting in stale register contents being returned. | |
39067 | + */ | |
39068 | +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) | |
39069 | +{ | |
39070 | + __asm__(XEN_CPUID | |
39071 | + : "=a" (*eax), | |
39072 | + "=b" (*ebx), | |
39073 | + "=c" (*ecx), | |
39074 | + "=d" (*edx) | |
39075 | + : "0" (op), "c"(0)); | |
39076 | +} | |
39077 | + | |
39078 | +/* Some CPUID calls want 'count' to be placed in ecx */ | |
39079 | +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, | |
39080 | + int *edx) | |
39081 | +{ | |
39082 | + __asm__(XEN_CPUID | |
39083 | + : "=a" (*eax), | |
39084 | + "=b" (*ebx), | |
39085 | + "=c" (*ecx), | |
39086 | + "=d" (*edx) | |
39087 | + : "0" (op), "c" (count)); | |
39088 | +} | |
39089 | + | |
39090 | +/* | |
39091 | + * CPUID functions returning a single datum | |
39092 | + */ | |
39093 | +static inline unsigned int cpuid_eax(unsigned int op) | |
39094 | +{ | |
39095 | + unsigned int eax; | |
39096 | + | |
39097 | + __asm__(XEN_CPUID | |
39098 | + : "=a" (eax) | |
39099 | + : "0" (op) | |
39100 | + : "bx", "cx", "dx"); | |
39101 | + return eax; | |
39102 | +} | |
39103 | +static inline unsigned int cpuid_ebx(unsigned int op) | |
39104 | +{ | |
39105 | + unsigned int eax, ebx; | |
39106 | + | |
39107 | + __asm__(XEN_CPUID | |
39108 | + : "=a" (eax), "=b" (ebx) | |
39109 | + : "0" (op) | |
39110 | + : "cx", "dx" ); | |
39111 | + return ebx; | |
39112 | +} | |
39113 | +static inline unsigned int cpuid_ecx(unsigned int op) | |
39114 | +{ | |
39115 | + unsigned int eax, ecx; | |
39116 | + | |
39117 | + __asm__(XEN_CPUID | |
39118 | + : "=a" (eax), "=c" (ecx) | |
39119 | + : "0" (op) | |
39120 | + : "bx", "dx" ); | |
39121 | + return ecx; | |
39122 | +} | |
39123 | +static inline unsigned int cpuid_edx(unsigned int op) | |
39124 | +{ | |
39125 | + unsigned int eax, edx; | |
39126 | + | |
39127 | + __asm__(XEN_CPUID | |
39128 | + : "=a" (eax), "=d" (edx) | |
39129 | + : "0" (op) | |
39130 | + : "bx", "cx"); | |
39131 | + return edx; | |
39132 | +} | |
39133 | + | |
39134 | +#define load_cr3(pgdir) write_cr3(__pa(pgdir)) | |
39135 | + | |
39136 | +/* | |
39137 | + * Intel CPU features in CR4 | |
39138 | + */ | |
39139 | +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ | |
39140 | +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ | |
39141 | +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ | |
39142 | +#define X86_CR4_DE 0x0008 /* enable debugging extensions */ | |
39143 | +#define X86_CR4_PSE 0x0010 /* enable page size extensions */ | |
39144 | +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ | |
39145 | +#define X86_CR4_MCE 0x0040 /* Machine check enable */ | |
39146 | +#define X86_CR4_PGE 0x0080 /* enable global pages */ | |
39147 | +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ | |
39148 | +#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ | |
39149 | +#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ | |
39150 | + | |
39151 | +/* | |
39152 | + * Save the cr4 feature set we're using (ie | |
39153 | + * Pentium 4MB enable and PPro Global page | |
39154 | + * enable), so that any CPU's that boot up | |
39155 | + * after us can get the correct flags. | |
39156 | + */ | |
39157 | +extern unsigned long mmu_cr4_features; | |
39158 | + | |
39159 | +static inline void set_in_cr4 (unsigned long mask) | |
39160 | +{ | |
39161 | + unsigned cr4; | |
39162 | + mmu_cr4_features |= mask; | |
39163 | + cr4 = read_cr4(); | |
39164 | + cr4 |= mask; | |
39165 | + write_cr4(cr4); | |
39166 | +} | |
39167 | + | |
39168 | +static inline void clear_in_cr4 (unsigned long mask) | |
39169 | +{ | |
39170 | + unsigned cr4; | |
39171 | + mmu_cr4_features &= ~mask; | |
39172 | + cr4 = read_cr4(); | |
39173 | + cr4 &= ~mask; | |
39174 | + write_cr4(cr4); | |
39175 | +} | |
39176 | + | |
39177 | +/* | |
39178 | + * NSC/Cyrix CPU configuration register indexes | |
39179 | + */ | |
39180 | + | |
39181 | +#define CX86_PCR0 0x20 | |
39182 | +#define CX86_GCR 0xb8 | |
39183 | +#define CX86_CCR0 0xc0 | |
39184 | +#define CX86_CCR1 0xc1 | |
39185 | +#define CX86_CCR2 0xc2 | |
39186 | +#define CX86_CCR3 0xc3 | |
39187 | +#define CX86_CCR4 0xe8 | |
39188 | +#define CX86_CCR5 0xe9 | |
39189 | +#define CX86_CCR6 0xea | |
39190 | +#define CX86_CCR7 0xeb | |
39191 | +#define CX86_PCR1 0xf0 | |
39192 | +#define CX86_DIR0 0xfe | |
39193 | +#define CX86_DIR1 0xff | |
39194 | +#define CX86_ARR_BASE 0xc4 | |
39195 | +#define CX86_RCR_BASE 0xdc | |
39196 | + | |
39197 | +/* | |
39198 | + * NSC/Cyrix CPU indexed register access macros | |
39199 | + */ | |
39200 | + | |
39201 | +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); }) | |
39202 | + | |
39203 | +#define setCx86(reg, data) do { \ | |
39204 | + outb((reg), 0x22); \ | |
39205 | + outb((data), 0x23); \ | |
39206 | +} while (0) | |
39207 | + | |
39208 | +/* Stop speculative execution */ | |
39209 | +static inline void sync_core(void) | |
39210 | +{ | |
39211 | + int tmp; | |
39212 | + asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); | |
39213 | +} | |
39214 | + | |
39215 | +static inline void __monitor(const void *eax, unsigned long ecx, | |
39216 | + unsigned long edx) | |
39217 | +{ | |
39218 | + /* "monitor %eax,%ecx,%edx;" */ | |
39219 | + asm volatile( | |
39220 | + ".byte 0x0f,0x01,0xc8;" | |
39221 | + : :"a" (eax), "c" (ecx), "d"(edx)); | |
39222 | +} | |
39223 | + | |
39224 | +static inline void __mwait(unsigned long eax, unsigned long ecx) | |
39225 | +{ | |
39226 | + /* "mwait %eax,%ecx;" */ | |
39227 | + asm volatile( | |
39228 | + ".byte 0x0f,0x01,0xc9;" | |
39229 | + : :"a" (eax), "c" (ecx)); | |
39230 | +} | |
39231 | + | |
39232 | +/* from system description table in BIOS. Mostly for MCA use, but | |
39233 | +others may find it useful. */ | |
39234 | +extern unsigned int machine_id; | |
39235 | +extern unsigned int machine_submodel_id; | |
39236 | +extern unsigned int BIOS_revision; | |
39237 | +extern unsigned int mca_pentium_flag; | |
39238 | + | |
39239 | +/* Boot loader type from the setup header */ | |
39240 | +extern int bootloader_type; | |
39241 | + | |
39242 | +/* | |
39243 | + * User space process size: 3GB (default). | |
39244 | + */ | |
39245 | +#define TASK_SIZE (PAGE_OFFSET) | |
39246 | + | |
39247 | +/* This decides where the kernel will search for a free chunk of vm | |
39248 | + * space during mmap's. | |
39249 | + */ | |
39250 | +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) | |
39251 | + | |
39252 | +#define HAVE_ARCH_PICK_MMAP_LAYOUT | |
39253 | + | |
39254 | +/* | |
39255 | + * Size of io_bitmap. | |
39256 | + */ | |
39257 | +#define IO_BITMAP_BITS 65536 | |
39258 | +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) | |
39259 | +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) | |
39260 | +#ifndef CONFIG_X86_NO_TSS | |
39261 | +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) | |
39262 | +#endif | |
39263 | +#define INVALID_IO_BITMAP_OFFSET 0x8000 | |
39264 | +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 | |
39265 | + | |
39266 | +struct i387_fsave_struct { | |
39267 | + long cwd; | |
39268 | + long swd; | |
39269 | + long twd; | |
39270 | + long fip; | |
39271 | + long fcs; | |
39272 | + long foo; | |
39273 | + long fos; | |
39274 | + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ | |
39275 | + long status; /* software status information */ | |
39276 | +}; | |
39277 | + | |
39278 | +struct i387_fxsave_struct { | |
39279 | + unsigned short cwd; | |
39280 | + unsigned short swd; | |
39281 | + unsigned short twd; | |
39282 | + unsigned short fop; | |
39283 | + long fip; | |
39284 | + long fcs; | |
39285 | + long foo; | |
39286 | + long fos; | |
39287 | + long mxcsr; | |
39288 | + long mxcsr_mask; | |
39289 | + long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ | |
39290 | + long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ | |
39291 | + long padding[56]; | |
39292 | +} __attribute__ ((aligned (16))); | |
39293 | + | |
39294 | +struct i387_soft_struct { | |
39295 | + long cwd; | |
39296 | + long swd; | |
39297 | + long twd; | |
39298 | + long fip; | |
39299 | + long fcs; | |
39300 | + long foo; | |
39301 | + long fos; | |
39302 | + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ | |
39303 | + unsigned char ftop, changed, lookahead, no_update, rm, alimit; | |
39304 | + struct info *info; | |
39305 | + unsigned long entry_eip; | |
39306 | +}; | |
39307 | + | |
39308 | +union i387_union { | |
39309 | + struct i387_fsave_struct fsave; | |
39310 | + struct i387_fxsave_struct fxsave; | |
39311 | + struct i387_soft_struct soft; | |
39312 | +}; | |
39313 | + | |
39314 | +typedef struct { | |
39315 | + unsigned long seg; | |
39316 | +} mm_segment_t; | |
39317 | + | |
39318 | +struct thread_struct; | |
39319 | + | |
39320 | +#ifndef CONFIG_X86_NO_TSS | |
39321 | +struct tss_struct { | |
39322 | + unsigned short back_link,__blh; | |
39323 | + unsigned long esp0; | |
39324 | + unsigned short ss0,__ss0h; | |
39325 | + unsigned long esp1; | |
39326 | + unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */ | |
39327 | + unsigned long esp2; | |
39328 | + unsigned short ss2,__ss2h; | |
39329 | + unsigned long __cr3; | |
39330 | + unsigned long eip; | |
39331 | + unsigned long eflags; | |
39332 | + unsigned long eax,ecx,edx,ebx; | |
39333 | + unsigned long esp; | |
39334 | + unsigned long ebp; | |
39335 | + unsigned long esi; | |
39336 | + unsigned long edi; | |
39337 | + unsigned short es, __esh; | |
39338 | + unsigned short cs, __csh; | |
39339 | + unsigned short ss, __ssh; | |
39340 | + unsigned short ds, __dsh; | |
39341 | + unsigned short fs, __fsh; | |
39342 | + unsigned short gs, __gsh; | |
39343 | + unsigned short ldt, __ldth; | |
39344 | + unsigned short trace, io_bitmap_base; | |
39345 | + /* | |
39346 | + * The extra 1 is there because the CPU will access an | |
39347 | + * additional byte beyond the end of the IO permission | |
39348 | + * bitmap. The extra byte must be all 1 bits, and must | |
39349 | + * be within the limit. | |
39350 | + */ | |
39351 | + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; | |
39352 | + /* | |
39353 | + * Cache the current maximum and the last task that used the bitmap: | |
39354 | + */ | |
39355 | + unsigned long io_bitmap_max; | |
39356 | + struct thread_struct *io_bitmap_owner; | |
39357 | + /* | |
39358 | + * pads the TSS to be cacheline-aligned (size is 0x100) | |
39359 | + */ | |
39360 | + unsigned long __cacheline_filler[35]; | |
39361 | + /* | |
39362 | + * .. and then another 0x100 bytes for emergency kernel stack | |
39363 | + */ | |
39364 | + unsigned long stack[64]; | |
39365 | +} __attribute__((packed)); | |
39366 | +#endif | |
39367 | + | |
39368 | +#define ARCH_MIN_TASKALIGN 16 | |
39369 | + | |
39370 | +struct thread_struct { | |
39371 | +/* cached TLS descriptors. */ | |
39372 | + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; | |
39373 | + unsigned long esp0; | |
39374 | + unsigned long sysenter_cs; | |
39375 | + unsigned long eip; | |
39376 | + unsigned long esp; | |
39377 | + unsigned long fs; | |
39378 | + unsigned long gs; | |
39379 | +/* Hardware debugging registers */ | |
39380 | + unsigned long debugreg[8]; /* %%db0-7 debug registers */ | |
39381 | +/* fault info */ | |
39382 | + unsigned long cr2, trap_no, error_code; | |
39383 | +/* floating point info */ | |
39384 | + union i387_union i387; | |
39385 | +/* virtual 86 mode info */ | |
39386 | + struct vm86_struct __user * vm86_info; | |
39387 | + unsigned long screen_bitmap; | |
39388 | + unsigned long v86flags, v86mask, saved_esp0; | |
39389 | + unsigned int saved_fs, saved_gs; | |
39390 | +/* IO permissions */ | |
39391 | + unsigned long *io_bitmap_ptr; | |
39392 | + unsigned long iopl; | |
39393 | +/* max allowed port in the bitmap, in bytes: */ | |
39394 | + unsigned long io_bitmap_max; | |
39395 | +}; | |
39396 | + | |
39397 | +#define INIT_THREAD { \ | |
39398 | + .vm86_info = NULL, \ | |
39399 | + .sysenter_cs = __KERNEL_CS, \ | |
39400 | + .io_bitmap_ptr = NULL, \ | |
39401 | +} | |
39402 | + | |
39403 | +#ifndef CONFIG_X86_NO_TSS | |
39404 | +/* | |
39405 | + * Note that the .io_bitmap member must be extra-big. This is because | |
39406 | + * the CPU will access an additional byte beyond the end of the IO | |
39407 | + * permission bitmap. The extra byte must be all 1 bits, and must | |
39408 | + * be within the limit. | |
39409 | + */ | |
39410 | +#define INIT_TSS { \ | |
39411 | + .esp0 = sizeof(init_stack) + (long)&init_stack, \ | |
39412 | + .ss0 = __KERNEL_DS, \ | |
39413 | + .ss1 = __KERNEL_CS, \ | |
39414 | + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ | |
39415 | + .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ | |
39416 | +} | |
39417 | + | |
39418 | +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread) | |
39419 | +{ | |
39420 | + tss->esp0 = thread->esp0; | |
39421 | + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ | |
39422 | + if (unlikely(tss->ss1 != thread->sysenter_cs)) { | |
39423 | + tss->ss1 = thread->sysenter_cs; | |
39424 | + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); | |
39425 | + } | |
39426 | +} | |
39427 | +#define load_esp0(tss, thread) \ | |
39428 | + __load_esp0(tss, thread) | |
39429 | +#else | |
39430 | +#define load_esp0(tss, thread) do { \ | |
39431 | + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \ | |
39432 | + BUG(); \ | |
39433 | +} while (0) | |
39434 | +#endif | |
39435 | + | |
39436 | +#define start_thread(regs, new_eip, new_esp) do { \ | |
39437 | + __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ | |
39438 | + set_fs(USER_DS); \ | |
39439 | + regs->xds = __USER_DS; \ | |
39440 | + regs->xes = __USER_DS; \ | |
39441 | + regs->xss = __USER_DS; \ | |
39442 | + regs->xcs = __USER_CS; \ | |
39443 | + regs->eip = new_eip; \ | |
39444 | + regs->esp = new_esp; \ | |
39445 | +} while (0) | |
39446 | + | |
39447 | +/* | |
39448 | + * These special macros can be used to get or set a debugging register | |
39449 | + */ | |
39450 | +#define get_debugreg(var, register) \ | |
39451 | + (var) = HYPERVISOR_get_debugreg((register)) | |
39452 | +#define set_debugreg(value, register) \ | |
39453 | + WARN_ON(HYPERVISOR_set_debugreg((register), (value))) | |
39454 | + | |
39455 | +/* | |
39456 | + * Set IOPL bits in EFLAGS from given mask | |
39457 | + */ | |
39458 | +static inline void set_iopl_mask(unsigned mask) | |
39459 | +{ | |
39460 | + struct physdev_set_iopl set_iopl; | |
39461 | + | |
39462 | + /* Force the change at ring 0. */ | |
39463 | + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; | |
39464 | + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); | |
39465 | +} | |
39466 | + | |
39467 | +/* Forward declaration, a strange C thing */ | |
39468 | +struct task_struct; | |
39469 | +struct mm_struct; | |
39470 | + | |
39471 | +/* Free all resources held by a thread. */ | |
39472 | +extern void release_thread(struct task_struct *); | |
39473 | + | |
39474 | +/* Prepare to copy thread state - unlazy all lazy status */ | |
39475 | +extern void prepare_to_copy(struct task_struct *tsk); | |
39476 | + | |
39477 | +/* | |
39478 | + * create a kernel thread without removing it from tasklists | |
39479 | + */ | |
39480 | +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); | |
39481 | + | |
39482 | +extern unsigned long thread_saved_pc(struct task_struct *tsk); | |
39483 | +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack); | |
39484 | + | |
39485 | +unsigned long get_wchan(struct task_struct *p); | |
39486 | + | |
39487 | +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) | |
39488 | +#define KSTK_TOP(info) \ | |
39489 | +({ \ | |
39490 | + unsigned long *__ptr = (unsigned long *)(info); \ | |
39491 | + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ | |
39492 | +}) | |
39493 | + | |
39494 | +/* | |
39495 | + * The below -8 is to reserve 8 bytes on top of the ring0 stack. | |
39496 | + * This is necessary to guarantee that the entire "struct pt_regs" | |
39497 | + * is accessable even if the CPU haven't stored the SS/ESP registers | |
39498 | + * on the stack (interrupt gate does not save these registers | |
39499 | + * when switching to the same priv ring). | |
39500 | + * Therefore beware: accessing the xss/esp fields of the | |
39501 | + * "struct pt_regs" is possible, but they may contain the | |
39502 | + * completely wrong values. | |
39503 | + */ | |
39504 | +#define task_pt_regs(task) \ | |
39505 | +({ \ | |
39506 | + struct pt_regs *__regs__; \ | |
39507 | + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ | |
39508 | + __regs__ - 1; \ | |
39509 | +}) | |
39510 | + | |
39511 | +#define KSTK_EIP(task) (task_pt_regs(task)->eip) | |
39512 | +#define KSTK_ESP(task) (task_pt_regs(task)->esp) | |
39513 | + | |
39514 | + | |
39515 | +struct microcode_header { | |
39516 | + unsigned int hdrver; | |
39517 | + unsigned int rev; | |
39518 | + unsigned int date; | |
39519 | + unsigned int sig; | |
39520 | + unsigned int cksum; | |
39521 | + unsigned int ldrver; | |
39522 | + unsigned int pf; | |
39523 | + unsigned int datasize; | |
39524 | + unsigned int totalsize; | |
39525 | + unsigned int reserved[3]; | |
39526 | +}; | |
39527 | + | |
39528 | +struct microcode { | |
39529 | + struct microcode_header hdr; | |
39530 | + unsigned int bits[0]; | |
39531 | +}; | |
39532 | + | |
39533 | +typedef struct microcode microcode_t; | |
39534 | +typedef struct microcode_header microcode_header_t; | |
39535 | + | |
39536 | +/* microcode format is extended from prescott processors */ | |
39537 | +struct extended_signature { | |
39538 | + unsigned int sig; | |
39539 | + unsigned int pf; | |
39540 | + unsigned int cksum; | |
39541 | +}; | |
39542 | + | |
39543 | +struct extended_sigtable { | |
39544 | + unsigned int count; | |
39545 | + unsigned int cksum; | |
39546 | + unsigned int reserved[3]; | |
39547 | + struct extended_signature sigs[0]; | |
39548 | +}; | |
39549 | + | |
39550 | +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ | |
39551 | +static inline void rep_nop(void) | |
39552 | +{ | |
39553 | + __asm__ __volatile__("rep;nop": : :"memory"); | |
39554 | +} | |
39555 | + | |
39556 | +#define cpu_relax() rep_nop() | |
39557 | + | |
39558 | +/* generic versions from gas */ | |
39559 | +#define GENERIC_NOP1 ".byte 0x90\n" | |
39560 | +#define GENERIC_NOP2 ".byte 0x89,0xf6\n" | |
39561 | +#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" | |
39562 | +#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" | |
39563 | +#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 | |
39564 | +#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" | |
39565 | +#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" | |
39566 | +#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 | |
39567 | + | |
39568 | +/* Opteron nops */ | |
39569 | +#define K8_NOP1 GENERIC_NOP1 | |
39570 | +#define K8_NOP2 ".byte 0x66,0x90\n" | |
39571 | +#define K8_NOP3 ".byte 0x66,0x66,0x90\n" | |
39572 | +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" | |
39573 | +#define K8_NOP5 K8_NOP3 K8_NOP2 | |
39574 | +#define K8_NOP6 K8_NOP3 K8_NOP3 | |
39575 | +#define K8_NOP7 K8_NOP4 K8_NOP3 | |
39576 | +#define K8_NOP8 K8_NOP4 K8_NOP4 | |
39577 | + | |
39578 | +/* K7 nops */ | |
39579 | +/* uses eax dependencies (arbitary choice) */ | |
39580 | +#define K7_NOP1 GENERIC_NOP1 | |
39581 | +#define K7_NOP2 ".byte 0x8b,0xc0\n" | |
39582 | +#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" | |
39583 | +#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" | |
39584 | +#define K7_NOP5 K7_NOP4 ASM_NOP1 | |
39585 | +#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" | |
39586 | +#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" | |
39587 | +#define K7_NOP8 K7_NOP7 ASM_NOP1 | |
39588 | + | |
39589 | +#ifdef CONFIG_MK8 | |
39590 | +#define ASM_NOP1 K8_NOP1 | |
39591 | +#define ASM_NOP2 K8_NOP2 | |
39592 | +#define ASM_NOP3 K8_NOP3 | |
39593 | +#define ASM_NOP4 K8_NOP4 | |
39594 | +#define ASM_NOP5 K8_NOP5 | |
39595 | +#define ASM_NOP6 K8_NOP6 | |
39596 | +#define ASM_NOP7 K8_NOP7 | |
39597 | +#define ASM_NOP8 K8_NOP8 | |
39598 | +#elif defined(CONFIG_MK7) | |
39599 | +#define ASM_NOP1 K7_NOP1 | |
39600 | +#define ASM_NOP2 K7_NOP2 | |
39601 | +#define ASM_NOP3 K7_NOP3 | |
39602 | +#define ASM_NOP4 K7_NOP4 | |
39603 | +#define ASM_NOP5 K7_NOP5 | |
39604 | +#define ASM_NOP6 K7_NOP6 | |
39605 | +#define ASM_NOP7 K7_NOP7 | |
39606 | +#define ASM_NOP8 K7_NOP8 | |
39607 | +#else | |
39608 | +#define ASM_NOP1 GENERIC_NOP1 | |
39609 | +#define ASM_NOP2 GENERIC_NOP2 | |
39610 | +#define ASM_NOP3 GENERIC_NOP3 | |
39611 | +#define ASM_NOP4 GENERIC_NOP4 | |
39612 | +#define ASM_NOP5 GENERIC_NOP5 | |
39613 | +#define ASM_NOP6 GENERIC_NOP6 | |
39614 | +#define ASM_NOP7 GENERIC_NOP7 | |
39615 | +#define ASM_NOP8 GENERIC_NOP8 | |
39616 | +#endif | |
39617 | + | |
39618 | +#define ASM_NOP_MAX 8 | |
39619 | + | |
39620 | +/* Prefetch instructions for Pentium III and AMD Athlon */ | |
39621 | +/* It's not worth to care about 3dnow! prefetches for the K6 | |
39622 | + because they are microcoded there and very slow. | |
39623 | + However we don't do prefetches for pre XP Athlons currently | |
39624 | + That should be fixed. */ | |
39625 | +#define ARCH_HAS_PREFETCH | |
39626 | +static inline void prefetch(const void *x) | |
39627 | +{ | |
39628 | + alternative_input(ASM_NOP4, | |
39629 | + "prefetchnta (%1)", | |
39630 | + X86_FEATURE_XMM, | |
39631 | + "r" (x)); | |
39632 | +} | |
39633 | + | |
39634 | +#define ARCH_HAS_PREFETCH | |
39635 | +#define ARCH_HAS_PREFETCHW | |
39636 | +#define ARCH_HAS_SPINLOCK_PREFETCH | |
39637 | + | |
39638 | +/* 3dnow! prefetch to get an exclusive cache line. Useful for | |
39639 | + spinlocks to avoid one state transition in the cache coherency protocol. */ | |
39640 | +static inline void prefetchw(const void *x) | |
39641 | +{ | |
39642 | + alternative_input(ASM_NOP4, | |
39643 | + "prefetchw (%1)", | |
39644 | + X86_FEATURE_3DNOW, | |
39645 | + "r" (x)); | |
39646 | +} | |
39647 | +#define spin_lock_prefetch(x) prefetchw(x) | |
39648 | + | |
39649 | +extern void select_idle_routine(const struct cpuinfo_x86 *c); | |
39650 | + | |
39651 | +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) | |
39652 | + | |
39653 | +extern unsigned long boot_option_idle_override; | |
39654 | +extern void enable_sep_cpu(void); | |
39655 | +extern int sysenter_setup(void); | |
39656 | + | |
39657 | +#endif /* __ASM_I386_PROCESSOR_H */ | |
39658 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/segment_32.h | |
39659 | =================================================================== | |
39660 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
39661 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/segment_32.h 2007-06-12 13:14:02.000000000 +0200 | |
39662 | @@ -0,0 +1,117 @@ | |
39663 | +#ifndef _ASM_SEGMENT_H | |
39664 | +#define _ASM_SEGMENT_H | |
39665 | + | |
39666 | +/* | |
39667 | + * The layout of the per-CPU GDT under Linux: | |
39668 | + * | |
39669 | + * 0 - null | |
39670 | + * 1 - reserved | |
39671 | + * 2 - reserved | |
39672 | + * 3 - reserved | |
39673 | + * | |
39674 | + * 4 - unused <==== new cacheline | |
39675 | + * 5 - unused | |
39676 | + * | |
39677 | + * ------- start of TLS (Thread-Local Storage) segments: | |
39678 | + * | |
39679 | + * 6 - TLS segment #1 [ glibc's TLS segment ] | |
39680 | + * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] | |
39681 | + * 8 - TLS segment #3 | |
39682 | + * 9 - reserved | |
39683 | + * 10 - reserved | |
39684 | + * 11 - reserved | |
39685 | + * | |
39686 | + * ------- start of kernel segments: | |
39687 | + * | |
39688 | + * 12 - kernel code segment <==== new cacheline | |
39689 | + * 13 - kernel data segment | |
39690 | + * 14 - default user CS | |
39691 | + * 15 - default user DS | |
39692 | + * 16 - TSS | |
39693 | + * 17 - LDT | |
39694 | + * 18 - PNPBIOS support (16->32 gate) | |
39695 | + * 19 - PNPBIOS support | |
39696 | + * 20 - PNPBIOS support | |
39697 | + * 21 - PNPBIOS support | |
39698 | + * 22 - PNPBIOS support | |
39699 | + * 23 - APM BIOS support | |
39700 | + * 24 - APM BIOS support | |
39701 | + * 25 - APM BIOS support | |
39702 | + * | |
39703 | + * 26 - ESPFIX small SS | |
39704 | + * 27 - unused | |
39705 | + * 28 - unused | |
39706 | + * 29 - unused | |
39707 | + * 30 - unused | |
39708 | + * 31 - TSS for double fault handler | |
39709 | + */ | |
39710 | +#define GDT_ENTRY_TLS_ENTRIES 3 | |
39711 | +#define GDT_ENTRY_TLS_MIN 6 | |
39712 | +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) | |
39713 | + | |
39714 | +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) | |
39715 | + | |
39716 | +#define GDT_ENTRY_DEFAULT_USER_CS 14 | |
39717 | +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3) | |
39718 | + | |
39719 | +#define GDT_ENTRY_DEFAULT_USER_DS 15 | |
39720 | +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3) | |
39721 | + | |
39722 | +#define GDT_ENTRY_KERNEL_BASE 12 | |
39723 | + | |
39724 | +#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) | |
39725 | +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) | |
39726 | +#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) ) | |
39727 | + | |
39728 | +#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) | |
39729 | +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) | |
39730 | +#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) ) | |
39731 | + | |
39732 | +#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) | |
39733 | +#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5) | |
39734 | + | |
39735 | +#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) | |
39736 | +#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) | |
39737 | + | |
39738 | +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) | |
39739 | +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) | |
39740 | + | |
39741 | +#define GDT_ENTRY_DOUBLEFAULT_TSS 31 | |
39742 | + | |
39743 | +/* | |
39744 | + * The GDT has 32 entries | |
39745 | + */ | |
39746 | +#define GDT_ENTRIES 32 | |
39747 | + | |
39748 | +#define GDT_SIZE (GDT_ENTRIES * 8) | |
39749 | + | |
39750 | +/* Simple and small GDT entries for booting only */ | |
39751 | + | |
39752 | +#define GDT_ENTRY_BOOT_CS 2 | |
39753 | +#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) | |
39754 | + | |
39755 | +#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) | |
39756 | +#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) | |
39757 | + | |
39758 | +/* The PnP BIOS entries in the GDT */ | |
39759 | +#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0) | |
39760 | +#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1) | |
39761 | +#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2) | |
39762 | +#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3) | |
39763 | +#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4) | |
39764 | + | |
39765 | +/* The PnP BIOS selectors */ | |
39766 | +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ | |
39767 | +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ | |
39768 | +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ | |
39769 | +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ | |
39770 | +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ | |
39771 | + | |
39772 | +/* | |
39773 | + * The interrupt descriptor table has room for 256 idt's, | |
39774 | + * the global descriptor table is dependent on the number | |
39775 | + * of tasks we can have.. | |
39776 | + */ | |
39777 | +#define IDT_ENTRIES 256 | |
39778 | + | |
39779 | +#endif | |
39780 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/smp_32.h | |
39781 | =================================================================== | |
39782 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
39783 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/smp_32.h 2007-06-12 13:14:02.000000000 +0200 | |
39784 | @@ -0,0 +1,103 @@ | |
39785 | +#ifndef __ASM_SMP_H | |
39786 | +#define __ASM_SMP_H | |
39787 | + | |
39788 | +/* | |
39789 | + * We need the APIC definitions automatically as part of 'smp.h' | |
39790 | + */ | |
39791 | +#ifndef __ASSEMBLY__ | |
39792 | +#include <linux/kernel.h> | |
39793 | +#include <linux/threads.h> | |
39794 | +#include <linux/cpumask.h> | |
39795 | +#endif | |
39796 | + | |
39797 | +#ifdef CONFIG_X86_LOCAL_APIC | |
39798 | +#ifndef __ASSEMBLY__ | |
39799 | +#include <asm/fixmap.h> | |
39800 | +#include <asm/bitops.h> | |
39801 | +#include <asm/mpspec.h> | |
39802 | +#ifdef CONFIG_X86_IO_APIC | |
39803 | +#include <asm/io_apic.h> | |
39804 | +#endif | |
39805 | +#include <asm/apic.h> | |
39806 | +#endif | |
39807 | +#endif | |
39808 | + | |
39809 | +#define BAD_APICID 0xFFu | |
39810 | +#ifdef CONFIG_SMP | |
39811 | +#ifndef __ASSEMBLY__ | |
39812 | + | |
39813 | +/* | |
39814 | + * Private routines/data | |
39815 | + */ | |
39816 | + | |
39817 | +extern void smp_alloc_memory(void); | |
39818 | +extern int pic_mode; | |
39819 | +extern int smp_num_siblings; | |
39820 | +extern cpumask_t cpu_sibling_map[]; | |
39821 | +extern cpumask_t cpu_core_map[]; | |
39822 | + | |
39823 | +extern void (*mtrr_hook) (void); | |
39824 | +extern void zap_low_mappings (void); | |
39825 | +extern void lock_ipi_call_lock(void); | |
39826 | +extern void unlock_ipi_call_lock(void); | |
39827 | + | |
39828 | +#define MAX_APICID 256 | |
39829 | +extern u8 x86_cpu_to_apicid[]; | |
39830 | + | |
39831 | +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] | |
39832 | + | |
39833 | +#ifdef CONFIG_HOTPLUG_CPU | |
39834 | +extern void cpu_exit_clear(void); | |
39835 | +extern void cpu_uninit(void); | |
39836 | +#endif | |
39837 | + | |
39838 | +/* | |
39839 | + * This function is needed by all SMP systems. It must _always_ be valid | |
39840 | + * from the initial startup. We map APIC_BASE very early in page_setup(), | |
39841 | + * so this is correct in the x86 case. | |
39842 | + */ | |
39843 | +#define raw_smp_processor_id() (current_thread_info()->cpu) | |
39844 | + | |
39845 | +extern cpumask_t cpu_possible_map; | |
39846 | +#define cpu_callin_map cpu_possible_map | |
39847 | + | |
39848 | +/* We don't mark CPUs online until __cpu_up(), so we need another measure */ | |
39849 | +static inline int num_booting_cpus(void) | |
39850 | +{ | |
39851 | + return cpus_weight(cpu_possible_map); | |
39852 | +} | |
39853 | + | |
39854 | +#ifdef CONFIG_X86_LOCAL_APIC | |
39855 | + | |
39856 | +#ifdef APIC_DEFINITION | |
39857 | +extern int hard_smp_processor_id(void); | |
39858 | +#else | |
39859 | +#include <mach_apicdef.h> | |
39860 | +static inline int hard_smp_processor_id(void) | |
39861 | +{ | |
39862 | + /* we don't want to mark this access volatile - bad code generation */ | |
39863 | + return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); | |
39864 | +} | |
39865 | +#endif | |
39866 | + | |
39867 | +static __inline int logical_smp_processor_id(void) | |
39868 | +{ | |
39869 | + /* we don't want to mark this access volatile - bad code generation */ | |
39870 | + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); | |
39871 | +} | |
39872 | + | |
39873 | +#endif | |
39874 | + | |
39875 | +extern int __cpu_disable(void); | |
39876 | +extern void __cpu_die(unsigned int cpu); | |
39877 | +extern void prefill_possible_map(void); | |
39878 | +#endif /* !__ASSEMBLY__ */ | |
39879 | + | |
39880 | +#else /* CONFIG_SMP */ | |
39881 | + | |
39882 | +#define cpu_physical_id(cpu) boot_cpu_physical_apicid | |
39883 | + | |
39884 | +#define NO_PROC_ID 0xFF /* No processor magic marker */ | |
39885 | + | |
39886 | +#endif | |
39887 | +#endif | |
39888 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/swiotlb_32.h | |
39889 | =================================================================== | |
39890 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
39891 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/swiotlb_32.h 2007-06-12 13:14:02.000000000 +0200 | |
39892 | @@ -0,0 +1,43 @@ | |
39893 | +#ifndef _ASM_SWIOTLB_H | |
39894 | +#define _ASM_SWIOTLB_H 1 | |
39895 | + | |
39896 | +/* SWIOTLB interface */ | |
39897 | + | |
39898 | +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, | |
39899 | + int dir); | |
39900 | +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, | |
39901 | + size_t size, int dir); | |
39902 | +extern void swiotlb_sync_single_for_cpu(struct device *hwdev, | |
39903 | + dma_addr_t dev_addr, | |
39904 | + size_t size, int dir); | |
39905 | +extern void swiotlb_sync_single_for_device(struct device *hwdev, | |
39906 | + dma_addr_t dev_addr, | |
39907 | + size_t size, int dir); | |
39908 | +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev, | |
39909 | + struct scatterlist *sg, int nelems, | |
39910 | + int dir); | |
39911 | +extern void swiotlb_sync_sg_for_device(struct device *hwdev, | |
39912 | + struct scatterlist *sg, int nelems, | |
39913 | + int dir); | |
39914 | +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, | |
39915 | + int nents, int direction); | |
39916 | +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, | |
39917 | + int nents, int direction); | |
39918 | +extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr); | |
39919 | +#ifdef CONFIG_HIGHMEM | |
39920 | +extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page, | |
39921 | + unsigned long offset, size_t size, | |
39922 | + enum dma_data_direction direction); | |
39923 | +extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address, | |
39924 | + size_t size, enum dma_data_direction direction); | |
39925 | +#endif | |
39926 | +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask); | |
39927 | +extern void swiotlb_init(void); | |
39928 | + | |
39929 | +#ifdef CONFIG_SWIOTLB | |
39930 | +extern int swiotlb; | |
39931 | +#else | |
39932 | +#define swiotlb 0 | |
39933 | +#endif | |
39934 | + | |
39935 | +#endif | |
39936 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/synch_bitops.h | |
39937 | =================================================================== | |
39938 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
39939 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/synch_bitops.h 2008-04-02 12:34:02.000000000 +0200 | |
39940 | @@ -0,0 +1,126 @@ | |
39941 | +#ifndef __XEN_SYNCH_BITOPS_H__ | |
39942 | +#define __XEN_SYNCH_BITOPS_H__ | |
39943 | + | |
39944 | +/* | |
39945 | + * Copyright 1992, Linus Torvalds. | |
39946 | + * Heavily modified to provide guaranteed strong synchronisation | |
39947 | + * when communicating with Xen or other guest OSes running on other CPUs. | |
39948 | + */ | |
39949 | + | |
39950 | +#ifdef HAVE_XEN_PLATFORM_COMPAT_H | |
39951 | +#include <xen/platform-compat.h> | |
39952 | +#endif | |
39953 | + | |
39954 | +#define ADDR (*(volatile long *) addr) | |
39955 | + | |
39956 | +static __inline__ void synch_set_bit(int nr, volatile void * addr) | |
39957 | +{ | |
39958 | + __asm__ __volatile__ ( | |
39959 | + "lock btsl %1,%0" | |
39960 | + : "+m" (ADDR) : "Ir" (nr) : "memory" ); | |
39961 | +} | |
39962 | + | |
39963 | +static __inline__ void synch_clear_bit(int nr, volatile void * addr) | |
39964 | +{ | |
39965 | + __asm__ __volatile__ ( | |
39966 | + "lock btrl %1,%0" | |
39967 | + : "+m" (ADDR) : "Ir" (nr) : "memory" ); | |
39968 | +} | |
39969 | + | |
39970 | +static __inline__ void synch_change_bit(int nr, volatile void * addr) | |
39971 | +{ | |
39972 | + __asm__ __volatile__ ( | |
39973 | + "lock btcl %1,%0" | |
39974 | + : "+m" (ADDR) : "Ir" (nr) : "memory" ); | |
39975 | +} | |
39976 | + | |
39977 | +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr) | |
39978 | +{ | |
39979 | + int oldbit; | |
39980 | + __asm__ __volatile__ ( | |
39981 | + "lock btsl %2,%1\n\tsbbl %0,%0" | |
39982 | + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); | |
39983 | + return oldbit; | |
39984 | +} | |
39985 | + | |
39986 | +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr) | |
39987 | +{ | |
39988 | + int oldbit; | |
39989 | + __asm__ __volatile__ ( | |
39990 | + "lock btrl %2,%1\n\tsbbl %0,%0" | |
39991 | + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); | |
39992 | + return oldbit; | |
39993 | +} | |
39994 | + | |
39995 | +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr) | |
39996 | +{ | |
39997 | + int oldbit; | |
39998 | + | |
39999 | + __asm__ __volatile__ ( | |
40000 | + "lock btcl %2,%1\n\tsbbl %0,%0" | |
40001 | + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); | |
40002 | + return oldbit; | |
40003 | +} | |
40004 | + | |
40005 | +struct __synch_xchg_dummy { unsigned long a[100]; }; | |
40006 | +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x)) | |
40007 | + | |
40008 | +#define synch_cmpxchg(ptr, old, new) \ | |
40009 | +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\ | |
40010 | + (unsigned long)(old), \ | |
40011 | + (unsigned long)(new), \ | |
40012 | + sizeof(*(ptr)))) | |
40013 | + | |
40014 | +static inline unsigned long __synch_cmpxchg(volatile void *ptr, | |
40015 | + unsigned long old, | |
40016 | + unsigned long new, int size) | |
40017 | +{ | |
40018 | + unsigned long prev; | |
40019 | + switch (size) { | |
40020 | + case 1: | |
40021 | + __asm__ __volatile__("lock; cmpxchgb %b1,%2" | |
40022 | + : "=a"(prev) | |
40023 | + : "q"(new), "m"(*__synch_xg(ptr)), | |
40024 | + "0"(old) | |
40025 | + : "memory"); | |
40026 | + return prev; | |
40027 | + case 2: | |
40028 | + __asm__ __volatile__("lock; cmpxchgw %w1,%2" | |
40029 | + : "=a"(prev) | |
40030 | + : "r"(new), "m"(*__synch_xg(ptr)), | |
40031 | + "0"(old) | |
40032 | + : "memory"); | |
40033 | + return prev; | |
40034 | +#ifdef CONFIG_X86_64 | |
40035 | + case 4: | |
40036 | + __asm__ __volatile__("lock; cmpxchgl %k1,%2" | |
40037 | + : "=a"(prev) | |
40038 | + : "r"(new), "m"(*__synch_xg(ptr)), | |
40039 | + "0"(old) | |
40040 | + : "memory"); | |
40041 | + return prev; | |
40042 | + case 8: | |
40043 | + __asm__ __volatile__("lock; cmpxchgq %1,%2" | |
40044 | + : "=a"(prev) | |
40045 | + : "r"(new), "m"(*__synch_xg(ptr)), | |
40046 | + "0"(old) | |
40047 | + : "memory"); | |
40048 | + return prev; | |
40049 | +#else | |
40050 | + case 4: | |
40051 | + __asm__ __volatile__("lock; cmpxchgl %1,%2" | |
40052 | + : "=a"(prev) | |
40053 | + : "r"(new), "m"(*__synch_xg(ptr)), | |
40054 | + "0"(old) | |
40055 | + : "memory"); | |
40056 | + return prev; | |
40057 | +#endif | |
40058 | + } | |
40059 | + return old; | |
40060 | +} | |
40061 | + | |
40062 | +#define synch_test_bit test_bit | |
40063 | + | |
40064 | +#define synch_cmpxchg_subword synch_cmpxchg | |
40065 | + | |
40066 | +#endif /* __XEN_SYNCH_BITOPS_H__ */ | |
40067 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/system_32.h | |
40068 | =================================================================== | |
40069 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
40070 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/system_32.h 2007-06-12 13:14:02.000000000 +0200 | |
40071 | @@ -0,0 +1,488 @@ | |
40072 | +#ifndef __ASM_SYSTEM_H | |
40073 | +#define __ASM_SYSTEM_H | |
40074 | + | |
40075 | +#include <linux/kernel.h> | |
40076 | +#include <asm/segment.h> | |
40077 | +#include <asm/cpufeature.h> | |
40078 | +#include <linux/bitops.h> /* for LOCK_PREFIX */ | |
40079 | +#include <asm/synch_bitops.h> | |
40080 | +#include <asm/hypervisor.h> | |
40081 | + | |
40082 | +#ifdef __KERNEL__ | |
40083 | + | |
40084 | +struct task_struct; /* one of the stranger aspects of C forward declarations.. */ | |
40085 | +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); | |
40086 | + | |
40087 | +/* | |
40088 | + * Saving eflags is important. It switches not only IOPL between tasks, | |
40089 | + * it also protects other tasks from NT leaking through sysenter etc. | |
40090 | + */ | |
40091 | +#define switch_to(prev,next,last) do { \ | |
40092 | + unsigned long esi,edi; \ | |
40093 | + asm volatile("pushfl\n\t" /* Save flags */ \ | |
40094 | + "pushl %%ebp\n\t" \ | |
40095 | + "movl %%esp,%0\n\t" /* save ESP */ \ | |
40096 | + "movl %5,%%esp\n\t" /* restore ESP */ \ | |
40097 | + "movl $1f,%1\n\t" /* save EIP */ \ | |
40098 | + "pushl %6\n\t" /* restore EIP */ \ | |
40099 | + "jmp __switch_to\n" \ | |
40100 | + "1:\t" \ | |
40101 | + "popl %%ebp\n\t" \ | |
40102 | + "popfl" \ | |
40103 | + :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ | |
40104 | + "=a" (last),"=S" (esi),"=D" (edi) \ | |
40105 | + :"m" (next->thread.esp),"m" (next->thread.eip), \ | |
40106 | + "2" (prev), "d" (next)); \ | |
40107 | +} while (0) | |
40108 | + | |
40109 | +#define _set_base(addr,base) do { unsigned long __pr; \ | |
40110 | +__asm__ __volatile__ ("movw %%dx,%1\n\t" \ | |
40111 | + "rorl $16,%%edx\n\t" \ | |
40112 | + "movb %%dl,%2\n\t" \ | |
40113 | + "movb %%dh,%3" \ | |
40114 | + :"=&d" (__pr) \ | |
40115 | + :"m" (*((addr)+2)), \ | |
40116 | + "m" (*((addr)+4)), \ | |
40117 | + "m" (*((addr)+7)), \ | |
40118 | + "0" (base) \ | |
40119 | + ); } while(0) | |
40120 | + | |
40121 | +#define _set_limit(addr,limit) do { unsigned long __lr; \ | |
40122 | +__asm__ __volatile__ ("movw %%dx,%1\n\t" \ | |
40123 | + "rorl $16,%%edx\n\t" \ | |
40124 | + "movb %2,%%dh\n\t" \ | |
40125 | + "andb $0xf0,%%dh\n\t" \ | |
40126 | + "orb %%dh,%%dl\n\t" \ | |
40127 | + "movb %%dl,%2" \ | |
40128 | + :"=&d" (__lr) \ | |
40129 | + :"m" (*(addr)), \ | |
40130 | + "m" (*((addr)+6)), \ | |
40131 | + "0" (limit) \ | |
40132 | + ); } while(0) | |
40133 | + | |
40134 | +#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) ) | |
40135 | +#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) ) | |
40136 | + | |
40137 | +/* | |
40138 | + * Load a segment. Fall back on loading the zero | |
40139 | + * segment if something goes wrong.. | |
40140 | + */ | |
40141 | +#define loadsegment(seg,value) \ | |
40142 | + asm volatile("\n" \ | |
40143 | + "1:\t" \ | |
40144 | + "mov %0,%%" #seg "\n" \ | |
40145 | + "2:\n" \ | |
40146 | + ".section .fixup,\"ax\"\n" \ | |
40147 | + "3:\t" \ | |
40148 | + "pushl $0\n\t" \ | |
40149 | + "popl %%" #seg "\n\t" \ | |
40150 | + "jmp 2b\n" \ | |
40151 | + ".previous\n" \ | |
40152 | + ".section __ex_table,\"a\"\n\t" \ | |
40153 | + ".align 4\n\t" \ | |
40154 | + ".long 1b,3b\n" \ | |
40155 | + ".previous" \ | |
40156 | + : :"rm" (value)) | |
40157 | + | |
40158 | +/* | |
40159 | + * Save a segment register away | |
40160 | + */ | |
40161 | +#define savesegment(seg, value) \ | |
40162 | + asm volatile("mov %%" #seg ",%0":"=rm" (value)) | |
40163 | + | |
40164 | +#define read_cr0() ({ \ | |
40165 | + unsigned int __dummy; \ | |
40166 | + __asm__ __volatile__( \ | |
40167 | + "movl %%cr0,%0\n\t" \ | |
40168 | + :"=r" (__dummy)); \ | |
40169 | + __dummy; \ | |
40170 | +}) | |
40171 | +#define write_cr0(x) \ | |
40172 | + __asm__ __volatile__("movl %0,%%cr0": :"r" (x)) | |
40173 | + | |
40174 | +#define read_cr2() (current_vcpu_info()->arch.cr2) | |
40175 | +#define write_cr2(x) \ | |
40176 | + __asm__ __volatile__("movl %0,%%cr2": :"r" (x)) | |
40177 | + | |
40178 | +#define read_cr3() ({ \ | |
40179 | + unsigned int __dummy; \ | |
40180 | + __asm__ ( \ | |
40181 | + "movl %%cr3,%0\n\t" \ | |
40182 | + :"=r" (__dummy)); \ | |
40183 | + __dummy = xen_cr3_to_pfn(__dummy); \ | |
40184 | + mfn_to_pfn(__dummy) << PAGE_SHIFT; \ | |
40185 | +}) | |
40186 | +#define write_cr3(x) ({ \ | |
40187 | + unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \ | |
40188 | + __dummy = xen_pfn_to_cr3(__dummy); \ | |
40189 | + __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \ | |
40190 | +}) | |
40191 | +#define read_cr4() ({ \ | |
40192 | + unsigned int __dummy; \ | |
40193 | + __asm__( \ | |
40194 | + "movl %%cr4,%0\n\t" \ | |
40195 | + :"=r" (__dummy)); \ | |
40196 | + __dummy; \ | |
40197 | +}) | |
40198 | +#define read_cr4_safe() ({ \ | |
40199 | + unsigned int __dummy; \ | |
40200 | + /* This could fault if %cr4 does not exist */ \ | |
40201 | + __asm__("1: movl %%cr4, %0 \n" \ | |
40202 | + "2: \n" \ | |
40203 | + ".section __ex_table,\"a\" \n" \ | |
40204 | + ".long 1b,2b \n" \ | |
40205 | + ".previous \n" \ | |
40206 | + : "=r" (__dummy): "0" (0)); \ | |
40207 | + __dummy; \ | |
40208 | +}) | |
40209 | + | |
40210 | +#define write_cr4(x) \ | |
40211 | + __asm__ __volatile__("movl %0,%%cr4": :"r" (x)) | |
40212 | + | |
40213 | +/* | |
40214 | + * Clear and set 'TS' bit respectively | |
40215 | + */ | |
40216 | +#define clts() (HYPERVISOR_fpu_taskswitch(0)) | |
40217 | +#define stts() (HYPERVISOR_fpu_taskswitch(1)) | |
40218 | + | |
40219 | +#endif /* __KERNEL__ */ | |
40220 | + | |
40221 | +#define wbinvd() \ | |
40222 | + __asm__ __volatile__ ("wbinvd": : :"memory") | |
40223 | + | |
40224 | +static inline unsigned long get_limit(unsigned long segment) | |
40225 | +{ | |
40226 | + unsigned long __limit; | |
40227 | + __asm__("lsll %1,%0" | |
40228 | + :"=r" (__limit):"r" (segment)); | |
40229 | + return __limit+1; | |
40230 | +} | |
40231 | + | |
40232 | +#define nop() __asm__ __volatile__ ("nop") | |
40233 | + | |
40234 | +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) | |
40235 | + | |
40236 | +#define tas(ptr) (xchg((ptr),1)) | |
40237 | + | |
40238 | +struct __xchg_dummy { unsigned long a[100]; }; | |
40239 | +#define __xg(x) ((struct __xchg_dummy *)(x)) | |
40240 | + | |
40241 | + | |
40242 | +#ifdef CONFIG_X86_CMPXCHG64 | |
40243 | + | |
40244 | +/* | |
40245 | + * The semantics of XCHGCMP8B are a bit strange, this is why | |
40246 | + * there is a loop and the loading of %%eax and %%edx has to | |
40247 | + * be inside. This inlines well in most cases, the cached | |
40248 | + * cost is around ~38 cycles. (in the future we might want | |
40249 | + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that | |
40250 | + * might have an implicit FPU-save as a cost, so it's not | |
40251 | + * clear which path to go.) | |
40252 | + * | |
40253 | + * cmpxchg8b must be used with the lock prefix here to allow | |
40254 | + * the instruction to be executed atomically, see page 3-102 | |
40255 | + * of the instruction set reference 24319102.pdf. We need | |
40256 | + * the reader side to see the coherent 64bit value. | |
40257 | + */ | |
40258 | +static inline void __set_64bit (unsigned long long * ptr, | |
40259 | + unsigned int low, unsigned int high) | |
40260 | +{ | |
40261 | + __asm__ __volatile__ ( | |
40262 | + "\n1:\t" | |
40263 | + "movl (%0), %%eax\n\t" | |
40264 | + "movl 4(%0), %%edx\n\t" | |
40265 | + "lock cmpxchg8b (%0)\n\t" | |
40266 | + "jnz 1b" | |
40267 | + : /* no outputs */ | |
40268 | + : "D"(ptr), | |
40269 | + "b"(low), | |
40270 | + "c"(high) | |
40271 | + : "ax","dx","memory"); | |
40272 | +} | |
40273 | + | |
40274 | +static inline void __set_64bit_constant (unsigned long long *ptr, | |
40275 | + unsigned long long value) | |
40276 | +{ | |
40277 | + __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL)); | |
40278 | +} | |
40279 | +#define ll_low(x) *(((unsigned int*)&(x))+0) | |
40280 | +#define ll_high(x) *(((unsigned int*)&(x))+1) | |
40281 | + | |
40282 | +static inline void __set_64bit_var (unsigned long long *ptr, | |
40283 | + unsigned long long value) | |
40284 | +{ | |
40285 | + __set_64bit(ptr,ll_low(value), ll_high(value)); | |
40286 | +} | |
40287 | + | |
40288 | +#define set_64bit(ptr,value) \ | |
40289 | +(__builtin_constant_p(value) ? \ | |
40290 | + __set_64bit_constant(ptr, value) : \ | |
40291 | + __set_64bit_var(ptr, value) ) | |
40292 | + | |
40293 | +#define _set_64bit(ptr,value) \ | |
40294 | +(__builtin_constant_p(value) ? \ | |
40295 | + __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \ | |
40296 | + __set_64bit(ptr, ll_low(value), ll_high(value)) ) | |
40297 | + | |
40298 | +#endif | |
40299 | + | |
40300 | +/* | |
40301 | + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway | |
40302 | + * Note 2: xchg has side effect, so that attribute volatile is necessary, | |
40303 | + * but generally the primitive is invalid, *ptr is output argument. --ANK | |
40304 | + */ | |
40305 | +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) | |
40306 | +{ | |
40307 | + switch (size) { | |
40308 | + case 1: | |
40309 | + __asm__ __volatile__("xchgb %b0,%1" | |
40310 | + :"=q" (x) | |
40311 | + :"m" (*__xg(ptr)), "0" (x) | |
40312 | + :"memory"); | |
40313 | + break; | |
40314 | + case 2: | |
40315 | + __asm__ __volatile__("xchgw %w0,%1" | |
40316 | + :"=r" (x) | |
40317 | + :"m" (*__xg(ptr)), "0" (x) | |
40318 | + :"memory"); | |
40319 | + break; | |
40320 | + case 4: | |
40321 | + __asm__ __volatile__("xchgl %0,%1" | |
40322 | + :"=r" (x) | |
40323 | + :"m" (*__xg(ptr)), "0" (x) | |
40324 | + :"memory"); | |
40325 | + break; | |
40326 | + } | |
40327 | + return x; | |
40328 | +} | |
40329 | + | |
40330 | +/* | |
40331 | + * Atomic compare and exchange. Compare OLD with MEM, if identical, | |
40332 | + * store NEW in MEM. Return the initial value in MEM. Success is | |
40333 | + * indicated by comparing RETURN with OLD. | |
40334 | + */ | |
40335 | + | |
40336 | +#ifdef CONFIG_X86_CMPXCHG | |
40337 | +#define __HAVE_ARCH_CMPXCHG 1 | |
40338 | +#define cmpxchg(ptr,o,n)\ | |
40339 | + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ | |
40340 | + (unsigned long)(n),sizeof(*(ptr)))) | |
40341 | +#endif | |
40342 | + | |
40343 | +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, | |
40344 | + unsigned long new, int size) | |
40345 | +{ | |
40346 | + unsigned long prev; | |
40347 | + switch (size) { | |
40348 | + case 1: | |
40349 | + __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" | |
40350 | + : "=a"(prev) | |
40351 | + : "q"(new), "m"(*__xg(ptr)), "0"(old) | |
40352 | + : "memory"); | |
40353 | + return prev; | |
40354 | + case 2: | |
40355 | + __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" | |
40356 | + : "=a"(prev) | |
40357 | + : "r"(new), "m"(*__xg(ptr)), "0"(old) | |
40358 | + : "memory"); | |
40359 | + return prev; | |
40360 | + case 4: | |
40361 | + __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" | |
40362 | + : "=a"(prev) | |
40363 | + : "r"(new), "m"(*__xg(ptr)), "0"(old) | |
40364 | + : "memory"); | |
40365 | + return prev; | |
40366 | + } | |
40367 | + return old; | |
40368 | +} | |
40369 | + | |
40370 | +#ifndef CONFIG_X86_CMPXCHG | |
40371 | +/* | |
40372 | + * Building a kernel capable running on 80386. It may be necessary to | |
40373 | + * simulate the cmpxchg on the 80386 CPU. For that purpose we define | |
40374 | + * a function for each of the sizes we support. | |
40375 | + */ | |
40376 | + | |
40377 | +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8); | |
40378 | +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16); | |
40379 | +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32); | |
40380 | + | |
40381 | +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, | |
40382 | + unsigned long new, int size) | |
40383 | +{ | |
40384 | + switch (size) { | |
40385 | + case 1: | |
40386 | + return cmpxchg_386_u8(ptr, old, new); | |
40387 | + case 2: | |
40388 | + return cmpxchg_386_u16(ptr, old, new); | |
40389 | + case 4: | |
40390 | + return cmpxchg_386_u32(ptr, old, new); | |
40391 | + } | |
40392 | + return old; | |
40393 | +} | |
40394 | + | |
40395 | +#define cmpxchg(ptr,o,n) \ | |
40396 | +({ \ | |
40397 | + __typeof__(*(ptr)) __ret; \ | |
40398 | + if (likely(boot_cpu_data.x86 > 3)) \ | |
40399 | + __ret = __cmpxchg((ptr), (unsigned long)(o), \ | |
40400 | + (unsigned long)(n), sizeof(*(ptr))); \ | |
40401 | + else \ | |
40402 | + __ret = cmpxchg_386((ptr), (unsigned long)(o), \ | |
40403 | + (unsigned long)(n), sizeof(*(ptr))); \ | |
40404 | + __ret; \ | |
40405 | +}) | |
40406 | +#endif | |
40407 | + | |
40408 | +#ifdef CONFIG_X86_CMPXCHG64 | |
40409 | + | |
40410 | +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old, | |
40411 | + unsigned long long new) | |
40412 | +{ | |
40413 | + unsigned long long prev; | |
40414 | + __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3" | |
40415 | + : "=A"(prev) | |
40416 | + : "b"((unsigned long)new), | |
40417 | + "c"((unsigned long)(new >> 32)), | |
40418 | + "m"(*__xg(ptr)), | |
40419 | + "0"(old) | |
40420 | + : "memory"); | |
40421 | + return prev; | |
40422 | +} | |
40423 | + | |
40424 | +#define cmpxchg64(ptr,o,n)\ | |
40425 | + ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\ | |
40426 | + (unsigned long long)(n))) | |
40427 | + | |
40428 | +#endif | |
40429 | + | |
40430 | +/* | |
40431 | + * Force strict CPU ordering. | |
40432 | + * And yes, this is required on UP too when we're talking | |
40433 | + * to devices. | |
40434 | + * | |
40435 | + * For now, "wmb()" doesn't actually do anything, as all | |
40436 | + * Intel CPU's follow what Intel calls a *Processor Order*, | |
40437 | + * in which all writes are seen in the program order even | |
40438 | + * outside the CPU. | |
40439 | + * | |
40440 | + * I expect future Intel CPU's to have a weaker ordering, | |
40441 | + * but I'd also expect them to finally get their act together | |
40442 | + * and add some real memory barriers if so. | |
40443 | + * | |
40444 | + * Some non intel clones support out of order store. wmb() ceases to be a | |
40445 | + * nop for these. | |
40446 | + */ | |
40447 | + | |
40448 | + | |
40449 | +/* | |
40450 | + * Actually only lfence would be needed for mb() because all stores done | |
40451 | + * by the kernel should be already ordered. But keep a full barrier for now. | |
40452 | + */ | |
40453 | + | |
40454 | +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) | |
40455 | +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) | |
40456 | + | |
40457 | +/** | |
40458 | + * read_barrier_depends - Flush all pending reads that subsequents reads | |
40459 | + * depend on. | |
40460 | + * | |
40461 | + * No data-dependent reads from memory-like regions are ever reordered | |
40462 | + * over this barrier. All reads preceding this primitive are guaranteed | |
40463 | + * to access memory (but not necessarily other CPUs' caches) before any | |
40464 | + * reads following this primitive that depend on the data return by | |
40465 | + * any of the preceding reads. This primitive is much lighter weight than | |
40466 | + * rmb() on most CPUs, and is never heavier weight than is | |
40467 | + * rmb(). | |
40468 | + * | |
40469 | + * These ordering constraints are respected by both the local CPU | |
40470 | + * and the compiler. | |
40471 | + * | |
40472 | + * Ordering is not guaranteed by anything other than these primitives, | |
40473 | + * not even by data dependencies. See the documentation for | |
40474 | + * memory_barrier() for examples and URLs to more information. | |
40475 | + * | |
40476 | + * For example, the following code would force ordering (the initial | |
40477 | + * value of "a" is zero, "b" is one, and "p" is "&a"): | |
40478 | + * | |
40479 | + * <programlisting> | |
40480 | + * CPU 0 CPU 1 | |
40481 | + * | |
40482 | + * b = 2; | |
40483 | + * memory_barrier(); | |
40484 | + * p = &b; q = p; | |
40485 | + * read_barrier_depends(); | |
40486 | + * d = *q; | |
40487 | + * </programlisting> | |
40488 | + * | |
40489 | + * because the read of "*q" depends on the read of "p" and these | |
40490 | + * two reads are separated by a read_barrier_depends(). However, | |
40491 | + * the following code, with the same initial values for "a" and "b": | |
40492 | + * | |
40493 | + * <programlisting> | |
40494 | + * CPU 0 CPU 1 | |
40495 | + * | |
40496 | + * a = 2; | |
40497 | + * memory_barrier(); | |
40498 | + * b = 3; y = b; | |
40499 | + * read_barrier_depends(); | |
40500 | + * x = a; | |
40501 | + * </programlisting> | |
40502 | + * | |
40503 | + * does not enforce ordering, since there is no data dependency between | |
40504 | + * the read of "a" and the read of "b". Therefore, on some CPUs, such | |
40505 | + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() | |
40506 | + * in cases like this where there are no data dependencies. | |
40507 | + **/ | |
40508 | + | |
40509 | +#define read_barrier_depends() do { } while(0) | |
40510 | + | |
40511 | +#ifdef CONFIG_X86_OOSTORE | |
40512 | +/* Actually there are no OOO store capable CPUs for now that do SSE, | |
40513 | + but make it already an possibility. */ | |
40514 | +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) | |
40515 | +#else | |
40516 | +#define wmb() __asm__ __volatile__ ("": : :"memory") | |
40517 | +#endif | |
40518 | + | |
40519 | +#ifdef CONFIG_SMP | |
40520 | +#define smp_mb() mb() | |
40521 | +#define smp_rmb() rmb() | |
40522 | +#define smp_wmb() wmb() | |
40523 | +#define smp_read_barrier_depends() read_barrier_depends() | |
40524 | +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) | |
40525 | +#else | |
40526 | +#define smp_mb() barrier() | |
40527 | +#define smp_rmb() barrier() | |
40528 | +#define smp_wmb() barrier() | |
40529 | +#define smp_read_barrier_depends() do { } while(0) | |
40530 | +#define set_mb(var, value) do { var = value; barrier(); } while (0) | |
40531 | +#endif | |
40532 | + | |
40533 | +#include <linux/irqflags.h> | |
40534 | + | |
40535 | +/* | |
40536 | + * disable hlt during certain critical i/o operations | |
40537 | + */ | |
40538 | +#define HAVE_DISABLE_HLT | |
40539 | +void disable_hlt(void); | |
40540 | +void enable_hlt(void); | |
40541 | + | |
40542 | +extern int es7000_plat; | |
40543 | +void cpu_idle_wait(void); | |
40544 | + | |
40545 | +/* | |
40546 | + * On SMP systems, when the scheduler does migration-cost autodetection, | |
40547 | + * it needs a way to flush as much of the CPU's caches as possible: | |
40548 | + */ | |
40549 | +static inline void sched_cacheflush(void) | |
40550 | +{ | |
40551 | + wbinvd(); | |
40552 | +} | |
40553 | + | |
40554 | +extern unsigned long arch_align_stack(unsigned long sp); | |
40555 | +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); | |
40556 | + | |
40557 | +void default_idle(void); | |
40558 | + | |
40559 | +#endif | |
40560 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_32.h | |
40561 | =================================================================== | |
40562 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
40563 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_32.h 2007-11-26 16:59:25.000000000 +0100 | |
40564 | @@ -0,0 +1,101 @@ | |
40565 | +#ifndef _I386_TLBFLUSH_H | |
40566 | +#define _I386_TLBFLUSH_H | |
40567 | + | |
40568 | +#include <linux/mm.h> | |
40569 | +#include <asm/processor.h> | |
40570 | + | |
40571 | +#define __flush_tlb() xen_tlb_flush() | |
40572 | +#define __flush_tlb_global() xen_tlb_flush() | |
40573 | +#define __flush_tlb_all() xen_tlb_flush() | |
40574 | + | |
40575 | +extern unsigned long pgkern_mask; | |
40576 | + | |
40577 | +#define cpu_has_invlpg (boot_cpu_data.x86 > 3) | |
40578 | + | |
40579 | +#define __flush_tlb_single(addr) xen_invlpg(addr) | |
40580 | + | |
40581 | +#define __flush_tlb_one(addr) __flush_tlb_single(addr) | |
40582 | + | |
40583 | +/* | |
40584 | + * TLB flushing: | |
40585 | + * | |
40586 | + * - flush_tlb() flushes the current mm struct TLBs | |
40587 | + * - flush_tlb_all() flushes all processes TLBs | |
40588 | + * - flush_tlb_mm(mm) flushes the specified mm context TLB's | |
40589 | + * - flush_tlb_page(vma, vmaddr) flushes one page | |
40590 | + * - flush_tlb_range(vma, start, end) flushes a range of pages | |
40591 | + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages | |
40592 | + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables | |
40593 | + * | |
40594 | + * ..but the i386 has somewhat limited tlb flushing capabilities, | |
40595 | + * and page-granular flushes are available only on i486 and up. | |
40596 | + */ | |
40597 | + | |
40598 | +#ifndef CONFIG_SMP | |
40599 | + | |
40600 | +#define flush_tlb() __flush_tlb() | |
40601 | +#define flush_tlb_all() __flush_tlb_all() | |
40602 | +#define local_flush_tlb() __flush_tlb() | |
40603 | + | |
40604 | +static inline void flush_tlb_mm(struct mm_struct *mm) | |
40605 | +{ | |
40606 | + if (mm == current->active_mm) | |
40607 | + __flush_tlb(); | |
40608 | +} | |
40609 | + | |
40610 | +static inline void flush_tlb_page(struct vm_area_struct *vma, | |
40611 | + unsigned long addr) | |
40612 | +{ | |
40613 | + if (vma->vm_mm == current->active_mm) | |
40614 | + __flush_tlb_one(addr); | |
40615 | +} | |
40616 | + | |
40617 | +static inline void flush_tlb_range(struct vm_area_struct *vma, | |
40618 | + unsigned long start, unsigned long end) | |
40619 | +{ | |
40620 | + if (vma->vm_mm == current->active_mm) | |
40621 | + __flush_tlb(); | |
40622 | +} | |
40623 | + | |
40624 | +#else | |
40625 | + | |
40626 | +#include <asm/smp.h> | |
40627 | + | |
40628 | +#define local_flush_tlb() \ | |
40629 | + __flush_tlb() | |
40630 | + | |
40631 | +#define flush_tlb_all xen_tlb_flush_all | |
40632 | +#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) | |
40633 | +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) | |
40634 | +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) | |
40635 | + | |
40636 | +#define flush_tlb() flush_tlb_current_task() | |
40637 | + | |
40638 | +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) | |
40639 | +{ | |
40640 | + flush_tlb_mm(vma->vm_mm); | |
40641 | +} | |
40642 | + | |
40643 | +#define TLBSTATE_OK 1 | |
40644 | +#define TLBSTATE_LAZY 2 | |
40645 | + | |
40646 | +struct tlb_state | |
40647 | +{ | |
40648 | + struct mm_struct *active_mm; | |
40649 | + int state; | |
40650 | + char __cacheline_padding[L1_CACHE_BYTES-8]; | |
40651 | +}; | |
40652 | +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); | |
40653 | + | |
40654 | + | |
40655 | +#endif | |
40656 | + | |
40657 | +#define flush_tlb_kernel_range(start, end) flush_tlb_all() | |
40658 | + | |
40659 | +static inline void flush_tlb_pgtables(struct mm_struct *mm, | |
40660 | + unsigned long start, unsigned long end) | |
40661 | +{ | |
40662 | + /* i386 does not keep any page table caches in TLB */ | |
40663 | +} | |
40664 | + | |
40665 | +#endif /* _I386_TLBFLUSH_H */ | |
40666 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/vga.h | |
40667 | =================================================================== | |
40668 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
40669 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/vga.h 2007-06-12 13:14:02.000000000 +0200 | |
40670 | @@ -0,0 +1,20 @@ | |
40671 | +/* | |
40672 | + * Access to VGA videoram | |
40673 | + * | |
40674 | + * (c) 1998 Martin Mares <mj@ucw.cz> | |
40675 | + */ | |
40676 | + | |
40677 | +#ifndef _LINUX_ASM_VGA_H_ | |
40678 | +#define _LINUX_ASM_VGA_H_ | |
40679 | + | |
40680 | +/* | |
40681 | + * On the PC, we can just recalculate addresses and then | |
40682 | + * access the videoram directly without any black magic. | |
40683 | + */ | |
40684 | + | |
40685 | +#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x) | |
40686 | + | |
40687 | +#define vga_readb(x) (*(x)) | |
40688 | +#define vga_writeb(x,y) (*(y) = (x)) | |
40689 | + | |
40690 | +#endif | |
40691 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/xenoprof.h | |
40692 | =================================================================== | |
40693 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
40694 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/xenoprof.h 2007-06-12 13:14:02.000000000 +0200 | |
40695 | @@ -0,0 +1,48 @@ | |
40696 | +/****************************************************************************** | |
40697 | + * asm-i386/mach-xen/asm/xenoprof.h | |
40698 | + * | |
40699 | + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> | |
40700 | + * VA Linux Systems Japan K.K. | |
40701 | + * | |
40702 | + * This program is free software; you can redistribute it and/or modify | |
40703 | + * it under the terms of the GNU General Public License as published by | |
40704 | + * the Free Software Foundation; either version 2 of the License, or | |
40705 | + * (at your option) any later version. | |
40706 | + * | |
40707 | + * This program is distributed in the hope that it will be useful, | |
40708 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
40709 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
40710 | + * GNU General Public License for more details. | |
40711 | + * | |
40712 | + * You should have received a copy of the GNU General Public License | |
40713 | + * along with this program; if not, write to the Free Software | |
40714 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
40715 | + * | |
40716 | + */ | |
40717 | +#ifndef __ASM_XENOPROF_H__ | |
40718 | +#define __ASM_XENOPROF_H__ | |
40719 | +#ifdef CONFIG_XEN | |
40720 | + | |
40721 | +struct super_block; | |
40722 | +struct dentry; | |
40723 | +int xenoprof_create_files(struct super_block * sb, struct dentry * root); | |
40724 | +#define HAVE_XENOPROF_CREATE_FILES | |
40725 | + | |
40726 | +struct xenoprof_init; | |
40727 | +void xenoprof_arch_init_counter(struct xenoprof_init *init); | |
40728 | +void xenoprof_arch_counter(void); | |
40729 | +void xenoprof_arch_start(void); | |
40730 | +void xenoprof_arch_stop(void); | |
40731 | + | |
40732 | +struct xenoprof_arch_shared_buffer { | |
40733 | + /* nothing */ | |
40734 | +}; | |
40735 | +struct xenoprof_shared_buffer; | |
40736 | +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf); | |
40737 | +struct xenoprof_get_buffer; | |
40738 | +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf); | |
40739 | +struct xenoprof_passive; | |
40740 | +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf); | |
40741 | + | |
40742 | +#endif /* CONFIG_XEN */ | |
40743 | +#endif /* __ASM_XENOPROF_H__ */ | |
40744 | Index: head-2008-11-25/include/asm-x86/mach-xen/irq_vectors.h | |
40745 | =================================================================== | |
40746 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
40747 | +++ head-2008-11-25/include/asm-x86/mach-xen/irq_vectors.h 2008-09-25 13:55:32.000000000 +0200 | |
40748 | @@ -0,0 +1,125 @@ | |
40749 | +/* | |
40750 | + * This file should contain #defines for all of the interrupt vector | |
40751 | + * numbers used by this architecture. | |
40752 | + * | |
40753 | + * In addition, there are some standard defines: | |
40754 | + * | |
40755 | + * FIRST_EXTERNAL_VECTOR: | |
40756 | + * The first free place for external interrupts | |
40757 | + * | |
40758 | + * SYSCALL_VECTOR: | |
40759 | + * The IRQ vector a syscall makes the user to kernel transition | |
40760 | + * under. | |
40761 | + * | |
40762 | + * TIMER_IRQ: | |
40763 | + * The IRQ number the timer interrupt comes in at. | |
40764 | + * | |
40765 | + * NR_IRQS: | |
40766 | + * The total number of interrupt vectors (including all the | |
40767 | + * architecture specific interrupts) needed. | |
40768 | + * | |
40769 | + */ | |
40770 | +#ifndef _ASM_IRQ_VECTORS_H | |
40771 | +#define _ASM_IRQ_VECTORS_H | |
40772 | + | |
40773 | +/* | |
40774 | + * IDT vectors usable for external interrupt sources start | |
40775 | + * at 0x20: | |
40776 | + */ | |
40777 | +#define FIRST_EXTERNAL_VECTOR 0x20 | |
40778 | + | |
40779 | +#define SYSCALL_VECTOR 0x80 | |
40780 | + | |
40781 | +/* | |
40782 | + * Vectors 0x20-0x2f are used for ISA interrupts. | |
40783 | + */ | |
40784 | + | |
40785 | +#if 0 | |
40786 | +/* | |
40787 | + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff | |
40788 | + * | |
40789 | + * some of the following vectors are 'rare', they are merged | |
40790 | + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. | |
40791 | + * TLB, reschedule and local APIC vectors are performance-critical. | |
40792 | + * | |
40793 | + * Vectors 0xf0-0xfa are free (reserved for future Linux use). | |
40794 | + */ | |
40795 | +#define SPURIOUS_APIC_VECTOR 0xff | |
40796 | +#define ERROR_APIC_VECTOR 0xfe | |
40797 | +#define INVALIDATE_TLB_VECTOR 0xfd | |
40798 | +#define RESCHEDULE_VECTOR 0xfc | |
40799 | +#define CALL_FUNCTION_VECTOR 0xfb | |
40800 | + | |
40801 | +#define THERMAL_APIC_VECTOR 0xf0 | |
40802 | +/* | |
40803 | + * Local APIC timer IRQ vector is on a different priority level, | |
40804 | + * to work around the 'lost local interrupt if more than 2 IRQ | |
40805 | + * sources per level' errata. | |
40806 | + */ | |
40807 | +#define LOCAL_TIMER_VECTOR 0xef | |
40808 | +#endif | |
40809 | + | |
40810 | +#define SPURIOUS_APIC_VECTOR 0xff | |
40811 | +#define ERROR_APIC_VECTOR 0xfe | |
40812 | + | |
40813 | +/* | |
40814 | + * First APIC vector available to drivers: (vectors 0x30-0xee) | |
40815 | + * we start at 0x31 to spread out vectors evenly between priority | |
40816 | + * levels. (0x80 is the syscall vector) | |
40817 | + */ | |
40818 | +#define FIRST_DEVICE_VECTOR 0x31 | |
40819 | +#define FIRST_SYSTEM_VECTOR 0xef | |
40820 | + | |
40821 | +/* | |
40822 | + * 16 8259A IRQ's, 208 potential APIC interrupt sources. | |
40823 | + * Right now the APIC is mostly only used for SMP. | |
40824 | + * 256 vectors is an architectural limit. (we can have | |
40825 | + * more than 256 devices theoretically, but they will | |
40826 | + * have to use shared interrupts) | |
40827 | + * Since vectors 0x00-0x1f are used/reserved for the CPU, | |
40828 | + * the usable vector space is 0x20-0xff (224 vectors) | |
40829 | + */ | |
40830 | + | |
40831 | +#define RESCHEDULE_VECTOR 0 | |
40832 | +#define CALL_FUNCTION_VECTOR 1 | |
40833 | +#define NR_IPIS 2 | |
40834 | + | |
40835 | +/* | |
40836 | + * The maximum number of vectors supported by i386 processors | |
40837 | + * is limited to 256. For processors other than i386, NR_VECTORS | |
40838 | + * should be changed accordingly. | |
40839 | + */ | |
40840 | +#define NR_VECTORS 256 | |
40841 | + | |
40842 | +#define FPU_IRQ 13 | |
40843 | + | |
40844 | +#define FIRST_VM86_IRQ 3 | |
40845 | +#define LAST_VM86_IRQ 15 | |
40846 | +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) | |
40847 | + | |
40848 | +/* | |
40849 | + * The flat IRQ space is divided into two regions: | |
40850 | + * 1. A one-to-one mapping of real physical IRQs. This space is only used | |
40851 | + * if we have physical device-access privilege. This region is at the | |
40852 | + * start of the IRQ space so that existing device drivers do not need | |
40853 | + * to be modified to translate physical IRQ numbers into our IRQ space. | |
40854 | + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These | |
40855 | + * are bound using the provided bind/unbind functions. | |
40856 | + */ | |
40857 | + | |
40858 | +#define PIRQ_BASE 0 | |
40859 | +#if !defined(MAX_IO_APICS) | |
40860 | +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS) | |
40861 | +#elif NR_CPUS < MAX_IO_APICS | |
40862 | +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS) | |
40863 | +#else | |
40864 | +# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS) | |
40865 | +#endif | |
40866 | + | |
40867 | +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS) | |
40868 | +#define NR_DYNIRQS 256 | |
40869 | + | |
40870 | +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) | |
40871 | +#define NR_IRQ_VECTORS NR_IRQS | |
40872 | + | |
40873 | +#endif /* _ASM_IRQ_VECTORS_H */ | |
40874 | Index: head-2008-11-25/include/asm-x86/mach-xen/mach_traps.h | |
40875 | =================================================================== | |
40876 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
40877 | +++ head-2008-11-25/include/asm-x86/mach-xen/mach_traps.h 2007-06-12 13:14:02.000000000 +0200 | |
40878 | @@ -0,0 +1,33 @@ | |
40879 | +/* | |
40880 | + * include/asm-xen/asm-i386/mach-xen/mach_traps.h | |
40881 | + * | |
40882 | + * Machine specific NMI handling for Xen | |
40883 | + */ | |
40884 | +#ifndef _MACH_TRAPS_H | |
40885 | +#define _MACH_TRAPS_H | |
40886 | + | |
40887 | +#include <linux/bitops.h> | |
40888 | +#include <xen/interface/nmi.h> | |
40889 | + | |
40890 | +static inline void clear_mem_error(unsigned char reason) {} | |
40891 | +static inline void clear_io_check_error(unsigned char reason) {} | |
40892 | + | |
40893 | +static inline unsigned char get_nmi_reason(void) | |
40894 | +{ | |
40895 | + shared_info_t *s = HYPERVISOR_shared_info; | |
40896 | + unsigned char reason = 0; | |
40897 | + | |
40898 | + /* construct a value which looks like it came from | |
40899 | + * port 0x61. | |
40900 | + */ | |
40901 | + if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason)) | |
40902 | + reason |= 0x40; | |
40903 | + if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason)) | |
40904 | + reason |= 0x80; | |
40905 | + | |
40906 | + return reason; | |
40907 | +} | |
40908 | + | |
40909 | +static inline void reassert_nmi(void) {} | |
40910 | + | |
40911 | +#endif /* !_MACH_TRAPS_H */ | |
40912 | Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch.h | |
40913 | =================================================================== | |
40914 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
40915 | +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch.h 2007-06-12 13:14:02.000000000 +0200 | |
40916 | @@ -0,0 +1,5 @@ | |
40917 | +/* Hook to call BIOS initialisation function */ | |
40918 | + | |
40919 | +#define ARCH_SETUP machine_specific_arch_setup(); | |
40920 | + | |
40921 | +void __init machine_specific_arch_setup(void); | |
40922 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/desc_64.h | |
40923 | =================================================================== | |
40924 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
40925 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/desc_64.h 2008-01-28 12:24:19.000000000 +0100 | |
40926 | @@ -0,0 +1,265 @@ | |
40927 | +/* Written 2000 by Andi Kleen */ | |
40928 | +#ifndef __ARCH_DESC_H | |
40929 | +#define __ARCH_DESC_H | |
40930 | + | |
40931 | +#include <linux/threads.h> | |
40932 | +#include <asm/ldt.h> | |
40933 | + | |
40934 | +#ifndef __ASSEMBLY__ | |
40935 | + | |
40936 | +#include <linux/string.h> | |
40937 | +#include <linux/smp.h> | |
40938 | + | |
40939 | +#include <asm/segment.h> | |
40940 | +#include <asm/mmu.h> | |
40941 | + | |
40942 | +// 8 byte segment descriptor | |
40943 | +struct desc_struct { | |
40944 | + u16 limit0; | |
40945 | + u16 base0; | |
40946 | + unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1; | |
40947 | + unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8; | |
40948 | +} __attribute__((packed)); | |
40949 | + | |
40950 | +struct n_desc_struct { | |
40951 | + unsigned int a,b; | |
40952 | +}; | |
40953 | + | |
40954 | +enum { | |
40955 | + GATE_INTERRUPT = 0xE, | |
40956 | + GATE_TRAP = 0xF, | |
40957 | + GATE_CALL = 0xC, | |
40958 | +}; | |
40959 | + | |
40960 | +// 16byte gate | |
40961 | +struct gate_struct { | |
40962 | + u16 offset_low; | |
40963 | + u16 segment; | |
40964 | + unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; | |
40965 | + u16 offset_middle; | |
40966 | + u32 offset_high; | |
40967 | + u32 zero1; | |
40968 | +} __attribute__((packed)); | |
40969 | + | |
40970 | +#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF) | |
40971 | +#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF) | |
40972 | +#define PTR_HIGH(x) ((unsigned long)(x) >> 32) | |
40973 | + | |
40974 | +enum { | |
40975 | + DESC_TSS = 0x9, | |
40976 | + DESC_LDT = 0x2, | |
40977 | +}; | |
40978 | + | |
40979 | +// LDT or TSS descriptor in the GDT. 16 bytes. | |
40980 | +struct ldttss_desc { | |
40981 | + u16 limit0; | |
40982 | + u16 base0; | |
40983 | + unsigned base1 : 8, type : 5, dpl : 2, p : 1; | |
40984 | + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; | |
40985 | + u32 base3; | |
40986 | + u32 zero1; | |
40987 | +} __attribute__((packed)); | |
40988 | + | |
40989 | +struct desc_ptr { | |
40990 | + unsigned short size; | |
40991 | + unsigned long address; | |
40992 | +} __attribute__((packed)) ; | |
40993 | + | |
40994 | +extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS]; | |
40995 | + | |
40996 | +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; | |
40997 | + | |
40998 | +#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8)) | |
40999 | +#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8)) | |
41000 | + | |
41001 | +static inline void clear_LDT(void) | |
41002 | +{ | |
41003 | + int cpu = get_cpu(); | |
41004 | + | |
41005 | + /* | |
41006 | + * NB. We load the default_ldt for lcall7/27 handling on demand, as | |
41007 | + * it slows down context switching. Noone uses it anyway. | |
41008 | + */ | |
41009 | + cpu = cpu; /* XXX avoid compiler warning */ | |
41010 | + xen_set_ldt(NULL, 0); | |
41011 | + put_cpu(); | |
41012 | +} | |
41013 | + | |
41014 | +/* | |
41015 | + * This is the ldt that every process will get unless we need | |
41016 | + * something other than this. | |
41017 | + */ | |
41018 | +extern struct desc_struct default_ldt[]; | |
41019 | +#ifndef CONFIG_X86_NO_IDT | |
41020 | +extern struct gate_struct idt_table[]; | |
41021 | +#endif | |
41022 | +extern struct desc_ptr cpu_gdt_descr[]; | |
41023 | + | |
41024 | +/* the cpu gdt accessor */ | |
41025 | +#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address) | |
41026 | + | |
41027 | +static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist) | |
41028 | +{ | |
41029 | + struct gate_struct s; | |
41030 | + s.offset_low = PTR_LOW(func); | |
41031 | + s.segment = __KERNEL_CS; | |
41032 | + s.ist = ist; | |
41033 | + s.p = 1; | |
41034 | + s.dpl = dpl; | |
41035 | + s.zero0 = 0; | |
41036 | + s.zero1 = 0; | |
41037 | + s.type = type; | |
41038 | + s.offset_middle = PTR_MIDDLE(func); | |
41039 | + s.offset_high = PTR_HIGH(func); | |
41040 | + /* does not need to be atomic because it is only done once at setup time */ | |
41041 | + memcpy(adr, &s, 16); | |
41042 | +} | |
41043 | + | |
41044 | +#ifndef CONFIG_X86_NO_IDT | |
41045 | +static inline void set_intr_gate(int nr, void *func) | |
41046 | +{ | |
41047 | + BUG_ON((unsigned)nr > 0xFF); | |
41048 | + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); | |
41049 | +} | |
41050 | + | |
41051 | +static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) | |
41052 | +{ | |
41053 | + BUG_ON((unsigned)nr > 0xFF); | |
41054 | + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); | |
41055 | +} | |
41056 | + | |
41057 | +static inline void set_system_gate(int nr, void *func) | |
41058 | +{ | |
41059 | + BUG_ON((unsigned)nr > 0xFF); | |
41060 | + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); | |
41061 | +} | |
41062 | + | |
41063 | +static inline void set_system_gate_ist(int nr, void *func, unsigned ist) | |
41064 | +{ | |
41065 | + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist); | |
41066 | +} | |
41067 | +#endif | |
41068 | + | |
41069 | +static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, | |
41070 | + unsigned size) | |
41071 | +{ | |
41072 | + struct ldttss_desc d; | |
41073 | + memset(&d,0,sizeof(d)); | |
41074 | + d.limit0 = size & 0xFFFF; | |
41075 | + d.base0 = PTR_LOW(tss); | |
41076 | + d.base1 = PTR_MIDDLE(tss) & 0xFF; | |
41077 | + d.type = type; | |
41078 | + d.p = 1; | |
41079 | + d.limit1 = (size >> 16) & 0xF; | |
41080 | + d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF; | |
41081 | + d.base3 = PTR_HIGH(tss); | |
41082 | + memcpy(ptr, &d, 16); | |
41083 | +} | |
41084 | + | |
41085 | +#ifndef CONFIG_X86_NO_TSS | |
41086 | +static inline void set_tss_desc(unsigned cpu, void *addr) | |
41087 | +{ | |
41088 | + /* | |
41089 | + * sizeof(unsigned long) coming from an extra "long" at the end | |
41090 | + * of the iobitmap. See tss_struct definition in processor.h | |
41091 | + * | |
41092 | + * -1? seg base+limit should be pointing to the address of the | |
41093 | + * last valid byte | |
41094 | + */ | |
41095 | + set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS], | |
41096 | + (unsigned long)addr, DESC_TSS, | |
41097 | + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1); | |
41098 | +} | |
41099 | +#endif | |
41100 | + | |
41101 | +static inline void set_ldt_desc(unsigned cpu, void *addr, int size) | |
41102 | +{ | |
41103 | + set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr, | |
41104 | + DESC_LDT, size * 8 - 1); | |
41105 | +} | |
41106 | + | |
41107 | +static inline void set_seg_base(unsigned cpu, int entry, void *base) | |
41108 | +{ | |
41109 | + struct desc_struct *d = &cpu_gdt(cpu)[entry]; | |
41110 | + u32 addr = (u32)(u64)base; | |
41111 | + BUG_ON((u64)base >> 32); | |
41112 | + d->base0 = addr & 0xffff; | |
41113 | + d->base1 = (addr >> 16) & 0xff; | |
41114 | + d->base2 = (addr >> 24) & 0xff; | |
41115 | +} | |
41116 | + | |
41117 | +#define LDT_entry_a(info) \ | |
41118 | + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) | |
41119 | +/* Don't allow setting of the lm bit. It is useless anyways because | |
41120 | + 64bit system calls require __USER_CS. */ | |
41121 | +#define LDT_entry_b(info) \ | |
41122 | + (((info)->base_addr & 0xff000000) | \ | |
41123 | + (((info)->base_addr & 0x00ff0000) >> 16) | \ | |
41124 | + ((info)->limit & 0xf0000) | \ | |
41125 | + (((info)->read_exec_only ^ 1) << 9) | \ | |
41126 | + ((info)->contents << 10) | \ | |
41127 | + (((info)->seg_not_present ^ 1) << 15) | \ | |
41128 | + ((info)->seg_32bit << 22) | \ | |
41129 | + ((info)->limit_in_pages << 23) | \ | |
41130 | + ((info)->useable << 20) | \ | |
41131 | + /* ((info)->lm << 21) | */ \ | |
41132 | + 0x7000) | |
41133 | + | |
41134 | +#define LDT_empty(info) (\ | |
41135 | + (info)->base_addr == 0 && \ | |
41136 | + (info)->limit == 0 && \ | |
41137 | + (info)->contents == 0 && \ | |
41138 | + (info)->read_exec_only == 1 && \ | |
41139 | + (info)->seg_32bit == 0 && \ | |
41140 | + (info)->limit_in_pages == 0 && \ | |
41141 | + (info)->seg_not_present == 1 && \ | |
41142 | + (info)->useable == 0 && \ | |
41143 | + (info)->lm == 0) | |
41144 | + | |
41145 | +#if TLS_SIZE != 24 | |
41146 | +# error update this code. | |
41147 | +#endif | |
41148 | + | |
41149 | +static inline void load_TLS(struct thread_struct *t, unsigned int cpu) | |
41150 | +{ | |
41151 | +#if 0 | |
41152 | + u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN); | |
41153 | + gdt[0] = t->tls_array[0]; | |
41154 | + gdt[1] = t->tls_array[1]; | |
41155 | + gdt[2] = t->tls_array[2]; | |
41156 | +#endif | |
41157 | +#define C(i) \ | |
41158 | + if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \ | |
41159 | + t->tls_array[i])) \ | |
41160 | + BUG(); | |
41161 | + | |
41162 | + C(0); C(1); C(2); | |
41163 | +#undef C | |
41164 | +} | |
41165 | + | |
41166 | +/* | |
41167 | + * load one particular LDT into the current CPU | |
41168 | + */ | |
41169 | +static inline void load_LDT_nolock (mm_context_t *pc, int cpu) | |
41170 | +{ | |
41171 | + void *segments = pc->ldt; | |
41172 | + int count = pc->size; | |
41173 | + | |
41174 | + if (likely(!count)) | |
41175 | + segments = NULL; | |
41176 | + | |
41177 | + xen_set_ldt(segments, count); | |
41178 | +} | |
41179 | + | |
41180 | +static inline void load_LDT(mm_context_t *pc) | |
41181 | +{ | |
41182 | + int cpu = get_cpu(); | |
41183 | + load_LDT_nolock(pc, cpu); | |
41184 | + put_cpu(); | |
41185 | +} | |
41186 | + | |
41187 | +extern struct desc_ptr idt_descr; | |
41188 | + | |
41189 | +#endif /* !__ASSEMBLY__ */ | |
41190 | + | |
41191 | +#endif | |
41192 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_64.h | |
41193 | =================================================================== | |
41194 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
41195 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_64.h 2007-06-12 13:14:13.000000000 +0200 | |
41196 | @@ -0,0 +1,207 @@ | |
41197 | +#ifndef _X8664_DMA_MAPPING_H | |
41198 | +#define _X8664_DMA_MAPPING_H 1 | |
41199 | + | |
41200 | +/* | |
41201 | + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for | |
41202 | + * documentation. | |
41203 | + */ | |
41204 | + | |
41205 | + | |
41206 | +#include <asm/scatterlist.h> | |
41207 | +#include <asm/io.h> | |
41208 | +#include <asm/swiotlb.h> | |
41209 | + | |
41210 | +struct dma_mapping_ops { | |
41211 | + int (*mapping_error)(dma_addr_t dma_addr); | |
41212 | + void* (*alloc_coherent)(struct device *dev, size_t size, | |
41213 | + dma_addr_t *dma_handle, gfp_t gfp); | |
41214 | + void (*free_coherent)(struct device *dev, size_t size, | |
41215 | + void *vaddr, dma_addr_t dma_handle); | |
41216 | + dma_addr_t (*map_single)(struct device *hwdev, void *ptr, | |
41217 | + size_t size, int direction); | |
41218 | + /* like map_single, but doesn't check the device mask */ | |
41219 | + dma_addr_t (*map_simple)(struct device *hwdev, char *ptr, | |
41220 | + size_t size, int direction); | |
41221 | + void (*unmap_single)(struct device *dev, dma_addr_t addr, | |
41222 | + size_t size, int direction); | |
41223 | + void (*sync_single_for_cpu)(struct device *hwdev, | |
41224 | + dma_addr_t dma_handle, size_t size, | |
41225 | + int direction); | |
41226 | + void (*sync_single_for_device)(struct device *hwdev, | |
41227 | + dma_addr_t dma_handle, size_t size, | |
41228 | + int direction); | |
41229 | + void (*sync_single_range_for_cpu)(struct device *hwdev, | |
41230 | + dma_addr_t dma_handle, unsigned long offset, | |
41231 | + size_t size, int direction); | |
41232 | + void (*sync_single_range_for_device)(struct device *hwdev, | |
41233 | + dma_addr_t dma_handle, unsigned long offset, | |
41234 | + size_t size, int direction); | |
41235 | + void (*sync_sg_for_cpu)(struct device *hwdev, | |
41236 | + struct scatterlist *sg, int nelems, | |
41237 | + int direction); | |
41238 | + void (*sync_sg_for_device)(struct device *hwdev, | |
41239 | + struct scatterlist *sg, int nelems, | |
41240 | + int direction); | |
41241 | + int (*map_sg)(struct device *hwdev, struct scatterlist *sg, | |
41242 | + int nents, int direction); | |
41243 | + void (*unmap_sg)(struct device *hwdev, | |
41244 | + struct scatterlist *sg, int nents, | |
41245 | + int direction); | |
41246 | + int (*dma_supported)(struct device *hwdev, u64 mask); | |
41247 | + int is_phys; | |
41248 | +}; | |
41249 | + | |
41250 | +extern dma_addr_t bad_dma_address; | |
41251 | +extern struct dma_mapping_ops* dma_ops; | |
41252 | +extern int iommu_merge; | |
41253 | + | |
41254 | +static inline int valid_dma_direction(int dma_direction) | |
41255 | +{ | |
41256 | + return ((dma_direction == DMA_BIDIRECTIONAL) || | |
41257 | + (dma_direction == DMA_TO_DEVICE) || | |
41258 | + (dma_direction == DMA_FROM_DEVICE)); | |
41259 | +} | |
41260 | + | |
41261 | +#if 0 | |
41262 | +static inline int dma_mapping_error(dma_addr_t dma_addr) | |
41263 | +{ | |
41264 | + if (dma_ops->mapping_error) | |
41265 | + return dma_ops->mapping_error(dma_addr); | |
41266 | + | |
41267 | + return (dma_addr == bad_dma_address); | |
41268 | +} | |
41269 | + | |
41270 | +extern void *dma_alloc_coherent(struct device *dev, size_t size, | |
41271 | + dma_addr_t *dma_handle, gfp_t gfp); | |
41272 | +extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr, | |
41273 | + dma_addr_t dma_handle); | |
41274 | + | |
41275 | +static inline dma_addr_t | |
41276 | +dma_map_single(struct device *hwdev, void *ptr, size_t size, | |
41277 | + int direction) | |
41278 | +{ | |
41279 | + BUG_ON(!valid_dma_direction(direction)); | |
41280 | + return dma_ops->map_single(hwdev, ptr, size, direction); | |
41281 | +} | |
41282 | + | |
41283 | +static inline void | |
41284 | +dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size, | |
41285 | + int direction) | |
41286 | +{ | |
41287 | + BUG_ON(!valid_dma_direction(direction)); | |
41288 | + dma_ops->unmap_single(dev, addr, size, direction); | |
41289 | +} | |
41290 | + | |
41291 | +#define dma_map_page(dev,page,offset,size,dir) \ | |
41292 | + dma_map_single((dev), page_address(page)+(offset), (size), (dir)) | |
41293 | + | |
41294 | +#define dma_unmap_page dma_unmap_single | |
41295 | + | |
41296 | +static inline void | |
41297 | +dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle, | |
41298 | + size_t size, int direction) | |
41299 | +{ | |
41300 | + BUG_ON(!valid_dma_direction(direction)); | |
41301 | + if (dma_ops->sync_single_for_cpu) | |
41302 | + dma_ops->sync_single_for_cpu(hwdev, dma_handle, size, | |
41303 | + direction); | |
41304 | + flush_write_buffers(); | |
41305 | +} | |
41306 | + | |
41307 | +static inline void | |
41308 | +dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle, | |
41309 | + size_t size, int direction) | |
41310 | +{ | |
41311 | + BUG_ON(!valid_dma_direction(direction)); | |
41312 | + if (dma_ops->sync_single_for_device) | |
41313 | + dma_ops->sync_single_for_device(hwdev, dma_handle, size, | |
41314 | + direction); | |
41315 | + flush_write_buffers(); | |
41316 | +} | |
41317 | + | |
41318 | +static inline void | |
41319 | +dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle, | |
41320 | + unsigned long offset, size_t size, int direction) | |
41321 | +{ | |
41322 | + BUG_ON(!valid_dma_direction(direction)); | |
41323 | + if (dma_ops->sync_single_range_for_cpu) { | |
41324 | + dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction); | |
41325 | + } | |
41326 | + | |
41327 | + flush_write_buffers(); | |
41328 | +} | |
41329 | + | |
41330 | +static inline void | |
41331 | +dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle, | |
41332 | + unsigned long offset, size_t size, int direction) | |
41333 | +{ | |
41334 | + BUG_ON(!valid_dma_direction(direction)); | |
41335 | + if (dma_ops->sync_single_range_for_device) | |
41336 | + dma_ops->sync_single_range_for_device(hwdev, dma_handle, | |
41337 | + offset, size, direction); | |
41338 | + | |
41339 | + flush_write_buffers(); | |
41340 | +} | |
41341 | + | |
41342 | +static inline void | |
41343 | +dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, | |
41344 | + int nelems, int direction) | |
41345 | +{ | |
41346 | + BUG_ON(!valid_dma_direction(direction)); | |
41347 | + if (dma_ops->sync_sg_for_cpu) | |
41348 | + dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction); | |
41349 | + flush_write_buffers(); | |
41350 | +} | |
41351 | + | |
41352 | +static inline void | |
41353 | +dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, | |
41354 | + int nelems, int direction) | |
41355 | +{ | |
41356 | + BUG_ON(!valid_dma_direction(direction)); | |
41357 | + if (dma_ops->sync_sg_for_device) { | |
41358 | + dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction); | |
41359 | + } | |
41360 | + | |
41361 | + flush_write_buffers(); | |
41362 | +} | |
41363 | + | |
41364 | +static inline int | |
41365 | +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction) | |
41366 | +{ | |
41367 | + BUG_ON(!valid_dma_direction(direction)); | |
41368 | + return dma_ops->map_sg(hwdev, sg, nents, direction); | |
41369 | +} | |
41370 | + | |
41371 | +static inline void | |
41372 | +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, | |
41373 | + int direction) | |
41374 | +{ | |
41375 | + BUG_ON(!valid_dma_direction(direction)); | |
41376 | + dma_ops->unmap_sg(hwdev, sg, nents, direction); | |
41377 | +} | |
41378 | + | |
41379 | +extern int dma_supported(struct device *hwdev, u64 mask); | |
41380 | + | |
41381 | +/* same for gart, swiotlb, and nommu */ | |
41382 | +static inline int dma_get_cache_alignment(void) | |
41383 | +{ | |
41384 | + return boot_cpu_data.x86_clflush_size; | |
41385 | +} | |
41386 | + | |
41387 | +#define dma_is_consistent(h) 1 | |
41388 | + | |
41389 | +extern int dma_set_mask(struct device *dev, u64 mask); | |
41390 | + | |
41391 | +static inline void | |
41392 | +dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir) | |
41393 | +{ | |
41394 | + flush_write_buffers(); | |
41395 | +} | |
41396 | + | |
41397 | +extern struct device fallback_dev; | |
41398 | +extern int panic_on_overflow; | |
41399 | +#endif | |
41400 | + | |
41401 | +#endif /* _X8664_DMA_MAPPING_H */ | |
41402 | + | |
41403 | +#include <asm-i386/mach-xen/asm/dma-mapping.h> | |
41404 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_64.h | |
41405 | =================================================================== | |
41406 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
41407 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_64.h 2007-06-12 13:14:13.000000000 +0200 | |
41408 | @@ -0,0 +1,112 @@ | |
41409 | +/* | |
41410 | + * fixmap.h: compile-time virtual memory allocation | |
41411 | + * | |
41412 | + * This file is subject to the terms and conditions of the GNU General Public | |
41413 | + * License. See the file "COPYING" in the main directory of this archive | |
41414 | + * for more details. | |
41415 | + * | |
41416 | + * Copyright (C) 1998 Ingo Molnar | |
41417 | + */ | |
41418 | + | |
41419 | +#ifndef _ASM_FIXMAP_H | |
41420 | +#define _ASM_FIXMAP_H | |
41421 | + | |
41422 | +#include <linux/kernel.h> | |
41423 | +#include <asm/apicdef.h> | |
41424 | +#include <asm/page.h> | |
41425 | +#include <asm/vsyscall.h> | |
41426 | +#include <asm/vsyscall32.h> | |
41427 | +#include <asm/acpi.h> | |
41428 | + | |
41429 | +/* | |
41430 | + * Here we define all the compile-time 'special' virtual | |
41431 | + * addresses. The point is to have a constant address at | |
41432 | + * compile time, but to set the physical address only | |
41433 | + * in the boot process. | |
41434 | + * | |
41435 | + * these 'compile-time allocated' memory buffers are | |
41436 | + * fixed-size 4k pages. (or larger if used with an increment | |
41437 | + * highger than 1) use fixmap_set(idx,phys) to associate | |
41438 | + * physical memory with fixmap indices. | |
41439 | + * | |
41440 | + * TLB entries of such buffers will not be flushed across | |
41441 | + * task switches. | |
41442 | + */ | |
41443 | + | |
41444 | +enum fixed_addresses { | |
41445 | + VSYSCALL_LAST_PAGE, | |
41446 | + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, | |
41447 | + VSYSCALL_HPET, | |
41448 | + FIX_HPET_BASE, | |
41449 | +#ifdef CONFIG_X86_LOCAL_APIC | |
41450 | + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ | |
41451 | +#endif | |
41452 | +#ifdef CONFIG_X86_IO_APIC | |
41453 | + FIX_IO_APIC_BASE_0, | |
41454 | + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, | |
41455 | +#endif | |
41456 | +#ifdef CONFIG_ACPI | |
41457 | + FIX_ACPI_BEGIN, | |
41458 | + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, | |
41459 | +#endif | |
41460 | + FIX_SHARED_INFO, | |
41461 | +#define NR_FIX_ISAMAPS 256 | |
41462 | + FIX_ISAMAP_END, | |
41463 | + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, | |
41464 | + __end_of_permanent_fixed_addresses, | |
41465 | + /* temporary boot-time mappings, used before ioremap() is functional */ | |
41466 | +#define NR_FIX_BTMAPS 16 | |
41467 | + FIX_BTMAP_END = __end_of_permanent_fixed_addresses, | |
41468 | + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, | |
41469 | + __end_of_fixed_addresses | |
41470 | +}; | |
41471 | + | |
41472 | +extern void __set_fixmap (enum fixed_addresses idx, | |
41473 | + unsigned long phys, pgprot_t flags); | |
41474 | + | |
41475 | +#define set_fixmap(idx, phys) \ | |
41476 | + __set_fixmap(idx, phys, PAGE_KERNEL) | |
41477 | +/* | |
41478 | + * Some hardware wants to get fixmapped without caching. | |
41479 | + */ | |
41480 | +#define set_fixmap_nocache(idx, phys) \ | |
41481 | + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) | |
41482 | + | |
41483 | +#define clear_fixmap(idx) \ | |
41484 | + __set_fixmap(idx, 0, __pgprot(0)) | |
41485 | + | |
41486 | +#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) | |
41487 | +#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) | |
41488 | +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) | |
41489 | + | |
41490 | +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ | |
41491 | +#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) | |
41492 | +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) | |
41493 | + | |
41494 | +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) | |
41495 | + | |
41496 | +extern void __this_fixmap_does_not_exist(void); | |
41497 | + | |
41498 | +/* | |
41499 | + * 'index to address' translation. If anyone tries to use the idx | |
41500 | + * directly without translation, we catch the bug with a NULL-deference | |
41501 | + * kernel oops. Illegal ranges of incoming indices are caught too. | |
41502 | + */ | |
41503 | +static __always_inline unsigned long fix_to_virt(const unsigned int idx) | |
41504 | +{ | |
41505 | + /* | |
41506 | + * this branch gets completely eliminated after inlining, | |
41507 | + * except when someone tries to use fixaddr indices in an | |
41508 | + * illegal way. (such as mixing up address types or using | |
41509 | + * out-of-range indices). | |
41510 | + * | |
41511 | + * If it doesn't get removed, the linker will complain | |
41512 | + * loudly with a reasonably clear error message.. | |
41513 | + */ | |
41514 | + if (idx >= __end_of_fixed_addresses) | |
41515 | + __this_fixmap_does_not_exist(); | |
41516 | + | |
41517 | + return __fix_to_virt(idx); | |
41518 | +} | |
41519 | + | |
41520 | +#endif | |
41521 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_64.h | |
41522 | =================================================================== | |
41523 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
41524 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_64.h 2008-11-25 12:22:34.000000000 +0100 | |
41525 | @@ -0,0 +1,408 @@ | |
41526 | +/****************************************************************************** | |
41527 | + * hypercall.h | |
41528 | + * | |
41529 | + * Linux-specific hypervisor handling. | |
41530 | + * | |
41531 | + * Copyright (c) 2002-2004, K A Fraser | |
41532 | + * | |
41533 | + * 64-bit updates: | |
41534 | + * Benjamin Liu <benjamin.liu@intel.com> | |
41535 | + * Jun Nakajima <jun.nakajima@intel.com> | |
41536 | + * | |
41537 | + * This program is free software; you can redistribute it and/or | |
41538 | + * modify it under the terms of the GNU General Public License version 2 | |
41539 | + * as published by the Free Software Foundation; or, when distributed | |
41540 | + * separately from the Linux kernel or incorporated into other | |
41541 | + * software packages, subject to the following license: | |
41542 | + * | |
41543 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | |
41544 | + * of this source file (the "Software"), to deal in the Software without | |
41545 | + * restriction, including without limitation the rights to use, copy, modify, | |
41546 | + * merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
41547 | + * and to permit persons to whom the Software is furnished to do so, subject to | |
41548 | + * the following conditions: | |
41549 | + * | |
41550 | + * The above copyright notice and this permission notice shall be included in | |
41551 | + * all copies or substantial portions of the Software. | |
41552 | + * | |
41553 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
41554 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
41555 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
41556 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
41557 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
41558 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
41559 | + * IN THE SOFTWARE. | |
41560 | + */ | |
41561 | + | |
41562 | +#ifndef __HYPERCALL_H__ | |
41563 | +#define __HYPERCALL_H__ | |
41564 | + | |
41565 | +#include <linux/string.h> /* memcpy() */ | |
41566 | +#include <linux/stringify.h> | |
41567 | + | |
41568 | +#ifndef __HYPERVISOR_H__ | |
41569 | +# error "please don't include this file directly" | |
41570 | +#endif | |
41571 | + | |
41572 | +#ifdef CONFIG_XEN | |
41573 | +#define HYPERCALL_STR(name) \ | |
41574 | + "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)" | |
41575 | +#else | |
41576 | +#define HYPERCALL_STR(name) \ | |
41577 | + "mov $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\ | |
41578 | + "add hypercall_stubs(%%rip),%%rax; " \ | |
41579 | + "call *%%rax" | |
41580 | +#endif | |
41581 | + | |
41582 | +#define _hypercall0(type, name) \ | |
41583 | +({ \ | |
41584 | + type __res; \ | |
41585 | + asm volatile ( \ | |
41586 | + HYPERCALL_STR(name) \ | |
41587 | + : "=a" (__res) \ | |
41588 | + : \ | |
41589 | + : "memory" ); \ | |
41590 | + __res; \ | |
41591 | +}) | |
41592 | + | |
41593 | +#define _hypercall1(type, name, a1) \ | |
41594 | +({ \ | |
41595 | + type __res; \ | |
41596 | + long __ign1; \ | |
41597 | + asm volatile ( \ | |
41598 | + HYPERCALL_STR(name) \ | |
41599 | + : "=a" (__res), "=D" (__ign1) \ | |
41600 | + : "1" ((long)(a1)) \ | |
41601 | + : "memory" ); \ | |
41602 | + __res; \ | |
41603 | +}) | |
41604 | + | |
41605 | +#define _hypercall2(type, name, a1, a2) \ | |
41606 | +({ \ | |
41607 | + type __res; \ | |
41608 | + long __ign1, __ign2; \ | |
41609 | + asm volatile ( \ | |
41610 | + HYPERCALL_STR(name) \ | |
41611 | + : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \ | |
41612 | + : "1" ((long)(a1)), "2" ((long)(a2)) \ | |
41613 | + : "memory" ); \ | |
41614 | + __res; \ | |
41615 | +}) | |
41616 | + | |
41617 | +#define _hypercall3(type, name, a1, a2, a3) \ | |
41618 | +({ \ | |
41619 | + type __res; \ | |
41620 | + long __ign1, __ign2, __ign3; \ | |
41621 | + asm volatile ( \ | |
41622 | + HYPERCALL_STR(name) \ | |
41623 | + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ | |
41624 | + "=d" (__ign3) \ | |
41625 | + : "1" ((long)(a1)), "2" ((long)(a2)), \ | |
41626 | + "3" ((long)(a3)) \ | |
41627 | + : "memory" ); \ | |
41628 | + __res; \ | |
41629 | +}) | |
41630 | + | |
41631 | +#define _hypercall4(type, name, a1, a2, a3, a4) \ | |
41632 | +({ \ | |
41633 | + type __res; \ | |
41634 | + long __ign1, __ign2, __ign3; \ | |
41635 | + register long __arg4 asm("r10") = (long)(a4); \ | |
41636 | + asm volatile ( \ | |
41637 | + HYPERCALL_STR(name) \ | |
41638 | + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ | |
41639 | + "=d" (__ign3), "+r" (__arg4) \ | |
41640 | + : "1" ((long)(a1)), "2" ((long)(a2)), \ | |
41641 | + "3" ((long)(a3)) \ | |
41642 | + : "memory" ); \ | |
41643 | + __res; \ | |
41644 | +}) | |
41645 | + | |
41646 | +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ | |
41647 | +({ \ | |
41648 | + type __res; \ | |
41649 | + long __ign1, __ign2, __ign3; \ | |
41650 | + register long __arg4 asm("r10") = (long)(a4); \ | |
41651 | + register long __arg5 asm("r8") = (long)(a5); \ | |
41652 | + asm volatile ( \ | |
41653 | + HYPERCALL_STR(name) \ | |
41654 | + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ | |
41655 | + "=d" (__ign3), "+r" (__arg4), "+r" (__arg5) \ | |
41656 | + : "1" ((long)(a1)), "2" ((long)(a2)), \ | |
41657 | + "3" ((long)(a3)) \ | |
41658 | + : "memory" ); \ | |
41659 | + __res; \ | |
41660 | +}) | |
41661 | + | |
41662 | +static inline int __must_check | |
41663 | +HYPERVISOR_set_trap_table( | |
41664 | + const trap_info_t *table) | |
41665 | +{ | |
41666 | + return _hypercall1(int, set_trap_table, table); | |
41667 | +} | |
41668 | + | |
41669 | +static inline int __must_check | |
41670 | +HYPERVISOR_mmu_update( | |
41671 | + mmu_update_t *req, unsigned int count, unsigned int *success_count, | |
41672 | + domid_t domid) | |
41673 | +{ | |
41674 | + return _hypercall4(int, mmu_update, req, count, success_count, domid); | |
41675 | +} | |
41676 | + | |
41677 | +static inline int __must_check | |
41678 | +HYPERVISOR_mmuext_op( | |
41679 | + struct mmuext_op *op, unsigned int count, unsigned int *success_count, | |
41680 | + domid_t domid) | |
41681 | +{ | |
41682 | + return _hypercall4(int, mmuext_op, op, count, success_count, domid); | |
41683 | +} | |
41684 | + | |
41685 | +static inline int __must_check | |
41686 | +HYPERVISOR_set_gdt( | |
41687 | + unsigned long *frame_list, unsigned int entries) | |
41688 | +{ | |
41689 | + return _hypercall2(int, set_gdt, frame_list, entries); | |
41690 | +} | |
41691 | + | |
41692 | +static inline int __must_check | |
41693 | +HYPERVISOR_stack_switch( | |
41694 | + unsigned long ss, unsigned long esp) | |
41695 | +{ | |
41696 | + return _hypercall2(int, stack_switch, ss, esp); | |
41697 | +} | |
41698 | + | |
41699 | +static inline int __must_check | |
41700 | +HYPERVISOR_set_callbacks( | |
41701 | + unsigned long event_address, unsigned long failsafe_address, | |
41702 | + unsigned long syscall_address) | |
41703 | +{ | |
41704 | + return _hypercall3(int, set_callbacks, | |
41705 | + event_address, failsafe_address, syscall_address); | |
41706 | +} | |
41707 | + | |
41708 | +static inline int | |
41709 | +HYPERVISOR_fpu_taskswitch( | |
41710 | + int set) | |
41711 | +{ | |
41712 | + return _hypercall1(int, fpu_taskswitch, set); | |
41713 | +} | |
41714 | + | |
41715 | +static inline int __must_check | |
41716 | +HYPERVISOR_sched_op_compat( | |
41717 | + int cmd, unsigned long arg) | |
41718 | +{ | |
41719 | + return _hypercall2(int, sched_op_compat, cmd, arg); | |
41720 | +} | |
41721 | + | |
41722 | +static inline int __must_check | |
41723 | +HYPERVISOR_sched_op( | |
41724 | + int cmd, void *arg) | |
41725 | +{ | |
41726 | + return _hypercall2(int, sched_op, cmd, arg); | |
41727 | +} | |
41728 | + | |
41729 | +static inline long __must_check | |
41730 | +HYPERVISOR_set_timer_op( | |
41731 | + u64 timeout) | |
41732 | +{ | |
41733 | + return _hypercall1(long, set_timer_op, timeout); | |
41734 | +} | |
41735 | + | |
41736 | +static inline int __must_check | |
41737 | +HYPERVISOR_platform_op( | |
41738 | + struct xen_platform_op *platform_op) | |
41739 | +{ | |
41740 | + platform_op->interface_version = XENPF_INTERFACE_VERSION; | |
41741 | + return _hypercall1(int, platform_op, platform_op); | |
41742 | +} | |
41743 | + | |
41744 | +static inline int __must_check | |
41745 | +HYPERVISOR_set_debugreg( | |
41746 | + unsigned int reg, unsigned long value) | |
41747 | +{ | |
41748 | + return _hypercall2(int, set_debugreg, reg, value); | |
41749 | +} | |
41750 | + | |
41751 | +static inline unsigned long __must_check | |
41752 | +HYPERVISOR_get_debugreg( | |
41753 | + unsigned int reg) | |
41754 | +{ | |
41755 | + return _hypercall1(unsigned long, get_debugreg, reg); | |
41756 | +} | |
41757 | + | |
41758 | +static inline int __must_check | |
41759 | +HYPERVISOR_update_descriptor( | |
41760 | + unsigned long ma, unsigned long word) | |
41761 | +{ | |
41762 | + return _hypercall2(int, update_descriptor, ma, word); | |
41763 | +} | |
41764 | + | |
41765 | +static inline int __must_check | |
41766 | +HYPERVISOR_memory_op( | |
41767 | + unsigned int cmd, void *arg) | |
41768 | +{ | |
41769 | + return _hypercall2(int, memory_op, cmd, arg); | |
41770 | +} | |
41771 | + | |
41772 | +static inline int __must_check | |
41773 | +HYPERVISOR_multicall( | |
41774 | + multicall_entry_t *call_list, unsigned int nr_calls) | |
41775 | +{ | |
41776 | + return _hypercall2(int, multicall, call_list, nr_calls); | |
41777 | +} | |
41778 | + | |
41779 | +static inline int __must_check | |
41780 | +HYPERVISOR_update_va_mapping( | |
41781 | + unsigned long va, pte_t new_val, unsigned long flags) | |
41782 | +{ | |
41783 | + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags); | |
41784 | +} | |
41785 | + | |
41786 | +static inline int __must_check | |
41787 | +HYPERVISOR_event_channel_op( | |
41788 | + int cmd, void *arg) | |
41789 | +{ | |
41790 | + int rc = _hypercall2(int, event_channel_op, cmd, arg); | |
41791 | + | |
41792 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
41793 | + if (unlikely(rc == -ENOSYS)) { | |
41794 | + struct evtchn_op op; | |
41795 | + op.cmd = cmd; | |
41796 | + memcpy(&op.u, arg, sizeof(op.u)); | |
41797 | + rc = _hypercall1(int, event_channel_op_compat, &op); | |
41798 | + memcpy(arg, &op.u, sizeof(op.u)); | |
41799 | + } | |
41800 | +#endif | |
41801 | + | |
41802 | + return rc; | |
41803 | +} | |
41804 | + | |
41805 | +static inline int __must_check | |
41806 | +HYPERVISOR_xen_version( | |
41807 | + int cmd, void *arg) | |
41808 | +{ | |
41809 | + return _hypercall2(int, xen_version, cmd, arg); | |
41810 | +} | |
41811 | + | |
41812 | +static inline int __must_check | |
41813 | +HYPERVISOR_console_io( | |
41814 | + int cmd, unsigned int count, char *str) | |
41815 | +{ | |
41816 | + return _hypercall3(int, console_io, cmd, count, str); | |
41817 | +} | |
41818 | + | |
41819 | +static inline int __must_check | |
41820 | +HYPERVISOR_physdev_op( | |
41821 | + int cmd, void *arg) | |
41822 | +{ | |
41823 | + int rc = _hypercall2(int, physdev_op, cmd, arg); | |
41824 | + | |
41825 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
41826 | + if (unlikely(rc == -ENOSYS)) { | |
41827 | + struct physdev_op op; | |
41828 | + op.cmd = cmd; | |
41829 | + memcpy(&op.u, arg, sizeof(op.u)); | |
41830 | + rc = _hypercall1(int, physdev_op_compat, &op); | |
41831 | + memcpy(arg, &op.u, sizeof(op.u)); | |
41832 | + } | |
41833 | +#endif | |
41834 | + | |
41835 | + return rc; | |
41836 | +} | |
41837 | + | |
41838 | +static inline int __must_check | |
41839 | +HYPERVISOR_grant_table_op( | |
41840 | + unsigned int cmd, void *uop, unsigned int count) | |
41841 | +{ | |
41842 | + return _hypercall3(int, grant_table_op, cmd, uop, count); | |
41843 | +} | |
41844 | + | |
41845 | +static inline int __must_check | |
41846 | +HYPERVISOR_update_va_mapping_otherdomain( | |
41847 | + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) | |
41848 | +{ | |
41849 | + return _hypercall4(int, update_va_mapping_otherdomain, va, | |
41850 | + new_val.pte, flags, domid); | |
41851 | +} | |
41852 | + | |
41853 | +static inline int __must_check | |
41854 | +HYPERVISOR_vm_assist( | |
41855 | + unsigned int cmd, unsigned int type) | |
41856 | +{ | |
41857 | + return _hypercall2(int, vm_assist, cmd, type); | |
41858 | +} | |
41859 | + | |
41860 | +static inline int __must_check | |
41861 | +HYPERVISOR_vcpu_op( | |
41862 | + int cmd, unsigned int vcpuid, void *extra_args) | |
41863 | +{ | |
41864 | + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); | |
41865 | +} | |
41866 | + | |
41867 | +static inline int __must_check | |
41868 | +HYPERVISOR_set_segment_base( | |
41869 | + int reg, unsigned long value) | |
41870 | +{ | |
41871 | + return _hypercall2(int, set_segment_base, reg, value); | |
41872 | +} | |
41873 | + | |
41874 | +static inline int __must_check | |
41875 | +HYPERVISOR_suspend( | |
41876 | + unsigned long srec) | |
41877 | +{ | |
41878 | + struct sched_shutdown sched_shutdown = { | |
41879 | + .reason = SHUTDOWN_suspend | |
41880 | + }; | |
41881 | + | |
41882 | + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, | |
41883 | + &sched_shutdown, srec); | |
41884 | + | |
41885 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
41886 | + if (rc == -ENOSYS) | |
41887 | + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, | |
41888 | + SHUTDOWN_suspend, srec); | |
41889 | +#endif | |
41890 | + | |
41891 | + return rc; | |
41892 | +} | |
41893 | + | |
41894 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
41895 | +static inline int | |
41896 | +HYPERVISOR_nmi_op( | |
41897 | + unsigned long op, void *arg) | |
41898 | +{ | |
41899 | + return _hypercall2(int, nmi_op, op, arg); | |
41900 | +} | |
41901 | +#endif | |
41902 | + | |
41903 | +#ifndef CONFIG_XEN | |
41904 | +static inline unsigned long __must_check | |
41905 | +HYPERVISOR_hvm_op( | |
41906 | + int op, void *arg) | |
41907 | +{ | |
41908 | + return _hypercall2(unsigned long, hvm_op, op, arg); | |
41909 | +} | |
41910 | +#endif | |
41911 | + | |
41912 | +static inline int __must_check | |
41913 | +HYPERVISOR_callback_op( | |
41914 | + int cmd, const void *arg) | |
41915 | +{ | |
41916 | + return _hypercall2(int, callback_op, cmd, arg); | |
41917 | +} | |
41918 | + | |
41919 | +static inline int __must_check | |
41920 | +HYPERVISOR_xenoprof_op( | |
41921 | + int op, void *arg) | |
41922 | +{ | |
41923 | + return _hypercall2(int, xenoprof_op, op, arg); | |
41924 | +} | |
41925 | + | |
41926 | +static inline int __must_check | |
41927 | +HYPERVISOR_kexec_op( | |
41928 | + unsigned long op, void *args) | |
41929 | +{ | |
41930 | + return _hypercall2(int, kexec_op, op, args); | |
41931 | +} | |
41932 | + | |
41933 | +#endif /* __HYPERCALL_H__ */ | |
41934 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/io_64.h | |
41935 | =================================================================== | |
41936 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
41937 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/io_64.h 2007-08-16 18:07:01.000000000 +0200 | |
41938 | @@ -0,0 +1,329 @@ | |
41939 | +#ifndef _ASM_IO_H | |
41940 | +#define _ASM_IO_H | |
41941 | + | |
41942 | +#include <asm/fixmap.h> | |
41943 | + | |
41944 | +/* | |
41945 | + * This file contains the definitions for the x86 IO instructions | |
41946 | + * inb/inw/inl/outb/outw/outl and the "string versions" of the same | |
41947 | + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" | |
41948 | + * versions of the single-IO instructions (inb_p/inw_p/..). | |
41949 | + * | |
41950 | + * This file is not meant to be obfuscating: it's just complicated | |
41951 | + * to (a) handle it all in a way that makes gcc able to optimize it | |
41952 | + * as well as possible and (b) trying to avoid writing the same thing | |
41953 | + * over and over again with slight variations and possibly making a | |
41954 | + * mistake somewhere. | |
41955 | + */ | |
41956 | + | |
41957 | +/* | |
41958 | + * Thanks to James van Artsdalen for a better timing-fix than | |
41959 | + * the two short jumps: using outb's to a nonexistent port seems | |
41960 | + * to guarantee better timings even on fast machines. | |
41961 | + * | |
41962 | + * On the other hand, I'd like to be sure of a non-existent port: | |
41963 | + * I feel a bit unsafe about using 0x80 (should be safe, though) | |
41964 | + * | |
41965 | + * Linus | |
41966 | + */ | |
41967 | + | |
41968 | + /* | |
41969 | + * Bit simplified and optimized by Jan Hubicka | |
41970 | + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. | |
41971 | + * | |
41972 | + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, | |
41973 | + * isa_read[wl] and isa_write[wl] fixed | |
41974 | + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> | |
41975 | + */ | |
41976 | + | |
41977 | +#define __SLOW_DOWN_IO "\noutb %%al,$0x80" | |
41978 | + | |
41979 | +#ifdef REALLY_SLOW_IO | |
41980 | +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO | |
41981 | +#else | |
41982 | +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO | |
41983 | +#endif | |
41984 | + | |
41985 | +/* | |
41986 | + * Talk about misusing macros.. | |
41987 | + */ | |
41988 | +#define __OUT1(s,x) \ | |
41989 | +static inline void out##s(unsigned x value, unsigned short port) { | |
41990 | + | |
41991 | +#define __OUT2(s,s1,s2) \ | |
41992 | +__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1" | |
41993 | + | |
41994 | +#define __OUT(s,s1,x) \ | |
41995 | +__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \ | |
41996 | +__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \ | |
41997 | + | |
41998 | +#define __IN1(s) \ | |
41999 | +static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v; | |
42000 | + | |
42001 | +#define __IN2(s,s1,s2) \ | |
42002 | +__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0" | |
42003 | + | |
42004 | +#define __IN(s,s1,i...) \ | |
42005 | +__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ | |
42006 | +__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ | |
42007 | + | |
42008 | +#define __INS(s) \ | |
42009 | +static inline void ins##s(unsigned short port, void * addr, unsigned long count) \ | |
42010 | +{ __asm__ __volatile__ ("rep ; ins" #s \ | |
42011 | +: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); } | |
42012 | + | |
42013 | +#define __OUTS(s) \ | |
42014 | +static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \ | |
42015 | +{ __asm__ __volatile__ ("rep ; outs" #s \ | |
42016 | +: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); } | |
42017 | + | |
42018 | +#define RETURN_TYPE unsigned char | |
42019 | +__IN(b,"") | |
42020 | +#undef RETURN_TYPE | |
42021 | +#define RETURN_TYPE unsigned short | |
42022 | +__IN(w,"") | |
42023 | +#undef RETURN_TYPE | |
42024 | +#define RETURN_TYPE unsigned int | |
42025 | +__IN(l,"") | |
42026 | +#undef RETURN_TYPE | |
42027 | + | |
42028 | +__OUT(b,"b",char) | |
42029 | +__OUT(w,"w",short) | |
42030 | +__OUT(l,,int) | |
42031 | + | |
42032 | +__INS(b) | |
42033 | +__INS(w) | |
42034 | +__INS(l) | |
42035 | + | |
42036 | +__OUTS(b) | |
42037 | +__OUTS(w) | |
42038 | +__OUTS(l) | |
42039 | + | |
42040 | +#define IO_SPACE_LIMIT 0xffff | |
42041 | + | |
42042 | +#if defined(__KERNEL__) && __x86_64__ | |
42043 | + | |
42044 | +#include <linux/vmalloc.h> | |
42045 | + | |
42046 | +#ifndef __i386__ | |
42047 | +/* | |
42048 | + * Change virtual addresses to physical addresses and vv. | |
42049 | + * These are pretty trivial | |
42050 | + */ | |
42051 | +static inline unsigned long virt_to_phys(volatile void * address) | |
42052 | +{ | |
42053 | + return __pa(address); | |
42054 | +} | |
42055 | + | |
42056 | +static inline void * phys_to_virt(unsigned long address) | |
42057 | +{ | |
42058 | + return __va(address); | |
42059 | +} | |
42060 | + | |
42061 | +#define virt_to_bus(_x) phys_to_machine(__pa(_x)) | |
42062 | +#define bus_to_virt(_x) __va(machine_to_phys(_x)) | |
42063 | +#endif | |
42064 | + | |
42065 | +/* | |
42066 | + * Change "struct page" to physical address. | |
42067 | + */ | |
42068 | +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) | |
42069 | +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page))) | |
42070 | +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page))) | |
42071 | + | |
42072 | +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \ | |
42073 | + (unsigned long) bio_offset((bio))) | |
42074 | +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \ | |
42075 | + (unsigned long) (bv)->bv_offset) | |
42076 | + | |
42077 | +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ | |
42078 | + (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \ | |
42079 | + ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \ | |
42080 | + bvec_to_pseudophys((vec2)))) | |
42081 | + | |
42082 | +#include <asm-generic/iomap.h> | |
42083 | + | |
42084 | +extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags); | |
42085 | + | |
42086 | +static inline void __iomem * ioremap (unsigned long offset, unsigned long size) | |
42087 | +{ | |
42088 | + return __ioremap(offset, size, 0); | |
42089 | +} | |
42090 | + | |
42091 | +extern void *bt_ioremap(unsigned long addr, unsigned long size); | |
42092 | +extern void bt_iounmap(void *addr, unsigned long size); | |
42093 | +#define early_ioremap bt_ioremap | |
42094 | +#define early_iounmap bt_iounmap | |
42095 | + | |
42096 | +/* | |
42097 | + * This one maps high address device memory and turns off caching for that area. | |
42098 | + * it's useful if some control registers are in such an area and write combining | |
42099 | + * or read caching is not desirable: | |
42100 | + */ | |
42101 | +extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); | |
42102 | +extern void iounmap(volatile void __iomem *addr); | |
42103 | + | |
42104 | +/* | |
42105 | + * ISA I/O bus memory addresses are 1:1 with the physical address. | |
42106 | + */ | |
42107 | + | |
42108 | +#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); }) | |
42109 | +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x | |
42110 | +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x)) | |
42111 | + | |
42112 | +/* | |
42113 | + * However PCI ones are not necessarily 1:1 and therefore these interfaces | |
42114 | + * are forbidden in portable PCI drivers. | |
42115 | + * | |
42116 | + * Allow them on x86 for legacy drivers, though. | |
42117 | + */ | |
42118 | +#define virt_to_bus(_x) phys_to_machine(__pa(_x)) | |
42119 | +#define bus_to_virt(_x) __va(machine_to_phys(_x)) | |
42120 | + | |
42121 | +/* | |
42122 | + * readX/writeX() are used to access memory mapped devices. On some | |
42123 | + * architectures the memory mapped IO stuff needs to be accessed | |
42124 | + * differently. On the x86 architecture, we just read/write the | |
42125 | + * memory location directly. | |
42126 | + */ | |
42127 | + | |
42128 | +static inline __u8 __readb(const volatile void __iomem *addr) | |
42129 | +{ | |
42130 | + return *(__force volatile __u8 *)addr; | |
42131 | +} | |
42132 | +static inline __u16 __readw(const volatile void __iomem *addr) | |
42133 | +{ | |
42134 | + return *(__force volatile __u16 *)addr; | |
42135 | +} | |
42136 | +static __always_inline __u32 __readl(const volatile void __iomem *addr) | |
42137 | +{ | |
42138 | + return *(__force volatile __u32 *)addr; | |
42139 | +} | |
42140 | +static inline __u64 __readq(const volatile void __iomem *addr) | |
42141 | +{ | |
42142 | + return *(__force volatile __u64 *)addr; | |
42143 | +} | |
42144 | +#define readb(x) __readb(x) | |
42145 | +#define readw(x) __readw(x) | |
42146 | +#define readl(x) __readl(x) | |
42147 | +#define readq(x) __readq(x) | |
42148 | +#define readb_relaxed(a) readb(a) | |
42149 | +#define readw_relaxed(a) readw(a) | |
42150 | +#define readl_relaxed(a) readl(a) | |
42151 | +#define readq_relaxed(a) readq(a) | |
42152 | +#define __raw_readb readb | |
42153 | +#define __raw_readw readw | |
42154 | +#define __raw_readl readl | |
42155 | +#define __raw_readq readq | |
42156 | + | |
42157 | +#define mmiowb() | |
42158 | + | |
42159 | +static inline void __writel(__u32 b, volatile void __iomem *addr) | |
42160 | +{ | |
42161 | + *(__force volatile __u32 *)addr = b; | |
42162 | +} | |
42163 | +static inline void __writeq(__u64 b, volatile void __iomem *addr) | |
42164 | +{ | |
42165 | + *(__force volatile __u64 *)addr = b; | |
42166 | +} | |
42167 | +static inline void __writeb(__u8 b, volatile void __iomem *addr) | |
42168 | +{ | |
42169 | + *(__force volatile __u8 *)addr = b; | |
42170 | +} | |
42171 | +static inline void __writew(__u16 b, volatile void __iomem *addr) | |
42172 | +{ | |
42173 | + *(__force volatile __u16 *)addr = b; | |
42174 | +} | |
42175 | +#define writeq(val,addr) __writeq((val),(addr)) | |
42176 | +#define writel(val,addr) __writel((val),(addr)) | |
42177 | +#define writew(val,addr) __writew((val),(addr)) | |
42178 | +#define writeb(val,addr) __writeb((val),(addr)) | |
42179 | +#define __raw_writeb writeb | |
42180 | +#define __raw_writew writew | |
42181 | +#define __raw_writel writel | |
42182 | +#define __raw_writeq writeq | |
42183 | + | |
42184 | +void __memcpy_fromio(void*,unsigned long,unsigned); | |
42185 | +void __memcpy_toio(unsigned long,const void*,unsigned); | |
42186 | + | |
42187 | +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len) | |
42188 | +{ | |
42189 | + __memcpy_fromio(to,(unsigned long)from,len); | |
42190 | +} | |
42191 | +static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len) | |
42192 | +{ | |
42193 | + __memcpy_toio((unsigned long)to,from,len); | |
42194 | +} | |
42195 | + | |
42196 | +void memset_io(volatile void __iomem *a, int b, size_t c); | |
42197 | + | |
42198 | +/* | |
42199 | + * ISA space is 'always mapped' on a typical x86 system, no need to | |
42200 | + * explicitly ioremap() it. The fact that the ISA IO space is mapped | |
42201 | + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values | |
42202 | + * are physical addresses. The following constant pointer can be | |
42203 | + * used as the IO-area pointer (it can be iounmapped as well, so the | |
42204 | + * analogy with PCI is quite large): | |
42205 | + */ | |
42206 | +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN))) | |
42207 | + | |
42208 | +/* | |
42209 | + * Again, x86-64 does not require mem IO specific function. | |
42210 | + */ | |
42211 | + | |
42212 | +#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d)) | |
42213 | + | |
42214 | +/** | |
42215 | + * check_signature - find BIOS signatures | |
42216 | + * @io_addr: mmio address to check | |
42217 | + * @signature: signature block | |
42218 | + * @length: length of signature | |
42219 | + * | |
42220 | + * Perform a signature comparison with the mmio address io_addr. This | |
42221 | + * address should have been obtained by ioremap. | |
42222 | + * Returns 1 on a match. | |
42223 | + */ | |
42224 | + | |
42225 | +static inline int check_signature(void __iomem *io_addr, | |
42226 | + const unsigned char *signature, int length) | |
42227 | +{ | |
42228 | + int retval = 0; | |
42229 | + do { | |
42230 | + if (readb(io_addr) != *signature) | |
42231 | + goto out; | |
42232 | + io_addr++; | |
42233 | + signature++; | |
42234 | + length--; | |
42235 | + } while (length); | |
42236 | + retval = 1; | |
42237 | +out: | |
42238 | + return retval; | |
42239 | +} | |
42240 | + | |
42241 | +/* Nothing to do */ | |
42242 | + | |
42243 | +#define dma_cache_inv(_start,_size) do { } while (0) | |
42244 | +#define dma_cache_wback(_start,_size) do { } while (0) | |
42245 | +#define dma_cache_wback_inv(_start,_size) do { } while (0) | |
42246 | + | |
42247 | +#define flush_write_buffers() | |
42248 | + | |
42249 | +extern int iommu_bio_merge; | |
42250 | +#define BIO_VMERGE_BOUNDARY iommu_bio_merge | |
42251 | + | |
42252 | +/* | |
42253 | + * Convert a physical pointer to a virtual kernel pointer for /dev/mem | |
42254 | + * access | |
42255 | + */ | |
42256 | +#define xlate_dev_mem_ptr(p) __va(p) | |
42257 | + | |
42258 | +/* | |
42259 | + * Convert a virtual cached pointer to an uncached pointer | |
42260 | + */ | |
42261 | +#define xlate_dev_kmem_ptr(p) p | |
42262 | + | |
42263 | +#endif /* __KERNEL__ */ | |
42264 | + | |
42265 | +#define ARCH_HAS_DEV_MEM | |
42266 | + | |
42267 | +#endif | |
42268 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_64.h | |
42269 | =================================================================== | |
42270 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
42271 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_64.h 2007-06-12 13:14:13.000000000 +0200 | |
42272 | @@ -0,0 +1,139 @@ | |
42273 | +/* | |
42274 | + * include/asm-x86_64/irqflags.h | |
42275 | + * | |
42276 | + * IRQ flags handling | |
42277 | + * | |
42278 | + * This file gets included from lowlevel asm headers too, to provide | |
42279 | + * wrapped versions of the local_irq_*() APIs, based on the | |
42280 | + * raw_local_irq_*() functions from the lowlevel headers. | |
42281 | + */ | |
42282 | +#ifndef _ASM_IRQFLAGS_H | |
42283 | +#define _ASM_IRQFLAGS_H | |
42284 | + | |
42285 | +#ifndef __ASSEMBLY__ | |
42286 | +/* | |
42287 | + * Interrupt control: | |
42288 | + */ | |
42289 | + | |
42290 | +/* | |
42291 | + * The use of 'barrier' in the following reflects their use as local-lock | |
42292 | + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following | |
42293 | + * critical operations are executed. All critical operations must complete | |
42294 | + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also | |
42295 | + * includes these barriers, for example. | |
42296 | + */ | |
42297 | + | |
42298 | +#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask) | |
42299 | + | |
42300 | +#define raw_local_save_flags(flags) \ | |
42301 | + do { (flags) = __raw_local_save_flags(); } while (0) | |
42302 | + | |
42303 | +#define raw_local_irq_restore(x) \ | |
42304 | +do { \ | |
42305 | + vcpu_info_t *_vcpu; \ | |
42306 | + barrier(); \ | |
42307 | + _vcpu = current_vcpu_info(); \ | |
42308 | + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ | |
42309 | + barrier(); /* unmask then check (avoid races) */ \ | |
42310 | + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ | |
42311 | + force_evtchn_callback(); \ | |
42312 | + } \ | |
42313 | +} while (0) | |
42314 | + | |
42315 | +#ifdef CONFIG_X86_VSMP | |
42316 | + | |
42317 | +/* | |
42318 | + * Interrupt control for the VSMP architecture: | |
42319 | + */ | |
42320 | + | |
42321 | +static inline void raw_local_irq_disable(void) | |
42322 | +{ | |
42323 | + unsigned long flags = __raw_local_save_flags(); | |
42324 | + | |
42325 | + raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); | |
42326 | +} | |
42327 | + | |
42328 | +static inline void raw_local_irq_enable(void) | |
42329 | +{ | |
42330 | + unsigned long flags = __raw_local_save_flags(); | |
42331 | + | |
42332 | + raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); | |
42333 | +} | |
42334 | + | |
42335 | +static inline int raw_irqs_disabled_flags(unsigned long flags) | |
42336 | +{ | |
42337 | + return !(flags & (1<<9)) || (flags & (1 << 18)); | |
42338 | +} | |
42339 | + | |
42340 | +#else /* CONFIG_X86_VSMP */ | |
42341 | + | |
42342 | +#define raw_local_irq_disable() \ | |
42343 | +do { \ | |
42344 | + current_vcpu_info()->evtchn_upcall_mask = 1; \ | |
42345 | + barrier(); \ | |
42346 | +} while (0) | |
42347 | + | |
42348 | +#define raw_local_irq_enable() \ | |
42349 | +do { \ | |
42350 | + vcpu_info_t *_vcpu; \ | |
42351 | + barrier(); \ | |
42352 | + _vcpu = current_vcpu_info(); \ | |
42353 | + _vcpu->evtchn_upcall_mask = 0; \ | |
42354 | + barrier(); /* unmask then check (avoid races) */ \ | |
42355 | + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ | |
42356 | + force_evtchn_callback(); \ | |
42357 | +} while (0) | |
42358 | + | |
42359 | +static inline int raw_irqs_disabled_flags(unsigned long flags) | |
42360 | +{ | |
42361 | + return (flags != 0); | |
42362 | +} | |
42363 | + | |
42364 | +#endif | |
42365 | + | |
42366 | +/* | |
42367 | + * For spinlocks, etc.: | |
42368 | + */ | |
42369 | + | |
42370 | +#define __raw_local_irq_save() \ | |
42371 | +({ \ | |
42372 | + unsigned long flags = __raw_local_save_flags(); \ | |
42373 | + \ | |
42374 | + raw_local_irq_disable(); \ | |
42375 | + \ | |
42376 | + flags; \ | |
42377 | +}) | |
42378 | + | |
42379 | +#define raw_local_irq_save(flags) \ | |
42380 | + do { (flags) = __raw_local_irq_save(); } while (0) | |
42381 | + | |
42382 | +#define raw_irqs_disabled() \ | |
42383 | +({ \ | |
42384 | + unsigned long flags = __raw_local_save_flags(); \ | |
42385 | + \ | |
42386 | + raw_irqs_disabled_flags(flags); \ | |
42387 | +}) | |
42388 | + | |
42389 | +/* | |
42390 | + * Used in the idle loop; sti takes one instruction cycle | |
42391 | + * to complete: | |
42392 | + */ | |
42393 | +void raw_safe_halt(void); | |
42394 | + | |
42395 | +/* | |
42396 | + * Used when interrupts are already enabled or to | |
42397 | + * shutdown the processor: | |
42398 | + */ | |
42399 | +void halt(void); | |
42400 | + | |
42401 | +#else /* __ASSEMBLY__: */ | |
42402 | +# ifdef CONFIG_TRACE_IRQFLAGS | |
42403 | +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk | |
42404 | +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk | |
42405 | +# else | |
42406 | +# define TRACE_IRQS_ON | |
42407 | +# define TRACE_IRQS_OFF | |
42408 | +# endif | |
42409 | +#endif | |
42410 | + | |
42411 | +#endif | |
42412 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_64.h | |
42413 | =================================================================== | |
42414 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
42415 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_64.h 2007-06-12 13:14:13.000000000 +0200 | |
42416 | @@ -0,0 +1,161 @@ | |
42417 | +#ifndef _X86_64_MADDR_H | |
42418 | +#define _X86_64_MADDR_H | |
42419 | + | |
42420 | +#include <xen/features.h> | |
42421 | +#include <xen/interface/xen.h> | |
42422 | + | |
42423 | +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ | |
42424 | +#define INVALID_P2M_ENTRY (~0UL) | |
42425 | +#define FOREIGN_FRAME_BIT (1UL<<63) | |
42426 | +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) | |
42427 | + | |
42428 | +/* Definitions for machine and pseudophysical addresses. */ | |
42429 | +typedef unsigned long paddr_t; | |
42430 | +typedef unsigned long maddr_t; | |
42431 | + | |
42432 | +#ifdef CONFIG_XEN | |
42433 | + | |
42434 | +extern unsigned long *phys_to_machine_mapping; | |
42435 | + | |
42436 | +#undef machine_to_phys_mapping | |
42437 | +extern unsigned long *machine_to_phys_mapping; | |
42438 | +extern unsigned int machine_to_phys_order; | |
42439 | + | |
42440 | +static inline unsigned long pfn_to_mfn(unsigned long pfn) | |
42441 | +{ | |
42442 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
42443 | + return pfn; | |
42444 | + BUG_ON(end_pfn && pfn >= end_pfn); | |
42445 | + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; | |
42446 | +} | |
42447 | + | |
42448 | +static inline int phys_to_machine_mapping_valid(unsigned long pfn) | |
42449 | +{ | |
42450 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
42451 | + return 1; | |
42452 | + BUG_ON(end_pfn && pfn >= end_pfn); | |
42453 | + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); | |
42454 | +} | |
42455 | + | |
42456 | +static inline unsigned long mfn_to_pfn(unsigned long mfn) | |
42457 | +{ | |
42458 | + unsigned long pfn; | |
42459 | + | |
42460 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
42461 | + return mfn; | |
42462 | + | |
42463 | + if (unlikely((mfn >> machine_to_phys_order) != 0)) | |
42464 | + return end_pfn; | |
42465 | + | |
42466 | + /* The array access can fail (e.g., device space beyond end of RAM). */ | |
42467 | + asm ( | |
42468 | + "1: movq %1,%0\n" | |
42469 | + "2:\n" | |
42470 | + ".section .fixup,\"ax\"\n" | |
42471 | + "3: movq %2,%0\n" | |
42472 | + " jmp 2b\n" | |
42473 | + ".previous\n" | |
42474 | + ".section __ex_table,\"a\"\n" | |
42475 | + " .align 8\n" | |
42476 | + " .quad 1b,3b\n" | |
42477 | + ".previous" | |
42478 | + : "=r" (pfn) | |
42479 | + : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) ); | |
42480 | + | |
42481 | + return pfn; | |
42482 | +} | |
42483 | + | |
42484 | +/* | |
42485 | + * We detect special mappings in one of two ways: | |
42486 | + * 1. If the MFN is an I/O page then Xen will set the m2p entry | |
42487 | + * to be outside our maximum possible pseudophys range. | |
42488 | + * 2. If the MFN belongs to a different domain then we will certainly | |
42489 | + * not have MFN in our p2m table. Conversely, if the page is ours, | |
42490 | + * then we'll have p2m(m2p(MFN))==MFN. | |
42491 | + * If we detect a special mapping then it doesn't have a 'struct page'. | |
42492 | + * We force !pfn_valid() by returning an out-of-range pointer. | |
42493 | + * | |
42494 | + * NB. These checks require that, for any MFN that is not in our reservation, | |
42495 | + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if | |
42496 | + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. | |
42497 | + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. | |
42498 | + * | |
42499 | + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* | |
42500 | + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we | |
42501 | + * require. In all the cases we care about, the FOREIGN_FRAME bit is | |
42502 | + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. | |
42503 | + */ | |
42504 | +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) | |
42505 | +{ | |
42506 | + unsigned long pfn = mfn_to_pfn(mfn); | |
42507 | + if ((pfn < end_pfn) | |
42508 | + && !xen_feature(XENFEAT_auto_translated_physmap) | |
42509 | + && (phys_to_machine_mapping[pfn] != mfn)) | |
42510 | + return end_pfn; /* force !pfn_valid() */ | |
42511 | + return pfn; | |
42512 | +} | |
42513 | + | |
42514 | +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) | |
42515 | +{ | |
42516 | + BUG_ON(end_pfn && pfn >= end_pfn); | |
42517 | + if (xen_feature(XENFEAT_auto_translated_physmap)) { | |
42518 | + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); | |
42519 | + return; | |
42520 | + } | |
42521 | + phys_to_machine_mapping[pfn] = mfn; | |
42522 | +} | |
42523 | + | |
42524 | +static inline maddr_t phys_to_machine(paddr_t phys) | |
42525 | +{ | |
42526 | + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); | |
42527 | + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); | |
42528 | + return machine; | |
42529 | +} | |
42530 | + | |
42531 | +static inline paddr_t machine_to_phys(maddr_t machine) | |
42532 | +{ | |
42533 | + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); | |
42534 | + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); | |
42535 | + return phys; | |
42536 | +} | |
42537 | + | |
42538 | +static inline paddr_t pte_phys_to_machine(paddr_t phys) | |
42539 | +{ | |
42540 | + maddr_t machine; | |
42541 | + machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); | |
42542 | + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK); | |
42543 | + return machine; | |
42544 | +} | |
42545 | + | |
42546 | +static inline paddr_t pte_machine_to_phys(maddr_t machine) | |
42547 | +{ | |
42548 | + paddr_t phys; | |
42549 | + phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); | |
42550 | + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); | |
42551 | + return phys; | |
42552 | +} | |
42553 | + | |
42554 | +#define __pte_ma(x) ((pte_t) { (x) } ) | |
42555 | +#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask) | |
42556 | + | |
42557 | +#else /* !CONFIG_XEN */ | |
42558 | + | |
42559 | +#define pfn_to_mfn(pfn) (pfn) | |
42560 | +#define mfn_to_pfn(mfn) (mfn) | |
42561 | +#define mfn_to_local_pfn(mfn) (mfn) | |
42562 | +#define set_phys_to_machine(pfn, mfn) ((void)0) | |
42563 | +#define phys_to_machine_mapping_valid(pfn) (1) | |
42564 | +#define phys_to_machine(phys) ((maddr_t)(phys)) | |
42565 | +#define machine_to_phys(mach) ((paddr_t)(mach)) | |
42566 | +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot) | |
42567 | +#define __pte_ma(x) __pte(x) | |
42568 | + | |
42569 | +#endif /* !CONFIG_XEN */ | |
42570 | + | |
42571 | +/* VIRT <-> MACHINE conversion */ | |
42572 | +#define virt_to_machine(v) (phys_to_machine(__pa(v))) | |
42573 | +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) | |
42574 | +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) | |
42575 | + | |
42576 | +#endif /* _X86_64_MADDR_H */ | |
42577 | + | |
42578 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_64.h | |
42579 | =================================================================== | |
42580 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
42581 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_64.h 2007-06-12 13:14:13.000000000 +0200 | |
42582 | @@ -0,0 +1,136 @@ | |
42583 | +#ifndef __X86_64_MMU_CONTEXT_H | |
42584 | +#define __X86_64_MMU_CONTEXT_H | |
42585 | + | |
42586 | +#include <asm/desc.h> | |
42587 | +#include <asm/atomic.h> | |
42588 | +#include <asm/pgalloc.h> | |
42589 | +#include <asm/page.h> | |
42590 | +#include <asm/pda.h> | |
42591 | +#include <asm/pgtable.h> | |
42592 | +#include <asm/tlbflush.h> | |
42593 | + | |
42594 | +/* | |
42595 | + * possibly do the LDT unload here? | |
42596 | + */ | |
42597 | +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); | |
42598 | +void destroy_context(struct mm_struct *mm); | |
42599 | + | |
42600 | +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) | |
42601 | +{ | |
42602 | +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) | |
42603 | + if (read_pda(mmu_state) == TLBSTATE_OK) | |
42604 | + write_pda(mmu_state, TLBSTATE_LAZY); | |
42605 | +#endif | |
42606 | +} | |
42607 | + | |
42608 | +#define prepare_arch_switch(next) __prepare_arch_switch() | |
42609 | + | |
42610 | +static inline void __prepare_arch_switch(void) | |
42611 | +{ | |
42612 | + /* | |
42613 | + * Save away %es, %ds, %fs and %gs. Must happen before reload | |
42614 | + * of cr3/ldt (i.e., not in __switch_to). | |
42615 | + */ | |
42616 | + __asm__ __volatile__ ( | |
42617 | + "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3" | |
42618 | + : "=m" (current->thread.es), | |
42619 | + "=m" (current->thread.ds), | |
42620 | + "=m" (current->thread.fsindex), | |
42621 | + "=m" (current->thread.gsindex) ); | |
42622 | + | |
42623 | + if (current->thread.ds) | |
42624 | + __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) ); | |
42625 | + | |
42626 | + if (current->thread.es) | |
42627 | + __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) ); | |
42628 | + | |
42629 | + if (current->thread.fsindex) { | |
42630 | + __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) ); | |
42631 | + current->thread.fs = 0; | |
42632 | + } | |
42633 | + | |
42634 | + if (current->thread.gsindex) { | |
42635 | + load_gs_index(0); | |
42636 | + current->thread.gs = 0; | |
42637 | + } | |
42638 | +} | |
42639 | + | |
42640 | +extern void mm_pin(struct mm_struct *mm); | |
42641 | +extern void mm_unpin(struct mm_struct *mm); | |
42642 | +void mm_pin_all(void); | |
42643 | + | |
42644 | +static inline void load_cr3(pgd_t *pgd) | |
42645 | +{ | |
42646 | + asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) : | |
42647 | + "memory"); | |
42648 | +} | |
42649 | + | |
42650 | +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |
42651 | + struct task_struct *tsk) | |
42652 | +{ | |
42653 | + unsigned cpu = smp_processor_id(); | |
42654 | + struct mmuext_op _op[3], *op = _op; | |
42655 | + | |
42656 | + if (likely(prev != next)) { | |
42657 | + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && | |
42658 | + !next->context.pinned); | |
42659 | + | |
42660 | + /* stop flush ipis for the previous mm */ | |
42661 | + cpu_clear(cpu, prev->cpu_vm_mask); | |
42662 | +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) | |
42663 | + write_pda(mmu_state, TLBSTATE_OK); | |
42664 | + write_pda(active_mm, next); | |
42665 | +#endif | |
42666 | + cpu_set(cpu, next->cpu_vm_mask); | |
42667 | + | |
42668 | + /* load_cr3(next->pgd) */ | |
42669 | + op->cmd = MMUEXT_NEW_BASEPTR; | |
42670 | + op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); | |
42671 | + op++; | |
42672 | + | |
42673 | + /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */ | |
42674 | + op->cmd = MMUEXT_NEW_USER_BASEPTR; | |
42675 | + op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT); | |
42676 | + op++; | |
42677 | + | |
42678 | + if (unlikely(next->context.ldt != prev->context.ldt)) { | |
42679 | + /* load_LDT_nolock(&next->context, cpu) */ | |
42680 | + op->cmd = MMUEXT_SET_LDT; | |
42681 | + op->arg1.linear_addr = (unsigned long)next->context.ldt; | |
42682 | + op->arg2.nr_ents = next->context.size; | |
42683 | + op++; | |
42684 | + } | |
42685 | + | |
42686 | + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); | |
42687 | + } | |
42688 | +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) | |
42689 | + else { | |
42690 | + write_pda(mmu_state, TLBSTATE_OK); | |
42691 | + if (read_pda(active_mm) != next) | |
42692 | + out_of_line_bug(); | |
42693 | + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { | |
42694 | + /* We were in lazy tlb mode and leave_mm disabled | |
42695 | + * tlb flush IPI delivery. We must reload CR3 | |
42696 | + * to make sure to use no freed page tables. | |
42697 | + */ | |
42698 | + load_cr3(next->pgd); | |
42699 | + xen_new_user_pt(__pa(__user_pgd(next->pgd))); | |
42700 | + load_LDT_nolock(&next->context, cpu); | |
42701 | + } | |
42702 | + } | |
42703 | +#endif | |
42704 | +} | |
42705 | + | |
42706 | +#define deactivate_mm(tsk,mm) do { \ | |
42707 | + load_gs_index(0); \ | |
42708 | + asm volatile("movl %0,%%fs"::"r"(0)); \ | |
42709 | +} while(0) | |
42710 | + | |
42711 | +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) | |
42712 | +{ | |
42713 | + if (!next->context.pinned) | |
42714 | + mm_pin(next); | |
42715 | + switch_mm(prev, next, NULL); | |
42716 | +} | |
42717 | + | |
42718 | +#endif | |
42719 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/page_64.h | |
42720 | =================================================================== | |
42721 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
42722 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/page_64.h 2008-04-02 12:34:02.000000000 +0200 | |
42723 | @@ -0,0 +1,212 @@ | |
42724 | +#ifndef _X86_64_PAGE_H | |
42725 | +#define _X86_64_PAGE_H | |
42726 | + | |
42727 | +/* #include <linux/string.h> */ | |
42728 | +#ifndef __ASSEMBLY__ | |
42729 | +#include <linux/kernel.h> | |
42730 | +#include <linux/types.h> | |
42731 | +#include <asm/bug.h> | |
42732 | +#endif | |
42733 | +#include <xen/interface/xen.h> | |
42734 | + | |
42735 | +/* | |
42736 | + * Need to repeat this here in order to not include pgtable.h (which in turn | |
42737 | + * depends on definitions made here), but to be able to use the symbolic | |
42738 | + * below. The preprocessor will warn if the two definitions aren't identical. | |
42739 | + */ | |
42740 | +#define _PAGE_PRESENT 0x001 | |
42741 | +#define _PAGE_IO 0x200 | |
42742 | + | |
42743 | +/* PAGE_SHIFT determines the page size */ | |
42744 | +#define PAGE_SHIFT 12 | |
42745 | +#ifdef __ASSEMBLY__ | |
42746 | +#define PAGE_SIZE (0x1 << PAGE_SHIFT) | |
42747 | +#else | |
42748 | +#define PAGE_SIZE (1UL << PAGE_SHIFT) | |
42749 | +#endif | |
42750 | +#define PAGE_MASK (~(PAGE_SIZE-1)) | |
42751 | + | |
42752 | +/* See Documentation/x86_64/mm.txt for a description of the memory map. */ | |
42753 | +#define __PHYSICAL_MASK_SHIFT 46 | |
42754 | +#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1) | |
42755 | +#define __VIRTUAL_MASK_SHIFT 48 | |
42756 | +#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) | |
42757 | + | |
42758 | +#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK) | |
42759 | + | |
42760 | +#define THREAD_ORDER 1 | |
42761 | +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) | |
42762 | +#define CURRENT_MASK (~(THREAD_SIZE-1)) | |
42763 | + | |
42764 | +#define EXCEPTION_STACK_ORDER 0 | |
42765 | +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) | |
42766 | + | |
42767 | +#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) | |
42768 | +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) | |
42769 | + | |
42770 | +#define IRQSTACK_ORDER 2 | |
42771 | +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) | |
42772 | + | |
42773 | +#define STACKFAULT_STACK 1 | |
42774 | +#define DOUBLEFAULT_STACK 2 | |
42775 | +#define NMI_STACK 3 | |
42776 | +#define DEBUG_STACK 4 | |
42777 | +#define MCE_STACK 5 | |
42778 | +#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ | |
42779 | + | |
42780 | +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) | |
42781 | +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) | |
42782 | + | |
42783 | +#define HPAGE_SHIFT PMD_SHIFT | |
42784 | +#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) | |
42785 | +#define HPAGE_MASK (~(HPAGE_SIZE - 1)) | |
42786 | +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) | |
42787 | + | |
42788 | +#ifdef __KERNEL__ | |
42789 | +#ifndef __ASSEMBLY__ | |
42790 | + | |
42791 | +extern unsigned long end_pfn; | |
42792 | + | |
42793 | +#include <asm/maddr.h> | |
42794 | + | |
42795 | +void clear_page(void *); | |
42796 | +void copy_page(void *, void *); | |
42797 | + | |
42798 | +#define clear_user_page(page, vaddr, pg) clear_page(page) | |
42799 | +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) | |
42800 | + | |
42801 | +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) | |
42802 | +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE | |
42803 | + | |
42804 | +/* | |
42805 | + * These are used to make use of C type-checking.. | |
42806 | + */ | |
42807 | +typedef struct { unsigned long pte; } pte_t; | |
42808 | +typedef struct { unsigned long pmd; } pmd_t; | |
42809 | +typedef struct { unsigned long pud; } pud_t; | |
42810 | +typedef struct { unsigned long pgd; } pgd_t; | |
42811 | +#define PTE_MASK PHYSICAL_PAGE_MASK | |
42812 | + | |
42813 | +typedef struct { unsigned long pgprot; } pgprot_t; | |
42814 | + | |
42815 | +#define __pte_val(x) ((x).pte) | |
42816 | +#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO)) \ | |
42817 | + == _PAGE_PRESENT ? \ | |
42818 | + pte_machine_to_phys(__pte_val(x)) : \ | |
42819 | + __pte_val(x)) | |
42820 | + | |
42821 | +#define __pmd_val(x) ((x).pmd) | |
42822 | +static inline unsigned long pmd_val(pmd_t x) | |
42823 | +{ | |
42824 | + unsigned long ret = __pmd_val(x); | |
42825 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
42826 | + if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT; | |
42827 | +#else | |
42828 | + if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret); | |
42829 | +#endif | |
42830 | + return ret; | |
42831 | +} | |
42832 | + | |
42833 | +#define __pud_val(x) ((x).pud) | |
42834 | +static inline unsigned long pud_val(pud_t x) | |
42835 | +{ | |
42836 | + unsigned long ret = __pud_val(x); | |
42837 | + if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret); | |
42838 | + return ret; | |
42839 | +} | |
42840 | + | |
42841 | +#define __pgd_val(x) ((x).pgd) | |
42842 | +static inline unsigned long pgd_val(pgd_t x) | |
42843 | +{ | |
42844 | + unsigned long ret = __pgd_val(x); | |
42845 | + if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret); | |
42846 | + return ret; | |
42847 | +} | |
42848 | + | |
42849 | +#define pgprot_val(x) ((x).pgprot) | |
42850 | + | |
42851 | +static inline pte_t __pte(unsigned long x) | |
42852 | +{ | |
42853 | + if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT) | |
42854 | + x = pte_phys_to_machine(x); | |
42855 | + return ((pte_t) { (x) }); | |
42856 | +} | |
42857 | + | |
42858 | +static inline pmd_t __pmd(unsigned long x) | |
42859 | +{ | |
42860 | + if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x); | |
42861 | + return ((pmd_t) { (x) }); | |
42862 | +} | |
42863 | + | |
42864 | +static inline pud_t __pud(unsigned long x) | |
42865 | +{ | |
42866 | + if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x); | |
42867 | + return ((pud_t) { (x) }); | |
42868 | +} | |
42869 | + | |
42870 | +static inline pgd_t __pgd(unsigned long x) | |
42871 | +{ | |
42872 | + if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x); | |
42873 | + return ((pgd_t) { (x) }); | |
42874 | +} | |
42875 | + | |
42876 | +#define __pgprot(x) ((pgprot_t) { (x) } ) | |
42877 | + | |
42878 | +#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START) | |
42879 | +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) | |
42880 | +#define __START_KERNEL_map 0xffffffff80000000UL | |
42881 | +#define __PAGE_OFFSET 0xffff880000000000UL | |
42882 | + | |
42883 | +#else | |
42884 | +#define __PHYSICAL_START CONFIG_PHYSICAL_START | |
42885 | +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) | |
42886 | +#define __START_KERNEL_map 0xffffffff80000000 | |
42887 | +#define __PAGE_OFFSET 0xffff880000000000 | |
42888 | +#endif /* !__ASSEMBLY__ */ | |
42889 | + | |
42890 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
42891 | +#undef LOAD_OFFSET | |
42892 | +#define LOAD_OFFSET 0 | |
42893 | +#endif | |
42894 | + | |
42895 | +/* to align the pointer to the (next) page boundary */ | |
42896 | +#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) | |
42897 | + | |
42898 | +#define KERNEL_TEXT_SIZE (40UL*1024*1024) | |
42899 | +#define KERNEL_TEXT_START 0xffffffff80000000UL | |
42900 | + | |
42901 | +#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) | |
42902 | + | |
42903 | +/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. | |
42904 | + Otherwise you risk miscompilation. */ | |
42905 | +#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET) | |
42906 | +/* __pa_symbol should be used for C visible symbols. | |
42907 | + This seems to be the official gcc blessed way to do such arithmetic. */ | |
42908 | +#define __pa_symbol(x) \ | |
42909 | + ({unsigned long v; \ | |
42910 | + asm("" : "=r" (v) : "0" (x)); \ | |
42911 | + __pa(v); }) | |
42912 | + | |
42913 | +#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) | |
42914 | +#define __boot_va(x) __va(x) | |
42915 | +#define __boot_pa(x) __pa(x) | |
42916 | +#ifdef CONFIG_FLATMEM | |
42917 | +#define pfn_valid(pfn) ((pfn) < end_pfn) | |
42918 | +#endif | |
42919 | + | |
42920 | +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) | |
42921 | +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) | |
42922 | +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) | |
42923 | + | |
42924 | +#define VM_DATA_DEFAULT_FLAGS \ | |
42925 | + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ | |
42926 | + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) | |
42927 | + | |
42928 | +#define __HAVE_ARCH_GATE_AREA 1 | |
42929 | + | |
42930 | +#include <asm-generic/memory_model.h> | |
42931 | +#include <asm-generic/page.h> | |
42932 | + | |
42933 | +#endif /* __KERNEL__ */ | |
42934 | + | |
42935 | +#endif /* _X86_64_PAGE_H */ | |
42936 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pci_64.h | |
42937 | =================================================================== | |
42938 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
42939 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pci_64.h 2007-09-14 11:14:51.000000000 +0200 | |
42940 | @@ -0,0 +1,168 @@ | |
42941 | +#ifndef __x8664_PCI_H | |
42942 | +#define __x8664_PCI_H | |
42943 | + | |
42944 | +#include <asm/io.h> | |
42945 | + | |
42946 | +#ifdef __KERNEL__ | |
42947 | + | |
42948 | +#include <linux/mm.h> /* for struct page */ | |
42949 | + | |
42950 | +/* Can be used to override the logic in pci_scan_bus for skipping | |
42951 | + already-configured bus numbers - to be used for buggy BIOSes | |
42952 | + or architectures with incomplete PCI setup by the loader */ | |
42953 | + | |
42954 | +#ifdef CONFIG_PCI | |
42955 | +extern unsigned int pcibios_assign_all_busses(void); | |
42956 | +#else | |
42957 | +#define pcibios_assign_all_busses() 0 | |
42958 | +#endif | |
42959 | + | |
42960 | +#include <asm/hypervisor.h> | |
42961 | +#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain()) | |
42962 | + | |
42963 | +extern unsigned long pci_mem_start; | |
42964 | +#define PCIBIOS_MIN_IO 0x1000 | |
42965 | +#define PCIBIOS_MIN_MEM (pci_mem_start) | |
42966 | + | |
42967 | +#define PCIBIOS_MIN_CARDBUS_IO 0x4000 | |
42968 | + | |
42969 | +void pcibios_config_init(void); | |
42970 | +struct pci_bus * pcibios_scan_root(int bus); | |
42971 | +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value); | |
42972 | +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); | |
42973 | + | |
42974 | +void pcibios_set_master(struct pci_dev *dev); | |
42975 | +void pcibios_penalize_isa_irq(int irq, int active); | |
42976 | +struct irq_routing_table *pcibios_get_irq_routing_table(void); | |
42977 | +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); | |
42978 | + | |
42979 | +#include <linux/types.h> | |
42980 | +#include <linux/slab.h> | |
42981 | +#include <asm/scatterlist.h> | |
42982 | +#include <linux/string.h> | |
42983 | +#include <asm/page.h> | |
42984 | + | |
42985 | +extern void pci_iommu_alloc(void); | |
42986 | +extern int iommu_setup(char *opt); | |
42987 | + | |
42988 | +/* The PCI address space does equal the physical memory | |
42989 | + * address space. The networking and block device layers use | |
42990 | + * this boolean for bounce buffer decisions | |
42991 | + * | |
42992 | + * On AMD64 it mostly equals, but we set it to zero if a hardware | |
42993 | + * IOMMU (gart) of sotware IOMMU (swiotlb) is available. | |
42994 | + */ | |
42995 | +#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) | |
42996 | + | |
42997 | +#if defined(CONFIG_IOMMU) || defined(CONFIG_CALGARY_IOMMU) | |
42998 | + | |
42999 | +/* | |
43000 | + * x86-64 always supports DAC, but sometimes it is useful to force | |
43001 | + * devices through the IOMMU to get automatic sg list merging. | |
43002 | + * Optional right now. | |
43003 | + */ | |
43004 | +extern int iommu_sac_force; | |
43005 | +#define pci_dac_dma_supported(pci_dev, mask) (!iommu_sac_force) | |
43006 | + | |
43007 | +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ | |
43008 | + dma_addr_t ADDR_NAME; | |
43009 | +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ | |
43010 | + __u32 LEN_NAME; | |
43011 | +#define pci_unmap_addr(PTR, ADDR_NAME) \ | |
43012 | + ((PTR)->ADDR_NAME) | |
43013 | +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ | |
43014 | + (((PTR)->ADDR_NAME) = (VAL)) | |
43015 | +#define pci_unmap_len(PTR, LEN_NAME) \ | |
43016 | + ((PTR)->LEN_NAME) | |
43017 | +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ | |
43018 | + (((PTR)->LEN_NAME) = (VAL)) | |
43019 | + | |
43020 | +#elif defined(CONFIG_SWIOTLB) | |
43021 | + | |
43022 | +#define pci_dac_dma_supported(pci_dev, mask) 1 | |
43023 | + | |
43024 | +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ | |
43025 | + dma_addr_t ADDR_NAME; | |
43026 | +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ | |
43027 | + __u32 LEN_NAME; | |
43028 | +#define pci_unmap_addr(PTR, ADDR_NAME) \ | |
43029 | + ((PTR)->ADDR_NAME) | |
43030 | +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ | |
43031 | + (((PTR)->ADDR_NAME) = (VAL)) | |
43032 | +#define pci_unmap_len(PTR, LEN_NAME) \ | |
43033 | + ((PTR)->LEN_NAME) | |
43034 | +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ | |
43035 | + (((PTR)->LEN_NAME) = (VAL)) | |
43036 | + | |
43037 | +#else | |
43038 | +/* No IOMMU */ | |
43039 | + | |
43040 | +#define pci_dac_dma_supported(pci_dev, mask) 1 | |
43041 | + | |
43042 | +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) | |
43043 | +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) | |
43044 | +#define pci_unmap_addr(PTR, ADDR_NAME) (0) | |
43045 | +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) | |
43046 | +#define pci_unmap_len(PTR, LEN_NAME) (0) | |
43047 | +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) | |
43048 | + | |
43049 | +#endif | |
43050 | + | |
43051 | +#include <asm-generic/pci-dma-compat.h> | |
43052 | + | |
43053 | +static inline dma64_addr_t | |
43054 | +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction) | |
43055 | +{ | |
43056 | + return ((dma64_addr_t) page_to_phys(page) + | |
43057 | + (dma64_addr_t) offset); | |
43058 | +} | |
43059 | + | |
43060 | +static inline struct page * | |
43061 | +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) | |
43062 | +{ | |
43063 | + return virt_to_page(__va(dma_addr)); | |
43064 | +} | |
43065 | + | |
43066 | +static inline unsigned long | |
43067 | +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) | |
43068 | +{ | |
43069 | + return (dma_addr & ~PAGE_MASK); | |
43070 | +} | |
43071 | + | |
43072 | +static inline void | |
43073 | +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) | |
43074 | +{ | |
43075 | +} | |
43076 | + | |
43077 | +static inline void | |
43078 | +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) | |
43079 | +{ | |
43080 | + flush_write_buffers(); | |
43081 | +} | |
43082 | + | |
43083 | +#ifdef CONFIG_PCI | |
43084 | +static inline void pci_dma_burst_advice(struct pci_dev *pdev, | |
43085 | + enum pci_dma_burst_strategy *strat, | |
43086 | + unsigned long *strategy_parameter) | |
43087 | +{ | |
43088 | + *strat = PCI_DMA_BURST_INFINITY; | |
43089 | + *strategy_parameter = ~0UL; | |
43090 | +} | |
43091 | +#endif | |
43092 | + | |
43093 | +#define HAVE_PCI_MMAP | |
43094 | +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, | |
43095 | + enum pci_mmap_state mmap_state, int write_combine); | |
43096 | + | |
43097 | +static inline void pcibios_add_platform_entries(struct pci_dev *dev) | |
43098 | +{ | |
43099 | +} | |
43100 | + | |
43101 | +#endif /* __KERNEL__ */ | |
43102 | + | |
43103 | +/* generic pci stuff */ | |
43104 | +#ifdef CONFIG_PCI | |
43105 | +#include <asm-generic/pci.h> | |
43106 | +#endif | |
43107 | + | |
43108 | +#endif /* __x8664_PCI_H */ | |
43109 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_64.h | |
43110 | =================================================================== | |
43111 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
43112 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_64.h 2007-06-18 08:38:13.000000000 +0200 | |
43113 | @@ -0,0 +1,204 @@ | |
43114 | +#ifndef _X86_64_PGALLOC_H | |
43115 | +#define _X86_64_PGALLOC_H | |
43116 | + | |
43117 | +#include <asm/fixmap.h> | |
43118 | +#include <asm/pda.h> | |
43119 | +#include <linux/threads.h> | |
43120 | +#include <linux/mm.h> | |
43121 | +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ | |
43122 | + | |
43123 | +#include <xen/features.h> | |
43124 | +void make_page_readonly(void *va, unsigned int feature); | |
43125 | +void make_page_writable(void *va, unsigned int feature); | |
43126 | +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); | |
43127 | +void make_pages_writable(void *va, unsigned int nr, unsigned int feature); | |
43128 | + | |
43129 | +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) | |
43130 | + | |
43131 | +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) | |
43132 | +{ | |
43133 | + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))); | |
43134 | +} | |
43135 | + | |
43136 | +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) | |
43137 | +{ | |
43138 | + if (unlikely((mm)->context.pinned)) { | |
43139 | + BUG_ON(HYPERVISOR_update_va_mapping( | |
43140 | + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT), | |
43141 | + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0)); | |
43142 | + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); | |
43143 | + } else { | |
43144 | + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)); | |
43145 | + } | |
43146 | +} | |
43147 | + | |
43148 | +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) | |
43149 | +{ | |
43150 | + if (unlikely((mm)->context.pinned)) { | |
43151 | + BUG_ON(HYPERVISOR_update_va_mapping( | |
43152 | + (unsigned long)pmd, | |
43153 | + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, | |
43154 | + PAGE_KERNEL_RO), 0)); | |
43155 | + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); | |
43156 | + } else { | |
43157 | + *(pud) = __pud(_PAGE_TABLE | __pa(pmd)); | |
43158 | + } | |
43159 | +} | |
43160 | + | |
43161 | +/* | |
43162 | + * We need to use the batch mode here, but pgd_pupulate() won't be | |
43163 | + * be called frequently. | |
43164 | + */ | |
43165 | +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) | |
43166 | +{ | |
43167 | + if (unlikely((mm)->context.pinned)) { | |
43168 | + BUG_ON(HYPERVISOR_update_va_mapping( | |
43169 | + (unsigned long)pud, | |
43170 | + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, | |
43171 | + PAGE_KERNEL_RO), 0)); | |
43172 | + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); | |
43173 | + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); | |
43174 | + } else { | |
43175 | + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud)); | |
43176 | + *(__user_pgd(pgd)) = *(pgd); | |
43177 | + } | |
43178 | +} | |
43179 | + | |
43180 | +extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr); | |
43181 | +extern void pte_free(struct page *pte); | |
43182 | + | |
43183 | +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) | |
43184 | +{ | |
43185 | + struct page *pg; | |
43186 | + | |
43187 | + pg = pte_alloc_one(mm, addr); | |
43188 | + return pg ? page_address(pg) : NULL; | |
43189 | +} | |
43190 | + | |
43191 | +static inline void pmd_free(pmd_t *pmd) | |
43192 | +{ | |
43193 | + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); | |
43194 | + pte_free(virt_to_page(pmd)); | |
43195 | +} | |
43196 | + | |
43197 | +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) | |
43198 | +{ | |
43199 | + struct page *pg; | |
43200 | + | |
43201 | + pg = pte_alloc_one(mm, addr); | |
43202 | + return pg ? page_address(pg) : NULL; | |
43203 | +} | |
43204 | + | |
43205 | +static inline void pud_free(pud_t *pud) | |
43206 | +{ | |
43207 | + BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); | |
43208 | + pte_free(virt_to_page(pud)); | |
43209 | +} | |
43210 | + | |
43211 | +static inline void pgd_list_add(pgd_t *pgd) | |
43212 | +{ | |
43213 | + struct page *page = virt_to_page(pgd); | |
43214 | + | |
43215 | + spin_lock(&pgd_lock); | |
43216 | + page->index = (pgoff_t)pgd_list; | |
43217 | + if (pgd_list) | |
43218 | + pgd_list->private = (unsigned long)&page->index; | |
43219 | + pgd_list = page; | |
43220 | + page->private = (unsigned long)&pgd_list; | |
43221 | + spin_unlock(&pgd_lock); | |
43222 | +} | |
43223 | + | |
43224 | +static inline void pgd_list_del(pgd_t *pgd) | |
43225 | +{ | |
43226 | + struct page *next, **pprev, *page = virt_to_page(pgd); | |
43227 | + | |
43228 | + spin_lock(&pgd_lock); | |
43229 | + next = (struct page *)page->index; | |
43230 | + pprev = (struct page **)page->private; | |
43231 | + *pprev = next; | |
43232 | + if (next) | |
43233 | + next->private = (unsigned long)pprev; | |
43234 | + spin_unlock(&pgd_lock); | |
43235 | +} | |
43236 | + | |
43237 | +static inline pgd_t *pgd_alloc(struct mm_struct *mm) | |
43238 | +{ | |
43239 | + /* | |
43240 | + * We allocate two contiguous pages for kernel and user. | |
43241 | + */ | |
43242 | + unsigned boundary; | |
43243 | + pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1); | |
43244 | + if (!pgd) | |
43245 | + return NULL; | |
43246 | + pgd_list_add(pgd); | |
43247 | + /* | |
43248 | + * Copy kernel pointers in from init. | |
43249 | + * Could keep a freelist or slab cache of those because the kernel | |
43250 | + * part never changes. | |
43251 | + */ | |
43252 | + boundary = pgd_index(__PAGE_OFFSET); | |
43253 | + memset(pgd, 0, boundary * sizeof(pgd_t)); | |
43254 | + memcpy(pgd + boundary, | |
43255 | + init_level4_pgt + boundary, | |
43256 | + (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); | |
43257 | + | |
43258 | + memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */ | |
43259 | + /* | |
43260 | + * Set level3_user_pgt for vsyscall area | |
43261 | + */ | |
43262 | + __user_pgd(pgd)[pgd_index(VSYSCALL_START)] = | |
43263 | + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE); | |
43264 | + return pgd; | |
43265 | +} | |
43266 | + | |
43267 | +static inline void pgd_free(pgd_t *pgd) | |
43268 | +{ | |
43269 | + pte_t *ptep = virt_to_ptep(pgd); | |
43270 | + | |
43271 | + if (!pte_write(*ptep)) { | |
43272 | + xen_pgd_unpin(__pa(pgd)); | |
43273 | + BUG_ON(HYPERVISOR_update_va_mapping( | |
43274 | + (unsigned long)pgd, | |
43275 | + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), | |
43276 | + 0)); | |
43277 | + } | |
43278 | + | |
43279 | + ptep = virt_to_ptep(__user_pgd(pgd)); | |
43280 | + | |
43281 | + if (!pte_write(*ptep)) { | |
43282 | + xen_pgd_unpin(__pa(__user_pgd(pgd))); | |
43283 | + BUG_ON(HYPERVISOR_update_va_mapping( | |
43284 | + (unsigned long)__user_pgd(pgd), | |
43285 | + pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, | |
43286 | + PAGE_KERNEL), | |
43287 | + 0)); | |
43288 | + } | |
43289 | + | |
43290 | + pgd_list_del(pgd); | |
43291 | + free_pages((unsigned long)pgd, 1); | |
43292 | +} | |
43293 | + | |
43294 | +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | |
43295 | +{ | |
43296 | + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); | |
43297 | + if (pte) | |
43298 | + make_page_readonly(pte, XENFEAT_writable_page_tables); | |
43299 | + | |
43300 | + return pte; | |
43301 | +} | |
43302 | + | |
43303 | +/* Should really implement gc for free page table pages. This could be | |
43304 | + done with a reference count in struct page. */ | |
43305 | + | |
43306 | +static inline void pte_free_kernel(pte_t *pte) | |
43307 | +{ | |
43308 | + BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); | |
43309 | + make_page_writable(pte, XENFEAT_writable_page_tables); | |
43310 | + free_page((unsigned long)pte); | |
43311 | +} | |
43312 | + | |
43313 | +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) | |
43314 | +#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) | |
43315 | +#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) | |
43316 | + | |
43317 | +#endif /* _X86_64_PGALLOC_H */ | |
43318 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_64.h | |
43319 | =================================================================== | |
43320 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
43321 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-07-21 11:00:33.000000000 +0200 | |
43322 | @@ -0,0 +1,583 @@ | |
43323 | +#ifndef _X86_64_PGTABLE_H | |
43324 | +#define _X86_64_PGTABLE_H | |
43325 | + | |
43326 | +/* | |
43327 | + * This file contains the functions and defines necessary to modify and use | |
43328 | + * the x86-64 page table tree. | |
43329 | + */ | |
43330 | +#include <asm/processor.h> | |
43331 | +#include <asm/fixmap.h> | |
43332 | +#include <asm/bitops.h> | |
43333 | +#include <linux/threads.h> | |
43334 | +#include <linux/sched.h> | |
43335 | +#include <asm/pda.h> | |
43336 | +#ifdef CONFIG_XEN | |
43337 | +#include <asm/hypervisor.h> | |
43338 | + | |
43339 | +extern pud_t level3_user_pgt[512]; | |
43340 | + | |
43341 | +extern void xen_init_pt(void); | |
43342 | + | |
43343 | +extern pte_t *lookup_address(unsigned long address); | |
43344 | + | |
43345 | +#define virt_to_ptep(va) \ | |
43346 | +({ \ | |
43347 | + pte_t *__ptep = lookup_address((unsigned long)(va)); \ | |
43348 | + BUG_ON(!__ptep || !pte_present(*__ptep)); \ | |
43349 | + __ptep; \ | |
43350 | +}) | |
43351 | + | |
43352 | +#define arbitrary_virt_to_machine(va) \ | |
43353 | + (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \ | |
43354 | + | ((unsigned long)(va) & (PAGE_SIZE - 1))) | |
43355 | +#endif | |
43356 | + | |
43357 | +extern pud_t level3_kernel_pgt[512]; | |
43358 | +extern pud_t level3_physmem_pgt[512]; | |
43359 | +extern pud_t level3_ident_pgt[512]; | |
43360 | +extern pmd_t level2_kernel_pgt[512]; | |
43361 | +extern pgd_t init_level4_pgt[]; | |
43362 | +extern pgd_t boot_level4_pgt[]; | |
43363 | +extern unsigned long __supported_pte_mask; | |
43364 | + | |
43365 | +#define swapper_pg_dir init_level4_pgt | |
43366 | + | |
43367 | +extern int nonx_setup(char *str); | |
43368 | +extern void paging_init(void); | |
43369 | +extern void clear_kernel_mapping(unsigned long addr, unsigned long size); | |
43370 | + | |
43371 | +extern unsigned long pgkern_mask; | |
43372 | + | |
43373 | +/* | |
43374 | + * ZERO_PAGE is a global shared page that is always zero: used | |
43375 | + * for zero-mapped memory areas etc.. | |
43376 | + */ | |
43377 | +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; | |
43378 | +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) | |
43379 | + | |
43380 | +/* | |
43381 | + * PGDIR_SHIFT determines what a top-level page table entry can map | |
43382 | + */ | |
43383 | +#define PGDIR_SHIFT 39 | |
43384 | +#define PTRS_PER_PGD 512 | |
43385 | + | |
43386 | +/* | |
43387 | + * 3rd level page | |
43388 | + */ | |
43389 | +#define PUD_SHIFT 30 | |
43390 | +#define PTRS_PER_PUD 512 | |
43391 | + | |
43392 | +/* | |
43393 | + * PMD_SHIFT determines the size of the area a middle-level | |
43394 | + * page table can map | |
43395 | + */ | |
43396 | +#define PMD_SHIFT 21 | |
43397 | +#define PTRS_PER_PMD 512 | |
43398 | + | |
43399 | +/* | |
43400 | + * entries per page directory level | |
43401 | + */ | |
43402 | +#define PTRS_PER_PTE 512 | |
43403 | + | |
43404 | +#define pte_ERROR(e) \ | |
43405 | + printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ | |
43406 | + &(e), __pte_val(e), pte_pfn(e)) | |
43407 | +#define pmd_ERROR(e) \ | |
43408 | + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ | |
43409 | + &(e), __pmd_val(e), pmd_pfn(e)) | |
43410 | +#define pud_ERROR(e) \ | |
43411 | + printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ | |
43412 | + &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) | |
43413 | +#define pgd_ERROR(e) \ | |
43414 | + printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ | |
43415 | + &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) | |
43416 | + | |
43417 | +#define pgd_none(x) (!__pgd_val(x)) | |
43418 | +#define pud_none(x) (!__pud_val(x)) | |
43419 | + | |
43420 | +static inline void set_pte(pte_t *dst, pte_t val) | |
43421 | +{ | |
43422 | + *dst = val; | |
43423 | +} | |
43424 | + | |
43425 | +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval)) | |
43426 | +#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval)) | |
43427 | +#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval)) | |
43428 | + | |
43429 | +static inline void pud_clear (pud_t * pud) | |
43430 | +{ | |
43431 | + set_pud(pud, __pud(0)); | |
43432 | +} | |
43433 | + | |
43434 | +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) | |
43435 | + | |
43436 | +static inline void pgd_clear (pgd_t * pgd) | |
43437 | +{ | |
43438 | + set_pgd(pgd, __pgd(0)); | |
43439 | + set_pgd(__user_pgd(pgd), __pgd(0)); | |
43440 | +} | |
43441 | + | |
43442 | +#define pud_page(pud) \ | |
43443 | + ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK)) | |
43444 | + | |
43445 | +#define pte_same(a, b) ((a).pte == (b).pte) | |
43446 | + | |
43447 | +#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK)) | |
43448 | + | |
43449 | +#define PMD_SIZE (1UL << PMD_SHIFT) | |
43450 | +#define PMD_MASK (~(PMD_SIZE-1)) | |
43451 | +#define PUD_SIZE (1UL << PUD_SHIFT) | |
43452 | +#define PUD_MASK (~(PUD_SIZE-1)) | |
43453 | +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) | |
43454 | +#define PGDIR_MASK (~(PGDIR_SIZE-1)) | |
43455 | + | |
43456 | +#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) | |
43457 | +#define FIRST_USER_ADDRESS 0 | |
43458 | + | |
43459 | +#ifndef __ASSEMBLY__ | |
43460 | +#define MAXMEM 0x3fffffffffffUL | |
43461 | +#define VMALLOC_START 0xffffc20000000000UL | |
43462 | +#define VMALLOC_END 0xffffe1ffffffffffUL | |
43463 | +#define MODULES_VADDR 0xffffffff88000000UL | |
43464 | +#define MODULES_END 0xfffffffffff00000UL | |
43465 | +#define MODULES_LEN (MODULES_END - MODULES_VADDR) | |
43466 | + | |
43467 | +#define _PAGE_BIT_PRESENT 0 | |
43468 | +#define _PAGE_BIT_RW 1 | |
43469 | +#define _PAGE_BIT_USER 2 | |
43470 | +#define _PAGE_BIT_PWT 3 | |
43471 | +#define _PAGE_BIT_PCD 4 | |
43472 | +#define _PAGE_BIT_ACCESSED 5 | |
43473 | +#define _PAGE_BIT_DIRTY 6 | |
43474 | +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ | |
43475 | +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ | |
43476 | +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ | |
43477 | + | |
43478 | +#define _PAGE_PRESENT 0x001 | |
43479 | +#define _PAGE_RW 0x002 | |
43480 | +#define _PAGE_USER 0x004 | |
43481 | +#define _PAGE_PWT 0x008 | |
43482 | +#define _PAGE_PCD 0x010 | |
43483 | +#define _PAGE_ACCESSED 0x020 | |
43484 | +#define _PAGE_DIRTY 0x040 | |
43485 | +#define _PAGE_PSE 0x080 /* 2MB page */ | |
43486 | +#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ | |
43487 | +#define _PAGE_GLOBAL 0x100 /* Global TLB entry */ | |
43488 | + | |
43489 | +#define _PAGE_PROTNONE 0x080 /* If not present */ | |
43490 | +#define _PAGE_NX (1UL<<_PAGE_BIT_NX) | |
43491 | + | |
43492 | +/* Mapped page is I/O or foreign and has no associated page struct. */ | |
43493 | +#define _PAGE_IO 0x200 | |
43494 | + | |
43495 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
43496 | +extern unsigned int __kernel_page_user; | |
43497 | +#else | |
43498 | +#define __kernel_page_user 0 | |
43499 | +#endif | |
43500 | + | |
43501 | +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) | |
43502 | +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user) | |
43503 | + | |
43504 | +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) | |
43505 | + | |
43506 | +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) | |
43507 | +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
43508 | +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) | |
43509 | +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
43510 | +#define PAGE_COPY PAGE_COPY_NOEXEC | |
43511 | +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
43512 | +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) | |
43513 | +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
43514 | +#define __PAGE_KERNEL \ | |
43515 | + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) | |
43516 | +#define __PAGE_KERNEL_EXEC \ | |
43517 | + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) | |
43518 | +#define __PAGE_KERNEL_NOCACHE \ | |
43519 | + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) | |
43520 | +#define __PAGE_KERNEL_RO \ | |
43521 | + (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) | |
43522 | +#define __PAGE_KERNEL_VSYSCALL \ | |
43523 | + (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) | |
43524 | +#define __PAGE_KERNEL_VSYSCALL_NOCACHE \ | |
43525 | + (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD) | |
43526 | +#define __PAGE_KERNEL_LARGE \ | |
43527 | + (__PAGE_KERNEL | _PAGE_PSE) | |
43528 | +#define __PAGE_KERNEL_LARGE_EXEC \ | |
43529 | + (__PAGE_KERNEL_EXEC | _PAGE_PSE) | |
43530 | + | |
43531 | +/* | |
43532 | + * We don't support GLOBAL page in xenolinux64 | |
43533 | + */ | |
43534 | +#define MAKE_GLOBAL(x) __pgprot((x)) | |
43535 | + | |
43536 | +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) | |
43537 | +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) | |
43538 | +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) | |
43539 | +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) | |
43540 | +#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL) | |
43541 | +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL) | |
43542 | +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) | |
43543 | +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE) | |
43544 | + | |
43545 | +/* xwr */ | |
43546 | +#define __P000 PAGE_NONE | |
43547 | +#define __P001 PAGE_READONLY | |
43548 | +#define __P010 PAGE_COPY | |
43549 | +#define __P011 PAGE_COPY | |
43550 | +#define __P100 PAGE_READONLY_EXEC | |
43551 | +#define __P101 PAGE_READONLY_EXEC | |
43552 | +#define __P110 PAGE_COPY_EXEC | |
43553 | +#define __P111 PAGE_COPY_EXEC | |
43554 | + | |
43555 | +#define __S000 PAGE_NONE | |
43556 | +#define __S001 PAGE_READONLY | |
43557 | +#define __S010 PAGE_SHARED | |
43558 | +#define __S011 PAGE_SHARED | |
43559 | +#define __S100 PAGE_READONLY_EXEC | |
43560 | +#define __S101 PAGE_READONLY_EXEC | |
43561 | +#define __S110 PAGE_SHARED_EXEC | |
43562 | +#define __S111 PAGE_SHARED_EXEC | |
43563 | + | |
43564 | +static inline unsigned long pgd_bad(pgd_t pgd) | |
43565 | +{ | |
43566 | + unsigned long val = __pgd_val(pgd); | |
43567 | + val &= ~PTE_MASK; | |
43568 | + val &= ~(_PAGE_USER | _PAGE_DIRTY); | |
43569 | + return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED); | |
43570 | +} | |
43571 | + | |
43572 | +static inline unsigned long pud_bad(pud_t pud) | |
43573 | +{ | |
43574 | + unsigned long val = __pud_val(pud); | |
43575 | + val &= ~PTE_MASK; | |
43576 | + val &= ~(_PAGE_USER | _PAGE_DIRTY); | |
43577 | + return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED); | |
43578 | +} | |
43579 | + | |
43580 | +#define set_pte_at(_mm,addr,ptep,pteval) do { \ | |
43581 | + if (((_mm) != current->mm && (_mm) != &init_mm) || \ | |
43582 | + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \ | |
43583 | + set_pte((ptep), (pteval)); \ | |
43584 | +} while (0) | |
43585 | + | |
43586 | +#define pte_none(x) (!(x).pte) | |
43587 | +#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE)) | |
43588 | +#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) | |
43589 | + | |
43590 | +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) | |
43591 | + | |
43592 | +#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT) | |
43593 | +#define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \ | |
43594 | + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) | |
43595 | +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \ | |
43596 | + (_pte).pte & _PAGE_PRESENT ? \ | |
43597 | + mfn_to_local_pfn(__pte_mfn(_pte)) : \ | |
43598 | + __pte_mfn(_pte)) | |
43599 | + | |
43600 | +#define pte_page(x) pfn_to_page(pte_pfn(x)) | |
43601 | + | |
43602 | +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) | |
43603 | +{ | |
43604 | + unsigned long pte = page_nr << PAGE_SHIFT; | |
43605 | + pte |= pgprot_val(pgprot); | |
43606 | + pte &= __supported_pte_mask; | |
43607 | + return __pte(pte); | |
43608 | +} | |
43609 | + | |
43610 | +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
43611 | +{ | |
43612 | + pte_t pte = *ptep; | |
43613 | + if (!pte_none(pte)) { | |
43614 | + if ((mm != &init_mm) || | |
43615 | + HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) | |
43616 | + pte = __pte_ma(xchg(&ptep->pte, 0)); | |
43617 | + } | |
43618 | + return pte; | |
43619 | +} | |
43620 | + | |
43621 | +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) | |
43622 | +{ | |
43623 | + if (full) { | |
43624 | + pte_t pte = *ptep; | |
43625 | + if (mm->context.pinned) | |
43626 | + xen_l1_entry_update(ptep, __pte(0)); | |
43627 | + else | |
43628 | + *ptep = __pte(0); | |
43629 | + return pte; | |
43630 | + } | |
43631 | + return ptep_get_and_clear(mm, addr, ptep); | |
43632 | +} | |
43633 | + | |
43634 | +#define ptep_clear_flush(vma, addr, ptep) \ | |
43635 | +({ \ | |
43636 | + pte_t *__ptep = (ptep); \ | |
43637 | + pte_t __res = *__ptep; \ | |
43638 | + if (!pte_none(__res) && \ | |
43639 | + ((vma)->vm_mm != current->mm || \ | |
43640 | + HYPERVISOR_update_va_mapping(addr, __pte(0), \ | |
43641 | + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ | |
43642 | + UVMF_INVLPG|UVMF_MULTI))) { \ | |
43643 | + __ptep->pte = 0; \ | |
43644 | + flush_tlb_page(vma, addr); \ | |
43645 | + } \ | |
43646 | + __res; \ | |
43647 | +}) | |
43648 | + | |
43649 | +/* | |
43650 | + * The following only work if pte_present() is true. | |
43651 | + * Undefined behaviour if not.. | |
43652 | + */ | |
43653 | +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) | |
43654 | +static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } | |
43655 | +static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } | |
43656 | +static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } | |
43657 | +static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; } | |
43658 | +static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; } | |
43659 | +static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; } | |
43660 | +static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; } | |
43661 | +static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; } | |
43662 | + | |
43663 | +static inline pte_t pte_rdprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; } | |
43664 | +static inline pte_t pte_exprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; } | |
43665 | +static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; } | |
43666 | +static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; } | |
43667 | +static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; } | |
43668 | +static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; } | |
43669 | +static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; } | |
43670 | +static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; } | |
43671 | +static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; } | |
43672 | +static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; } | |
43673 | +static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; } | |
43674 | + | |
43675 | +#define ptep_test_and_clear_dirty(vma, addr, ptep) \ | |
43676 | +({ \ | |
43677 | + pte_t __pte = *(ptep); \ | |
43678 | + int __ret = pte_dirty(__pte); \ | |
43679 | + if (__ret) \ | |
43680 | + set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \ | |
43681 | + __ret; \ | |
43682 | +}) | |
43683 | + | |
43684 | +#define ptep_test_and_clear_young(vma, addr, ptep) \ | |
43685 | +({ \ | |
43686 | + pte_t __pte = *(ptep); \ | |
43687 | + int __ret = pte_young(__pte); \ | |
43688 | + if (__ret) \ | |
43689 | + set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \ | |
43690 | + __ret; \ | |
43691 | +}) | |
43692 | + | |
43693 | +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | |
43694 | +{ | |
43695 | + pte_t pte = *ptep; | |
43696 | + if (pte_write(pte)) | |
43697 | + set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); | |
43698 | +} | |
43699 | + | |
43700 | +/* | |
43701 | + * Macro to mark a page protection value as "uncacheable". | |
43702 | + */ | |
43703 | +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) | |
43704 | + | |
43705 | +static inline int pmd_large(pmd_t pte) { | |
43706 | + return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; | |
43707 | +} | |
43708 | + | |
43709 | + | |
43710 | +/* | |
43711 | + * Conversion functions: convert a page and protection to a page entry, | |
43712 | + * and a page entry and page directory to the page they refer to. | |
43713 | + */ | |
43714 | + | |
43715 | +/* | |
43716 | + * Level 4 access. | |
43717 | + * Never use these in the common code. | |
43718 | + */ | |
43719 | +#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK)) | |
43720 | +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) | |
43721 | +#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) | |
43722 | +#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address)) | |
43723 | +#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT) | |
43724 | +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) | |
43725 | + | |
43726 | +/* PUD - Level3 access */ | |
43727 | +/* to find an entry in a page-table-directory. */ | |
43728 | +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) | |
43729 | +#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address)) | |
43730 | +#define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT) | |
43731 | + | |
43732 | +/* PMD - Level 2 access */ | |
43733 | +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK)) | |
43734 | +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) | |
43735 | + | |
43736 | +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) | |
43737 | +#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \ | |
43738 | + pmd_index(address)) | |
43739 | +#define pmd_none(x) (!__pmd_val(x)) | |
43740 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
43741 | +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. | |
43742 | + can temporarily clear it. */ | |
43743 | +#define pmd_present(x) (__pmd_val(x)) | |
43744 | +#else | |
43745 | +#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) | |
43746 | +#endif | |
43747 | +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) | |
43748 | +#define pmd_bad(x) ((__pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \ | |
43749 | + != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT))) | |
43750 | +#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) | |
43751 | +#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) | |
43752 | + | |
43753 | +#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) | |
43754 | +#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE }) | |
43755 | +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT | |
43756 | + | |
43757 | +/* PTE - Level 1 access. */ | |
43758 | + | |
43759 | +/* page, protection -> pte */ | |
43760 | +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) | |
43761 | +#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE) | |
43762 | + | |
43763 | +/* physical address -> PTE */ | |
43764 | +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) | |
43765 | +{ | |
43766 | + unsigned long pteval; | |
43767 | + pteval = physpage | pgprot_val(pgprot); | |
43768 | + return __pte(pteval); | |
43769 | +} | |
43770 | + | |
43771 | +/* Change flags of a PTE */ | |
43772 | +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) | |
43773 | +{ | |
43774 | + /* | |
43775 | + * Since this might change the present bit (which controls whether | |
43776 | + * a pte_t object has undergone p2m translation), we must use | |
43777 | + * pte_val() on the input pte and __pte() for the return value. | |
43778 | + */ | |
43779 | + unsigned long pteval = pte_val(pte); | |
43780 | + | |
43781 | + pteval &= _PAGE_CHG_MASK; | |
43782 | + pteval |= pgprot_val(newprot); | |
43783 | + pteval &= __supported_pte_mask; | |
43784 | + return __pte(pteval); | |
43785 | +} | |
43786 | + | |
43787 | +#define pte_index(address) \ | |
43788 | + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) | |
43789 | +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \ | |
43790 | + pte_index(address)) | |
43791 | + | |
43792 | +/* x86-64 always has all page tables mapped. */ | |
43793 | +#define pte_offset_map(dir,address) pte_offset_kernel(dir,address) | |
43794 | +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address) | |
43795 | +#define pte_unmap(pte) /* NOP */ | |
43796 | +#define pte_unmap_nested(pte) /* NOP */ | |
43797 | + | |
43798 | +#define update_mmu_cache(vma,address,pte) do { } while (0) | |
43799 | + | |
43800 | +/* | |
43801 | + * Rules for using ptep_establish: the pte MUST be a user pte, and | |
43802 | + * must be a present->present transition. | |
43803 | + */ | |
43804 | +#define __HAVE_ARCH_PTEP_ESTABLISH | |
43805 | +#define ptep_establish(vma, address, ptep, pteval) \ | |
43806 | + do { \ | |
43807 | + if ( likely((vma)->vm_mm == current->mm) ) { \ | |
43808 | + BUG_ON(HYPERVISOR_update_va_mapping(address, \ | |
43809 | + pteval, \ | |
43810 | + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ | |
43811 | + UVMF_INVLPG|UVMF_MULTI)); \ | |
43812 | + } else { \ | |
43813 | + xen_l1_entry_update(ptep, pteval); \ | |
43814 | + flush_tlb_page(vma, address); \ | |
43815 | + } \ | |
43816 | + } while (0) | |
43817 | + | |
43818 | +/* We only update the dirty/accessed state if we set | |
43819 | + * the dirty bit by hand in the kernel, since the hardware | |
43820 | + * will do the accessed bit for us, and we don't want to | |
43821 | + * race with other CPU's that might be updating the dirty | |
43822 | + * bit at the same time. */ | |
43823 | +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | |
43824 | +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ | |
43825 | + do { \ | |
43826 | + if (dirty) \ | |
43827 | + ptep_establish(vma, address, ptep, entry); \ | |
43828 | + } while (0) | |
43829 | + | |
43830 | +/* Encode and de-code a swap entry */ | |
43831 | +#define __swp_type(x) (((x).val >> 1) & 0x3f) | |
43832 | +#define __swp_offset(x) ((x).val >> 8) | |
43833 | +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) | |
43834 | +#define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) }) | |
43835 | +#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) | |
43836 | + | |
43837 | +extern spinlock_t pgd_lock; | |
43838 | +extern struct page *pgd_list; | |
43839 | +void vmalloc_sync_all(void); | |
43840 | + | |
43841 | +#endif /* !__ASSEMBLY__ */ | |
43842 | + | |
43843 | +extern int kern_addr_valid(unsigned long addr); | |
43844 | + | |
43845 | +#define DOMID_LOCAL (0xFFFFU) | |
43846 | + | |
43847 | +struct vm_area_struct; | |
43848 | + | |
43849 | +int direct_remap_pfn_range(struct vm_area_struct *vma, | |
43850 | + unsigned long address, | |
43851 | + unsigned long mfn, | |
43852 | + unsigned long size, | |
43853 | + pgprot_t prot, | |
43854 | + domid_t domid); | |
43855 | + | |
43856 | +int direct_kernel_remap_pfn_range(unsigned long address, | |
43857 | + unsigned long mfn, | |
43858 | + unsigned long size, | |
43859 | + pgprot_t prot, | |
43860 | + domid_t domid); | |
43861 | + | |
43862 | +int create_lookup_pte_addr(struct mm_struct *mm, | |
43863 | + unsigned long address, | |
43864 | + uint64_t *ptep); | |
43865 | + | |
43866 | +int touch_pte_range(struct mm_struct *mm, | |
43867 | + unsigned long address, | |
43868 | + unsigned long size); | |
43869 | + | |
43870 | +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |
43871 | + unsigned long addr, unsigned long end, pgprot_t newprot); | |
43872 | + | |
43873 | +#define arch_change_pte_range(mm, pmd, addr, end, newprot) \ | |
43874 | + xen_change_pte_range(mm, pmd, addr, end, newprot) | |
43875 | + | |
43876 | +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ | |
43877 | + direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO) | |
43878 | + | |
43879 | +#define MK_IOSPACE_PFN(space, pfn) (pfn) | |
43880 | +#define GET_IOSPACE(pfn) 0 | |
43881 | +#define GET_PFN(pfn) (pfn) | |
43882 | + | |
43883 | +#define HAVE_ARCH_UNMAPPED_AREA | |
43884 | + | |
43885 | +#define pgtable_cache_init() do { } while (0) | |
43886 | +#define check_pgt_cache() do { } while (0) | |
43887 | + | |
43888 | +#define PAGE_AGP PAGE_KERNEL_NOCACHE | |
43889 | +#define HAVE_PAGE_AGP 1 | |
43890 | + | |
43891 | +/* fs/proc/kcore.c */ | |
43892 | +#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) | |
43893 | +#define kc_offset_to_vaddr(o) \ | |
43894 | + (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o)) | |
43895 | + | |
43896 | +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG | |
43897 | +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY | |
43898 | +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR | |
43899 | +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL | |
43900 | +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH | |
43901 | +#define __HAVE_ARCH_PTEP_SET_WRPROTECT | |
43902 | +#define __HAVE_ARCH_PTE_SAME | |
43903 | +#include <asm-generic/pgtable.h> | |
43904 | + | |
43905 | +#endif /* _X86_64_PGTABLE_H */ | |
43906 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/processor_64.h | |
43907 | =================================================================== | |
43908 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
43909 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/processor_64.h 2008-03-06 08:54:32.000000000 +0100 | |
43910 | @@ -0,0 +1,502 @@ | |
43911 | +/* | |
43912 | + * include/asm-x86_64/processor.h | |
43913 | + * | |
43914 | + * Copyright (C) 1994 Linus Torvalds | |
43915 | + */ | |
43916 | + | |
43917 | +#ifndef __ASM_X86_64_PROCESSOR_H | |
43918 | +#define __ASM_X86_64_PROCESSOR_H | |
43919 | + | |
43920 | +#include <asm/segment.h> | |
43921 | +#include <asm/page.h> | |
43922 | +#include <asm/types.h> | |
43923 | +#include <asm/sigcontext.h> | |
43924 | +#include <asm/cpufeature.h> | |
43925 | +#include <linux/threads.h> | |
43926 | +#include <asm/msr.h> | |
43927 | +#include <asm/current.h> | |
43928 | +#include <asm/system.h> | |
43929 | +#include <asm/mmsegment.h> | |
43930 | +#include <asm/percpu.h> | |
43931 | +#include <linux/personality.h> | |
43932 | +#include <linux/cpumask.h> | |
43933 | + | |
43934 | +#define TF_MASK 0x00000100 | |
43935 | +#define IF_MASK 0x00000200 | |
43936 | +#define IOPL_MASK 0x00003000 | |
43937 | +#define NT_MASK 0x00004000 | |
43938 | +#define VM_MASK 0x00020000 | |
43939 | +#define AC_MASK 0x00040000 | |
43940 | +#define VIF_MASK 0x00080000 /* virtual interrupt flag */ | |
43941 | +#define VIP_MASK 0x00100000 /* virtual interrupt pending */ | |
43942 | +#define ID_MASK 0x00200000 | |
43943 | + | |
43944 | +#define desc_empty(desc) \ | |
43945 | + (!((desc)->a | (desc)->b)) | |
43946 | + | |
43947 | +#define desc_equal(desc1, desc2) \ | |
43948 | + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) | |
43949 | + | |
43950 | +/* | |
43951 | + * Default implementation of macro that returns current | |
43952 | + * instruction pointer ("program counter"). | |
43953 | + */ | |
43954 | +#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; }) | |
43955 | + | |
43956 | +/* | |
43957 | + * CPU type and hardware bug flags. Kept separately for each CPU. | |
43958 | + */ | |
43959 | + | |
43960 | +struct cpuinfo_x86 { | |
43961 | + __u8 x86; /* CPU family */ | |
43962 | + __u8 x86_vendor; /* CPU vendor */ | |
43963 | + __u8 x86_model; | |
43964 | + __u8 x86_mask; | |
43965 | + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ | |
43966 | + __u32 x86_capability[NCAPINTS]; | |
43967 | + char x86_vendor_id[16]; | |
43968 | + char x86_model_id[64]; | |
43969 | + int x86_cache_size; /* in KB */ | |
43970 | + int x86_clflush_size; | |
43971 | + int x86_cache_alignment; | |
43972 | + int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/ | |
43973 | + __u8 x86_virt_bits, x86_phys_bits; | |
43974 | + __u8 x86_max_cores; /* cpuid returned max cores value */ | |
43975 | + __u32 x86_power; | |
43976 | + __u32 extended_cpuid_level; /* Max extended CPUID function supported */ | |
43977 | + unsigned long loops_per_jiffy; | |
43978 | +#ifdef CONFIG_SMP | |
43979 | + cpumask_t llc_shared_map; /* cpus sharing the last level cache */ | |
43980 | +#endif | |
43981 | + __u8 apicid; | |
43982 | +#ifdef CONFIG_SMP | |
43983 | + __u8 booted_cores; /* number of cores as seen by OS */ | |
43984 | + __u8 phys_proc_id; /* Physical Processor id. */ | |
43985 | + __u8 cpu_core_id; /* Core id. */ | |
43986 | +#endif | |
43987 | +} ____cacheline_aligned; | |
43988 | + | |
43989 | +#define X86_VENDOR_INTEL 0 | |
43990 | +#define X86_VENDOR_CYRIX 1 | |
43991 | +#define X86_VENDOR_AMD 2 | |
43992 | +#define X86_VENDOR_UMC 3 | |
43993 | +#define X86_VENDOR_NEXGEN 4 | |
43994 | +#define X86_VENDOR_CENTAUR 5 | |
43995 | +#define X86_VENDOR_RISE 6 | |
43996 | +#define X86_VENDOR_TRANSMETA 7 | |
43997 | +#define X86_VENDOR_NUM 8 | |
43998 | +#define X86_VENDOR_UNKNOWN 0xff | |
43999 | + | |
44000 | +#ifdef CONFIG_SMP | |
44001 | +extern struct cpuinfo_x86 cpu_data[]; | |
44002 | +#define current_cpu_data cpu_data[smp_processor_id()] | |
44003 | +#else | |
44004 | +#define cpu_data (&boot_cpu_data) | |
44005 | +#define current_cpu_data boot_cpu_data | |
44006 | +#endif | |
44007 | + | |
44008 | +extern char ignore_irq13; | |
44009 | + | |
44010 | +extern void identify_cpu(struct cpuinfo_x86 *); | |
44011 | +extern void print_cpu_info(struct cpuinfo_x86 *); | |
44012 | +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); | |
44013 | +extern unsigned short num_cache_leaves; | |
44014 | + | |
44015 | +/* | |
44016 | + * EFLAGS bits | |
44017 | + */ | |
44018 | +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ | |
44019 | +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ | |
44020 | +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ | |
44021 | +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ | |
44022 | +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ | |
44023 | +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ | |
44024 | +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ | |
44025 | +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ | |
44026 | +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ | |
44027 | +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ | |
44028 | +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ | |
44029 | +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ | |
44030 | +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ | |
44031 | +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ | |
44032 | +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ | |
44033 | +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ | |
44034 | +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ | |
44035 | + | |
44036 | +/* | |
44037 | + * Intel CPU features in CR4 | |
44038 | + */ | |
44039 | +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ | |
44040 | +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ | |
44041 | +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ | |
44042 | +#define X86_CR4_DE 0x0008 /* enable debugging extensions */ | |
44043 | +#define X86_CR4_PSE 0x0010 /* enable page size extensions */ | |
44044 | +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ | |
44045 | +#define X86_CR4_MCE 0x0040 /* Machine check enable */ | |
44046 | +#define X86_CR4_PGE 0x0080 /* enable global pages */ | |
44047 | +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ | |
44048 | +#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ | |
44049 | +#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ | |
44050 | + | |
44051 | +/* | |
44052 | + * Save the cr4 feature set we're using (ie | |
44053 | + * Pentium 4MB enable and PPro Global page | |
44054 | + * enable), so that any CPU's that boot up | |
44055 | + * after us can get the correct flags. | |
44056 | + */ | |
44057 | +extern unsigned long mmu_cr4_features; | |
44058 | + | |
44059 | +static inline void set_in_cr4 (unsigned long mask) | |
44060 | +{ | |
44061 | + mmu_cr4_features |= mask; | |
44062 | + __asm__("movq %%cr4,%%rax\n\t" | |
44063 | + "orq %0,%%rax\n\t" | |
44064 | + "movq %%rax,%%cr4\n" | |
44065 | + : : "irg" (mask) | |
44066 | + :"ax"); | |
44067 | +} | |
44068 | + | |
44069 | +static inline void clear_in_cr4 (unsigned long mask) | |
44070 | +{ | |
44071 | + mmu_cr4_features &= ~mask; | |
44072 | + __asm__("movq %%cr4,%%rax\n\t" | |
44073 | + "andq %0,%%rax\n\t" | |
44074 | + "movq %%rax,%%cr4\n" | |
44075 | + : : "irg" (~mask) | |
44076 | + :"ax"); | |
44077 | +} | |
44078 | + | |
44079 | + | |
44080 | +/* | |
44081 | + * User space process size. 47bits minus one guard page. | |
44082 | + */ | |
44083 | +#define TASK_SIZE64 (0x800000000000UL - 4096) | |
44084 | + | |
44085 | +/* This decides where the kernel will search for a free chunk of vm | |
44086 | + * space during mmap's. | |
44087 | + */ | |
44088 | +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000) | |
44089 | + | |
44090 | +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64) | |
44091 | +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) | |
44092 | + | |
44093 | +#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3) | |
44094 | + | |
44095 | +/* | |
44096 | + * Size of io_bitmap. | |
44097 | + */ | |
44098 | +#define IO_BITMAP_BITS 65536 | |
44099 | +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) | |
44100 | +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) | |
44101 | +#ifndef CONFIG_X86_NO_TSS | |
44102 | +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) | |
44103 | +#endif | |
44104 | +#define INVALID_IO_BITMAP_OFFSET 0x8000 | |
44105 | + | |
44106 | +struct i387_fxsave_struct { | |
44107 | + u16 cwd; | |
44108 | + u16 swd; | |
44109 | + u16 twd; | |
44110 | + u16 fop; | |
44111 | + u64 rip; | |
44112 | + u64 rdp; | |
44113 | + u32 mxcsr; | |
44114 | + u32 mxcsr_mask; | |
44115 | + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ | |
44116 | + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */ | |
44117 | + u32 padding[24]; | |
44118 | +} __attribute__ ((aligned (16))); | |
44119 | + | |
44120 | +union i387_union { | |
44121 | + struct i387_fxsave_struct fxsave; | |
44122 | +}; | |
44123 | + | |
44124 | +#ifndef CONFIG_X86_NO_TSS | |
44125 | +struct tss_struct { | |
44126 | + u32 reserved1; | |
44127 | + u64 rsp0; | |
44128 | + u64 rsp1; | |
44129 | + u64 rsp2; | |
44130 | + u64 reserved2; | |
44131 | + u64 ist[7]; | |
44132 | + u32 reserved3; | |
44133 | + u32 reserved4; | |
44134 | + u16 reserved5; | |
44135 | + u16 io_bitmap_base; | |
44136 | + /* | |
44137 | + * The extra 1 is there because the CPU will access an | |
44138 | + * additional byte beyond the end of the IO permission | |
44139 | + * bitmap. The extra byte must be all 1 bits, and must | |
44140 | + * be within the limit. Thus we have: | |
44141 | + * | |
44142 | + * 128 bytes, the bitmap itself, for ports 0..0x3ff | |
44143 | + * 8 bytes, for an extra "long" of ~0UL | |
44144 | + */ | |
44145 | + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; | |
44146 | +} __attribute__((packed)) ____cacheline_aligned; | |
44147 | + | |
44148 | +DECLARE_PER_CPU(struct tss_struct,init_tss); | |
44149 | +#endif | |
44150 | + | |
44151 | + | |
44152 | +extern struct cpuinfo_x86 boot_cpu_data; | |
44153 | +#ifndef CONFIG_X86_NO_TSS | |
44154 | +/* Save the original ist values for checking stack pointers during debugging */ | |
44155 | +struct orig_ist { | |
44156 | + unsigned long ist[7]; | |
44157 | +}; | |
44158 | +DECLARE_PER_CPU(struct orig_ist, orig_ist); | |
44159 | +#endif | |
44160 | + | |
44161 | +#ifdef CONFIG_X86_VSMP | |
44162 | +#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) | |
44163 | +#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) | |
44164 | +#else | |
44165 | +#define ARCH_MIN_TASKALIGN 16 | |
44166 | +#define ARCH_MIN_MMSTRUCT_ALIGN 0 | |
44167 | +#endif | |
44168 | + | |
44169 | +struct thread_struct { | |
44170 | + unsigned long rsp0; | |
44171 | + unsigned long rsp; | |
44172 | + unsigned long userrsp; /* Copy from PDA */ | |
44173 | + unsigned long fs; | |
44174 | + unsigned long gs; | |
44175 | + unsigned short es, ds, fsindex, gsindex; | |
44176 | +/* Hardware debugging registers */ | |
44177 | + unsigned long debugreg0; | |
44178 | + unsigned long debugreg1; | |
44179 | + unsigned long debugreg2; | |
44180 | + unsigned long debugreg3; | |
44181 | + unsigned long debugreg6; | |
44182 | + unsigned long debugreg7; | |
44183 | +/* fault info */ | |
44184 | + unsigned long cr2, trap_no, error_code; | |
44185 | +/* floating point info */ | |
44186 | + union i387_union i387 __attribute__((aligned(16))); | |
44187 | +/* IO permissions. the bitmap could be moved into the GDT, that would make | |
44188 | + switch faster for a limited number of ioperm using tasks. -AK */ | |
44189 | + int ioperm; | |
44190 | + unsigned long *io_bitmap_ptr; | |
44191 | + unsigned io_bitmap_max; | |
44192 | +/* cached TLS descriptors. */ | |
44193 | + u64 tls_array[GDT_ENTRY_TLS_ENTRIES]; | |
44194 | + unsigned int iopl; | |
44195 | +} __attribute__((aligned(16))); | |
44196 | + | |
44197 | +#define INIT_THREAD { \ | |
44198 | + .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \ | |
44199 | +} | |
44200 | + | |
44201 | +#ifndef CONFIG_X86_NO_TSS | |
44202 | +#define INIT_TSS { \ | |
44203 | + .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \ | |
44204 | +} | |
44205 | +#endif | |
44206 | + | |
44207 | +#define INIT_MMAP \ | |
44208 | +{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL } | |
44209 | + | |
44210 | +#define start_thread(regs,new_rip,new_rsp) do { \ | |
44211 | + asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \ | |
44212 | + load_gs_index(0); \ | |
44213 | + (regs)->rip = (new_rip); \ | |
44214 | + (regs)->rsp = (new_rsp); \ | |
44215 | + write_pda(oldrsp, (new_rsp)); \ | |
44216 | + (regs)->cs = __USER_CS; \ | |
44217 | + (regs)->ss = __USER_DS; \ | |
44218 | + (regs)->eflags = 0x200; \ | |
44219 | + set_fs(USER_DS); \ | |
44220 | +} while(0) | |
44221 | + | |
44222 | +#define get_debugreg(var, register) \ | |
44223 | + var = HYPERVISOR_get_debugreg(register) | |
44224 | +#define set_debugreg(value, register) do { \ | |
44225 | + if (HYPERVISOR_set_debugreg(register, value)) \ | |
44226 | + BUG(); \ | |
44227 | +} while (0) | |
44228 | + | |
44229 | +struct task_struct; | |
44230 | +struct mm_struct; | |
44231 | + | |
44232 | +/* Free all resources held by a thread. */ | |
44233 | +extern void release_thread(struct task_struct *); | |
44234 | + | |
44235 | +/* Prepare to copy thread state - unlazy all lazy status */ | |
44236 | +extern void prepare_to_copy(struct task_struct *tsk); | |
44237 | + | |
44238 | +/* | |
44239 | + * create a kernel thread without removing it from tasklists | |
44240 | + */ | |
44241 | +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); | |
44242 | + | |
44243 | +/* | |
44244 | + * Return saved PC of a blocked thread. | |
44245 | + * What is this good for? it will be always the scheduler or ret_from_fork. | |
44246 | + */ | |
44247 | +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8)) | |
44248 | + | |
44249 | +extern unsigned long get_wchan(struct task_struct *p); | |
44250 | +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1) | |
44251 | +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip) | |
44252 | +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ | |
44253 | + | |
44254 | + | |
44255 | +struct microcode_header { | |
44256 | + unsigned int hdrver; | |
44257 | + unsigned int rev; | |
44258 | + unsigned int date; | |
44259 | + unsigned int sig; | |
44260 | + unsigned int cksum; | |
44261 | + unsigned int ldrver; | |
44262 | + unsigned int pf; | |
44263 | + unsigned int datasize; | |
44264 | + unsigned int totalsize; | |
44265 | + unsigned int reserved[3]; | |
44266 | +}; | |
44267 | + | |
44268 | +struct microcode { | |
44269 | + struct microcode_header hdr; | |
44270 | + unsigned int bits[0]; | |
44271 | +}; | |
44272 | + | |
44273 | +typedef struct microcode microcode_t; | |
44274 | +typedef struct microcode_header microcode_header_t; | |
44275 | + | |
44276 | +/* microcode format is extended from prescott processors */ | |
44277 | +struct extended_signature { | |
44278 | + unsigned int sig; | |
44279 | + unsigned int pf; | |
44280 | + unsigned int cksum; | |
44281 | +}; | |
44282 | + | |
44283 | +struct extended_sigtable { | |
44284 | + unsigned int count; | |
44285 | + unsigned int cksum; | |
44286 | + unsigned int reserved[3]; | |
44287 | + struct extended_signature sigs[0]; | |
44288 | +}; | |
44289 | + | |
44290 | + | |
44291 | +#define ASM_NOP1 K8_NOP1 | |
44292 | +#define ASM_NOP2 K8_NOP2 | |
44293 | +#define ASM_NOP3 K8_NOP3 | |
44294 | +#define ASM_NOP4 K8_NOP4 | |
44295 | +#define ASM_NOP5 K8_NOP5 | |
44296 | +#define ASM_NOP6 K8_NOP6 | |
44297 | +#define ASM_NOP7 K8_NOP7 | |
44298 | +#define ASM_NOP8 K8_NOP8 | |
44299 | + | |
44300 | +/* Opteron nops */ | |
44301 | +#define K8_NOP1 ".byte 0x90\n" | |
44302 | +#define K8_NOP2 ".byte 0x66,0x90\n" | |
44303 | +#define K8_NOP3 ".byte 0x66,0x66,0x90\n" | |
44304 | +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" | |
44305 | +#define K8_NOP5 K8_NOP3 K8_NOP2 | |
44306 | +#define K8_NOP6 K8_NOP3 K8_NOP3 | |
44307 | +#define K8_NOP7 K8_NOP4 K8_NOP3 | |
44308 | +#define K8_NOP8 K8_NOP4 K8_NOP4 | |
44309 | + | |
44310 | +#define ASM_NOP_MAX 8 | |
44311 | + | |
44312 | +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ | |
44313 | +static inline void rep_nop(void) | |
44314 | +{ | |
44315 | + __asm__ __volatile__("rep;nop": : :"memory"); | |
44316 | +} | |
44317 | + | |
44318 | +/* Stop speculative execution */ | |
44319 | +static inline void sync_core(void) | |
44320 | +{ | |
44321 | + int tmp; | |
44322 | + asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); | |
44323 | +} | |
44324 | + | |
44325 | +#define cpu_has_fpu 1 | |
44326 | + | |
44327 | +#define ARCH_HAS_PREFETCH | |
44328 | +static inline void prefetch(void *x) | |
44329 | +{ | |
44330 | + asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); | |
44331 | +} | |
44332 | + | |
44333 | +#define ARCH_HAS_PREFETCHW 1 | |
44334 | +static inline void prefetchw(void *x) | |
44335 | +{ | |
44336 | + alternative_input("prefetcht0 (%1)", | |
44337 | + "prefetchw (%1)", | |
44338 | + X86_FEATURE_3DNOW, | |
44339 | + "r" (x)); | |
44340 | +} | |
44341 | + | |
44342 | +#define ARCH_HAS_SPINLOCK_PREFETCH 1 | |
44343 | + | |
44344 | +#define spin_lock_prefetch(x) prefetchw(x) | |
44345 | + | |
44346 | +#define cpu_relax() rep_nop() | |
44347 | + | |
44348 | +/* | |
44349 | + * NSC/Cyrix CPU configuration register indexes | |
44350 | + */ | |
44351 | +#define CX86_CCR0 0xc0 | |
44352 | +#define CX86_CCR1 0xc1 | |
44353 | +#define CX86_CCR2 0xc2 | |
44354 | +#define CX86_CCR3 0xc3 | |
44355 | +#define CX86_CCR4 0xe8 | |
44356 | +#define CX86_CCR5 0xe9 | |
44357 | +#define CX86_CCR6 0xea | |
44358 | +#define CX86_CCR7 0xeb | |
44359 | +#define CX86_DIR0 0xfe | |
44360 | +#define CX86_DIR1 0xff | |
44361 | +#define CX86_ARR_BASE 0xc4 | |
44362 | +#define CX86_RCR_BASE 0xdc | |
44363 | + | |
44364 | +/* | |
44365 | + * NSC/Cyrix CPU indexed register access macros | |
44366 | + */ | |
44367 | + | |
44368 | +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); }) | |
44369 | + | |
44370 | +#define setCx86(reg, data) do { \ | |
44371 | + outb((reg), 0x22); \ | |
44372 | + outb((data), 0x23); \ | |
44373 | +} while (0) | |
44374 | + | |
44375 | +static inline void serialize_cpu(void) | |
44376 | +{ | |
44377 | + __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx"); | |
44378 | +} | |
44379 | + | |
44380 | +static inline void __monitor(const void *eax, unsigned long ecx, | |
44381 | + unsigned long edx) | |
44382 | +{ | |
44383 | + /* "monitor %eax,%ecx,%edx;" */ | |
44384 | + asm volatile( | |
44385 | + ".byte 0x0f,0x01,0xc8;" | |
44386 | + : :"a" (eax), "c" (ecx), "d"(edx)); | |
44387 | +} | |
44388 | + | |
44389 | +static inline void __mwait(unsigned long eax, unsigned long ecx) | |
44390 | +{ | |
44391 | + /* "mwait %eax,%ecx;" */ | |
44392 | + asm volatile( | |
44393 | + ".byte 0x0f,0x01,0xc9;" | |
44394 | + : :"a" (eax), "c" (ecx)); | |
44395 | +} | |
44396 | + | |
44397 | +#define stack_current() \ | |
44398 | +({ \ | |
44399 | + struct thread_info *ti; \ | |
44400 | + asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ | |
44401 | + ti->task; \ | |
44402 | +}) | |
44403 | + | |
44404 | +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) | |
44405 | + | |
44406 | +extern unsigned long boot_option_idle_override; | |
44407 | +/* Boot loader type from the setup header */ | |
44408 | +extern int bootloader_type; | |
44409 | + | |
44410 | +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 | |
44411 | + | |
44412 | +#endif /* __ASM_X86_64_PROCESSOR_H */ | |
44413 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/smp_64.h | |
44414 | =================================================================== | |
44415 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
44416 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/smp_64.h 2007-06-12 13:14:13.000000000 +0200 | |
44417 | @@ -0,0 +1,150 @@ | |
44418 | +#ifndef __ASM_SMP_H | |
44419 | +#define __ASM_SMP_H | |
44420 | + | |
44421 | +/* | |
44422 | + * We need the APIC definitions automatically as part of 'smp.h' | |
44423 | + */ | |
44424 | +#ifndef __ASSEMBLY__ | |
44425 | +#include <linux/threads.h> | |
44426 | +#include <linux/cpumask.h> | |
44427 | +#include <linux/bitops.h> | |
44428 | +extern int disable_apic; | |
44429 | +#endif | |
44430 | + | |
44431 | +#ifdef CONFIG_X86_LOCAL_APIC | |
44432 | +#ifndef __ASSEMBLY__ | |
44433 | +#include <asm/fixmap.h> | |
44434 | +#include <asm/mpspec.h> | |
44435 | +#ifdef CONFIG_X86_IO_APIC | |
44436 | +#include <asm/io_apic.h> | |
44437 | +#endif | |
44438 | +#include <asm/apic.h> | |
44439 | +#include <asm/thread_info.h> | |
44440 | +#endif | |
44441 | +#endif | |
44442 | + | |
44443 | +#ifdef CONFIG_SMP | |
44444 | +#ifndef ASSEMBLY | |
44445 | + | |
44446 | +#include <asm/pda.h> | |
44447 | + | |
44448 | +struct pt_regs; | |
44449 | + | |
44450 | +extern cpumask_t cpu_present_mask; | |
44451 | +extern cpumask_t cpu_possible_map; | |
44452 | +extern cpumask_t cpu_online_map; | |
44453 | +extern cpumask_t cpu_initialized; | |
44454 | + | |
44455 | +/* | |
44456 | + * Private routines/data | |
44457 | + */ | |
44458 | + | |
44459 | +extern void smp_alloc_memory(void); | |
44460 | +extern volatile unsigned long smp_invalidate_needed; | |
44461 | +extern int pic_mode; | |
44462 | +extern void lock_ipi_call_lock(void); | |
44463 | +extern void unlock_ipi_call_lock(void); | |
44464 | +extern int smp_num_siblings; | |
44465 | +extern void smp_send_reschedule(int cpu); | |
44466 | +void smp_stop_cpu(void); | |
44467 | +extern int smp_call_function_single(int cpuid, void (*func) (void *info), | |
44468 | + void *info, int retry, int wait); | |
44469 | + | |
44470 | +extern cpumask_t cpu_sibling_map[NR_CPUS]; | |
44471 | +extern cpumask_t cpu_core_map[NR_CPUS]; | |
44472 | +extern u8 cpu_llc_id[NR_CPUS]; | |
44473 | + | |
44474 | +#define SMP_TRAMPOLINE_BASE 0x6000 | |
44475 | + | |
44476 | +/* | |
44477 | + * On x86 all CPUs are mapped 1:1 to the APIC space. | |
44478 | + * This simplifies scheduling and IPI sending and | |
44479 | + * compresses data structures. | |
44480 | + */ | |
44481 | + | |
44482 | +static inline int num_booting_cpus(void) | |
44483 | +{ | |
44484 | + return cpus_weight(cpu_possible_map); | |
44485 | +} | |
44486 | + | |
44487 | +#define raw_smp_processor_id() read_pda(cpunumber) | |
44488 | + | |
44489 | +#ifdef CONFIG_X86_LOCAL_APIC | |
44490 | +static inline int hard_smp_processor_id(void) | |
44491 | +{ | |
44492 | + /* we don't want to mark this access volatile - bad code generation */ | |
44493 | + return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); | |
44494 | +} | |
44495 | +#endif | |
44496 | + | |
44497 | +extern int safe_smp_processor_id(void); | |
44498 | +extern int __cpu_disable(void); | |
44499 | +extern void __cpu_die(unsigned int cpu); | |
44500 | +extern void prefill_possible_map(void); | |
44501 | +extern unsigned num_processors; | |
44502 | +extern unsigned disabled_cpus; | |
44503 | + | |
44504 | +#endif /* !ASSEMBLY */ | |
44505 | + | |
44506 | +#define NO_PROC_ID 0xFF /* No processor magic marker */ | |
44507 | + | |
44508 | +#endif | |
44509 | + | |
44510 | +#ifndef ASSEMBLY | |
44511 | +/* | |
44512 | + * Some lowlevel functions might want to know about | |
44513 | + * the real APIC ID <-> CPU # mapping. | |
44514 | + */ | |
44515 | +extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */ | |
44516 | +extern u8 x86_cpu_to_log_apicid[NR_CPUS]; | |
44517 | +extern u8 bios_cpu_apicid[]; | |
44518 | + | |
44519 | +#ifdef CONFIG_X86_LOCAL_APIC | |
44520 | +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) | |
44521 | +{ | |
44522 | + return cpus_addr(cpumask)[0]; | |
44523 | +} | |
44524 | + | |
44525 | +static inline int cpu_present_to_apicid(int mps_cpu) | |
44526 | +{ | |
44527 | + if (mps_cpu < NR_CPUS) | |
44528 | + return (int)bios_cpu_apicid[mps_cpu]; | |
44529 | + else | |
44530 | + return BAD_APICID; | |
44531 | +} | |
44532 | +#endif | |
44533 | + | |
44534 | +#endif /* !ASSEMBLY */ | |
44535 | + | |
44536 | +#ifndef CONFIG_SMP | |
44537 | +#define stack_smp_processor_id() 0 | |
44538 | +#define safe_smp_processor_id() 0 | |
44539 | +#define cpu_logical_map(x) (x) | |
44540 | +#else | |
44541 | +#include <asm/thread_info.h> | |
44542 | +#define stack_smp_processor_id() \ | |
44543 | +({ \ | |
44544 | + struct thread_info *ti; \ | |
44545 | + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ | |
44546 | + ti->cpu; \ | |
44547 | +}) | |
44548 | +#endif | |
44549 | + | |
44550 | +#ifndef __ASSEMBLY__ | |
44551 | +#ifdef CONFIG_X86_LOCAL_APIC | |
44552 | +static __inline int logical_smp_processor_id(void) | |
44553 | +{ | |
44554 | + /* we don't want to mark this access volatile - bad code generation */ | |
44555 | + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); | |
44556 | +} | |
44557 | +#endif | |
44558 | +#endif | |
44559 | + | |
44560 | +#ifdef CONFIG_SMP | |
44561 | +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] | |
44562 | +#else | |
44563 | +#define cpu_physical_id(cpu) boot_cpu_id | |
44564 | +#endif | |
44565 | + | |
44566 | +#endif | |
44567 | + | |
44568 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/system_64.h | |
44569 | =================================================================== | |
44570 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
44571 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/system_64.h 2007-11-26 16:59:25.000000000 +0100 | |
44572 | @@ -0,0 +1,256 @@ | |
44573 | +#ifndef __ASM_SYSTEM_H | |
44574 | +#define __ASM_SYSTEM_H | |
44575 | + | |
44576 | +#include <linux/kernel.h> | |
44577 | +#include <asm/segment.h> | |
44578 | +#include <asm/alternative.h> | |
44579 | + | |
44580 | +#include <asm/synch_bitops.h> | |
44581 | +#include <asm/hypervisor.h> | |
44582 | +#include <xen/interface/arch-x86_64.h> | |
44583 | + | |
44584 | +#ifdef __KERNEL__ | |
44585 | + | |
44586 | +#define __STR(x) #x | |
44587 | +#define STR(x) __STR(x) | |
44588 | + | |
44589 | +#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" | |
44590 | +#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" | |
44591 | + | |
44592 | +/* frame pointer must be last for get_wchan */ | |
44593 | +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" | |
44594 | +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t" | |
44595 | + | |
44596 | +#define __EXTRA_CLOBBER \ | |
44597 | + ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" | |
44598 | + | |
44599 | +#define switch_to(prev,next,last) \ | |
44600 | + asm volatile(SAVE_CONTEXT \ | |
44601 | + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ | |
44602 | + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ | |
44603 | + "call __switch_to\n\t" \ | |
44604 | + ".globl thread_return\n" \ | |
44605 | + "thread_return:\n\t" \ | |
44606 | + "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ | |
44607 | + "movq %P[thread_info](%%rsi),%%r8\n\t" \ | |
44608 | + LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ | |
44609 | + "movq %%rax,%%rdi\n\t" \ | |
44610 | + "jc ret_from_fork\n\t" \ | |
44611 | + RESTORE_CONTEXT \ | |
44612 | + : "=a" (last) \ | |
44613 | + : [next] "S" (next), [prev] "D" (prev), \ | |
44614 | + [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \ | |
44615 | + [ti_flags] "i" (offsetof(struct thread_info, flags)),\ | |
44616 | + [tif_fork] "i" (TIF_FORK), \ | |
44617 | + [thread_info] "i" (offsetof(struct task_struct, thread_info)), \ | |
44618 | + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ | |
44619 | + : "memory", "cc" __EXTRA_CLOBBER) | |
44620 | + | |
44621 | +extern void load_gs_index(unsigned); | |
44622 | + | |
44623 | +/* | |
44624 | + * Load a segment. Fall back on loading the zero | |
44625 | + * segment if something goes wrong.. | |
44626 | + */ | |
44627 | +#define loadsegment(seg,value) \ | |
44628 | + asm volatile("\n" \ | |
44629 | + "1:\t" \ | |
44630 | + "movl %k0,%%" #seg "\n" \ | |
44631 | + "2:\n" \ | |
44632 | + ".section .fixup,\"ax\"\n" \ | |
44633 | + "3:\t" \ | |
44634 | + "movl %1,%%" #seg "\n\t" \ | |
44635 | + "jmp 2b\n" \ | |
44636 | + ".previous\n" \ | |
44637 | + ".section __ex_table,\"a\"\n\t" \ | |
44638 | + ".align 8\n\t" \ | |
44639 | + ".quad 1b,3b\n" \ | |
44640 | + ".previous" \ | |
44641 | + : :"r" (value), "r" (0)) | |
44642 | + | |
44643 | +/* | |
44644 | + * Clear and set 'TS' bit respectively | |
44645 | + */ | |
44646 | +#define clts() (HYPERVISOR_fpu_taskswitch(0)) | |
44647 | + | |
44648 | +static inline unsigned long read_cr0(void) | |
44649 | +{ | |
44650 | + unsigned long cr0; | |
44651 | + asm volatile("movq %%cr0,%0" : "=r" (cr0)); | |
44652 | + return cr0; | |
44653 | +} | |
44654 | + | |
44655 | +static inline void write_cr0(unsigned long val) | |
44656 | +{ | |
44657 | + asm volatile("movq %0,%%cr0" :: "r" (val)); | |
44658 | +} | |
44659 | + | |
44660 | +#define read_cr3() ({ \ | |
44661 | + unsigned long __dummy; \ | |
44662 | + asm("movq %%cr3,%0" : "=r" (__dummy)); \ | |
44663 | + machine_to_phys(__dummy); \ | |
44664 | +}) | |
44665 | + | |
44666 | +static inline unsigned long read_cr4(void) | |
44667 | +{ | |
44668 | + unsigned long cr4; | |
44669 | + asm("movq %%cr4,%0" : "=r" (cr4)); | |
44670 | + return cr4; | |
44671 | +} | |
44672 | + | |
44673 | +static inline void write_cr4(unsigned long val) | |
44674 | +{ | |
44675 | + asm volatile("movq %0,%%cr4" :: "r" (val)); | |
44676 | +} | |
44677 | + | |
44678 | +#define stts() (HYPERVISOR_fpu_taskswitch(1)) | |
44679 | + | |
44680 | +#define wbinvd() \ | |
44681 | + __asm__ __volatile__ ("wbinvd": : :"memory"); | |
44682 | + | |
44683 | +/* | |
44684 | + * On SMP systems, when the scheduler does migration-cost autodetection, | |
44685 | + * it needs a way to flush as much of the CPU's caches as possible. | |
44686 | + */ | |
44687 | +static inline void sched_cacheflush(void) | |
44688 | +{ | |
44689 | + wbinvd(); | |
44690 | +} | |
44691 | + | |
44692 | +#endif /* __KERNEL__ */ | |
44693 | + | |
44694 | +#define nop() __asm__ __volatile__ ("nop") | |
44695 | + | |
44696 | +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) | |
44697 | + | |
44698 | +#define tas(ptr) (xchg((ptr),1)) | |
44699 | + | |
44700 | +#define __xg(x) ((volatile long *)(x)) | |
44701 | + | |
44702 | +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) | |
44703 | +{ | |
44704 | + *ptr = val; | |
44705 | +} | |
44706 | + | |
44707 | +#define _set_64bit set_64bit | |
44708 | + | |
44709 | +/* | |
44710 | + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway | |
44711 | + * Note 2: xchg has side effect, so that attribute volatile is necessary, | |
44712 | + * but generally the primitive is invalid, *ptr is output argument. --ANK | |
44713 | + */ | |
44714 | +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) | |
44715 | +{ | |
44716 | + switch (size) { | |
44717 | + case 1: | |
44718 | + __asm__ __volatile__("xchgb %b0,%1" | |
44719 | + :"=q" (x) | |
44720 | + :"m" (*__xg(ptr)), "0" (x) | |
44721 | + :"memory"); | |
44722 | + break; | |
44723 | + case 2: | |
44724 | + __asm__ __volatile__("xchgw %w0,%1" | |
44725 | + :"=r" (x) | |
44726 | + :"m" (*__xg(ptr)), "0" (x) | |
44727 | + :"memory"); | |
44728 | + break; | |
44729 | + case 4: | |
44730 | + __asm__ __volatile__("xchgl %k0,%1" | |
44731 | + :"=r" (x) | |
44732 | + :"m" (*__xg(ptr)), "0" (x) | |
44733 | + :"memory"); | |
44734 | + break; | |
44735 | + case 8: | |
44736 | + __asm__ __volatile__("xchgq %0,%1" | |
44737 | + :"=r" (x) | |
44738 | + :"m" (*__xg(ptr)), "0" (x) | |
44739 | + :"memory"); | |
44740 | + break; | |
44741 | + } | |
44742 | + return x; | |
44743 | +} | |
44744 | + | |
44745 | +/* | |
44746 | + * Atomic compare and exchange. Compare OLD with MEM, if identical, | |
44747 | + * store NEW in MEM. Return the initial value in MEM. Success is | |
44748 | + * indicated by comparing RETURN with OLD. | |
44749 | + */ | |
44750 | + | |
44751 | +#define __HAVE_ARCH_CMPXCHG 1 | |
44752 | + | |
44753 | +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, | |
44754 | + unsigned long new, int size) | |
44755 | +{ | |
44756 | + unsigned long prev; | |
44757 | + switch (size) { | |
44758 | + case 1: | |
44759 | + __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" | |
44760 | + : "=a"(prev) | |
44761 | + : "q"(new), "m"(*__xg(ptr)), "0"(old) | |
44762 | + : "memory"); | |
44763 | + return prev; | |
44764 | + case 2: | |
44765 | + __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" | |
44766 | + : "=a"(prev) | |
44767 | + : "r"(new), "m"(*__xg(ptr)), "0"(old) | |
44768 | + : "memory"); | |
44769 | + return prev; | |
44770 | + case 4: | |
44771 | + __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2" | |
44772 | + : "=a"(prev) | |
44773 | + : "r"(new), "m"(*__xg(ptr)), "0"(old) | |
44774 | + : "memory"); | |
44775 | + return prev; | |
44776 | + case 8: | |
44777 | + __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2" | |
44778 | + : "=a"(prev) | |
44779 | + : "r"(new), "m"(*__xg(ptr)), "0"(old) | |
44780 | + : "memory"); | |
44781 | + return prev; | |
44782 | + } | |
44783 | + return old; | |
44784 | +} | |
44785 | + | |
44786 | +#define cmpxchg(ptr,o,n)\ | |
44787 | + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ | |
44788 | + (unsigned long)(n),sizeof(*(ptr)))) | |
44789 | + | |
44790 | +#ifdef CONFIG_SMP | |
44791 | +#define smp_mb() mb() | |
44792 | +#define smp_rmb() rmb() | |
44793 | +#define smp_wmb() wmb() | |
44794 | +#define smp_read_barrier_depends() do {} while(0) | |
44795 | +#else | |
44796 | +#define smp_mb() barrier() | |
44797 | +#define smp_rmb() barrier() | |
44798 | +#define smp_wmb() barrier() | |
44799 | +#define smp_read_barrier_depends() do {} while(0) | |
44800 | +#endif | |
44801 | + | |
44802 | + | |
44803 | +/* | |
44804 | + * Force strict CPU ordering. | |
44805 | + * And yes, this is required on UP too when we're talking | |
44806 | + * to devices. | |
44807 | + */ | |
44808 | +#define mb() asm volatile("mfence":::"memory") | |
44809 | +#define rmb() asm volatile("lfence":::"memory") | |
44810 | + | |
44811 | +#ifdef CONFIG_UNORDERED_IO | |
44812 | +#define wmb() asm volatile("sfence" ::: "memory") | |
44813 | +#else | |
44814 | +#define wmb() asm volatile("" ::: "memory") | |
44815 | +#endif | |
44816 | +#define read_barrier_depends() do {} while(0) | |
44817 | +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) | |
44818 | + | |
44819 | +#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0) | |
44820 | + | |
44821 | +#include <linux/irqflags.h> | |
44822 | + | |
44823 | +void cpu_idle_wait(void); | |
44824 | + | |
44825 | +extern unsigned long arch_align_stack(unsigned long sp); | |
44826 | +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); | |
44827 | + | |
44828 | +#endif | |
44829 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_64.h | |
44830 | =================================================================== | |
44831 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
44832 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_64.h 2007-11-26 16:59:25.000000000 +0100 | |
44833 | @@ -0,0 +1,103 @@ | |
44834 | +#ifndef _X8664_TLBFLUSH_H | |
44835 | +#define _X8664_TLBFLUSH_H | |
44836 | + | |
44837 | +#include <linux/mm.h> | |
44838 | +#include <asm/processor.h> | |
44839 | + | |
44840 | +#define __flush_tlb() xen_tlb_flush() | |
44841 | + | |
44842 | +/* | |
44843 | + * Global pages have to be flushed a bit differently. Not a real | |
44844 | + * performance problem because this does not happen often. | |
44845 | + */ | |
44846 | +#define __flush_tlb_global() xen_tlb_flush() | |
44847 | + | |
44848 | + | |
44849 | +extern unsigned long pgkern_mask; | |
44850 | + | |
44851 | +#define __flush_tlb_all() __flush_tlb_global() | |
44852 | + | |
44853 | +#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr) | |
44854 | + | |
44855 | + | |
44856 | +/* | |
44857 | + * TLB flushing: | |
44858 | + * | |
44859 | + * - flush_tlb() flushes the current mm struct TLBs | |
44860 | + * - flush_tlb_all() flushes all processes TLBs | |
44861 | + * - flush_tlb_mm(mm) flushes the specified mm context TLB's | |
44862 | + * - flush_tlb_page(vma, vmaddr) flushes one page | |
44863 | + * - flush_tlb_range(vma, start, end) flushes a range of pages | |
44864 | + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages | |
44865 | + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables | |
44866 | + * | |
44867 | + * x86-64 can only flush individual pages or full VMs. For a range flush | |
44868 | + * we always do the full VM. Might be worth trying if for a small | |
44869 | + * range a few INVLPGs in a row are a win. | |
44870 | + */ | |
44871 | + | |
44872 | +#ifndef CONFIG_SMP | |
44873 | + | |
44874 | +#define flush_tlb() __flush_tlb() | |
44875 | +#define flush_tlb_all() __flush_tlb_all() | |
44876 | +#define local_flush_tlb() __flush_tlb() | |
44877 | + | |
44878 | +static inline void flush_tlb_mm(struct mm_struct *mm) | |
44879 | +{ | |
44880 | + if (mm == current->active_mm) | |
44881 | + __flush_tlb(); | |
44882 | +} | |
44883 | + | |
44884 | +static inline void flush_tlb_page(struct vm_area_struct *vma, | |
44885 | + unsigned long addr) | |
44886 | +{ | |
44887 | + if (vma->vm_mm == current->active_mm) | |
44888 | + __flush_tlb_one(addr); | |
44889 | +} | |
44890 | + | |
44891 | +static inline void flush_tlb_range(struct vm_area_struct *vma, | |
44892 | + unsigned long start, unsigned long end) | |
44893 | +{ | |
44894 | + if (vma->vm_mm == current->active_mm) | |
44895 | + __flush_tlb(); | |
44896 | +} | |
44897 | + | |
44898 | +#else | |
44899 | + | |
44900 | +#include <asm/smp.h> | |
44901 | + | |
44902 | +#define local_flush_tlb() \ | |
44903 | + __flush_tlb() | |
44904 | + | |
44905 | +#define flush_tlb_all xen_tlb_flush_all | |
44906 | +#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) | |
44907 | +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) | |
44908 | +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) | |
44909 | + | |
44910 | +#define flush_tlb() flush_tlb_current_task() | |
44911 | + | |
44912 | +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) | |
44913 | +{ | |
44914 | + flush_tlb_mm(vma->vm_mm); | |
44915 | +} | |
44916 | + | |
44917 | +#define TLBSTATE_OK 1 | |
44918 | +#define TLBSTATE_LAZY 2 | |
44919 | + | |
44920 | +/* Roughly an IPI every 20MB with 4k pages for freeing page table | |
44921 | + ranges. Cost is about 42k of memory for each CPU. */ | |
44922 | +#define ARCH_FREE_PTE_NR 5350 | |
44923 | + | |
44924 | +#endif | |
44925 | + | |
44926 | +#define flush_tlb_kernel_range(start, end) flush_tlb_all() | |
44927 | + | |
44928 | +static inline void flush_tlb_pgtables(struct mm_struct *mm, | |
44929 | + unsigned long start, unsigned long end) | |
44930 | +{ | |
44931 | + /* x86_64 does not keep any page table caches in a software TLB. | |
44932 | + The CPUs do in their hardware TLBs, but they are handled | |
44933 | + by the normal TLB flushing algorithms. */ | |
44934 | +} | |
44935 | + | |
44936 | +#endif /* _X8664_TLBFLUSH_H */ | |
44937 | Index: head-2008-11-25/include/asm-x86/mach-xen/asm/xor_64.h | |
44938 | =================================================================== | |
44939 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
44940 | +++ head-2008-11-25/include/asm-x86/mach-xen/asm/xor_64.h 2007-06-12 13:14:13.000000000 +0200 | |
44941 | @@ -0,0 +1,328 @@ | |
44942 | +/* | |
44943 | + * x86-64 changes / gcc fixes from Andi Kleen. | |
44944 | + * Copyright 2002 Andi Kleen, SuSE Labs. | |
44945 | + * | |
44946 | + * This hasn't been optimized for the hammer yet, but there are likely | |
44947 | + * no advantages to be gotten from x86-64 here anyways. | |
44948 | + */ | |
44949 | + | |
44950 | +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t; | |
44951 | + | |
44952 | +/* Doesn't use gcc to save the XMM registers, because there is no easy way to | |
44953 | + tell it to do a clts before the register saving. */ | |
44954 | +#define XMMS_SAVE do { \ | |
44955 | + preempt_disable(); \ | |
44956 | + if (!(current_thread_info()->status & TS_USEDFPU)) \ | |
44957 | + clts(); \ | |
44958 | + __asm__ __volatile__ ( \ | |
44959 | + "movups %%xmm0,(%1) ;\n\t" \ | |
44960 | + "movups %%xmm1,0x10(%1) ;\n\t" \ | |
44961 | + "movups %%xmm2,0x20(%1) ;\n\t" \ | |
44962 | + "movups %%xmm3,0x30(%1) ;\n\t" \ | |
44963 | + : "=&r" (cr0) \ | |
44964 | + : "r" (xmm_save) \ | |
44965 | + : "memory"); \ | |
44966 | +} while(0) | |
44967 | + | |
44968 | +#define XMMS_RESTORE do { \ | |
44969 | + asm volatile ( \ | |
44970 | + "sfence ;\n\t" \ | |
44971 | + "movups (%1),%%xmm0 ;\n\t" \ | |
44972 | + "movups 0x10(%1),%%xmm1 ;\n\t" \ | |
44973 | + "movups 0x20(%1),%%xmm2 ;\n\t" \ | |
44974 | + "movups 0x30(%1),%%xmm3 ;\n\t" \ | |
44975 | + : \ | |
44976 | + : "r" (cr0), "r" (xmm_save) \ | |
44977 | + : "memory"); \ | |
44978 | + if (!(current_thread_info()->status & TS_USEDFPU)) \ | |
44979 | + stts(); \ | |
44980 | + preempt_enable(); \ | |
44981 | +} while(0) | |
44982 | + | |
44983 | +#define OFFS(x) "16*("#x")" | |
44984 | +#define PF_OFFS(x) "256+16*("#x")" | |
44985 | +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" | |
44986 | +#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" | |
44987 | +#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" | |
44988 | +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" | |
44989 | +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" | |
44990 | +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" | |
44991 | +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" | |
44992 | +#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" | |
44993 | +#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" | |
44994 | +#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" | |
44995 | +#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" | |
44996 | +#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" | |
44997 | +#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" | |
44998 | + | |
44999 | + | |
45000 | +static void | |
45001 | +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
45002 | +{ | |
45003 | + unsigned int lines = bytes >> 8; | |
45004 | + unsigned long cr0; | |
45005 | + xmm_store_t xmm_save[4]; | |
45006 | + | |
45007 | + XMMS_SAVE; | |
45008 | + | |
45009 | + asm volatile ( | |
45010 | +#undef BLOCK | |
45011 | +#define BLOCK(i) \ | |
45012 | + LD(i,0) \ | |
45013 | + LD(i+1,1) \ | |
45014 | + PF1(i) \ | |
45015 | + PF1(i+2) \ | |
45016 | + LD(i+2,2) \ | |
45017 | + LD(i+3,3) \ | |
45018 | + PF0(i+4) \ | |
45019 | + PF0(i+6) \ | |
45020 | + XO1(i,0) \ | |
45021 | + XO1(i+1,1) \ | |
45022 | + XO1(i+2,2) \ | |
45023 | + XO1(i+3,3) \ | |
45024 | + ST(i,0) \ | |
45025 | + ST(i+1,1) \ | |
45026 | + ST(i+2,2) \ | |
45027 | + ST(i+3,3) \ | |
45028 | + | |
45029 | + | |
45030 | + PF0(0) | |
45031 | + PF0(2) | |
45032 | + | |
45033 | + " .align 32 ;\n" | |
45034 | + " 1: ;\n" | |
45035 | + | |
45036 | + BLOCK(0) | |
45037 | + BLOCK(4) | |
45038 | + BLOCK(8) | |
45039 | + BLOCK(12) | |
45040 | + | |
45041 | + " addq %[inc], %[p1] ;\n" | |
45042 | + " addq %[inc], %[p2] ;\n" | |
45043 | + " decl %[cnt] ; jnz 1b" | |
45044 | + : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) | |
45045 | + : [inc] "r" (256UL) | |
45046 | + : "memory"); | |
45047 | + | |
45048 | + XMMS_RESTORE; | |
45049 | +} | |
45050 | + | |
45051 | +static void | |
45052 | +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
45053 | + unsigned long *p3) | |
45054 | +{ | |
45055 | + unsigned int lines = bytes >> 8; | |
45056 | + xmm_store_t xmm_save[4]; | |
45057 | + unsigned long cr0; | |
45058 | + | |
45059 | + XMMS_SAVE; | |
45060 | + | |
45061 | + __asm__ __volatile__ ( | |
45062 | +#undef BLOCK | |
45063 | +#define BLOCK(i) \ | |
45064 | + PF1(i) \ | |
45065 | + PF1(i+2) \ | |
45066 | + LD(i,0) \ | |
45067 | + LD(i+1,1) \ | |
45068 | + LD(i+2,2) \ | |
45069 | + LD(i+3,3) \ | |
45070 | + PF2(i) \ | |
45071 | + PF2(i+2) \ | |
45072 | + PF0(i+4) \ | |
45073 | + PF0(i+6) \ | |
45074 | + XO1(i,0) \ | |
45075 | + XO1(i+1,1) \ | |
45076 | + XO1(i+2,2) \ | |
45077 | + XO1(i+3,3) \ | |
45078 | + XO2(i,0) \ | |
45079 | + XO2(i+1,1) \ | |
45080 | + XO2(i+2,2) \ | |
45081 | + XO2(i+3,3) \ | |
45082 | + ST(i,0) \ | |
45083 | + ST(i+1,1) \ | |
45084 | + ST(i+2,2) \ | |
45085 | + ST(i+3,3) \ | |
45086 | + | |
45087 | + | |
45088 | + PF0(0) | |
45089 | + PF0(2) | |
45090 | + | |
45091 | + " .align 32 ;\n" | |
45092 | + " 1: ;\n" | |
45093 | + | |
45094 | + BLOCK(0) | |
45095 | + BLOCK(4) | |
45096 | + BLOCK(8) | |
45097 | + BLOCK(12) | |
45098 | + | |
45099 | + " addq %[inc], %[p1] ;\n" | |
45100 | + " addq %[inc], %[p2] ;\n" | |
45101 | + " addq %[inc], %[p3] ;\n" | |
45102 | + " decl %[cnt] ; jnz 1b" | |
45103 | + : [cnt] "+r" (lines), | |
45104 | + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | |
45105 | + : [inc] "r" (256UL) | |
45106 | + : "memory"); | |
45107 | + XMMS_RESTORE; | |
45108 | +} | |
45109 | + | |
45110 | +static void | |
45111 | +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
45112 | + unsigned long *p3, unsigned long *p4) | |
45113 | +{ | |
45114 | + unsigned int lines = bytes >> 8; | |
45115 | + xmm_store_t xmm_save[4]; | |
45116 | + unsigned long cr0; | |
45117 | + | |
45118 | + XMMS_SAVE; | |
45119 | + | |
45120 | + __asm__ __volatile__ ( | |
45121 | +#undef BLOCK | |
45122 | +#define BLOCK(i) \ | |
45123 | + PF1(i) \ | |
45124 | + PF1(i+2) \ | |
45125 | + LD(i,0) \ | |
45126 | + LD(i+1,1) \ | |
45127 | + LD(i+2,2) \ | |
45128 | + LD(i+3,3) \ | |
45129 | + PF2(i) \ | |
45130 | + PF2(i+2) \ | |
45131 | + XO1(i,0) \ | |
45132 | + XO1(i+1,1) \ | |
45133 | + XO1(i+2,2) \ | |
45134 | + XO1(i+3,3) \ | |
45135 | + PF3(i) \ | |
45136 | + PF3(i+2) \ | |
45137 | + PF0(i+4) \ | |
45138 | + PF0(i+6) \ | |
45139 | + XO2(i,0) \ | |
45140 | + XO2(i+1,1) \ | |
45141 | + XO2(i+2,2) \ | |
45142 | + XO2(i+3,3) \ | |
45143 | + XO3(i,0) \ | |
45144 | + XO3(i+1,1) \ | |
45145 | + XO3(i+2,2) \ | |
45146 | + XO3(i+3,3) \ | |
45147 | + ST(i,0) \ | |
45148 | + ST(i+1,1) \ | |
45149 | + ST(i+2,2) \ | |
45150 | + ST(i+3,3) \ | |
45151 | + | |
45152 | + | |
45153 | + PF0(0) | |
45154 | + PF0(2) | |
45155 | + | |
45156 | + " .align 32 ;\n" | |
45157 | + " 1: ;\n" | |
45158 | + | |
45159 | + BLOCK(0) | |
45160 | + BLOCK(4) | |
45161 | + BLOCK(8) | |
45162 | + BLOCK(12) | |
45163 | + | |
45164 | + " addq %[inc], %[p1] ;\n" | |
45165 | + " addq %[inc], %[p2] ;\n" | |
45166 | + " addq %[inc], %[p3] ;\n" | |
45167 | + " addq %[inc], %[p4] ;\n" | |
45168 | + " decl %[cnt] ; jnz 1b" | |
45169 | + : [cnt] "+c" (lines), | |
45170 | + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | |
45171 | + : [inc] "r" (256UL) | |
45172 | + : "memory" ); | |
45173 | + | |
45174 | + XMMS_RESTORE; | |
45175 | +} | |
45176 | + | |
45177 | +static void | |
45178 | +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
45179 | + unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
45180 | +{ | |
45181 | + unsigned int lines = bytes >> 8; | |
45182 | + xmm_store_t xmm_save[4]; | |
45183 | + unsigned long cr0; | |
45184 | + | |
45185 | + XMMS_SAVE; | |
45186 | + | |
45187 | + __asm__ __volatile__ ( | |
45188 | +#undef BLOCK | |
45189 | +#define BLOCK(i) \ | |
45190 | + PF1(i) \ | |
45191 | + PF1(i+2) \ | |
45192 | + LD(i,0) \ | |
45193 | + LD(i+1,1) \ | |
45194 | + LD(i+2,2) \ | |
45195 | + LD(i+3,3) \ | |
45196 | + PF2(i) \ | |
45197 | + PF2(i+2) \ | |
45198 | + XO1(i,0) \ | |
45199 | + XO1(i+1,1) \ | |
45200 | + XO1(i+2,2) \ | |
45201 | + XO1(i+3,3) \ | |
45202 | + PF3(i) \ | |
45203 | + PF3(i+2) \ | |
45204 | + XO2(i,0) \ | |
45205 | + XO2(i+1,1) \ | |
45206 | + XO2(i+2,2) \ | |
45207 | + XO2(i+3,3) \ | |
45208 | + PF4(i) \ | |
45209 | + PF4(i+2) \ | |
45210 | + PF0(i+4) \ | |
45211 | + PF0(i+6) \ | |
45212 | + XO3(i,0) \ | |
45213 | + XO3(i+1,1) \ | |
45214 | + XO3(i+2,2) \ | |
45215 | + XO3(i+3,3) \ | |
45216 | + XO4(i,0) \ | |
45217 | + XO4(i+1,1) \ | |
45218 | + XO4(i+2,2) \ | |
45219 | + XO4(i+3,3) \ | |
45220 | + ST(i,0) \ | |
45221 | + ST(i+1,1) \ | |
45222 | + ST(i+2,2) \ | |
45223 | + ST(i+3,3) \ | |
45224 | + | |
45225 | + | |
45226 | + PF0(0) | |
45227 | + PF0(2) | |
45228 | + | |
45229 | + " .align 32 ;\n" | |
45230 | + " 1: ;\n" | |
45231 | + | |
45232 | + BLOCK(0) | |
45233 | + BLOCK(4) | |
45234 | + BLOCK(8) | |
45235 | + BLOCK(12) | |
45236 | + | |
45237 | + " addq %[inc], %[p1] ;\n" | |
45238 | + " addq %[inc], %[p2] ;\n" | |
45239 | + " addq %[inc], %[p3] ;\n" | |
45240 | + " addq %[inc], %[p4] ;\n" | |
45241 | + " addq %[inc], %[p5] ;\n" | |
45242 | + " decl %[cnt] ; jnz 1b" | |
45243 | + : [cnt] "+c" (lines), | |
45244 | + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), | |
45245 | + [p5] "+r" (p5) | |
45246 | + : [inc] "r" (256UL) | |
45247 | + : "memory"); | |
45248 | + | |
45249 | + XMMS_RESTORE; | |
45250 | +} | |
45251 | + | |
45252 | +static struct xor_block_template xor_block_sse = { | |
45253 | + .name = "generic_sse", | |
45254 | + .do_2 = xor_sse_2, | |
45255 | + .do_3 = xor_sse_3, | |
45256 | + .do_4 = xor_sse_4, | |
45257 | + .do_5 = xor_sse_5, | |
45258 | +}; | |
45259 | + | |
45260 | +#undef XOR_TRY_TEMPLATES | |
45261 | +#define XOR_TRY_TEMPLATES \ | |
45262 | + do { \ | |
45263 | + xor_speed(&xor_block_sse); \ | |
45264 | + } while (0) | |
45265 | + | |
45266 | +/* We force the use of the SSE xor block because it can write around L2. | |
45267 | + We may also be able to load into the L1 only depending on how the cpu | |
45268 | + deals with a load to a line that is being prefetched. */ | |
45269 | +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) | |
45270 | Index: head-2008-11-25/include/asm-x86/mach-xen/mach_time.h | |
45271 | =================================================================== | |
45272 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45273 | +++ head-2008-11-25/include/asm-x86/mach-xen/mach_time.h 2007-06-12 13:14:13.000000000 +0200 | |
45274 | @@ -0,0 +1,111 @@ | |
45275 | +/* | |
45276 | + * include/asm-i386/mach-default/mach_time.h | |
45277 | + * | |
45278 | + * Machine specific set RTC function for generic. | |
45279 | + * Split out from time.c by Osamu Tomita <tomita@cinet.co.jp> | |
45280 | + */ | |
45281 | +#ifndef _MACH_TIME_H | |
45282 | +#define _MACH_TIME_H | |
45283 | + | |
45284 | +#include <asm-i386/mc146818rtc.h> | |
45285 | + | |
45286 | +/* for check timing call set_rtc_mmss() 500ms */ | |
45287 | +/* used in arch/i386/time.c::do_timer_interrupt() */ | |
45288 | +#define USEC_AFTER 500000 | |
45289 | +#define USEC_BEFORE 500000 | |
45290 | + | |
45291 | +/* | |
45292 | + * In order to set the CMOS clock precisely, set_rtc_mmss has to be | |
45293 | + * called 500 ms after the second nowtime has started, because when | |
45294 | + * nowtime is written into the registers of the CMOS clock, it will | |
45295 | + * jump to the next second precisely 500 ms later. Check the Motorola | |
45296 | + * MC146818A or Dallas DS12887 data sheet for details. | |
45297 | + * | |
45298 | + * BUG: This routine does not handle hour overflow properly; it just | |
45299 | + * sets the minutes. Usually you'll only notice that after reboot! | |
45300 | + */ | |
45301 | +static inline int mach_set_rtc_mmss(unsigned long nowtime) | |
45302 | +{ | |
45303 | + int retval = 0; | |
45304 | + int real_seconds, real_minutes, cmos_minutes; | |
45305 | + unsigned char save_control, save_freq_select; | |
45306 | + | |
45307 | + save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */ | |
45308 | + CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); | |
45309 | + | |
45310 | + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */ | |
45311 | + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); | |
45312 | + | |
45313 | + cmos_minutes = CMOS_READ(RTC_MINUTES); | |
45314 | + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) | |
45315 | + BCD_TO_BIN(cmos_minutes); | |
45316 | + | |
45317 | + /* | |
45318 | + * since we're only adjusting minutes and seconds, | |
45319 | + * don't interfere with hour overflow. This avoids | |
45320 | + * messing with unknown time zones but requires your | |
45321 | + * RTC not to be off by more than 15 minutes | |
45322 | + */ | |
45323 | + real_seconds = nowtime % 60; | |
45324 | + real_minutes = nowtime / 60; | |
45325 | + if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) | |
45326 | + real_minutes += 30; /* correct for half hour time zone */ | |
45327 | + real_minutes %= 60; | |
45328 | + | |
45329 | + if (abs(real_minutes - cmos_minutes) < 30) { | |
45330 | + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { | |
45331 | + BIN_TO_BCD(real_seconds); | |
45332 | + BIN_TO_BCD(real_minutes); | |
45333 | + } | |
45334 | + CMOS_WRITE(real_seconds,RTC_SECONDS); | |
45335 | + CMOS_WRITE(real_minutes,RTC_MINUTES); | |
45336 | + } else { | |
45337 | + printk(KERN_WARNING | |
45338 | + "set_rtc_mmss: can't update from %d to %d\n", | |
45339 | + cmos_minutes, real_minutes); | |
45340 | + retval = -1; | |
45341 | + } | |
45342 | + | |
45343 | + /* The following flags have to be released exactly in this order, | |
45344 | + * otherwise the DS12887 (popular MC146818A clone with integrated | |
45345 | + * battery and quartz) will not reset the oscillator and will not | |
45346 | + * update precisely 500 ms later. You won't find this mentioned in | |
45347 | + * the Dallas Semiconductor data sheets, but who believes data | |
45348 | + * sheets anyway ... -- Markus Kuhn | |
45349 | + */ | |
45350 | + CMOS_WRITE(save_control, RTC_CONTROL); | |
45351 | + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); | |
45352 | + | |
45353 | + return retval; | |
45354 | +} | |
45355 | + | |
45356 | +static inline unsigned long mach_get_cmos_time(void) | |
45357 | +{ | |
45358 | + unsigned int year, mon, day, hour, min, sec; | |
45359 | + | |
45360 | + do { | |
45361 | + sec = CMOS_READ(RTC_SECONDS); | |
45362 | + min = CMOS_READ(RTC_MINUTES); | |
45363 | + hour = CMOS_READ(RTC_HOURS); | |
45364 | + day = CMOS_READ(RTC_DAY_OF_MONTH); | |
45365 | + mon = CMOS_READ(RTC_MONTH); | |
45366 | + year = CMOS_READ(RTC_YEAR); | |
45367 | + } while (sec != CMOS_READ(RTC_SECONDS)); | |
45368 | + | |
45369 | + if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { | |
45370 | + BCD_TO_BIN(sec); | |
45371 | + BCD_TO_BIN(min); | |
45372 | + BCD_TO_BIN(hour); | |
45373 | + BCD_TO_BIN(day); | |
45374 | + BCD_TO_BIN(mon); | |
45375 | + BCD_TO_BIN(year); | |
45376 | + } | |
45377 | + | |
45378 | + year += 1900; | |
45379 | + if (year < 1970) | |
45380 | + year += 100; | |
45381 | + | |
45382 | + return mktime(year, mon, day, hour, min, sec); | |
45383 | +} | |
45384 | + | |
45385 | +#endif /* !_MACH_TIME_H */ | |
45386 | Index: head-2008-11-25/include/asm-x86/mach-xen/mach_timer.h | |
45387 | =================================================================== | |
45388 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45389 | +++ head-2008-11-25/include/asm-x86/mach-xen/mach_timer.h 2007-06-12 13:14:13.000000000 +0200 | |
45390 | @@ -0,0 +1,50 @@ | |
45391 | +/* | |
45392 | + * include/asm-i386/mach-default/mach_timer.h | |
45393 | + * | |
45394 | + * Machine specific calibrate_tsc() for generic. | |
45395 | + * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp> | |
45396 | + */ | |
45397 | +/* ------ Calibrate the TSC ------- | |
45398 | + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). | |
45399 | + * Too much 64-bit arithmetic here to do this cleanly in C, and for | |
45400 | + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) | |
45401 | + * output busy loop as low as possible. We avoid reading the CTC registers | |
45402 | + * directly because of the awkward 8-bit access mechanism of the 82C54 | |
45403 | + * device. | |
45404 | + */ | |
45405 | +#ifndef _MACH_TIMER_H | |
45406 | +#define _MACH_TIMER_H | |
45407 | + | |
45408 | +#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ | |
45409 | +#define CALIBRATE_LATCH \ | |
45410 | + ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) | |
45411 | + | |
45412 | +static inline void mach_prepare_counter(void) | |
45413 | +{ | |
45414 | + /* Set the Gate high, disable speaker */ | |
45415 | + outb((inb(0x61) & ~0x02) | 0x01, 0x61); | |
45416 | + | |
45417 | + /* | |
45418 | + * Now let's take care of CTC channel 2 | |
45419 | + * | |
45420 | + * Set the Gate high, program CTC channel 2 for mode 0, | |
45421 | + * (interrupt on terminal count mode), binary count, | |
45422 | + * load 5 * LATCH count, (LSB and MSB) to begin countdown. | |
45423 | + * | |
45424 | + * Some devices need a delay here. | |
45425 | + */ | |
45426 | + outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */ | |
45427 | + outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */ | |
45428 | + outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */ | |
45429 | +} | |
45430 | + | |
45431 | +static inline void mach_countup(unsigned long *count_p) | |
45432 | +{ | |
45433 | + unsigned long count = 0; | |
45434 | + do { | |
45435 | + count++; | |
45436 | + } while ((inb_p(0x61) & 0x20) == 0); | |
45437 | + *count_p = count; | |
45438 | +} | |
45439 | + | |
45440 | +#endif /* !_MACH_TIMER_H */ | |
45441 | Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch_post.h | |
45442 | =================================================================== | |
45443 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45444 | +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch_post.h 2007-06-12 13:14:13.000000000 +0200 | |
45445 | @@ -0,0 +1,63 @@ | |
45446 | +/** | |
45447 | + * machine_specific_* - Hooks for machine specific setup. | |
45448 | + * | |
45449 | + * Description: | |
45450 | + * This is included late in kernel/setup.c so that it can make | |
45451 | + * use of all of the static functions. | |
45452 | + **/ | |
45453 | + | |
45454 | +#include <xen/interface/callback.h> | |
45455 | + | |
45456 | +extern void hypervisor_callback(void); | |
45457 | +extern void failsafe_callback(void); | |
45458 | +extern void nmi(void); | |
45459 | + | |
45460 | +static void __init machine_specific_arch_setup(void) | |
45461 | +{ | |
45462 | + int ret; | |
45463 | + static struct callback_register __initdata event = { | |
45464 | + .type = CALLBACKTYPE_event, | |
45465 | + .address = (unsigned long) hypervisor_callback, | |
45466 | + }; | |
45467 | + static struct callback_register __initdata failsafe = { | |
45468 | + .type = CALLBACKTYPE_failsafe, | |
45469 | + .address = (unsigned long)failsafe_callback, | |
45470 | + }; | |
45471 | + static struct callback_register __initdata syscall = { | |
45472 | + .type = CALLBACKTYPE_syscall, | |
45473 | + .address = (unsigned long)system_call, | |
45474 | + }; | |
45475 | +#ifdef CONFIG_X86_LOCAL_APIC | |
45476 | + static struct callback_register __initdata nmi_cb = { | |
45477 | + .type = CALLBACKTYPE_nmi, | |
45478 | + .address = (unsigned long)nmi, | |
45479 | + }; | |
45480 | +#endif | |
45481 | + | |
45482 | + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event); | |
45483 | + if (ret == 0) | |
45484 | + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); | |
45485 | + if (ret == 0) | |
45486 | + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall); | |
45487 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
45488 | + if (ret == -ENOSYS) | |
45489 | + ret = HYPERVISOR_set_callbacks( | |
45490 | + event.address, | |
45491 | + failsafe.address, | |
45492 | + syscall.address); | |
45493 | +#endif | |
45494 | + BUG_ON(ret); | |
45495 | + | |
45496 | +#ifdef CONFIG_X86_LOCAL_APIC | |
45497 | + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb); | |
45498 | +#if CONFIG_XEN_COMPAT <= 0x030002 | |
45499 | + if (ret == -ENOSYS) { | |
45500 | + static struct xennmi_callback __initdata cb = { | |
45501 | + .handler_address = (unsigned long)nmi | |
45502 | + }; | |
45503 | + | |
45504 | + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb); | |
45505 | + } | |
45506 | +#endif | |
45507 | +#endif | |
45508 | +} | |
45509 | Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch_pre.h | |
45510 | =================================================================== | |
45511 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45512 | +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch_pre.h 2007-06-12 13:14:13.000000000 +0200 | |
45513 | @@ -0,0 +1,5 @@ | |
45514 | +/* Hook to call BIOS initialisation function */ | |
45515 | + | |
45516 | +#define ARCH_SETUP machine_specific_arch_setup(); | |
45517 | + | |
45518 | +static void __init machine_specific_arch_setup(void); | |
45519 | Index: head-2008-11-25/include/xen/blkif.h | |
45520 | =================================================================== | |
45521 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45522 | +++ head-2008-11-25/include/xen/blkif.h 2008-07-21 11:00:33.000000000 +0200 | |
45523 | @@ -0,0 +1,123 @@ | |
45524 | +/* | |
45525 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | |
45526 | + * of this software and associated documentation files (the "Software"), to | |
45527 | + * deal in the Software without restriction, including without limitation the | |
45528 | + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | |
45529 | + * sell copies of the Software, and to permit persons to whom the Software is | |
45530 | + * furnished to do so, subject to the following conditions: | |
45531 | + * | |
45532 | + * The above copyright notice and this permission notice shall be included in | |
45533 | + * all copies or substantial portions of the Software. | |
45534 | + * | |
45535 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
45536 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
45537 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
45538 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
45539 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
45540 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
45541 | + * DEALINGS IN THE SOFTWARE. | |
45542 | + */ | |
45543 | + | |
45544 | +#ifndef __XEN_BLKIF_H__ | |
45545 | +#define __XEN_BLKIF_H__ | |
45546 | + | |
45547 | +#include <xen/interface/io/ring.h> | |
45548 | +#include <xen/interface/io/blkif.h> | |
45549 | +#include <xen/interface/io/protocols.h> | |
45550 | + | |
45551 | +/* Not a real protocol. Used to generate ring structs which contain | |
45552 | + * the elements common to all protocols only. This way we get a | |
45553 | + * compiler-checkable way to use common struct elements, so we can | |
45554 | + * avoid using switch(protocol) in a number of places. */ | |
45555 | +struct blkif_common_request { | |
45556 | + char dummy; | |
45557 | +}; | |
45558 | +struct blkif_common_response { | |
45559 | + char dummy; | |
45560 | +}; | |
45561 | + | |
45562 | +/* i386 protocol version */ | |
45563 | +#pragma pack(push, 4) | |
45564 | +struct blkif_x86_32_request { | |
45565 | + uint8_t operation; /* BLKIF_OP_??? */ | |
45566 | + uint8_t nr_segments; /* number of segments */ | |
45567 | + blkif_vdev_t handle; /* only for read/write requests */ | |
45568 | + uint64_t id; /* private guest value, echoed in resp */ | |
45569 | + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ | |
45570 | + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | |
45571 | +}; | |
45572 | +struct blkif_x86_32_response { | |
45573 | + uint64_t id; /* copied from request */ | |
45574 | + uint8_t operation; /* copied from request */ | |
45575 | + int16_t status; /* BLKIF_RSP_??? */ | |
45576 | +}; | |
45577 | +typedef struct blkif_x86_32_request blkif_x86_32_request_t; | |
45578 | +typedef struct blkif_x86_32_response blkif_x86_32_response_t; | |
45579 | +#pragma pack(pop) | |
45580 | + | |
45581 | +/* x86_64 protocol version */ | |
45582 | +struct blkif_x86_64_request { | |
45583 | + uint8_t operation; /* BLKIF_OP_??? */ | |
45584 | + uint8_t nr_segments; /* number of segments */ | |
45585 | + blkif_vdev_t handle; /* only for read/write requests */ | |
45586 | + uint64_t __attribute__((__aligned__(8))) id; | |
45587 | + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ | |
45588 | + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | |
45589 | +}; | |
45590 | +struct blkif_x86_64_response { | |
45591 | + uint64_t __attribute__((__aligned__(8))) id; | |
45592 | + uint8_t operation; /* copied from request */ | |
45593 | + int16_t status; /* BLKIF_RSP_??? */ | |
45594 | +}; | |
45595 | +typedef struct blkif_x86_64_request blkif_x86_64_request_t; | |
45596 | +typedef struct blkif_x86_64_response blkif_x86_64_response_t; | |
45597 | + | |
45598 | +DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response); | |
45599 | +DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response); | |
45600 | +DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response); | |
45601 | + | |
45602 | +union blkif_back_rings { | |
45603 | + blkif_back_ring_t native; | |
45604 | + blkif_common_back_ring_t common; | |
45605 | + blkif_x86_32_back_ring_t x86_32; | |
45606 | + blkif_x86_64_back_ring_t x86_64; | |
45607 | +}; | |
45608 | +typedef union blkif_back_rings blkif_back_rings_t; | |
45609 | + | |
45610 | +enum blkif_protocol { | |
45611 | + BLKIF_PROTOCOL_NATIVE = 1, | |
45612 | + BLKIF_PROTOCOL_X86_32 = 2, | |
45613 | + BLKIF_PROTOCOL_X86_64 = 3, | |
45614 | +}; | |
45615 | + | |
45616 | +static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src) | |
45617 | +{ | |
45618 | + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; | |
45619 | + dst->operation = src->operation; | |
45620 | + dst->nr_segments = src->nr_segments; | |
45621 | + dst->handle = src->handle; | |
45622 | + dst->id = src->id; | |
45623 | + dst->sector_number = src->sector_number; | |
45624 | + barrier(); | |
45625 | + if (n > dst->nr_segments) | |
45626 | + n = dst->nr_segments; | |
45627 | + for (i = 0; i < n; i++) | |
45628 | + dst->seg[i] = src->seg[i]; | |
45629 | +} | |
45630 | + | |
45631 | +static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src) | |
45632 | +{ | |
45633 | + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; | |
45634 | + dst->operation = src->operation; | |
45635 | + dst->nr_segments = src->nr_segments; | |
45636 | + dst->handle = src->handle; | |
45637 | + dst->id = src->id; | |
45638 | + dst->sector_number = src->sector_number; | |
45639 | + barrier(); | |
45640 | + if (n > dst->nr_segments) | |
45641 | + n = dst->nr_segments; | |
45642 | + for (i = 0; i < n; i++) | |
45643 | + dst->seg[i] = src->seg[i]; | |
45644 | +} | |
45645 | + | |
45646 | +#endif /* __XEN_BLKIF_H__ */ | |
45647 | Index: head-2008-11-25/include/xen/compat_ioctl.h | |
45648 | =================================================================== | |
45649 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45650 | +++ head-2008-11-25/include/xen/compat_ioctl.h 2007-07-10 09:42:30.000000000 +0200 | |
45651 | @@ -0,0 +1,45 @@ | |
45652 | +/* | |
45653 | + * This program is free software; you can redistribute it and/or | |
45654 | + * modify it under the terms of the GNU General Public License as | |
45655 | + * published by the Free Software Foundation; either version 2 of the | |
45656 | + * License, or (at your option) any later version. | |
45657 | + * | |
45658 | + * This program is distributed in the hope that it will be useful, | |
45659 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
45660 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
45661 | + * GNU General Public License for more details. | |
45662 | + * | |
45663 | + * You should have received a copy of the GNU General Public License | |
45664 | + * along with this program; if not, write to the Free Software | |
45665 | + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |
45666 | + * | |
45667 | + * Copyright IBM Corp. 2007 | |
45668 | + * | |
45669 | + * Authors: Jimi Xenidis <jimix@watson.ibm.com> | |
45670 | + * Hollis Blanchard <hollisb@us.ibm.com> | |
45671 | + */ | |
45672 | + | |
45673 | +#ifndef __LINUX_XEN_COMPAT_H__ | |
45674 | +#define __LINUX_XEN_COMPAT_H__ | |
45675 | + | |
45676 | +#include <linux/compat.h> | |
45677 | + | |
45678 | +extern int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg); | |
45679 | +struct privcmd_mmap_32 { | |
45680 | + int num; | |
45681 | + domid_t dom; | |
45682 | + compat_uptr_t entry; | |
45683 | +}; | |
45684 | + | |
45685 | +struct privcmd_mmapbatch_32 { | |
45686 | + int num; /* number of pages to populate */ | |
45687 | + domid_t dom; /* target domain */ | |
45688 | + __u64 addr; /* virtual address */ | |
45689 | + compat_uptr_t arr; /* array of mfns - top nibble set on err */ | |
45690 | +}; | |
45691 | +#define IOCTL_PRIVCMD_MMAP_32 \ | |
45692 | + _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32)) | |
45693 | +#define IOCTL_PRIVCMD_MMAPBATCH_32 \ | |
45694 | + _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32)) | |
45695 | + | |
45696 | +#endif /* __LINUX_XEN_COMPAT_H__ */ | |
45697 | Index: head-2008-11-25/include/xen/cpu_hotplug.h | |
45698 | =================================================================== | |
45699 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45700 | +++ head-2008-11-25/include/xen/cpu_hotplug.h 2007-08-16 18:07:01.000000000 +0200 | |
45701 | @@ -0,0 +1,41 @@ | |
45702 | +#ifndef __XEN_CPU_HOTPLUG_H__ | |
45703 | +#define __XEN_CPU_HOTPLUG_H__ | |
45704 | + | |
45705 | +#include <linux/kernel.h> | |
45706 | +#include <linux/cpumask.h> | |
45707 | + | |
45708 | +#if defined(CONFIG_X86) && defined(CONFIG_SMP) | |
45709 | +extern cpumask_t cpu_initialized_map; | |
45710 | +#endif | |
45711 | + | |
45712 | +#if defined(CONFIG_HOTPLUG_CPU) | |
45713 | + | |
45714 | +int cpu_up_check(unsigned int cpu); | |
45715 | +void init_xenbus_allowed_cpumask(void); | |
45716 | +int smp_suspend(void); | |
45717 | +void smp_resume(void); | |
45718 | + | |
45719 | +void cpu_bringup(void); | |
45720 | + | |
45721 | +#else /* !defined(CONFIG_HOTPLUG_CPU) */ | |
45722 | + | |
45723 | +#define cpu_up_check(cpu) (0) | |
45724 | +#define init_xenbus_allowed_cpumask() ((void)0) | |
45725 | + | |
45726 | +static inline int smp_suspend(void) | |
45727 | +{ | |
45728 | + if (num_online_cpus() > 1) { | |
45729 | + printk(KERN_WARNING "Can't suspend SMP guests " | |
45730 | + "without CONFIG_HOTPLUG_CPU\n"); | |
45731 | + return -EOPNOTSUPP; | |
45732 | + } | |
45733 | + return 0; | |
45734 | +} | |
45735 | + | |
45736 | +static inline void smp_resume(void) | |
45737 | +{ | |
45738 | +} | |
45739 | + | |
45740 | +#endif /* !defined(CONFIG_HOTPLUG_CPU) */ | |
45741 | + | |
45742 | +#endif /* __XEN_CPU_HOTPLUG_H__ */ | |
45743 | Index: head-2008-11-25/include/xen/driver_util.h | |
45744 | =================================================================== | |
45745 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45746 | +++ head-2008-11-25/include/xen/driver_util.h 2007-06-12 13:14:19.000000000 +0200 | |
45747 | @@ -0,0 +1,14 @@ | |
45748 | + | |
45749 | +#ifndef __ASM_XEN_DRIVER_UTIL_H__ | |
45750 | +#define __ASM_XEN_DRIVER_UTIL_H__ | |
45751 | + | |
45752 | +#include <linux/vmalloc.h> | |
45753 | +#include <linux/device.h> | |
45754 | + | |
45755 | +/* Allocate/destroy a 'vmalloc' VM area. */ | |
45756 | +extern struct vm_struct *alloc_vm_area(unsigned long size); | |
45757 | +extern void free_vm_area(struct vm_struct *area); | |
45758 | + | |
45759 | +extern struct class *get_xen_class(void); | |
45760 | + | |
45761 | +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */ | |
45762 | Index: head-2008-11-25/include/xen/evtchn.h | |
45763 | =================================================================== | |
45764 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45765 | +++ head-2008-11-25/include/xen/evtchn.h 2008-09-15 13:40:15.000000000 +0200 | |
45766 | @@ -0,0 +1,160 @@ | |
45767 | +/****************************************************************************** | |
45768 | + * evtchn.h | |
45769 | + * | |
45770 | + * Communication via Xen event channels. | |
45771 | + * Also definitions for the device that demuxes notifications to userspace. | |
45772 | + * | |
45773 | + * Copyright (c) 2004-2005, K A Fraser | |
45774 | + * | |
45775 | + * This program is free software; you can redistribute it and/or | |
45776 | + * modify it under the terms of the GNU General Public License version 2 | |
45777 | + * as published by the Free Software Foundation; or, when distributed | |
45778 | + * separately from the Linux kernel or incorporated into other | |
45779 | + * software packages, subject to the following license: | |
45780 | + * | |
45781 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | |
45782 | + * of this source file (the "Software"), to deal in the Software without | |
45783 | + * restriction, including without limitation the rights to use, copy, modify, | |
45784 | + * merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
45785 | + * and to permit persons to whom the Software is furnished to do so, subject to | |
45786 | + * the following conditions: | |
45787 | + * | |
45788 | + * The above copyright notice and this permission notice shall be included in | |
45789 | + * all copies or substantial portions of the Software. | |
45790 | + * | |
45791 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
45792 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
45793 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
45794 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
45795 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
45796 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
45797 | + * IN THE SOFTWARE. | |
45798 | + */ | |
45799 | + | |
45800 | +#ifndef __ASM_EVTCHN_H__ | |
45801 | +#define __ASM_EVTCHN_H__ | |
45802 | + | |
45803 | +#include <linux/interrupt.h> | |
45804 | +#include <asm/hypervisor.h> | |
45805 | +#include <asm/ptrace.h> | |
45806 | +#include <asm/synch_bitops.h> | |
45807 | +#include <xen/interface/event_channel.h> | |
45808 | +#include <linux/smp.h> | |
45809 | + | |
45810 | +/* | |
45811 | + * LOW-LEVEL DEFINITIONS | |
45812 | + */ | |
45813 | + | |
45814 | +/* | |
45815 | + * Dynamically bind an event source to an IRQ-like callback handler. | |
45816 | + * On some platforms this may not be implemented via the Linux IRQ subsystem. | |
45817 | + * The IRQ argument passed to the callback handler is the same as returned | |
45818 | + * from the bind call. It may not correspond to a Linux IRQ number. | |
45819 | + * Returns IRQ or negative errno. | |
45820 | + */ | |
45821 | +int bind_caller_port_to_irqhandler( | |
45822 | + unsigned int caller_port, | |
45823 | + irqreturn_t (*handler)(int, void *, struct pt_regs *), | |
45824 | + unsigned long irqflags, | |
45825 | + const char *devname, | |
45826 | + void *dev_id); | |
45827 | +int bind_listening_port_to_irqhandler( | |
45828 | + unsigned int remote_domain, | |
45829 | + irqreturn_t (*handler)(int, void *, struct pt_regs *), | |
45830 | + unsigned long irqflags, | |
45831 | + const char *devname, | |
45832 | + void *dev_id); | |
45833 | +int bind_interdomain_evtchn_to_irqhandler( | |
45834 | + unsigned int remote_domain, | |
45835 | + unsigned int remote_port, | |
45836 | + irqreturn_t (*handler)(int, void *, struct pt_regs *), | |
45837 | + unsigned long irqflags, | |
45838 | + const char *devname, | |
45839 | + void *dev_id); | |
45840 | +int bind_virq_to_irqhandler( | |
45841 | + unsigned int virq, | |
45842 | + unsigned int cpu, | |
45843 | + irqreturn_t (*handler)(int, void *, struct pt_regs *), | |
45844 | + unsigned long irqflags, | |
45845 | + const char *devname, | |
45846 | + void *dev_id); | |
45847 | +int bind_ipi_to_irqhandler( | |
45848 | + unsigned int ipi, | |
45849 | + unsigned int cpu, | |
45850 | + irqreturn_t (*handler)(int, void *, struct pt_regs *), | |
45851 | + unsigned long irqflags, | |
45852 | + const char *devname, | |
45853 | + void *dev_id); | |
45854 | + | |
45855 | +/* | |
45856 | + * Common unbind function for all event sources. Takes IRQ to unbind from. | |
45857 | + * Automatically closes the underlying event channel (except for bindings | |
45858 | + * made with bind_caller_port_to_irqhandler()). | |
45859 | + */ | |
45860 | +void unbind_from_irqhandler(unsigned int irq, void *dev_id); | |
45861 | + | |
45862 | +void irq_resume(void); | |
45863 | + | |
45864 | +/* Entry point for notifications into Linux subsystems. */ | |
45865 | +asmlinkage void evtchn_do_upcall(struct pt_regs *regs); | |
45866 | + | |
45867 | +/* Entry point for notifications into the userland character device. */ | |
45868 | +void evtchn_device_upcall(int port); | |
45869 | + | |
45870 | +/* Mark a PIRQ as unavailable for dynamic allocation. */ | |
45871 | +void evtchn_register_pirq(int irq); | |
45872 | +/* Map a Xen-supplied PIRQ to a dynamically allocated one. */ | |
45873 | +int evtchn_map_pirq(int irq, int xen_pirq); | |
45874 | +/* Look up a Xen-supplied PIRQ for a dynamically allocated one. */ | |
45875 | +int evtchn_get_xen_pirq(int irq); | |
45876 | + | |
45877 | +void mask_evtchn(int port); | |
45878 | +void disable_all_local_evtchn(void); | |
45879 | +void unmask_evtchn(int port); | |
45880 | + | |
45881 | +#ifdef CONFIG_SMP | |
45882 | +void rebind_evtchn_to_cpu(int port, unsigned int cpu); | |
45883 | +#else | |
45884 | +#define rebind_evtchn_to_cpu(port, cpu) ((void)0) | |
45885 | +#endif | |
45886 | + | |
45887 | +static inline int test_and_set_evtchn_mask(int port) | |
45888 | +{ | |
45889 | + shared_info_t *s = HYPERVISOR_shared_info; | |
45890 | + return synch_test_and_set_bit(port, s->evtchn_mask); | |
45891 | +} | |
45892 | + | |
45893 | +static inline void clear_evtchn(int port) | |
45894 | +{ | |
45895 | + shared_info_t *s = HYPERVISOR_shared_info; | |
45896 | + synch_clear_bit(port, s->evtchn_pending); | |
45897 | +} | |
45898 | + | |
45899 | +static inline void notify_remote_via_evtchn(int port) | |
45900 | +{ | |
45901 | + struct evtchn_send send = { .port = port }; | |
45902 | + VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send)); | |
45903 | +} | |
45904 | + | |
45905 | +/* | |
45906 | + * Use these to access the event channel underlying the IRQ handle returned | |
45907 | + * by bind_*_to_irqhandler(). | |
45908 | + */ | |
45909 | +void notify_remote_via_irq(int irq); | |
45910 | +int irq_to_evtchn_port(int irq); | |
45911 | + | |
45912 | +#define PIRQ_SET_MAPPING 0x0 | |
45913 | +#define PIRQ_CLEAR_MAPPING 0x1 | |
45914 | +#define PIRQ_GET_MAPPING 0x3 | |
45915 | +int pirq_mapstatus(int pirq, int action); | |
45916 | +int set_pirq_hw_action(int pirq, int (*action)(int pirq, int action)); | |
45917 | +int clear_pirq_hw_action(int pirq); | |
45918 | + | |
45919 | +#define PIRQ_STARTUP 1 | |
45920 | +#define PIRQ_SHUTDOWN 2 | |
45921 | +#define PIRQ_ENABLE 3 | |
45922 | +#define PIRQ_DISABLE 4 | |
45923 | +#define PIRQ_END 5 | |
45924 | +#define PIRQ_ACK 6 | |
45925 | + | |
45926 | +#endif /* __ASM_EVTCHN_H__ */ | |
45927 | Index: head-2008-11-25/include/xen/firmware.h | |
45928 | =================================================================== | |
45929 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45930 | +++ head-2008-11-25/include/xen/firmware.h 2007-07-02 08:16:19.000000000 +0200 | |
45931 | @@ -0,0 +1,10 @@ | |
45932 | +#ifndef __XEN_FIRMWARE_H__ | |
45933 | +#define __XEN_FIRMWARE_H__ | |
45934 | + | |
45935 | +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | |
45936 | +void copy_edd(void); | |
45937 | +#endif | |
45938 | + | |
45939 | +void copy_edid(void); | |
45940 | + | |
45941 | +#endif /* __XEN_FIRMWARE_H__ */ | |
45942 | Index: head-2008-11-25/include/xen/gnttab.h | |
45943 | =================================================================== | |
45944 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
45945 | +++ head-2008-11-25/include/xen/gnttab.h 2008-11-04 11:13:10.000000000 +0100 | |
45946 | @@ -0,0 +1,164 @@ | |
45947 | +/****************************************************************************** | |
45948 | + * gnttab.h | |
45949 | + * | |
45950 | + * Two sets of functionality: | |
45951 | + * 1. Granting foreign access to our memory reservation. | |
45952 | + * 2. Accessing others' memory reservations via grant references. | |
45953 | + * (i.e., mechanisms for both sender and recipient of grant references) | |
45954 | + * | |
45955 | + * Copyright (c) 2004-2005, K A Fraser | |
45956 | + * Copyright (c) 2005, Christopher Clark | |
45957 | + * | |
45958 | + * This program is free software; you can redistribute it and/or | |
45959 | + * modify it under the terms of the GNU General Public License version 2 | |
45960 | + * as published by the Free Software Foundation; or, when distributed | |
45961 | + * separately from the Linux kernel or incorporated into other | |
45962 | + * software packages, subject to the following license: | |
45963 | + * | |
45964 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | |
45965 | + * of this source file (the "Software"), to deal in the Software without | |
45966 | + * restriction, including without limitation the rights to use, copy, modify, | |
45967 | + * merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
45968 | + * and to permit persons to whom the Software is furnished to do so, subject to | |
45969 | + * the following conditions: | |
45970 | + * | |
45971 | + * The above copyright notice and this permission notice shall be included in | |
45972 | + * all copies or substantial portions of the Software. | |
45973 | + * | |
45974 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
45975 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
45976 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
45977 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
45978 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
45979 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
45980 | + * IN THE SOFTWARE. | |
45981 | + */ | |
45982 | + | |
45983 | +#ifndef __ASM_GNTTAB_H__ | |
45984 | +#define __ASM_GNTTAB_H__ | |
45985 | + | |
45986 | +#include <asm/hypervisor.h> | |
45987 | +#include <asm/maddr.h> /* maddr_t */ | |
45988 | +#include <linux/mm.h> | |
45989 | +#include <xen/interface/grant_table.h> | |
45990 | +#include <xen/features.h> | |
45991 | + | |
45992 | +struct gnttab_free_callback { | |
45993 | + struct gnttab_free_callback *next; | |
45994 | + void (*fn)(void *); | |
45995 | + void *arg; | |
45996 | + u16 count; | |
45997 | + u8 queued; | |
45998 | +}; | |
45999 | + | |
46000 | +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, | |
46001 | + int flags); | |
46002 | + | |
46003 | +/* | |
46004 | + * End access through the given grant reference, iff the grant entry is no | |
46005 | + * longer in use. Return 1 if the grant entry was freed, 0 if it is still in | |
46006 | + * use. | |
46007 | + */ | |
46008 | +int gnttab_end_foreign_access_ref(grant_ref_t ref); | |
46009 | + | |
46010 | +/* | |
46011 | + * Eventually end access through the given grant reference, and once that | |
46012 | + * access has been ended, free the given page too. Access will be ended | |
46013 | + * immediately iff the grant entry is not in use, otherwise it will happen | |
46014 | + * some time later. page may be 0, in which case no freeing will occur. | |
46015 | + */ | |
46016 | +void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page); | |
46017 | + | |
46018 | +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); | |
46019 | + | |
46020 | +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); | |
46021 | +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); | |
46022 | + | |
46023 | +int gnttab_query_foreign_access(grant_ref_t ref); | |
46024 | + | |
46025 | +/* | |
46026 | + * operations on reserved batches of grant references | |
46027 | + */ | |
46028 | +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head); | |
46029 | + | |
46030 | +void gnttab_free_grant_reference(grant_ref_t ref); | |
46031 | + | |
46032 | +void gnttab_free_grant_references(grant_ref_t head); | |
46033 | + | |
46034 | +int gnttab_empty_grant_references(const grant_ref_t *pprivate_head); | |
46035 | + | |
46036 | +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head); | |
46037 | + | |
46038 | +void gnttab_release_grant_reference(grant_ref_t *private_head, | |
46039 | + grant_ref_t release); | |
46040 | + | |
46041 | +void gnttab_request_free_callback(struct gnttab_free_callback *callback, | |
46042 | + void (*fn)(void *), void *arg, u16 count); | |
46043 | +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); | |
46044 | + | |
46045 | +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, | |
46046 | + unsigned long frame, int flags); | |
46047 | + | |
46048 | +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, | |
46049 | + unsigned long pfn); | |
46050 | + | |
46051 | +int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep); | |
46052 | +void __gnttab_dma_map_page(struct page *page); | |
46053 | +static inline void __gnttab_dma_unmap_page(struct page *page) | |
46054 | +{ | |
46055 | +} | |
46056 | + | |
46057 | +void gnttab_reset_grant_page(struct page *page); | |
46058 | + | |
46059 | +int gnttab_suspend(void); | |
46060 | +int gnttab_resume(void); | |
46061 | + | |
46062 | +void *arch_gnttab_alloc_shared(unsigned long *frames); | |
46063 | + | |
46064 | +static inline void | |
46065 | +gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr, | |
46066 | + uint32_t flags, grant_ref_t ref, domid_t domid) | |
46067 | +{ | |
46068 | + if (flags & GNTMAP_contains_pte) | |
46069 | + map->host_addr = addr; | |
46070 | + else if (xen_feature(XENFEAT_auto_translated_physmap)) | |
46071 | + map->host_addr = __pa(addr); | |
46072 | + else | |
46073 | + map->host_addr = addr; | |
46074 | + | |
46075 | + map->flags = flags; | |
46076 | + map->ref = ref; | |
46077 | + map->dom = domid; | |
46078 | +} | |
46079 | + | |
46080 | +static inline void | |
46081 | +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr, | |
46082 | + uint32_t flags, grant_handle_t handle) | |
46083 | +{ | |
46084 | + if (flags & GNTMAP_contains_pte) | |
46085 | + unmap->host_addr = addr; | |
46086 | + else if (xen_feature(XENFEAT_auto_translated_physmap)) | |
46087 | + unmap->host_addr = __pa(addr); | |
46088 | + else | |
46089 | + unmap->host_addr = addr; | |
46090 | + | |
46091 | + unmap->handle = handle; | |
46092 | + unmap->dev_bus_addr = 0; | |
46093 | +} | |
46094 | + | |
46095 | +static inline void | |
46096 | +gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr, | |
46097 | + maddr_t new_addr, grant_handle_t handle) | |
46098 | +{ | |
46099 | + if (xen_feature(XENFEAT_auto_translated_physmap)) { | |
46100 | + unmap->host_addr = __pa(addr); | |
46101 | + unmap->new_addr = __pa(new_addr); | |
46102 | + } else { | |
46103 | + unmap->host_addr = addr; | |
46104 | + unmap->new_addr = new_addr; | |
46105 | + } | |
46106 | + | |
46107 | + unmap->handle = handle; | |
46108 | +} | |
46109 | + | |
46110 | +#endif /* __ASM_GNTTAB_H__ */ | |
46111 | Index: head-2008-11-25/include/xen/hvm.h | |
46112 | =================================================================== | |
46113 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46114 | +++ head-2008-11-25/include/xen/hvm.h 2007-06-12 13:14:19.000000000 +0200 | |
46115 | @@ -0,0 +1,23 @@ | |
46116 | +/* Simple wrappers around HVM functions */ | |
46117 | +#ifndef XEN_HVM_H__ | |
46118 | +#define XEN_HVM_H__ | |
46119 | + | |
46120 | +#include <xen/interface/hvm/params.h> | |
46121 | + | |
46122 | +static inline unsigned long hvm_get_parameter(int idx) | |
46123 | +{ | |
46124 | + struct xen_hvm_param xhv; | |
46125 | + int r; | |
46126 | + | |
46127 | + xhv.domid = DOMID_SELF; | |
46128 | + xhv.index = idx; | |
46129 | + r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv); | |
46130 | + if (r < 0) { | |
46131 | + printk(KERN_ERR "cannot get hvm parameter %d: %d.\n", | |
46132 | + idx, r); | |
46133 | + return 0; | |
46134 | + } | |
46135 | + return xhv.value; | |
46136 | +} | |
46137 | + | |
46138 | +#endif /* XEN_HVM_H__ */ | |
46139 | Index: head-2008-11-25/include/xen/hypercall.h | |
46140 | =================================================================== | |
46141 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46142 | +++ head-2008-11-25/include/xen/hypercall.h 2008-01-28 12:24:19.000000000 +0100 | |
46143 | @@ -0,0 +1,30 @@ | |
46144 | +#ifndef __XEN_HYPERCALL_H__ | |
46145 | +#define __XEN_HYPERCALL_H__ | |
46146 | + | |
46147 | +#include <asm/hypercall.h> | |
46148 | + | |
46149 | +static inline int __must_check | |
46150 | +HYPERVISOR_multicall_check( | |
46151 | + multicall_entry_t *call_list, unsigned int nr_calls, | |
46152 | + const unsigned long *rc_list) | |
46153 | +{ | |
46154 | + int rc = HYPERVISOR_multicall(call_list, nr_calls); | |
46155 | + | |
46156 | + if (unlikely(rc < 0)) | |
46157 | + return rc; | |
46158 | + BUG_ON(rc); | |
46159 | + BUG_ON((int)nr_calls < 0); | |
46160 | + | |
46161 | + for ( ; nr_calls > 0; --nr_calls, ++call_list) | |
46162 | + if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0))) | |
46163 | + return nr_calls; | |
46164 | + | |
46165 | + return 0; | |
46166 | +} | |
46167 | + | |
46168 | +/* A construct to ignore the return value of hypercall wrappers in a few | |
46169 | + * exceptional cases (simply casting the function result to void doesn't | |
46170 | + * avoid the compiler warning): */ | |
46171 | +#define VOID(expr) ((void)((expr)?:0)) | |
46172 | + | |
46173 | +#endif /* __XEN_HYPERCALL_H__ */ | |
46174 | Index: head-2008-11-25/include/xen/hypervisor_sysfs.h | |
46175 | =================================================================== | |
46176 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46177 | +++ head-2008-11-25/include/xen/hypervisor_sysfs.h 2007-06-22 09:08:06.000000000 +0200 | |
46178 | @@ -0,0 +1,30 @@ | |
46179 | +/* | |
46180 | + * copyright (c) 2006 IBM Corporation | |
46181 | + * Authored by: Mike D. Day <ncmike@us.ibm.com> | |
46182 | + * | |
46183 | + * This program is free software; you can redistribute it and/or modify | |
46184 | + * it under the terms of the GNU General Public License version 2 as | |
46185 | + * published by the Free Software Foundation. | |
46186 | + */ | |
46187 | + | |
46188 | +#ifndef _HYP_SYSFS_H_ | |
46189 | +#define _HYP_SYSFS_H_ | |
46190 | + | |
46191 | +#include <linux/kobject.h> | |
46192 | +#include <linux/sysfs.h> | |
46193 | + | |
46194 | +#define HYPERVISOR_ATTR_RO(_name) \ | |
46195 | +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) | |
46196 | + | |
46197 | +#define HYPERVISOR_ATTR_RW(_name) \ | |
46198 | +static struct hyp_sysfs_attr _name##_attr = \ | |
46199 | + __ATTR(_name, 0644, _name##_show, _name##_store) | |
46200 | + | |
46201 | +struct hyp_sysfs_attr { | |
46202 | + struct attribute attr; | |
46203 | + ssize_t (*show)(struct hyp_sysfs_attr *, char *); | |
46204 | + ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t); | |
46205 | + void *hyp_attr_data; | |
46206 | +}; | |
46207 | + | |
46208 | +#endif /* _HYP_SYSFS_H_ */ | |
46209 | Index: head-2008-11-25/include/xen/pcifront.h | |
46210 | =================================================================== | |
46211 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46212 | +++ head-2008-11-25/include/xen/pcifront.h 2007-06-18 08:38:13.000000000 +0200 | |
46213 | @@ -0,0 +1,83 @@ | |
46214 | +/* | |
46215 | + * PCI Frontend - arch-dependendent declarations | |
46216 | + * | |
46217 | + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> | |
46218 | + */ | |
46219 | +#ifndef __XEN_ASM_PCIFRONT_H__ | |
46220 | +#define __XEN_ASM_PCIFRONT_H__ | |
46221 | + | |
46222 | +#include <linux/spinlock.h> | |
46223 | + | |
46224 | +#ifdef __KERNEL__ | |
46225 | + | |
46226 | +#ifndef __ia64__ | |
46227 | + | |
46228 | +struct pcifront_device; | |
46229 | +struct pci_bus; | |
46230 | + | |
46231 | +struct pcifront_sd { | |
46232 | + int domain; | |
46233 | + struct pcifront_device *pdev; | |
46234 | +}; | |
46235 | + | |
46236 | +static inline struct pcifront_device * | |
46237 | +pcifront_get_pdev(struct pcifront_sd *sd) | |
46238 | +{ | |
46239 | + return sd->pdev; | |
46240 | +} | |
46241 | + | |
46242 | +static inline void pcifront_init_sd(struct pcifront_sd *sd, | |
46243 | + unsigned int domain, unsigned int bus, | |
46244 | + struct pcifront_device *pdev) | |
46245 | +{ | |
46246 | + sd->domain = domain; | |
46247 | + sd->pdev = pdev; | |
46248 | +} | |
46249 | + | |
46250 | +#if defined(CONFIG_PCI_DOMAINS) | |
46251 | +static inline int pci_domain_nr(struct pci_bus *bus) | |
46252 | +{ | |
46253 | + struct pcifront_sd *sd = bus->sysdata; | |
46254 | + return sd->domain; | |
46255 | +} | |
46256 | +static inline int pci_proc_domain(struct pci_bus *bus) | |
46257 | +{ | |
46258 | + return pci_domain_nr(bus); | |
46259 | +} | |
46260 | +#endif /* CONFIG_PCI_DOMAINS */ | |
46261 | + | |
46262 | +static inline void pcifront_setup_root_resources(struct pci_bus *bus, | |
46263 | + struct pcifront_sd *sd) | |
46264 | +{ | |
46265 | +} | |
46266 | + | |
46267 | +#else /* __ia64__ */ | |
46268 | + | |
46269 | +#include <linux/acpi.h> | |
46270 | +#include <asm/pci.h> | |
46271 | +#define pcifront_sd pci_controller | |
46272 | + | |
46273 | +extern void xen_add_resource(struct pci_controller *, unsigned int, | |
46274 | + unsigned int, struct acpi_resource *); | |
46275 | +extern void xen_pcibios_setup_root_windows(struct pci_bus *, | |
46276 | + struct pci_controller *); | |
46277 | + | |
46278 | +static inline struct pcifront_device * | |
46279 | +pcifront_get_pdev(struct pcifront_sd *sd) | |
46280 | +{ | |
46281 | + return (struct pcifront_device *)sd->platform_data; | |
46282 | +} | |
46283 | + | |
46284 | +static inline void pcifront_setup_root_resources(struct pci_bus *bus, | |
46285 | + struct pcifront_sd *sd) | |
46286 | +{ | |
46287 | + xen_pcibios_setup_root_windows(bus, sd); | |
46288 | +} | |
46289 | + | |
46290 | +#endif /* __ia64__ */ | |
46291 | + | |
46292 | +extern struct rw_semaphore pci_bus_sem; | |
46293 | + | |
46294 | +#endif /* __KERNEL__ */ | |
46295 | + | |
46296 | +#endif /* __XEN_ASM_PCIFRONT_H__ */ | |
46297 | Index: head-2008-11-25/include/xen/public/evtchn.h | |
46298 | =================================================================== | |
46299 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46300 | +++ head-2008-11-25/include/xen/public/evtchn.h 2007-06-12 13:14:19.000000000 +0200 | |
46301 | @@ -0,0 +1,88 @@ | |
46302 | +/****************************************************************************** | |
46303 | + * evtchn.h | |
46304 | + * | |
46305 | + * Interface to /dev/xen/evtchn. | |
46306 | + * | |
46307 | + * Copyright (c) 2003-2005, K A Fraser | |
46308 | + * | |
46309 | + * This program is free software; you can redistribute it and/or | |
46310 | + * modify it under the terms of the GNU General Public License version 2 | |
46311 | + * as published by the Free Software Foundation; or, when distributed | |
46312 | + * separately from the Linux kernel or incorporated into other | |
46313 | + * software packages, subject to the following license: | |
46314 | + * | |
46315 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | |
46316 | + * of this source file (the "Software"), to deal in the Software without | |
46317 | + * restriction, including without limitation the rights to use, copy, modify, | |
46318 | + * merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
46319 | + * and to permit persons to whom the Software is furnished to do so, subject to | |
46320 | + * the following conditions: | |
46321 | + * | |
46322 | + * The above copyright notice and this permission notice shall be included in | |
46323 | + * all copies or substantial portions of the Software. | |
46324 | + * | |
46325 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
46326 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
46327 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
46328 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
46329 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
46330 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
46331 | + * IN THE SOFTWARE. | |
46332 | + */ | |
46333 | + | |
46334 | +#ifndef __LINUX_PUBLIC_EVTCHN_H__ | |
46335 | +#define __LINUX_PUBLIC_EVTCHN_H__ | |
46336 | + | |
46337 | +/* | |
46338 | + * Bind a fresh port to VIRQ @virq. | |
46339 | + * Return allocated port. | |
46340 | + */ | |
46341 | +#define IOCTL_EVTCHN_BIND_VIRQ \ | |
46342 | + _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq)) | |
46343 | +struct ioctl_evtchn_bind_virq { | |
46344 | + unsigned int virq; | |
46345 | +}; | |
46346 | + | |
46347 | +/* | |
46348 | + * Bind a fresh port to remote <@remote_domain, @remote_port>. | |
46349 | + * Return allocated port. | |
46350 | + */ | |
46351 | +#define IOCTL_EVTCHN_BIND_INTERDOMAIN \ | |
46352 | + _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain)) | |
46353 | +struct ioctl_evtchn_bind_interdomain { | |
46354 | + unsigned int remote_domain, remote_port; | |
46355 | +}; | |
46356 | + | |
46357 | +/* | |
46358 | + * Allocate a fresh port for binding to @remote_domain. | |
46359 | + * Return allocated port. | |
46360 | + */ | |
46361 | +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \ | |
46362 | + _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port)) | |
46363 | +struct ioctl_evtchn_bind_unbound_port { | |
46364 | + unsigned int remote_domain; | |
46365 | +}; | |
46366 | + | |
46367 | +/* | |
46368 | + * Unbind previously allocated @port. | |
46369 | + */ | |
46370 | +#define IOCTL_EVTCHN_UNBIND \ | |
46371 | + _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind)) | |
46372 | +struct ioctl_evtchn_unbind { | |
46373 | + unsigned int port; | |
46374 | +}; | |
46375 | + | |
46376 | +/* | |
46377 | + * Unbind previously allocated @port. | |
46378 | + */ | |
46379 | +#define IOCTL_EVTCHN_NOTIFY \ | |
46380 | + _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify)) | |
46381 | +struct ioctl_evtchn_notify { | |
46382 | + unsigned int port; | |
46383 | +}; | |
46384 | + | |
46385 | +/* Clear and reinitialise the event buffer. Clear error condition. */ | |
46386 | +#define IOCTL_EVTCHN_RESET \ | |
46387 | + _IOC(_IOC_NONE, 'E', 5, 0) | |
46388 | + | |
46389 | +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */ | |
46390 | Index: head-2008-11-25/include/xen/public/gntdev.h | |
46391 | =================================================================== | |
46392 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46393 | +++ head-2008-11-25/include/xen/public/gntdev.h 2008-04-02 12:34:02.000000000 +0200 | |
46394 | @@ -0,0 +1,119 @@ | |
46395 | +/****************************************************************************** | |
46396 | + * gntdev.h | |
46397 | + * | |
46398 | + * Interface to /dev/xen/gntdev. | |
46399 | + * | |
46400 | + * Copyright (c) 2007, D G Murray | |
46401 | + * | |
46402 | + * This program is free software; you can redistribute it and/or | |
46403 | + * modify it under the terms of the GNU General Public License version 2 | |
46404 | + * as published by the Free Software Foundation; or, when distributed | |
46405 | + * separately from the Linux kernel or incorporated into other | |
46406 | + * software packages, subject to the following license: | |
46407 | + * | |
46408 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | |
46409 | + * of this source file (the "Software"), to deal in the Software without | |
46410 | + * restriction, including without limitation the rights to use, copy, modify, | |
46411 | + * merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
46412 | + * and to permit persons to whom the Software is furnished to do so, subject to | |
46413 | + * the following conditions: | |
46414 | + * | |
46415 | + * The above copyright notice and this permission notice shall be included in | |
46416 | + * all copies or substantial portions of the Software. | |
46417 | + * | |
46418 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
46419 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
46420 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
46421 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
46422 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
46423 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
46424 | + * IN THE SOFTWARE. | |
46425 | + */ | |
46426 | + | |
46427 | +#ifndef __LINUX_PUBLIC_GNTDEV_H__ | |
46428 | +#define __LINUX_PUBLIC_GNTDEV_H__ | |
46429 | + | |
46430 | +struct ioctl_gntdev_grant_ref { | |
46431 | + /* The domain ID of the grant to be mapped. */ | |
46432 | + uint32_t domid; | |
46433 | + /* The grant reference of the grant to be mapped. */ | |
46434 | + uint32_t ref; | |
46435 | +}; | |
46436 | + | |
46437 | +/* | |
46438 | + * Inserts the grant references into the mapping table of an instance | |
46439 | + * of gntdev. N.B. This does not perform the mapping, which is deferred | |
46440 | + * until mmap() is called with @index as the offset. | |
46441 | + */ | |
46442 | +#define IOCTL_GNTDEV_MAP_GRANT_REF \ | |
46443 | +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) | |
46444 | +struct ioctl_gntdev_map_grant_ref { | |
46445 | + /* IN parameters */ | |
46446 | + /* The number of grants to be mapped. */ | |
46447 | + uint32_t count; | |
46448 | + uint32_t pad; | |
46449 | + /* OUT parameters */ | |
46450 | + /* The offset to be used on a subsequent call to mmap(). */ | |
46451 | + uint64_t index; | |
46452 | + /* Variable IN parameter. */ | |
46453 | + /* Array of grant references, of size @count. */ | |
46454 | + struct ioctl_gntdev_grant_ref refs[1]; | |
46455 | +}; | |
46456 | + | |
46457 | +/* | |
46458 | + * Removes the grant references from the mapping table of an instance of | |
46459 | + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) | |
46460 | + * before this ioctl is called, or an error will result. | |
46461 | + */ | |
46462 | +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ | |
46463 | +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) | |
46464 | +struct ioctl_gntdev_unmap_grant_ref { | |
46465 | + /* IN parameters */ | |
46466 | + /* The offset was returned by the corresponding map operation. */ | |
46467 | + uint64_t index; | |
46468 | + /* The number of pages to be unmapped. */ | |
46469 | + uint32_t count; | |
46470 | + uint32_t pad; | |
46471 | +}; | |
46472 | + | |
46473 | +/* | |
46474 | + * Returns the offset in the driver's address space that corresponds | |
46475 | + * to @vaddr. This can be used to perform a munmap(), followed by an | |
46476 | + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by | |
46477 | + * the caller. The number of pages that were allocated at the same time as | |
46478 | + * @vaddr is returned in @count. | |
46479 | + * | |
46480 | + * N.B. Where more than one page has been mapped into a contiguous range, the | |
46481 | + * supplied @vaddr must correspond to the start of the range; otherwise | |
46482 | + * an error will result. It is only possible to munmap() the entire | |
46483 | + * contiguously-allocated range at once, and not any subrange thereof. | |
46484 | + */ | |
46485 | +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ | |
46486 | +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) | |
46487 | +struct ioctl_gntdev_get_offset_for_vaddr { | |
46488 | + /* IN parameters */ | |
46489 | + /* The virtual address of the first mapped page in a range. */ | |
46490 | + uint64_t vaddr; | |
46491 | + /* OUT parameters */ | |
46492 | + /* The offset that was used in the initial mmap() operation. */ | |
46493 | + uint64_t offset; | |
46494 | + /* The number of pages mapped in the VM area that begins at @vaddr. */ | |
46495 | + uint32_t count; | |
46496 | + uint32_t pad; | |
46497 | +}; | |
46498 | + | |
46499 | +/* | |
46500 | + * Sets the maximum number of grants that may mapped at once by this gntdev | |
46501 | + * instance. | |
46502 | + * | |
46503 | + * N.B. This must be called before any other ioctl is performed on the device. | |
46504 | + */ | |
46505 | +#define IOCTL_GNTDEV_SET_MAX_GRANTS \ | |
46506 | +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) | |
46507 | +struct ioctl_gntdev_set_max_grants { | |
46508 | + /* IN parameter */ | |
46509 | + /* The maximum number of grants that may be mapped at once. */ | |
46510 | + uint32_t count; | |
46511 | +}; | |
46512 | + | |
46513 | +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ | |
46514 | Index: head-2008-11-25/include/xen/public/privcmd.h | |
46515 | =================================================================== | |
46516 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46517 | +++ head-2008-11-25/include/xen/public/privcmd.h 2007-06-12 13:14:19.000000000 +0200 | |
46518 | @@ -0,0 +1,79 @@ | |
46519 | +/****************************************************************************** | |
46520 | + * privcmd.h | |
46521 | + * | |
46522 | + * Interface to /proc/xen/privcmd. | |
46523 | + * | |
46524 | + * Copyright (c) 2003-2005, K A Fraser | |
46525 | + * | |
46526 | + * This program is free software; you can redistribute it and/or | |
46527 | + * modify it under the terms of the GNU General Public License version 2 | |
46528 | + * as published by the Free Software Foundation; or, when distributed | |
46529 | + * separately from the Linux kernel or incorporated into other | |
46530 | + * software packages, subject to the following license: | |
46531 | + * | |
46532 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | |
46533 | + * of this source file (the "Software"), to deal in the Software without | |
46534 | + * restriction, including without limitation the rights to use, copy, modify, | |
46535 | + * merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
46536 | + * and to permit persons to whom the Software is furnished to do so, subject to | |
46537 | + * the following conditions: | |
46538 | + * | |
46539 | + * The above copyright notice and this permission notice shall be included in | |
46540 | + * all copies or substantial portions of the Software. | |
46541 | + * | |
46542 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
46543 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
46544 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
46545 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
46546 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
46547 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
46548 | + * IN THE SOFTWARE. | |
46549 | + */ | |
46550 | + | |
46551 | +#ifndef __LINUX_PUBLIC_PRIVCMD_H__ | |
46552 | +#define __LINUX_PUBLIC_PRIVCMD_H__ | |
46553 | + | |
46554 | +#include <linux/types.h> | |
46555 | + | |
46556 | +#ifndef __user | |
46557 | +#define __user | |
46558 | +#endif | |
46559 | + | |
46560 | +typedef struct privcmd_hypercall | |
46561 | +{ | |
46562 | + __u64 op; | |
46563 | + __u64 arg[5]; | |
46564 | +} privcmd_hypercall_t; | |
46565 | + | |
46566 | +typedef struct privcmd_mmap_entry { | |
46567 | + __u64 va; | |
46568 | + __u64 mfn; | |
46569 | + __u64 npages; | |
46570 | +} privcmd_mmap_entry_t; | |
46571 | + | |
46572 | +typedef struct privcmd_mmap { | |
46573 | + int num; | |
46574 | + domid_t dom; /* target domain */ | |
46575 | + privcmd_mmap_entry_t __user *entry; | |
46576 | +} privcmd_mmap_t; | |
46577 | + | |
46578 | +typedef struct privcmd_mmapbatch { | |
46579 | + int num; /* number of pages to populate */ | |
46580 | + domid_t dom; /* target domain */ | |
46581 | + __u64 addr; /* virtual address */ | |
46582 | + xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ | |
46583 | +} privcmd_mmapbatch_t; | |
46584 | + | |
46585 | +/* | |
46586 | + * @cmd: IOCTL_PRIVCMD_HYPERCALL | |
46587 | + * @arg: &privcmd_hypercall_t | |
46588 | + * Return: Value returned from execution of the specified hypercall. | |
46589 | + */ | |
46590 | +#define IOCTL_PRIVCMD_HYPERCALL \ | |
46591 | + _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t)) | |
46592 | +#define IOCTL_PRIVCMD_MMAP \ | |
46593 | + _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t)) | |
46594 | +#define IOCTL_PRIVCMD_MMAPBATCH \ | |
46595 | + _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t)) | |
46596 | + | |
46597 | +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ | |
46598 | Index: head-2008-11-25/include/xen/xen_proc.h | |
46599 | =================================================================== | |
46600 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46601 | +++ head-2008-11-25/include/xen/xen_proc.h 2007-06-12 13:14:19.000000000 +0200 | |
46602 | @@ -0,0 +1,12 @@ | |
46603 | + | |
46604 | +#ifndef __ASM_XEN_PROC_H__ | |
46605 | +#define __ASM_XEN_PROC_H__ | |
46606 | + | |
46607 | +#include <linux/proc_fs.h> | |
46608 | + | |
46609 | +extern struct proc_dir_entry *create_xen_proc_entry( | |
46610 | + const char *name, mode_t mode); | |
46611 | +extern void remove_xen_proc_entry( | |
46612 | + const char *name); | |
46613 | + | |
46614 | +#endif /* __ASM_XEN_PROC_H__ */ | |
46615 | Index: head-2008-11-25/include/xen/xencons.h | |
46616 | =================================================================== | |
46617 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46618 | +++ head-2008-11-25/include/xen/xencons.h 2007-10-15 09:39:38.000000000 +0200 | |
46619 | @@ -0,0 +1,17 @@ | |
46620 | +#ifndef __ASM_XENCONS_H__ | |
46621 | +#define __ASM_XENCONS_H__ | |
46622 | + | |
46623 | +struct dom0_vga_console_info; | |
46624 | +void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t); | |
46625 | + | |
46626 | +void xencons_force_flush(void); | |
46627 | +void xencons_resume(void); | |
46628 | + | |
46629 | +/* Interrupt work hooks. Receive data, or kick data out. */ | |
46630 | +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs); | |
46631 | +void xencons_tx(void); | |
46632 | + | |
46633 | +int xencons_ring_init(void); | |
46634 | +int xencons_ring_send(const char *data, unsigned len); | |
46635 | + | |
46636 | +#endif /* __ASM_XENCONS_H__ */ | |
46637 | Index: head-2008-11-25/include/xen/xenoprof.h | |
46638 | =================================================================== | |
46639 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46640 | +++ head-2008-11-25/include/xen/xenoprof.h 2007-06-12 13:14:19.000000000 +0200 | |
46641 | @@ -0,0 +1,42 @@ | |
46642 | +/****************************************************************************** | |
46643 | + * xen/xenoprof.h | |
46644 | + * | |
46645 | + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> | |
46646 | + * VA Linux Systems Japan K.K. | |
46647 | + * | |
46648 | + * This program is free software; you can redistribute it and/or modify | |
46649 | + * it under the terms of the GNU General Public License as published by | |
46650 | + * the Free Software Foundation; either version 2 of the License, or | |
46651 | + * (at your option) any later version. | |
46652 | + * | |
46653 | + * This program is distributed in the hope that it will be useful, | |
46654 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
46655 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
46656 | + * GNU General Public License for more details. | |
46657 | + * | |
46658 | + * You should have received a copy of the GNU General Public License | |
46659 | + * along with this program; if not, write to the Free Software | |
46660 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
46661 | + * | |
46662 | + */ | |
46663 | + | |
46664 | +#ifndef __XEN_XENOPROF_H__ | |
46665 | +#define __XEN_XENOPROF_H__ | |
46666 | +#ifdef CONFIG_XEN | |
46667 | + | |
46668 | +#include <asm/xenoprof.h> | |
46669 | + | |
46670 | +struct oprofile_operations; | |
46671 | +int xenoprofile_init(struct oprofile_operations * ops); | |
46672 | +void xenoprofile_exit(void); | |
46673 | + | |
46674 | +struct xenoprof_shared_buffer { | |
46675 | + char *buffer; | |
46676 | + struct xenoprof_arch_shared_buffer arch; | |
46677 | +}; | |
46678 | +#else | |
46679 | +#define xenoprofile_init(ops) (-ENOSYS) | |
46680 | +#define xenoprofile_exit() do { } while (0) | |
46681 | + | |
46682 | +#endif /* CONFIG_XEN */ | |
46683 | +#endif /* __XEN_XENOPROF_H__ */ | |
46684 | Index: head-2008-11-25/lib/swiotlb-xen.c | |
46685 | =================================================================== | |
46686 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
46687 | +++ head-2008-11-25/lib/swiotlb-xen.c 2008-09-15 13:40:15.000000000 +0200 | |
46688 | @@ -0,0 +1,739 @@ | |
46689 | +/* | |
46690 | + * Dynamic DMA mapping support. | |
46691 | + * | |
46692 | + * This implementation is a fallback for platforms that do not support | |
46693 | + * I/O TLBs (aka DMA address translation hardware). | |
46694 | + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com> | |
46695 | + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com> | |
46696 | + * Copyright (C) 2000, 2003 Hewlett-Packard Co | |
46697 | + * David Mosberger-Tang <davidm@hpl.hp.com> | |
46698 | + * Copyright (C) 2005 Keir Fraser <keir@xensource.com> | |
46699 | + */ | |
46700 | + | |
46701 | +#include <linux/cache.h> | |
46702 | +#include <linux/mm.h> | |
46703 | +#include <linux/module.h> | |
46704 | +#include <linux/pci.h> | |
46705 | +#include <linux/spinlock.h> | |
46706 | +#include <linux/string.h> | |
46707 | +#include <linux/types.h> | |
46708 | +#include <linux/ctype.h> | |
46709 | +#include <linux/init.h> | |
46710 | +#include <linux/bootmem.h> | |
46711 | +#include <linux/highmem.h> | |
46712 | +#include <asm/io.h> | |
46713 | +#include <asm/pci.h> | |
46714 | +#include <asm/dma.h> | |
46715 | +#include <asm/uaccess.h> | |
46716 | +#include <xen/gnttab.h> | |
46717 | +#include <xen/interface/memory.h> | |
46718 | +#include <asm-i386/mach-xen/asm/gnttab_dma.h> | |
46719 | + | |
46720 | +int swiotlb; | |
46721 | +EXPORT_SYMBOL(swiotlb); | |
46722 | + | |
46723 | +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1))) | |
46724 | + | |
46725 | +/* | |
46726 | + * Maximum allowable number of contiguous slabs to map, | |
46727 | + * must be a power of 2. What is the appropriate value ? | |
46728 | + * The complexity of {map,unmap}_single is linearly dependent on this value. | |
46729 | + */ | |
46730 | +#define IO_TLB_SEGSIZE 128 | |
46731 | + | |
46732 | +/* | |
46733 | + * log of the size of each IO TLB slab. The number of slabs is command line | |
46734 | + * controllable. | |
46735 | + */ | |
46736 | +#define IO_TLB_SHIFT 11 | |
46737 | + | |
46738 | +int swiotlb_force; | |
46739 | + | |
46740 | +static char *iotlb_virt_start; | |
46741 | +static unsigned long iotlb_nslabs; | |
46742 | + | |
46743 | +/* | |
46744 | + * Used to do a quick range check in swiotlb_unmap_single and | |
46745 | + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this | |
46746 | + * API. | |
46747 | + */ | |
46748 | +static unsigned long iotlb_pfn_start, iotlb_pfn_end; | |
46749 | + | |
46750 | +/* Does the given dma address reside within the swiotlb aperture? */ | |
46751 | +static inline int in_swiotlb_aperture(dma_addr_t dev_addr) | |
46752 | +{ | |
46753 | + unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT); | |
46754 | + return (pfn_valid(pfn) | |
46755 | + && (pfn >= iotlb_pfn_start) | |
46756 | + && (pfn < iotlb_pfn_end)); | |
46757 | +} | |
46758 | + | |
46759 | +/* | |
46760 | + * When the IOMMU overflows we return a fallback buffer. This sets the size. | |
46761 | + */ | |
46762 | +static unsigned long io_tlb_overflow = 32*1024; | |
46763 | + | |
46764 | +void *io_tlb_overflow_buffer; | |
46765 | + | |
46766 | +/* | |
46767 | + * This is a free list describing the number of free entries available from | |
46768 | + * each index | |
46769 | + */ | |
46770 | +static unsigned int *io_tlb_list; | |
46771 | +static unsigned int io_tlb_index; | |
46772 | + | |
46773 | +/* | |
46774 | + * We need to save away the original address corresponding to a mapped entry | |
46775 | + * for the sync operations. | |
46776 | + */ | |
46777 | +static struct phys_addr { | |
46778 | + struct page *page; | |
46779 | + unsigned int offset; | |
46780 | +} *io_tlb_orig_addr; | |
46781 | + | |
46782 | +/* | |
46783 | + * Protect the above data structures in the map and unmap calls | |
46784 | + */ | |
46785 | +static DEFINE_SPINLOCK(io_tlb_lock); | |
46786 | + | |
46787 | +static unsigned int dma_bits; | |
46788 | +static unsigned int __initdata max_dma_bits = 32; | |
46789 | +static int __init | |
46790 | +setup_dma_bits(char *str) | |
46791 | +{ | |
46792 | + max_dma_bits = simple_strtoul(str, NULL, 0); | |
46793 | + return 0; | |
46794 | +} | |
46795 | +__setup("dma_bits=", setup_dma_bits); | |
46796 | + | |
46797 | +static int __init | |
46798 | +setup_io_tlb_npages(char *str) | |
46799 | +{ | |
46800 | + /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */ | |
46801 | + if (isdigit(*str)) { | |
46802 | + iotlb_nslabs = simple_strtoul(str, &str, 0) << | |
46803 | + (20 - IO_TLB_SHIFT); | |
46804 | + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE); | |
46805 | + } | |
46806 | + if (*str == ',') | |
46807 | + ++str; | |
46808 | + /* | |
46809 | + * NB. 'force' enables the swiotlb, but doesn't force its use for | |
46810 | + * every DMA like it does on native Linux. 'off' forcibly disables | |
46811 | + * use of the swiotlb. | |
46812 | + */ | |
46813 | + if (!strcmp(str, "force")) | |
46814 | + swiotlb_force = 1; | |
46815 | + else if (!strcmp(str, "off")) | |
46816 | + swiotlb_force = -1; | |
46817 | + return 1; | |
46818 | +} | |
46819 | +__setup("swiotlb=", setup_io_tlb_npages); | |
46820 | +/* make io_tlb_overflow tunable too? */ | |
46821 | + | |
46822 | +/* | |
46823 | + * Statically reserve bounce buffer space and initialize bounce buffer data | |
46824 | + * structures for the software IO TLB used to implement the PCI DMA API. | |
46825 | + */ | |
46826 | +void | |
46827 | +swiotlb_init_with_default_size (size_t default_size) | |
46828 | +{ | |
46829 | + unsigned long i, bytes; | |
46830 | + int rc; | |
46831 | + | |
46832 | + if (!iotlb_nslabs) { | |
46833 | + iotlb_nslabs = (default_size >> IO_TLB_SHIFT); | |
46834 | + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE); | |
46835 | + } | |
46836 | + | |
46837 | + bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT); | |
46838 | + | |
46839 | + /* | |
46840 | + * Get IO TLB memory from the low pages | |
46841 | + */ | |
46842 | + iotlb_virt_start = alloc_bootmem_low_pages(bytes); | |
46843 | + if (!iotlb_virt_start) | |
46844 | + panic("Cannot allocate SWIOTLB buffer!\n"); | |
46845 | + | |
46846 | + dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; | |
46847 | + for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) { | |
46848 | + do { | |
46849 | + rc = xen_create_contiguous_region( | |
46850 | + (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT), | |
46851 | + get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT), | |
46852 | + dma_bits); | |
46853 | + } while (rc && dma_bits++ < max_dma_bits); | |
46854 | + if (rc) { | |
46855 | + if (i == 0) | |
46856 | + panic("No suitable physical memory available for SWIOTLB buffer!\n" | |
46857 | + "Use dom0_mem Xen boot parameter to reserve\n" | |
46858 | + "some DMA memory (e.g., dom0_mem=-128M).\n"); | |
46859 | + iotlb_nslabs = i; | |
46860 | + i <<= IO_TLB_SHIFT; | |
46861 | + free_bootmem(__pa(iotlb_virt_start + i), bytes - i); | |
46862 | + bytes = i; | |
46863 | + for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) { | |
46864 | + unsigned int bits = fls64(virt_to_bus(iotlb_virt_start + i - 1)); | |
46865 | + | |
46866 | + if (bits > dma_bits) | |
46867 | + dma_bits = bits; | |
46868 | + } | |
46869 | + break; | |
46870 | + } | |
46871 | + } | |
46872 | + | |
46873 | + /* | |
46874 | + * Allocate and initialize the free list array. This array is used | |
46875 | + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE. | |
46876 | + */ | |
46877 | + io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int)); | |
46878 | + for (i = 0; i < iotlb_nslabs; i++) | |
46879 | + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); | |
46880 | + io_tlb_index = 0; | |
46881 | + io_tlb_orig_addr = alloc_bootmem( | |
46882 | + iotlb_nslabs * sizeof(*io_tlb_orig_addr)); | |
46883 | + | |
46884 | + /* | |
46885 | + * Get the overflow emergency buffer | |
46886 | + */ | |
46887 | + io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); | |
46888 | + if (!io_tlb_overflow_buffer) | |
46889 | + panic("Cannot allocate SWIOTLB overflow buffer!\n"); | |
46890 | + | |
46891 | + do { | |
46892 | + rc = xen_create_contiguous_region( | |
46893 | + (unsigned long)io_tlb_overflow_buffer, | |
46894 | + get_order(io_tlb_overflow), | |
46895 | + dma_bits); | |
46896 | + } while (rc && dma_bits++ < max_dma_bits); | |
46897 | + if (rc) | |
46898 | + panic("No suitable physical memory available for SWIOTLB overflow buffer!\n"); | |
46899 | + | |
46900 | + iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT; | |
46901 | + iotlb_pfn_end = iotlb_pfn_start + (bytes >> PAGE_SHIFT); | |
46902 | + | |
46903 | + printk(KERN_INFO "Software IO TLB enabled: \n" | |
46904 | + " Aperture: %lu megabytes\n" | |
46905 | + " Kernel range: %p - %p\n" | |
46906 | + " Address size: %u bits\n", | |
46907 | + bytes >> 20, | |
46908 | + iotlb_virt_start, iotlb_virt_start + bytes, | |
46909 | + dma_bits); | |
46910 | +} | |
46911 | + | |
46912 | +void | |
46913 | +swiotlb_init(void) | |
46914 | +{ | |
46915 | + long ram_end; | |
46916 | + size_t defsz = 64 * (1 << 20); /* 64MB default size */ | |
46917 | + | |
46918 | + if (swiotlb_force == 1) { | |
46919 | + swiotlb = 1; | |
46920 | + } else if ((swiotlb_force != -1) && | |
46921 | + is_running_on_xen() && | |
46922 | + is_initial_xendomain()) { | |
46923 | + /* Domain 0 always has a swiotlb. */ | |
46924 | + ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); | |
46925 | + if (ram_end <= 0x7ffff) | |
46926 | + defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */ | |
46927 | + swiotlb = 1; | |
46928 | + } | |
46929 | + | |
46930 | + if (swiotlb) | |
46931 | + swiotlb_init_with_default_size(defsz); | |
46932 | + else | |
46933 | + printk(KERN_INFO "Software IO TLB disabled\n"); | |
46934 | +} | |
46935 | + | |
46936 | +/* | |
46937 | + * We use __copy_to_user_inatomic to transfer to the host buffer because the | |
46938 | + * buffer may be mapped read-only (e.g, in blkback driver) but lower-level | |
46939 | + * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an | |
46940 | + * unnecessary copy from the aperture to the host buffer, and a page fault. | |
46941 | + */ | |
46942 | +static void | |
46943 | +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir) | |
46944 | +{ | |
46945 | + if (PageHighMem(buffer.page)) { | |
46946 | + size_t len, bytes; | |
46947 | + char *dev, *host, *kmp; | |
46948 | + len = size; | |
46949 | + while (len != 0) { | |
46950 | + unsigned long flags; | |
46951 | + | |
46952 | + if (((bytes = len) + buffer.offset) > PAGE_SIZE) | |
46953 | + bytes = PAGE_SIZE - buffer.offset; | |
46954 | + local_irq_save(flags); /* protects KM_BOUNCE_READ */ | |
46955 | + kmp = kmap_atomic(buffer.page, KM_BOUNCE_READ); | |
46956 | + dev = dma_addr + size - len; | |
46957 | + host = kmp + buffer.offset; | |
46958 | + if (dir == DMA_FROM_DEVICE) { | |
46959 | + if (__copy_to_user_inatomic(host, dev, bytes)) | |
46960 | + /* inaccessible */; | |
46961 | + } else | |
46962 | + memcpy(dev, host, bytes); | |
46963 | + kunmap_atomic(kmp, KM_BOUNCE_READ); | |
46964 | + local_irq_restore(flags); | |
46965 | + len -= bytes; | |
46966 | + buffer.page++; | |
46967 | + buffer.offset = 0; | |
46968 | + } | |
46969 | + } else { | |
46970 | + char *host = (char *)phys_to_virt( | |
46971 | + page_to_pseudophys(buffer.page)) + buffer.offset; | |
46972 | + if (dir == DMA_FROM_DEVICE) { | |
46973 | + if (__copy_to_user_inatomic(host, dma_addr, size)) | |
46974 | + /* inaccessible */; | |
46975 | + } else if (dir == DMA_TO_DEVICE) | |
46976 | + memcpy(dma_addr, host, size); | |
46977 | + } | |
46978 | +} | |
46979 | + | |
46980 | +/* | |
46981 | + * Allocates bounce buffer and returns its kernel virtual address. | |
46982 | + */ | |
46983 | +static void * | |
46984 | +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir) | |
46985 | +{ | |
46986 | + unsigned long flags; | |
46987 | + char *dma_addr; | |
46988 | + unsigned int nslots, stride, index, wrap; | |
46989 | + struct phys_addr slot_buf; | |
46990 | + int i; | |
46991 | + | |
46992 | + /* | |
46993 | + * For mappings greater than a page, we limit the stride (and | |
46994 | + * hence alignment) to a page size. | |
46995 | + */ | |
46996 | + nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; | |
46997 | + if (size > PAGE_SIZE) | |
46998 | + stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); | |
46999 | + else | |
47000 | + stride = 1; | |
47001 | + | |
47002 | + BUG_ON(!nslots); | |
47003 | + | |
47004 | + /* | |
47005 | + * Find suitable number of IO TLB entries size that will fit this | |
47006 | + * request and allocate a buffer from that IO TLB pool. | |
47007 | + */ | |
47008 | + spin_lock_irqsave(&io_tlb_lock, flags); | |
47009 | + { | |
47010 | + wrap = index = ALIGN(io_tlb_index, stride); | |
47011 | + | |
47012 | + if (index >= iotlb_nslabs) | |
47013 | + wrap = index = 0; | |
47014 | + | |
47015 | + do { | |
47016 | + /* | |
47017 | + * If we find a slot that indicates we have 'nslots' | |
47018 | + * number of contiguous buffers, we allocate the | |
47019 | + * buffers from that slot and mark the entries as '0' | |
47020 | + * indicating unavailable. | |
47021 | + */ | |
47022 | + if (io_tlb_list[index] >= nslots) { | |
47023 | + int count = 0; | |
47024 | + | |
47025 | + for (i = index; i < (int)(index + nslots); i++) | |
47026 | + io_tlb_list[i] = 0; | |
47027 | + for (i = index - 1; | |
47028 | + (OFFSET(i, IO_TLB_SEGSIZE) != | |
47029 | + IO_TLB_SEGSIZE -1) && io_tlb_list[i]; | |
47030 | + i--) | |
47031 | + io_tlb_list[i] = ++count; | |
47032 | + dma_addr = iotlb_virt_start + | |
47033 | + (index << IO_TLB_SHIFT); | |
47034 | + | |
47035 | + /* | |
47036 | + * Update the indices to avoid searching in | |
47037 | + * the next round. | |
47038 | + */ | |
47039 | + io_tlb_index = | |
47040 | + ((index + nslots) < iotlb_nslabs | |
47041 | + ? (index + nslots) : 0); | |
47042 | + | |
47043 | + goto found; | |
47044 | + } | |
47045 | + index += stride; | |
47046 | + if (index >= iotlb_nslabs) | |
47047 | + index = 0; | |
47048 | + } while (index != wrap); | |
47049 | + | |
47050 | + spin_unlock_irqrestore(&io_tlb_lock, flags); | |
47051 | + return NULL; | |
47052 | + } | |
47053 | + found: | |
47054 | + spin_unlock_irqrestore(&io_tlb_lock, flags); | |
47055 | + | |
47056 | + /* | |
47057 | + * Save away the mapping from the original address to the DMA address. | |
47058 | + * This is needed when we sync the memory. Then we sync the buffer if | |
47059 | + * needed. | |
47060 | + */ | |
47061 | + slot_buf = buffer; | |
47062 | + for (i = 0; i < nslots; i++) { | |
47063 | + slot_buf.page += slot_buf.offset >> PAGE_SHIFT; | |
47064 | + slot_buf.offset &= PAGE_SIZE - 1; | |
47065 | + io_tlb_orig_addr[index+i] = slot_buf; | |
47066 | + slot_buf.offset += 1 << IO_TLB_SHIFT; | |
47067 | + } | |
47068 | + if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL)) | |
47069 | + __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE); | |
47070 | + | |
47071 | + return dma_addr; | |
47072 | +} | |
47073 | + | |
47074 | +static struct phys_addr dma_addr_to_phys_addr(char *dma_addr) | |
47075 | +{ | |
47076 | + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT; | |
47077 | + struct phys_addr buffer = io_tlb_orig_addr[index]; | |
47078 | + buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1); | |
47079 | + buffer.page += buffer.offset >> PAGE_SHIFT; | |
47080 | + buffer.offset &= PAGE_SIZE - 1; | |
47081 | + return buffer; | |
47082 | +} | |
47083 | + | |
47084 | +/* | |
47085 | + * dma_addr is the kernel virtual address of the bounce buffer to unmap. | |
47086 | + */ | |
47087 | +static void | |
47088 | +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) | |
47089 | +{ | |
47090 | + unsigned long flags; | |
47091 | + int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; | |
47092 | + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT; | |
47093 | + struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr); | |
47094 | + | |
47095 | + /* | |
47096 | + * First, sync the memory before unmapping the entry | |
47097 | + */ | |
47098 | + if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)) | |
47099 | + __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE); | |
47100 | + | |
47101 | + /* | |
47102 | + * Return the buffer to the free list by setting the corresponding | |
47103 | + * entries to indicate the number of contigous entries available. | |
47104 | + * While returning the entries to the free list, we merge the entries | |
47105 | + * with slots below and above the pool being returned. | |
47106 | + */ | |
47107 | + spin_lock_irqsave(&io_tlb_lock, flags); | |
47108 | + { | |
47109 | + count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? | |
47110 | + io_tlb_list[index + nslots] : 0); | |
47111 | + /* | |
47112 | + * Step 1: return the slots to the free list, merging the | |
47113 | + * slots with superceeding slots | |
47114 | + */ | |
47115 | + for (i = index + nslots - 1; i >= index; i--) | |
47116 | + io_tlb_list[i] = ++count; | |
47117 | + /* | |
47118 | + * Step 2: merge the returned slots with the preceding slots, | |
47119 | + * if available (non zero) | |
47120 | + */ | |
47121 | + for (i = index - 1; | |
47122 | + (OFFSET(i, IO_TLB_SEGSIZE) != | |
47123 | + IO_TLB_SEGSIZE -1) && io_tlb_list[i]; | |
47124 | + i--) | |
47125 | + io_tlb_list[i] = ++count; | |
47126 | + } | |
47127 | + spin_unlock_irqrestore(&io_tlb_lock, flags); | |
47128 | +} | |
47129 | + | |
47130 | +static void | |
47131 | +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir) | |
47132 | +{ | |
47133 | + struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr); | |
47134 | + BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE)); | |
47135 | + __sync_single(buffer, dma_addr, size, dir); | |
47136 | +} | |
47137 | + | |
47138 | +static void | |
47139 | +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) | |
47140 | +{ | |
47141 | + /* | |
47142 | + * Ran out of IOMMU space for this operation. This is very bad. | |
47143 | + * Unfortunately the drivers cannot handle this operation properly. | |
47144 | + * unless they check for pci_dma_mapping_error (most don't) | |
47145 | + * When the mapping is small enough return a static buffer to limit | |
47146 | + * the damage, or panic when the transfer is too big. | |
47147 | + */ | |
47148 | + printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at " | |
47149 | + "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?"); | |
47150 | + | |
47151 | + if (size > io_tlb_overflow && do_panic) { | |
47152 | + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) | |
47153 | + panic("PCI-DMA: Memory would be corrupted\n"); | |
47154 | + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) | |
47155 | + panic("PCI-DMA: Random memory would be DMAed\n"); | |
47156 | + } | |
47157 | +} | |
47158 | + | |
47159 | +/* | |
47160 | + * Map a single buffer of the indicated size for DMA in streaming mode. The | |
47161 | + * PCI address to use is returned. | |
47162 | + * | |
47163 | + * Once the device is given the dma address, the device owns this memory until | |
47164 | + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. | |
47165 | + */ | |
47166 | +dma_addr_t | |
47167 | +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir) | |
47168 | +{ | |
47169 | + dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) + | |
47170 | + offset_in_page(ptr); | |
47171 | + void *map; | |
47172 | + struct phys_addr buffer; | |
47173 | + | |
47174 | + BUG_ON(dir == DMA_NONE); | |
47175 | + | |
47176 | + /* | |
47177 | + * If the pointer passed in happens to be in the device's DMA window, | |
47178 | + * we can safely return the device addr and not worry about bounce | |
47179 | + * buffering it. | |
47180 | + */ | |
47181 | + if (!range_straddles_page_boundary(__pa(ptr), size) && | |
47182 | + !address_needs_mapping(hwdev, dev_addr)) | |
47183 | + return dev_addr; | |
47184 | + | |
47185 | + /* | |
47186 | + * Oh well, have to allocate and map a bounce buffer. | |
47187 | + */ | |
47188 | + gnttab_dma_unmap_page(dev_addr); | |
47189 | + buffer.page = virt_to_page(ptr); | |
47190 | + buffer.offset = (unsigned long)ptr & ~PAGE_MASK; | |
47191 | + map = map_single(hwdev, buffer, size, dir); | |
47192 | + if (!map) { | |
47193 | + swiotlb_full(hwdev, size, dir, 1); | |
47194 | + map = io_tlb_overflow_buffer; | |
47195 | + } | |
47196 | + | |
47197 | + dev_addr = virt_to_bus(map); | |
47198 | + return dev_addr; | |
47199 | +} | |
47200 | + | |
47201 | +/* | |
47202 | + * Unmap a single streaming mode DMA translation. The dma_addr and size must | |
47203 | + * match what was provided for in a previous swiotlb_map_single call. All | |
47204 | + * other usages are undefined. | |
47205 | + * | |
47206 | + * After this call, reads by the cpu to the buffer are guaranteed to see | |
47207 | + * whatever the device wrote there. | |
47208 | + */ | |
47209 | +void | |
47210 | +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, | |
47211 | + int dir) | |
47212 | +{ | |
47213 | + BUG_ON(dir == DMA_NONE); | |
47214 | + if (in_swiotlb_aperture(dev_addr)) | |
47215 | + unmap_single(hwdev, bus_to_virt(dev_addr), size, dir); | |
47216 | + else | |
47217 | + gnttab_dma_unmap_page(dev_addr); | |
47218 | +} | |
47219 | + | |
47220 | +/* | |
47221 | + * Make physical memory consistent for a single streaming mode DMA translation | |
47222 | + * after a transfer. | |
47223 | + * | |
47224 | + * If you perform a swiotlb_map_single() but wish to interrogate the buffer | |
47225 | + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must | |
47226 | + * call this function before doing so. At the next point you give the PCI dma | |
47227 | + * address back to the card, you must first perform a | |
47228 | + * swiotlb_dma_sync_for_device, and then the device again owns the buffer | |
47229 | + */ | |
47230 | +void | |
47231 | +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, | |
47232 | + size_t size, int dir) | |
47233 | +{ | |
47234 | + BUG_ON(dir == DMA_NONE); | |
47235 | + if (in_swiotlb_aperture(dev_addr)) | |
47236 | + sync_single(hwdev, bus_to_virt(dev_addr), size, dir); | |
47237 | +} | |
47238 | + | |
47239 | +void | |
47240 | +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, | |
47241 | + size_t size, int dir) | |
47242 | +{ | |
47243 | + BUG_ON(dir == DMA_NONE); | |
47244 | + if (in_swiotlb_aperture(dev_addr)) | |
47245 | + sync_single(hwdev, bus_to_virt(dev_addr), size, dir); | |
47246 | +} | |
47247 | + | |
47248 | +/* | |
47249 | + * Map a set of buffers described by scatterlist in streaming mode for DMA. | |
47250 | + * This is the scatter-gather version of the above swiotlb_map_single | |
47251 | + * interface. Here the scatter gather list elements are each tagged with the | |
47252 | + * appropriate dma address and length. They are obtained via | |
47253 | + * sg_dma_{address,length}(SG). | |
47254 | + * | |
47255 | + * NOTE: An implementation may be able to use a smaller number of | |
47256 | + * DMA address/length pairs than there are SG table elements. | |
47257 | + * (for example via virtual mapping capabilities) | |
47258 | + * The routine returns the number of addr/length pairs actually | |
47259 | + * used, at most nents. | |
47260 | + * | |
47261 | + * Device ownership issues as mentioned above for swiotlb_map_single are the | |
47262 | + * same here. | |
47263 | + */ | |
47264 | +int | |
47265 | +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems, | |
47266 | + int dir) | |
47267 | +{ | |
47268 | + struct phys_addr buffer; | |
47269 | + dma_addr_t dev_addr; | |
47270 | + char *map; | |
47271 | + int i; | |
47272 | + | |
47273 | + BUG_ON(dir == DMA_NONE); | |
47274 | + | |
47275 | + for (i = 0; i < nelems; i++, sg++) { | |
47276 | + dev_addr = gnttab_dma_map_page(sg->page) + sg->offset; | |
47277 | + | |
47278 | + if (range_straddles_page_boundary(page_to_pseudophys(sg->page) | |
47279 | + + sg->offset, sg->length) | |
47280 | + || address_needs_mapping(hwdev, dev_addr)) { | |
47281 | + gnttab_dma_unmap_page(dev_addr); | |
47282 | + buffer.page = sg->page; | |
47283 | + buffer.offset = sg->offset; | |
47284 | + map = map_single(hwdev, buffer, sg->length, dir); | |
47285 | + if (!map) { | |
47286 | + /* Don't panic here, we expect map_sg users | |
47287 | + to do proper error handling. */ | |
47288 | + swiotlb_full(hwdev, sg->length, dir, 0); | |
47289 | + swiotlb_unmap_sg(hwdev, sg - i, i, dir); | |
47290 | + sg[0].dma_length = 0; | |
47291 | + return 0; | |
47292 | + } | |
47293 | + sg->dma_address = (dma_addr_t)virt_to_bus(map); | |
47294 | + } else | |
47295 | + sg->dma_address = dev_addr; | |
47296 | + sg->dma_length = sg->length; | |
47297 | + } | |
47298 | + return nelems; | |
47299 | +} | |
47300 | + | |
47301 | +/* | |
47302 | + * Unmap a set of streaming mode DMA translations. Again, cpu read rules | |
47303 | + * concerning calls here are the same as for swiotlb_unmap_single() above. | |
47304 | + */ | |
47305 | +void | |
47306 | +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems, | |
47307 | + int dir) | |
47308 | +{ | |
47309 | + int i; | |
47310 | + | |
47311 | + BUG_ON(dir == DMA_NONE); | |
47312 | + | |
47313 | + for (i = 0; i < nelems; i++, sg++) | |
47314 | + if (in_swiotlb_aperture(sg->dma_address)) | |
47315 | + unmap_single(hwdev, | |
47316 | + (void *)bus_to_virt(sg->dma_address), | |
47317 | + sg->dma_length, dir); | |
47318 | + else | |
47319 | + gnttab_dma_unmap_page(sg->dma_address); | |
47320 | +} | |
47321 | + | |
47322 | +/* | |
47323 | + * Make physical memory consistent for a set of streaming mode DMA translations | |
47324 | + * after a transfer. | |
47325 | + * | |
47326 | + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules | |
47327 | + * and usage. | |
47328 | + */ | |
47329 | +void | |
47330 | +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, | |
47331 | + int nelems, int dir) | |
47332 | +{ | |
47333 | + int i; | |
47334 | + | |
47335 | + BUG_ON(dir == DMA_NONE); | |
47336 | + | |
47337 | + for (i = 0; i < nelems; i++, sg++) | |
47338 | + if (in_swiotlb_aperture(sg->dma_address)) | |
47339 | + sync_single(hwdev, | |
47340 | + (void *)bus_to_virt(sg->dma_address), | |
47341 | + sg->dma_length, dir); | |
47342 | +} | |
47343 | + | |
47344 | +void | |
47345 | +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, | |
47346 | + int nelems, int dir) | |
47347 | +{ | |
47348 | + int i; | |
47349 | + | |
47350 | + BUG_ON(dir == DMA_NONE); | |
47351 | + | |
47352 | + for (i = 0; i < nelems; i++, sg++) | |
47353 | + if (in_swiotlb_aperture(sg->dma_address)) | |
47354 | + sync_single(hwdev, | |
47355 | + (void *)bus_to_virt(sg->dma_address), | |
47356 | + sg->dma_length, dir); | |
47357 | +} | |
47358 | + | |
47359 | +#ifdef CONFIG_HIGHMEM | |
47360 | + | |
47361 | +dma_addr_t | |
47362 | +swiotlb_map_page(struct device *hwdev, struct page *page, | |
47363 | + unsigned long offset, size_t size, | |
47364 | + enum dma_data_direction direction) | |
47365 | +{ | |
47366 | + struct phys_addr buffer; | |
47367 | + dma_addr_t dev_addr; | |
47368 | + char *map; | |
47369 | + | |
47370 | + dev_addr = gnttab_dma_map_page(page) + offset; | |
47371 | + if (address_needs_mapping(hwdev, dev_addr)) { | |
47372 | + gnttab_dma_unmap_page(dev_addr); | |
47373 | + buffer.page = page; | |
47374 | + buffer.offset = offset; | |
47375 | + map = map_single(hwdev, buffer, size, direction); | |
47376 | + if (!map) { | |
47377 | + swiotlb_full(hwdev, size, direction, 1); | |
47378 | + map = io_tlb_overflow_buffer; | |
47379 | + } | |
47380 | + dev_addr = (dma_addr_t)virt_to_bus(map); | |
47381 | + } | |
47382 | + | |
47383 | + return dev_addr; | |
47384 | +} | |
47385 | + | |
47386 | +void | |
47387 | +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address, | |
47388 | + size_t size, enum dma_data_direction direction) | |
47389 | +{ | |
47390 | + BUG_ON(direction == DMA_NONE); | |
47391 | + if (in_swiotlb_aperture(dma_address)) | |
47392 | + unmap_single(hwdev, bus_to_virt(dma_address), size, direction); | |
47393 | + else | |
47394 | + gnttab_dma_unmap_page(dma_address); | |
47395 | +} | |
47396 | + | |
47397 | +#endif | |
47398 | + | |
47399 | +int | |
47400 | +swiotlb_dma_mapping_error(dma_addr_t dma_addr) | |
47401 | +{ | |
47402 | + return (dma_addr == virt_to_bus(io_tlb_overflow_buffer)); | |
47403 | +} | |
47404 | + | |
47405 | +/* | |
47406 | + * Return whether the given PCI device DMA address mask can be supported | |
47407 | + * properly. For example, if your device can only drive the low 24-bits | |
47408 | + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to | |
47409 | + * this function. | |
47410 | + */ | |
47411 | +int | |
47412 | +swiotlb_dma_supported (struct device *hwdev, u64 mask) | |
47413 | +{ | |
47414 | + return (mask >= ((1UL << dma_bits) - 1)); | |
47415 | +} | |
47416 | + | |
47417 | +EXPORT_SYMBOL(swiotlb_init); | |
47418 | +EXPORT_SYMBOL(swiotlb_map_single); | |
47419 | +EXPORT_SYMBOL(swiotlb_unmap_single); | |
47420 | +EXPORT_SYMBOL(swiotlb_map_sg); | |
47421 | +EXPORT_SYMBOL(swiotlb_unmap_sg); | |
47422 | +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu); | |
47423 | +EXPORT_SYMBOL(swiotlb_sync_single_for_device); | |
47424 | +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu); | |
47425 | +EXPORT_SYMBOL(swiotlb_sync_sg_for_device); | |
47426 | +EXPORT_SYMBOL(swiotlb_dma_mapping_error); | |
47427 | +EXPORT_SYMBOL(swiotlb_dma_supported); | |
47428 | Index: head-2008-11-25/scripts/Makefile.xen.awk | |
47429 | =================================================================== | |
47430 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | |
47431 | +++ head-2008-11-25/scripts/Makefile.xen.awk 2007-08-06 15:10:49.000000000 +0200 | |
47432 | @@ -0,0 +1,34 @@ | |
47433 | +BEGIN { | |
47434 | + is_rule = 0 | |
47435 | +} | |
47436 | + | |
47437 | +/^[[:space:]]*#/ { | |
47438 | + next | |
47439 | +} | |
47440 | + | |
47441 | +/^[[:space:]]*$/ { | |
47442 | + if (is_rule) | |
47443 | + print("") | |
47444 | + is_rule = 0 | |
47445 | + next | |
47446 | +} | |
47447 | + | |
47448 | +/:[[:space:]]*%\.[cS][[:space:]]/ { | |
47449 | + line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0) | |
47450 | + line = gensub(/(single-used-m)/, "xen-\\1", "g", line) | |
47451 | + print line | |
47452 | + is_rule = 1 | |
47453 | + next | |
47454 | +} | |
47455 | + | |
47456 | +/^[^\t]$/ { | |
47457 | + if (is_rule) | |
47458 | + print("") | |
47459 | + is_rule = 0 | |
47460 | + next | |
47461 | +} | |
47462 | + | |
47463 | +is_rule { | |
47464 | + print $0 | |
47465 | + next | |
47466 | +} |