]>
Commit | Line | Data |
---|---|---|
549e984e SL |
1 | /* |
2 | * Copyright (c) 2003-2004 Fabrice Bellard | |
3 | * Copyright (c) 2019 Red Hat, Inc. | |
4 | * | |
5 | * Permission is hereby granted, free of charge, to any person obtaining a copy | |
6 | * of this software and associated documentation files (the "Software"), to deal | |
7 | * in the Software without restriction, including without limitation the rights | |
8 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
9 | * copies of the Software, and to permit persons to whom the Software is | |
10 | * furnished to do so, subject to the following conditions: | |
11 | * | |
12 | * The above copyright notice and this permission notice shall be included in | |
13 | * all copies or substantial portions of the Software. | |
14 | * | |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
21 | * THE SOFTWARE. | |
22 | */ | |
23 | #include "qemu/osdep.h" | |
24 | #include "qemu/error-report.h" | |
25 | #include "qemu/option.h" | |
26 | #include "qemu/cutils.h" | |
27 | #include "qemu/units.h" | |
28 | #include "qemu-common.h" | |
29 | #include "qapi/error.h" | |
30 | #include "qapi/qmp/qerror.h" | |
31 | #include "qapi/qapi-visit-common.h" | |
32 | #include "qapi/visitor.h" | |
33 | #include "sysemu/qtest.h" | |
34 | #include "sysemu/numa.h" | |
35 | #include "sysemu/replay.h" | |
36 | #include "sysemu/sysemu.h" | |
89a289c7 | 37 | #include "trace.h" |
549e984e SL |
38 | |
39 | #include "hw/i386/x86.h" | |
549e984e SL |
40 | #include "target/i386/cpu.h" |
41 | #include "hw/i386/topology.h" | |
42 | #include "hw/i386/fw_cfg.h" | |
852c27e2 | 43 | #include "hw/intc/i8259.h" |
549e984e SL |
44 | |
45 | #include "hw/acpi/cpu_hotplug.h" | |
89a289c7 | 46 | #include "hw/irq.h" |
549e984e SL |
47 | #include "hw/nmi.h" |
48 | #include "hw/loader.h" | |
49 | #include "multiboot.h" | |
50 | #include "elf.h" | |
51 | #include "standard-headers/asm-x86/bootparam.h" | |
89a289c7 PB |
52 | #include "config-devices.h" |
53 | #include "kvm_i386.h" | |
549e984e SL |
54 | |
55 | #define BIOS_FILENAME "bios.bin" | |
56 | ||
57 | /* Physical Address of PVH entry point read from kernel ELF NOTE */ | |
58 | static size_t pvh_start_addr; | |
59 | ||
53a5e7bd BM |
60 | inline void init_topo_info(X86CPUTopoInfo *topo_info, |
61 | const X86MachineState *x86ms) | |
62 | { | |
63 | MachineState *ms = MACHINE(x86ms); | |
64 | ||
c24a41bb | 65 | topo_info->nodes_per_pkg = ms->numa_state->num_nodes / ms->smp.sockets; |
53a5e7bd BM |
66 | topo_info->dies_per_pkg = x86ms->smp_dies; |
67 | topo_info->cores_per_die = ms->smp.cores; | |
68 | topo_info->threads_per_core = ms->smp.threads; | |
69 | } | |
70 | ||
549e984e SL |
71 | /* |
72 | * Calculates initial APIC ID for a specific CPU index | |
73 | * | |
74 | * Currently we need to be able to calculate the APIC ID from the CPU index | |
75 | * alone (without requiring a CPU object), as the QEMU<->Seabios interfaces have | |
76 | * no concept of "CPU index", and the NUMA tables on fw_cfg need the APIC ID of | |
77 | * all CPUs up to max_cpus. | |
78 | */ | |
703a548a | 79 | uint32_t x86_cpu_apic_id_from_index(X86MachineState *x86ms, |
549e984e SL |
80 | unsigned int cpu_index) |
81 | { | |
f0bb276b | 82 | X86MachineClass *x86mc = X86_MACHINE_GET_CLASS(x86ms); |
53a5e7bd | 83 | X86CPUTopoInfo topo_info; |
549e984e SL |
84 | uint32_t correct_id; |
85 | static bool warned; | |
86 | ||
53a5e7bd BM |
87 | init_topo_info(&topo_info, x86ms); |
88 | ||
89 | correct_id = x86_apicid_from_cpu_idx(&topo_info, cpu_index); | |
f0bb276b | 90 | if (x86mc->compat_apic_id_mode) { |
549e984e SL |
91 | if (cpu_index != correct_id && !warned && !qtest_enabled()) { |
92 | error_report("APIC IDs set in compatibility mode, " | |
93 | "CPU topology won't match the configuration"); | |
94 | warned = true; | |
95 | } | |
96 | return cpu_index; | |
97 | } else { | |
98 | return correct_id; | |
99 | } | |
100 | } | |
101 | ||
703a548a SL |
102 | |
103 | void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp) | |
549e984e SL |
104 | { |
105 | Object *cpu = NULL; | |
106 | Error *local_err = NULL; | |
549e984e | 107 | |
703a548a | 108 | cpu = object_new(MACHINE(x86ms)->cpu_type); |
549e984e | 109 | |
549e984e SL |
110 | object_property_set_uint(cpu, apic_id, "apic-id", &local_err); |
111 | object_property_set_bool(cpu, true, "realized", &local_err); | |
112 | ||
113 | object_unref(cpu); | |
114 | error_propagate(errp, local_err); | |
115 | } | |
116 | ||
703a548a | 117 | void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) |
549e984e SL |
118 | { |
119 | int i; | |
120 | const CPUArchIdList *possible_cpus; | |
703a548a SL |
121 | MachineState *ms = MACHINE(x86ms); |
122 | MachineClass *mc = MACHINE_GET_CLASS(x86ms); | |
549e984e | 123 | |
703a548a | 124 | x86_cpu_set_default_version(default_cpu_version); |
549e984e SL |
125 | |
126 | /* | |
127 | * Calculates the limit to CPU APIC ID values | |
128 | * | |
129 | * Limit for the APIC ID value, so that all | |
703a548a | 130 | * CPU APIC IDs are < x86ms->apic_id_limit. |
549e984e SL |
131 | * |
132 | * This is used for FW_CFG_MAX_CPUS. See comments on fw_cfg_arch_create(). | |
133 | */ | |
703a548a | 134 | x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms, |
f0bb276b | 135 | ms->smp.max_cpus - 1) + 1; |
549e984e SL |
136 | possible_cpus = mc->possible_cpu_arch_ids(ms); |
137 | for (i = 0; i < ms->smp.cpus; i++) { | |
703a548a | 138 | x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal); |
549e984e SL |
139 | } |
140 | } | |
141 | ||
142 | CpuInstanceProperties | |
143 | x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index) | |
144 | { | |
145 | MachineClass *mc = MACHINE_GET_CLASS(ms); | |
146 | const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms); | |
147 | ||
148 | assert(cpu_index < possible_cpus->len); | |
149 | return possible_cpus->cpus[cpu_index].props; | |
150 | } | |
151 | ||
152 | int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx) | |
153 | { | |
dcf08bc6 | 154 | X86CPUTopoIDs topo_ids; |
f0bb276b | 155 | X86MachineState *x86ms = X86_MACHINE(ms); |
53a5e7bd BM |
156 | X86CPUTopoInfo topo_info; |
157 | ||
158 | init_topo_info(&topo_info, x86ms); | |
549e984e SL |
159 | |
160 | assert(idx < ms->possible_cpus->len); | |
161 | x86_topo_ids_from_apicid(ms->possible_cpus->cpus[idx].arch_id, | |
53a5e7bd | 162 | &topo_info, &topo_ids); |
dcf08bc6 | 163 | return topo_ids.pkg_id % ms->numa_state->num_nodes; |
549e984e SL |
164 | } |
165 | ||
166 | const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms) | |
167 | { | |
f0bb276b | 168 | X86MachineState *x86ms = X86_MACHINE(ms); |
549e984e | 169 | unsigned int max_cpus = ms->smp.max_cpus; |
53a5e7bd BM |
170 | X86CPUTopoInfo topo_info; |
171 | int i; | |
549e984e SL |
172 | |
173 | if (ms->possible_cpus) { | |
174 | /* | |
175 | * make sure that max_cpus hasn't changed since the first use, i.e. | |
176 | * -smp hasn't been parsed after it | |
177 | */ | |
178 | assert(ms->possible_cpus->len == max_cpus); | |
179 | return ms->possible_cpus; | |
180 | } | |
181 | ||
182 | ms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) + | |
183 | sizeof(CPUArchId) * max_cpus); | |
184 | ms->possible_cpus->len = max_cpus; | |
53a5e7bd BM |
185 | |
186 | init_topo_info(&topo_info, x86ms); | |
187 | ||
549e984e | 188 | for (i = 0; i < ms->possible_cpus->len; i++) { |
dcf08bc6 | 189 | X86CPUTopoIDs topo_ids; |
549e984e SL |
190 | |
191 | ms->possible_cpus->cpus[i].type = ms->cpu_type; | |
192 | ms->possible_cpus->cpus[i].vcpus_count = 1; | |
193 | ms->possible_cpus->cpus[i].arch_id = | |
703a548a | 194 | x86_cpu_apic_id_from_index(x86ms, i); |
549e984e | 195 | x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id, |
53a5e7bd | 196 | &topo_info, &topo_ids); |
549e984e | 197 | ms->possible_cpus->cpus[i].props.has_socket_id = true; |
dcf08bc6 | 198 | ms->possible_cpus->cpus[i].props.socket_id = topo_ids.pkg_id; |
f0bb276b | 199 | if (x86ms->smp_dies > 1) { |
549e984e | 200 | ms->possible_cpus->cpus[i].props.has_die_id = true; |
dcf08bc6 | 201 | ms->possible_cpus->cpus[i].props.die_id = topo_ids.die_id; |
549e984e SL |
202 | } |
203 | ms->possible_cpus->cpus[i].props.has_core_id = true; | |
dcf08bc6 | 204 | ms->possible_cpus->cpus[i].props.core_id = topo_ids.core_id; |
549e984e | 205 | ms->possible_cpus->cpus[i].props.has_thread_id = true; |
dcf08bc6 | 206 | ms->possible_cpus->cpus[i].props.thread_id = topo_ids.smt_id; |
549e984e SL |
207 | } |
208 | return ms->possible_cpus; | |
209 | } | |
210 | ||
f0bb276b PB |
211 | static void x86_nmi(NMIState *n, int cpu_index, Error **errp) |
212 | { | |
213 | /* cpu index isn't used */ | |
214 | CPUState *cs; | |
215 | ||
216 | CPU_FOREACH(cs) { | |
217 | X86CPU *cpu = X86_CPU(cs); | |
218 | ||
219 | if (!cpu->apic_state) { | |
220 | cpu_interrupt(cs, CPU_INTERRUPT_NMI); | |
221 | } else { | |
222 | apic_deliver_nmi(cpu->apic_state); | |
223 | } | |
224 | } | |
225 | } | |
226 | ||
549e984e SL |
227 | static long get_file_size(FILE *f) |
228 | { | |
229 | long where, size; | |
230 | ||
231 | /* XXX: on Unix systems, using fstat() probably makes more sense */ | |
232 | ||
233 | where = ftell(f); | |
234 | fseek(f, 0, SEEK_END); | |
235 | size = ftell(f); | |
236 | fseek(f, where, SEEK_SET); | |
237 | ||
238 | return size; | |
239 | } | |
240 | ||
89a289c7 PB |
241 | /* TSC handling */ |
242 | uint64_t cpu_get_tsc(CPUX86State *env) | |
243 | { | |
244 | return cpu_get_ticks(); | |
245 | } | |
246 | ||
247 | /* IRQ handling */ | |
248 | static void pic_irq_request(void *opaque, int irq, int level) | |
249 | { | |
250 | CPUState *cs = first_cpu; | |
251 | X86CPU *cpu = X86_CPU(cs); | |
252 | ||
253 | trace_x86_pic_interrupt(irq, level); | |
254 | if (cpu->apic_state && !kvm_irqchip_in_kernel()) { | |
255 | CPU_FOREACH(cs) { | |
256 | cpu = X86_CPU(cs); | |
257 | if (apic_accept_pic_intr(cpu->apic_state)) { | |
258 | apic_deliver_pic_intr(cpu->apic_state, level); | |
259 | } | |
260 | } | |
261 | } else { | |
262 | if (level) { | |
263 | cpu_interrupt(cs, CPU_INTERRUPT_HARD); | |
264 | } else { | |
265 | cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD); | |
266 | } | |
267 | } | |
268 | } | |
269 | ||
270 | qemu_irq x86_allocate_cpu_irq(void) | |
271 | { | |
272 | return qemu_allocate_irq(pic_irq_request, NULL, 0); | |
273 | } | |
274 | ||
275 | int cpu_get_pic_interrupt(CPUX86State *env) | |
276 | { | |
277 | X86CPU *cpu = env_archcpu(env); | |
278 | int intno; | |
279 | ||
280 | if (!kvm_irqchip_in_kernel()) { | |
281 | intno = apic_get_interrupt(cpu->apic_state); | |
282 | if (intno >= 0) { | |
283 | return intno; | |
284 | } | |
285 | /* read the irq from the PIC */ | |
286 | if (!apic_accept_pic_intr(cpu->apic_state)) { | |
287 | return -1; | |
288 | } | |
289 | } | |
290 | ||
291 | intno = pic_read_irq(isa_pic); | |
292 | return intno; | |
293 | } | |
294 | ||
295 | DeviceState *cpu_get_current_apic(void) | |
296 | { | |
297 | if (current_cpu) { | |
298 | X86CPU *cpu = X86_CPU(current_cpu); | |
299 | return cpu->apic_state; | |
300 | } else { | |
301 | return NULL; | |
302 | } | |
303 | } | |
304 | ||
305 | void gsi_handler(void *opaque, int n, int level) | |
306 | { | |
307 | GSIState *s = opaque; | |
308 | ||
309 | trace_x86_gsi_interrupt(n, level); | |
310 | if (n < ISA_NUM_IRQS) { | |
64c033ba | 311 | /* Under KVM, Kernel will forward to both PIC and IOAPIC */ |
89a289c7 PB |
312 | qemu_set_irq(s->i8259_irq[n], level); |
313 | } | |
314 | qemu_set_irq(s->ioapic_irq[n], level); | |
315 | } | |
316 | ||
317 | void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name) | |
318 | { | |
319 | DeviceState *dev; | |
320 | SysBusDevice *d; | |
321 | unsigned int i; | |
322 | ||
14a1bb48 | 323 | assert(parent_name); |
89a289c7 PB |
324 | if (kvm_ioapic_in_kernel()) { |
325 | dev = qdev_create(NULL, TYPE_KVM_IOAPIC); | |
326 | } else { | |
327 | dev = qdev_create(NULL, TYPE_IOAPIC); | |
328 | } | |
14a1bb48 PMD |
329 | object_property_add_child(object_resolve_path(parent_name, NULL), |
330 | "ioapic", OBJECT(dev), NULL); | |
89a289c7 PB |
331 | qdev_init_nofail(dev); |
332 | d = SYS_BUS_DEVICE(dev); | |
333 | sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS); | |
334 | ||
335 | for (i = 0; i < IOAPIC_NUM_PINS; i++) { | |
336 | gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i); | |
337 | } | |
338 | } | |
339 | ||
549e984e SL |
340 | struct setup_data { |
341 | uint64_t next; | |
342 | uint32_t type; | |
343 | uint32_t len; | |
f7795e40 | 344 | uint8_t data[]; |
549e984e SL |
345 | } __attribute__((packed)); |
346 | ||
347 | ||
348 | /* | |
349 | * The entry point into the kernel for PVH boot is different from | |
350 | * the native entry point. The PVH entry is defined by the x86/HVM | |
351 | * direct boot ABI and is available in an ELFNOTE in the kernel binary. | |
352 | * | |
353 | * This function is passed to load_elf() when it is called from | |
354 | * load_elfboot() which then additionally checks for an ELF Note of | |
355 | * type XEN_ELFNOTE_PHYS32_ENTRY and passes it to this function to | |
356 | * parse the PVH entry address from the ELF Note. | |
357 | * | |
358 | * Due to trickery in elf_opts.h, load_elf() is actually available as | |
359 | * load_elf32() or load_elf64() and this routine needs to be able | |
360 | * to deal with being called as 32 or 64 bit. | |
361 | * | |
362 | * The address of the PVH entry point is saved to the 'pvh_start_addr' | |
363 | * global variable. (although the entry point is 32-bit, the kernel | |
364 | * binary can be either 32-bit or 64-bit). | |
365 | */ | |
366 | static uint64_t read_pvh_start_addr(void *arg1, void *arg2, bool is64) | |
367 | { | |
368 | size_t *elf_note_data_addr; | |
369 | ||
370 | /* Check if ELF Note header passed in is valid */ | |
371 | if (arg1 == NULL) { | |
372 | return 0; | |
373 | } | |
374 | ||
375 | if (is64) { | |
376 | struct elf64_note *nhdr64 = (struct elf64_note *)arg1; | |
377 | uint64_t nhdr_size64 = sizeof(struct elf64_note); | |
378 | uint64_t phdr_align = *(uint64_t *)arg2; | |
379 | uint64_t nhdr_namesz = nhdr64->n_namesz; | |
380 | ||
381 | elf_note_data_addr = | |
382 | ((void *)nhdr64) + nhdr_size64 + | |
383 | QEMU_ALIGN_UP(nhdr_namesz, phdr_align); | |
384 | } else { | |
385 | struct elf32_note *nhdr32 = (struct elf32_note *)arg1; | |
386 | uint32_t nhdr_size32 = sizeof(struct elf32_note); | |
387 | uint32_t phdr_align = *(uint32_t *)arg2; | |
388 | uint32_t nhdr_namesz = nhdr32->n_namesz; | |
389 | ||
390 | elf_note_data_addr = | |
391 | ((void *)nhdr32) + nhdr_size32 + | |
392 | QEMU_ALIGN_UP(nhdr_namesz, phdr_align); | |
393 | } | |
394 | ||
395 | pvh_start_addr = *elf_note_data_addr; | |
396 | ||
397 | return pvh_start_addr; | |
398 | } | |
399 | ||
400 | static bool load_elfboot(const char *kernel_filename, | |
401 | int kernel_file_size, | |
402 | uint8_t *header, | |
403 | size_t pvh_xen_start_addr, | |
404 | FWCfgState *fw_cfg) | |
405 | { | |
406 | uint32_t flags = 0; | |
407 | uint32_t mh_load_addr = 0; | |
408 | uint32_t elf_kernel_size = 0; | |
409 | uint64_t elf_entry; | |
410 | uint64_t elf_low, elf_high; | |
411 | int kernel_size; | |
412 | ||
413 | if (ldl_p(header) != 0x464c457f) { | |
414 | return false; /* no elfboot */ | |
415 | } | |
416 | ||
417 | bool elf_is64 = header[EI_CLASS] == ELFCLASS64; | |
418 | flags = elf_is64 ? | |
419 | ((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags; | |
420 | ||
421 | if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */ | |
422 | error_report("elfboot unsupported flags = %x", flags); | |
423 | exit(1); | |
424 | } | |
425 | ||
426 | uint64_t elf_note_type = XEN_ELFNOTE_PHYS32_ENTRY; | |
427 | kernel_size = load_elf(kernel_filename, read_pvh_start_addr, | |
428 | NULL, &elf_note_type, &elf_entry, | |
6cdda0ff | 429 | &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE, |
549e984e SL |
430 | 0, 0); |
431 | ||
432 | if (kernel_size < 0) { | |
433 | error_report("Error while loading elf kernel"); | |
434 | exit(1); | |
435 | } | |
436 | mh_load_addr = elf_low; | |
437 | elf_kernel_size = elf_high - elf_low; | |
438 | ||
439 | if (pvh_start_addr == 0) { | |
440 | error_report("Error loading uncompressed kernel without PVH ELF Note"); | |
441 | exit(1); | |
442 | } | |
443 | fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, pvh_start_addr); | |
444 | fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr); | |
445 | fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, elf_kernel_size); | |
446 | ||
447 | return true; | |
448 | } | |
449 | ||
703a548a SL |
450 | void x86_load_linux(X86MachineState *x86ms, |
451 | FWCfgState *fw_cfg, | |
452 | int acpi_data_size, | |
453 | bool pvh_enabled, | |
454 | bool linuxboot_dma_enabled) | |
549e984e SL |
455 | { |
456 | uint16_t protocol; | |
457 | int setup_size, kernel_size, cmdline_size; | |
458 | int dtb_size, setup_data_offset; | |
459 | uint32_t initrd_max; | |
460 | uint8_t header[8192], *setup, *kernel; | |
461 | hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0; | |
462 | FILE *f; | |
463 | char *vmode; | |
703a548a | 464 | MachineState *machine = MACHINE(x86ms); |
549e984e SL |
465 | struct setup_data *setup_data; |
466 | const char *kernel_filename = machine->kernel_filename; | |
467 | const char *initrd_filename = machine->initrd_filename; | |
468 | const char *dtb_filename = machine->dtb; | |
469 | const char *kernel_cmdline = machine->kernel_cmdline; | |
470 | ||
471 | /* Align to 16 bytes as a paranoia measure */ | |
472 | cmdline_size = (strlen(kernel_cmdline) + 16) & ~15; | |
473 | ||
474 | /* load the kernel header */ | |
475 | f = fopen(kernel_filename, "rb"); | |
476 | if (!f) { | |
477 | fprintf(stderr, "qemu: could not open kernel file '%s': %s\n", | |
478 | kernel_filename, strerror(errno)); | |
479 | exit(1); | |
480 | } | |
481 | ||
482 | kernel_size = get_file_size(f); | |
483 | if (!kernel_size || | |
484 | fread(header, 1, MIN(ARRAY_SIZE(header), kernel_size), f) != | |
485 | MIN(ARRAY_SIZE(header), kernel_size)) { | |
486 | fprintf(stderr, "qemu: could not load kernel '%s': %s\n", | |
487 | kernel_filename, strerror(errno)); | |
488 | exit(1); | |
489 | } | |
490 | ||
491 | /* kernel protocol version */ | |
492 | if (ldl_p(header + 0x202) == 0x53726448) { | |
493 | protocol = lduw_p(header + 0x206); | |
494 | } else { | |
495 | /* | |
496 | * This could be a multiboot kernel. If it is, let's stop treating it | |
497 | * like a Linux kernel. | |
498 | * Note: some multiboot images could be in the ELF format (the same of | |
499 | * PVH), so we try multiboot first since we check the multiboot magic | |
500 | * header before to load it. | |
501 | */ | |
502 | if (load_multiboot(fw_cfg, f, kernel_filename, initrd_filename, | |
503 | kernel_cmdline, kernel_size, header)) { | |
504 | return; | |
505 | } | |
506 | /* | |
507 | * Check if the file is an uncompressed kernel file (ELF) and load it, | |
508 | * saving the PVH entry point used by the x86/HVM direct boot ABI. | |
509 | * If load_elfboot() is successful, populate the fw_cfg info. | |
510 | */ | |
703a548a | 511 | if (pvh_enabled && |
549e984e SL |
512 | load_elfboot(kernel_filename, kernel_size, |
513 | header, pvh_start_addr, fw_cfg)) { | |
514 | fclose(f); | |
515 | ||
516 | fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, | |
517 | strlen(kernel_cmdline) + 1); | |
518 | fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline); | |
519 | ||
520 | fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header)); | |
521 | fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, | |
522 | header, sizeof(header)); | |
523 | ||
524 | /* load initrd */ | |
525 | if (initrd_filename) { | |
526 | GMappedFile *mapped_file; | |
527 | gsize initrd_size; | |
528 | gchar *initrd_data; | |
529 | GError *gerr = NULL; | |
530 | ||
531 | mapped_file = g_mapped_file_new(initrd_filename, false, &gerr); | |
532 | if (!mapped_file) { | |
533 | fprintf(stderr, "qemu: error reading initrd %s: %s\n", | |
534 | initrd_filename, gerr->message); | |
535 | exit(1); | |
536 | } | |
f0bb276b | 537 | x86ms->initrd_mapped_file = mapped_file; |
549e984e SL |
538 | |
539 | initrd_data = g_mapped_file_get_contents(mapped_file); | |
540 | initrd_size = g_mapped_file_get_length(mapped_file); | |
703a548a | 541 | initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; |
549e984e SL |
542 | if (initrd_size >= initrd_max) { |
543 | fprintf(stderr, "qemu: initrd is too large, cannot support." | |
544 | "(max: %"PRIu32", need %"PRId64")\n", | |
545 | initrd_max, (uint64_t)initrd_size); | |
546 | exit(1); | |
547 | } | |
548 | ||
549 | initrd_addr = (initrd_max - initrd_size) & ~4095; | |
550 | ||
551 | fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr); | |
552 | fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size); | |
553 | fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, | |
554 | initrd_size); | |
555 | } | |
556 | ||
557 | option_rom[nb_option_roms].bootindex = 0; | |
558 | option_rom[nb_option_roms].name = "pvh.bin"; | |
559 | nb_option_roms++; | |
560 | ||
561 | return; | |
562 | } | |
563 | protocol = 0; | |
564 | } | |
565 | ||
566 | if (protocol < 0x200 || !(header[0x211] & 0x01)) { | |
567 | /* Low kernel */ | |
568 | real_addr = 0x90000; | |
569 | cmdline_addr = 0x9a000 - cmdline_size; | |
570 | prot_addr = 0x10000; | |
571 | } else if (protocol < 0x202) { | |
572 | /* High but ancient kernel */ | |
573 | real_addr = 0x90000; | |
574 | cmdline_addr = 0x9a000 - cmdline_size; | |
575 | prot_addr = 0x100000; | |
576 | } else { | |
577 | /* High and recent kernel */ | |
578 | real_addr = 0x10000; | |
579 | cmdline_addr = 0x20000; | |
580 | prot_addr = 0x100000; | |
581 | } | |
582 | ||
583 | /* highest address for loading the initrd */ | |
584 | if (protocol >= 0x20c && | |
585 | lduw_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) { | |
586 | /* | |
587 | * Linux has supported initrd up to 4 GB for a very long time (2007, | |
588 | * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013), | |
589 | * though it only sets initrd_max to 2 GB to "work around bootloader | |
590 | * bugs". Luckily, QEMU firmware(which does something like bootloader) | |
591 | * has supported this. | |
592 | * | |
593 | * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can | |
594 | * be loaded into any address. | |
595 | * | |
596 | * In addition, initrd_max is uint32_t simply because QEMU doesn't | |
597 | * support the 64-bit boot protocol (specifically the ext_ramdisk_image | |
598 | * field). | |
599 | * | |
600 | * Therefore here just limit initrd_max to UINT32_MAX simply as well. | |
601 | */ | |
602 | initrd_max = UINT32_MAX; | |
603 | } else if (protocol >= 0x203) { | |
604 | initrd_max = ldl_p(header + 0x22c); | |
605 | } else { | |
606 | initrd_max = 0x37ffffff; | |
607 | } | |
608 | ||
703a548a SL |
609 | if (initrd_max >= x86ms->below_4g_mem_size - acpi_data_size) { |
610 | initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; | |
549e984e SL |
611 | } |
612 | ||
613 | fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr); | |
614 | fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1); | |
615 | fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline); | |
616 | ||
617 | if (protocol >= 0x202) { | |
618 | stl_p(header + 0x228, cmdline_addr); | |
619 | } else { | |
620 | stw_p(header + 0x20, 0xA33F); | |
621 | stw_p(header + 0x22, cmdline_addr - real_addr); | |
622 | } | |
623 | ||
624 | /* handle vga= parameter */ | |
625 | vmode = strstr(kernel_cmdline, "vga="); | |
626 | if (vmode) { | |
627 | unsigned int video_mode; | |
a88c40f0 | 628 | const char *end; |
549e984e SL |
629 | int ret; |
630 | /* skip "vga=" */ | |
631 | vmode += 4; | |
632 | if (!strncmp(vmode, "normal", 6)) { | |
633 | video_mode = 0xffff; | |
634 | } else if (!strncmp(vmode, "ext", 3)) { | |
635 | video_mode = 0xfffe; | |
636 | } else if (!strncmp(vmode, "ask", 3)) { | |
637 | video_mode = 0xfffd; | |
638 | } else { | |
a88c40f0 PW |
639 | ret = qemu_strtoui(vmode, &end, 0, &video_mode); |
640 | if (ret != 0 || (*end && *end != ' ')) { | |
641 | fprintf(stderr, "qemu: invalid 'vga=' kernel parameter.\n"); | |
549e984e SL |
642 | exit(1); |
643 | } | |
644 | } | |
645 | stw_p(header + 0x1fa, video_mode); | |
646 | } | |
647 | ||
648 | /* loader type */ | |
649 | /* | |
650 | * High nybble = B reserved for QEMU; low nybble is revision number. | |
651 | * If this code is substantially changed, you may want to consider | |
652 | * incrementing the revision. | |
653 | */ | |
654 | if (protocol >= 0x200) { | |
655 | header[0x210] = 0xB0; | |
656 | } | |
657 | /* heap */ | |
658 | if (protocol >= 0x201) { | |
659 | header[0x211] |= 0x80; /* CAN_USE_HEAP */ | |
660 | stw_p(header + 0x224, cmdline_addr - real_addr - 0x200); | |
661 | } | |
662 | ||
663 | /* load initrd */ | |
664 | if (initrd_filename) { | |
665 | GMappedFile *mapped_file; | |
666 | gsize initrd_size; | |
667 | gchar *initrd_data; | |
668 | GError *gerr = NULL; | |
669 | ||
670 | if (protocol < 0x200) { | |
671 | fprintf(stderr, "qemu: linux kernel too old to load a ram disk\n"); | |
672 | exit(1); | |
673 | } | |
674 | ||
675 | mapped_file = g_mapped_file_new(initrd_filename, false, &gerr); | |
676 | if (!mapped_file) { | |
677 | fprintf(stderr, "qemu: error reading initrd %s: %s\n", | |
678 | initrd_filename, gerr->message); | |
679 | exit(1); | |
680 | } | |
f0bb276b | 681 | x86ms->initrd_mapped_file = mapped_file; |
549e984e SL |
682 | |
683 | initrd_data = g_mapped_file_get_contents(mapped_file); | |
684 | initrd_size = g_mapped_file_get_length(mapped_file); | |
685 | if (initrd_size >= initrd_max) { | |
686 | fprintf(stderr, "qemu: initrd is too large, cannot support." | |
687 | "(max: %"PRIu32", need %"PRId64")\n", | |
688 | initrd_max, (uint64_t)initrd_size); | |
689 | exit(1); | |
690 | } | |
691 | ||
692 | initrd_addr = (initrd_max - initrd_size) & ~4095; | |
693 | ||
694 | fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr); | |
695 | fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size); | |
696 | fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, initrd_size); | |
697 | ||
698 | stl_p(header + 0x218, initrd_addr); | |
699 | stl_p(header + 0x21c, initrd_size); | |
700 | } | |
701 | ||
702 | /* load kernel and setup */ | |
703 | setup_size = header[0x1f1]; | |
704 | if (setup_size == 0) { | |
705 | setup_size = 4; | |
706 | } | |
707 | setup_size = (setup_size + 1) * 512; | |
708 | if (setup_size > kernel_size) { | |
709 | fprintf(stderr, "qemu: invalid kernel header\n"); | |
710 | exit(1); | |
711 | } | |
712 | kernel_size -= setup_size; | |
713 | ||
714 | setup = g_malloc(setup_size); | |
715 | kernel = g_malloc(kernel_size); | |
716 | fseek(f, 0, SEEK_SET); | |
717 | if (fread(setup, 1, setup_size, f) != setup_size) { | |
718 | fprintf(stderr, "fread() failed\n"); | |
719 | exit(1); | |
720 | } | |
721 | if (fread(kernel, 1, kernel_size, f) != kernel_size) { | |
722 | fprintf(stderr, "fread() failed\n"); | |
723 | exit(1); | |
724 | } | |
725 | fclose(f); | |
726 | ||
727 | /* append dtb to kernel */ | |
728 | if (dtb_filename) { | |
729 | if (protocol < 0x209) { | |
730 | fprintf(stderr, "qemu: Linux kernel too old to load a dtb\n"); | |
731 | exit(1); | |
732 | } | |
733 | ||
734 | dtb_size = get_image_size(dtb_filename); | |
735 | if (dtb_size <= 0) { | |
736 | fprintf(stderr, "qemu: error reading dtb %s: %s\n", | |
737 | dtb_filename, strerror(errno)); | |
738 | exit(1); | |
739 | } | |
740 | ||
741 | setup_data_offset = QEMU_ALIGN_UP(kernel_size, 16); | |
742 | kernel_size = setup_data_offset + sizeof(struct setup_data) + dtb_size; | |
743 | kernel = g_realloc(kernel, kernel_size); | |
744 | ||
745 | stq_p(header + 0x250, prot_addr + setup_data_offset); | |
746 | ||
747 | setup_data = (struct setup_data *)(kernel + setup_data_offset); | |
748 | setup_data->next = 0; | |
749 | setup_data->type = cpu_to_le32(SETUP_DTB); | |
750 | setup_data->len = cpu_to_le32(dtb_size); | |
751 | ||
752 | load_image_size(dtb_filename, setup_data->data, dtb_size); | |
753 | } | |
754 | ||
755 | memcpy(setup, header, MIN(sizeof(header), setup_size)); | |
756 | ||
757 | fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr); | |
758 | fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size); | |
759 | fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size); | |
760 | ||
761 | fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr); | |
762 | fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size); | |
763 | fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, setup, setup_size); | |
764 | ||
765 | option_rom[nb_option_roms].bootindex = 0; | |
766 | option_rom[nb_option_roms].name = "linuxboot.bin"; | |
703a548a | 767 | if (linuxboot_dma_enabled && fw_cfg_dma_enabled(fw_cfg)) { |
549e984e SL |
768 | option_rom[nb_option_roms].name = "linuxboot_dma.bin"; |
769 | } | |
770 | nb_option_roms++; | |
771 | } | |
772 | ||
773 | void x86_bios_rom_init(MemoryRegion *rom_memory, bool isapc_ram_fw) | |
774 | { | |
775 | char *filename; | |
776 | MemoryRegion *bios, *isa_bios; | |
777 | int bios_size, isa_bios_size; | |
778 | int ret; | |
779 | ||
780 | /* BIOS load */ | |
781 | if (bios_name == NULL) { | |
782 | bios_name = BIOS_FILENAME; | |
783 | } | |
784 | filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name); | |
785 | if (filename) { | |
786 | bios_size = get_image_size(filename); | |
787 | } else { | |
788 | bios_size = -1; | |
789 | } | |
790 | if (bios_size <= 0 || | |
791 | (bios_size % 65536) != 0) { | |
792 | goto bios_error; | |
793 | } | |
794 | bios = g_malloc(sizeof(*bios)); | |
795 | memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal); | |
796 | if (!isapc_ram_fw) { | |
797 | memory_region_set_readonly(bios, true); | |
798 | } | |
799 | ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1); | |
800 | if (ret != 0) { | |
801 | bios_error: | |
802 | fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name); | |
803 | exit(1); | |
804 | } | |
805 | g_free(filename); | |
806 | ||
807 | /* map the last 128KB of the BIOS in ISA space */ | |
808 | isa_bios_size = MIN(bios_size, 128 * KiB); | |
809 | isa_bios = g_malloc(sizeof(*isa_bios)); | |
810 | memory_region_init_alias(isa_bios, NULL, "isa-bios", bios, | |
811 | bios_size - isa_bios_size, isa_bios_size); | |
812 | memory_region_add_subregion_overlap(rom_memory, | |
813 | 0x100000 - isa_bios_size, | |
814 | isa_bios, | |
815 | 1); | |
816 | if (!isapc_ram_fw) { | |
817 | memory_region_set_readonly(isa_bios, true); | |
818 | } | |
819 | ||
820 | /* map all the bios at the top of memory */ | |
821 | memory_region_add_subregion(rom_memory, | |
822 | (uint32_t)(-bios_size), | |
823 | bios); | |
824 | } | |
f0bb276b PB |
825 | |
826 | static void x86_machine_get_max_ram_below_4g(Object *obj, Visitor *v, | |
827 | const char *name, void *opaque, | |
828 | Error **errp) | |
829 | { | |
830 | X86MachineState *x86ms = X86_MACHINE(obj); | |
831 | uint64_t value = x86ms->max_ram_below_4g; | |
832 | ||
833 | visit_type_size(v, name, &value, errp); | |
834 | } | |
835 | ||
836 | static void x86_machine_set_max_ram_below_4g(Object *obj, Visitor *v, | |
837 | const char *name, void *opaque, | |
838 | Error **errp) | |
839 | { | |
840 | X86MachineState *x86ms = X86_MACHINE(obj); | |
841 | Error *error = NULL; | |
842 | uint64_t value; | |
843 | ||
844 | visit_type_size(v, name, &value, &error); | |
845 | if (error) { | |
846 | error_propagate(errp, error); | |
847 | return; | |
848 | } | |
849 | if (value > 4 * GiB) { | |
850 | error_setg(&error, | |
851 | "Machine option 'max-ram-below-4g=%"PRIu64 | |
852 | "' expects size less than or equal to 4G", value); | |
853 | error_propagate(errp, error); | |
854 | return; | |
855 | } | |
856 | ||
857 | if (value < 1 * MiB) { | |
858 | warn_report("Only %" PRIu64 " bytes of RAM below the 4GiB boundary," | |
859 | "BIOS may not work with less than 1MiB", value); | |
860 | } | |
861 | ||
862 | x86ms->max_ram_below_4g = value; | |
863 | } | |
864 | ||
ed9e923c PB |
865 | bool x86_machine_is_smm_enabled(X86MachineState *x86ms) |
866 | { | |
867 | bool smm_available = false; | |
868 | ||
869 | if (x86ms->smm == ON_OFF_AUTO_OFF) { | |
870 | return false; | |
871 | } | |
872 | ||
873 | if (tcg_enabled() || qtest_enabled()) { | |
874 | smm_available = true; | |
875 | } else if (kvm_enabled()) { | |
876 | smm_available = kvm_has_smm(); | |
877 | } | |
878 | ||
879 | if (smm_available) { | |
880 | return true; | |
881 | } | |
882 | ||
883 | if (x86ms->smm == ON_OFF_AUTO_ON) { | |
884 | error_report("System Management Mode not supported by this hypervisor."); | |
885 | exit(1); | |
886 | } | |
887 | return false; | |
888 | } | |
889 | ||
890 | static void x86_machine_get_smm(Object *obj, Visitor *v, const char *name, | |
891 | void *opaque, Error **errp) | |
892 | { | |
893 | X86MachineState *x86ms = X86_MACHINE(obj); | |
894 | OnOffAuto smm = x86ms->smm; | |
895 | ||
896 | visit_type_OnOffAuto(v, name, &smm, errp); | |
897 | } | |
898 | ||
899 | static void x86_machine_set_smm(Object *obj, Visitor *v, const char *name, | |
900 | void *opaque, Error **errp) | |
901 | { | |
902 | X86MachineState *x86ms = X86_MACHINE(obj); | |
903 | ||
904 | visit_type_OnOffAuto(v, name, &x86ms->smm, errp); | |
905 | } | |
906 | ||
f0bb276b PB |
907 | static void x86_machine_initfn(Object *obj) |
908 | { | |
909 | X86MachineState *x86ms = X86_MACHINE(obj); | |
910 | ||
ed9e923c | 911 | x86ms->smm = ON_OFF_AUTO_AUTO; |
f0bb276b PB |
912 | x86ms->max_ram_below_4g = 0; /* use default */ |
913 | x86ms->smp_dies = 1; | |
914 | } | |
915 | ||
916 | static void x86_machine_class_init(ObjectClass *oc, void *data) | |
917 | { | |
918 | MachineClass *mc = MACHINE_CLASS(oc); | |
919 | X86MachineClass *x86mc = X86_MACHINE_CLASS(oc); | |
920 | NMIClass *nc = NMI_CLASS(oc); | |
921 | ||
922 | mc->cpu_index_to_instance_props = x86_cpu_index_to_props; | |
923 | mc->get_default_cpu_node_id = x86_get_default_cpu_node_id; | |
924 | mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids; | |
925 | x86mc->compat_apic_id_mode = false; | |
2f34ebf2 | 926 | x86mc->save_tsc_khz = true; |
f0bb276b PB |
927 | nc->nmi_monitor_handler = x86_nmi; |
928 | ||
929 | object_class_property_add(oc, X86_MACHINE_MAX_RAM_BELOW_4G, "size", | |
930 | x86_machine_get_max_ram_below_4g, x86_machine_set_max_ram_below_4g, | |
931 | NULL, NULL, &error_abort); | |
f0bb276b PB |
932 | object_class_property_set_description(oc, X86_MACHINE_MAX_RAM_BELOW_4G, |
933 | "Maximum ram below the 4G boundary (32bit boundary)", &error_abort); | |
ed9e923c PB |
934 | |
935 | object_class_property_add(oc, X86_MACHINE_SMM, "OnOffAuto", | |
936 | x86_machine_get_smm, x86_machine_set_smm, | |
937 | NULL, NULL, &error_abort); | |
938 | object_class_property_set_description(oc, X86_MACHINE_SMM, | |
939 | "Enable SMM", &error_abort); | |
f0bb276b PB |
940 | } |
941 | ||
942 | static const TypeInfo x86_machine_info = { | |
943 | .name = TYPE_X86_MACHINE, | |
944 | .parent = TYPE_MACHINE, | |
945 | .abstract = true, | |
946 | .instance_size = sizeof(X86MachineState), | |
947 | .instance_init = x86_machine_initfn, | |
948 | .class_size = sizeof(X86MachineClass), | |
949 | .class_init = x86_machine_class_init, | |
950 | .interfaces = (InterfaceInfo[]) { | |
951 | { TYPE_NMI }, | |
952 | { } | |
953 | }, | |
954 | }; | |
955 | ||
956 | static void x86_machine_register_types(void) | |
957 | { | |
958 | type_register_static(&x86_machine_info); | |
959 | } | |
960 | ||
961 | type_init(x86_machine_register_types) |