]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
83b96794 VK |
2 | /* |
3 | * Xen SMP support | |
4 | * | |
5 | * This file implements the Xen versions of smp_ops. SMP under Xen is | |
6 | * very straightforward. Bringing a CPU up is simply a matter of | |
7 | * loading its initial context and setting it running. | |
8 | * | |
9 | * IPIs are handled through the Xen event mechanism. | |
10 | * | |
11 | * Because virtual CPUs can be scheduled onto any real CPU, there's no | |
12 | * useful topology information for the kernel to make use of. As a | |
13 | * result, all CPUs are treated as if they're single-core and | |
14 | * single-threaded. | |
15 | */ | |
16 | #include <linux/sched.h> | |
f16b3da1 | 17 | #include <linux/sched/task_stack.h> |
83b96794 VK |
18 | #include <linux/err.h> |
19 | #include <linux/slab.h> | |
20 | #include <linux/smp.h> | |
21 | #include <linux/irq_work.h> | |
22 | #include <linux/tick.h> | |
23 | #include <linux/nmi.h> | |
c185ddec | 24 | #include <linux/cpuhotplug.h> |
977e4be5 | 25 | #include <linux/stackprotector.h> |
83b96794 VK |
26 | |
27 | #include <asm/paravirt.h> | |
28 | #include <asm/desc.h> | |
29 | #include <asm/pgtable.h> | |
30 | #include <asm/cpu.h> | |
31 | ||
32 | #include <xen/interface/xen.h> | |
33 | #include <xen/interface/vcpu.h> | |
34 | #include <xen/interface/xenpmu.h> | |
35 | ||
74899d92 | 36 | #include <asm/spec-ctrl.h> |
83b96794 VK |
37 | #include <asm/xen/interface.h> |
38 | #include <asm/xen/hypercall.h> | |
39 | ||
40 | #include <xen/xen.h> | |
41 | #include <xen/page.h> | |
42 | #include <xen/events.h> | |
43 | ||
44 | #include <xen/hvc-console.h> | |
45 | #include "xen-ops.h" | |
46 | #include "mmu.h" | |
47 | #include "smp.h" | |
48 | #include "pmu.h" | |
49 | ||
50 | cpumask_var_t xen_cpu_initialized_map; | |
51 | ||
52 | static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; | |
53 | static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; | |
54 | ||
55 | static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); | |
c3881eb5 | 56 | void asm_cpu_bringup_and_idle(void); |
83b96794 VK |
57 | |
58 | static void cpu_bringup(void) | |
59 | { | |
60 | int cpu; | |
61 | ||
7652ac92 | 62 | cr4_init(); |
83b96794 VK |
63 | cpu_init(); |
64 | touch_softlockup_watchdog(); | |
65 | preempt_disable(); | |
66 | ||
67 | /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ | |
68 | if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { | |
69 | xen_enable_sysenter(); | |
70 | xen_enable_syscall(); | |
71 | } | |
72 | cpu = smp_processor_id(); | |
73 | smp_store_cpu_info(cpu); | |
74 | cpu_data(cpu).x86_max_cores = 1; | |
75 | set_cpu_sibling_map(cpu); | |
76 | ||
74899d92 JG |
77 | speculative_store_bypass_ht_init(); |
78 | ||
83b96794 VK |
79 | xen_setup_cpu_clockevents(); |
80 | ||
81 | notify_cpu_starting(cpu); | |
82 | ||
83 | set_cpu_online(cpu, true); | |
84 | ||
85 | cpu_set_state_online(cpu); /* Implies full memory barrier. */ | |
86 | ||
87 | /* We can take interrupts now: we're officially "up". */ | |
88 | local_irq_enable(); | |
89 | } | |
90 | ||
91 | asmlinkage __visible void cpu_bringup_and_idle(void) | |
92 | { | |
93 | cpu_bringup(); | |
977e4be5 | 94 | boot_init_stack_canary(); |
83b96794 | 95 | cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); |
a9a3ed1e | 96 | prevent_tail_call_optimization(); |
83b96794 VK |
97 | } |
98 | ||
99 | void xen_smp_intr_free_pv(unsigned int cpu) | |
100 | { | |
101 | if (per_cpu(xen_irq_work, cpu).irq >= 0) { | |
102 | unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); | |
103 | per_cpu(xen_irq_work, cpu).irq = -1; | |
104 | kfree(per_cpu(xen_irq_work, cpu).name); | |
105 | per_cpu(xen_irq_work, cpu).name = NULL; | |
106 | } | |
107 | ||
108 | if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { | |
109 | unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); | |
110 | per_cpu(xen_pmu_irq, cpu).irq = -1; | |
111 | kfree(per_cpu(xen_pmu_irq, cpu).name); | |
112 | per_cpu(xen_pmu_irq, cpu).name = NULL; | |
113 | } | |
114 | } | |
115 | ||
116 | int xen_smp_intr_init_pv(unsigned int cpu) | |
117 | { | |
118 | int rc; | |
119 | char *callfunc_name, *pmu_name; | |
120 | ||
121 | callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); | |
122 | rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, | |
123 | cpu, | |
124 | xen_irq_work_interrupt, | |
125 | IRQF_PERCPU|IRQF_NOBALANCING, | |
126 | callfunc_name, | |
127 | NULL); | |
128 | if (rc < 0) | |
129 | goto fail; | |
130 | per_cpu(xen_irq_work, cpu).irq = rc; | |
131 | per_cpu(xen_irq_work, cpu).name = callfunc_name; | |
132 | ||
133 | if (is_xen_pmu(cpu)) { | |
134 | pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); | |
135 | rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, | |
136 | xen_pmu_irq_handler, | |
137 | IRQF_PERCPU|IRQF_NOBALANCING, | |
138 | pmu_name, NULL); | |
139 | if (rc < 0) | |
140 | goto fail; | |
141 | per_cpu(xen_pmu_irq, cpu).irq = rc; | |
142 | per_cpu(xen_pmu_irq, cpu).name = pmu_name; | |
143 | } | |
144 | ||
145 | return 0; | |
146 | ||
147 | fail: | |
148 | xen_smp_intr_free_pv(cpu); | |
149 | return rc; | |
150 | } | |
151 | ||
152 | static void __init xen_fill_possible_map(void) | |
153 | { | |
154 | int i, rc; | |
155 | ||
156 | if (xen_initial_domain()) | |
157 | return; | |
158 | ||
159 | for (i = 0; i < nr_cpu_ids; i++) { | |
160 | rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); | |
161 | if (rc >= 0) { | |
162 | num_processors++; | |
163 | set_cpu_possible(i, true); | |
164 | } | |
165 | } | |
166 | } | |
167 | ||
168 | static void __init xen_filter_cpu_maps(void) | |
169 | { | |
170 | int i, rc; | |
171 | unsigned int subtract = 0; | |
172 | ||
173 | if (!xen_initial_domain()) | |
174 | return; | |
175 | ||
176 | num_processors = 0; | |
177 | disabled_cpus = 0; | |
178 | for (i = 0; i < nr_cpu_ids; i++) { | |
179 | rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); | |
180 | if (rc >= 0) { | |
181 | num_processors++; | |
182 | set_cpu_possible(i, true); | |
183 | } else { | |
184 | set_cpu_possible(i, false); | |
185 | set_cpu_present(i, false); | |
186 | subtract++; | |
187 | } | |
188 | } | |
189 | #ifdef CONFIG_HOTPLUG_CPU | |
190 | /* This is akin to using 'nr_cpus' on the Linux command line. | |
191 | * Which is OK as when we use 'dom0_max_vcpus=X' we can only | |
192 | * have up to X, while nr_cpu_ids is greater than X. This | |
193 | * normally is not a problem, except when CPU hotplugging | |
194 | * is involved and then there might be more than X CPUs | |
195 | * in the guest - which will not work as there is no | |
196 | * hypercall to expand the max number of VCPUs an already | |
197 | * running guest has. So cap it up to X. */ | |
198 | if (subtract) | |
199 | nr_cpu_ids = nr_cpu_ids - subtract; | |
200 | #endif | |
201 | ||
202 | } | |
203 | ||
204 | static void __init xen_pv_smp_prepare_boot_cpu(void) | |
205 | { | |
206 | BUG_ON(smp_processor_id() != 0); | |
207 | native_smp_prepare_boot_cpu(); | |
208 | ||
209 | if (!xen_feature(XENFEAT_writable_page_tables)) | |
210 | /* We've switched to the "real" per-cpu gdt, so make | |
211 | * sure the old memory can be recycled. */ | |
212 | make_lowmem_page_readwrite(xen_initial_gdt); | |
213 | ||
214 | #ifdef CONFIG_X86_32 | |
215 | /* | |
216 | * Xen starts us with XEN_FLAT_RING1_DS, but linux code | |
217 | * expects __USER_DS | |
218 | */ | |
219 | loadsegment(ds, __USER_DS); | |
220 | loadsegment(es, __USER_DS); | |
221 | #endif | |
222 | ||
223 | xen_filter_cpu_maps(); | |
224 | xen_setup_vcpu_info_placement(); | |
225 | ||
226 | /* | |
227 | * The alternative logic (which patches the unlock/lock) runs before | |
228 | * the smp bootup up code is activated. Hence we need to set this up | |
229 | * the core kernel is being patched. Otherwise we will have only | |
230 | * modules patched but not core code. | |
231 | */ | |
232 | xen_init_spinlocks(); | |
233 | } | |
234 | ||
8cb6de39 | 235 | static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) |
83b96794 VK |
236 | { |
237 | unsigned cpu; | |
238 | unsigned int i; | |
239 | ||
240 | if (skip_ioapic_setup) { | |
241 | char *m = (max_cpus == 0) ? | |
242 | "The nosmp parameter is incompatible with Xen; " \ | |
243 | "use Xen dom0_max_vcpus=1 parameter" : | |
244 | "The noapic parameter is incompatible with Xen"; | |
245 | ||
246 | xen_raw_printk(m); | |
247 | panic(m); | |
248 | } | |
249 | xen_init_lock_cpu(0); | |
250 | ||
251 | smp_store_boot_cpu_info(); | |
252 | cpu_data(0).x86_max_cores = 1; | |
253 | ||
254 | for_each_possible_cpu(i) { | |
255 | zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); | |
256 | zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); | |
2e4c54da | 257 | zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); |
83b96794 VK |
258 | zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); |
259 | } | |
260 | set_cpu_sibling_map(0); | |
261 | ||
74899d92 JG |
262 | speculative_store_bypass_ht_init(); |
263 | ||
83b96794 VK |
264 | xen_pmu_init(0); |
265 | ||
f31b9692 | 266 | if (xen_smp_intr_init(0) || xen_smp_intr_init_pv(0)) |
83b96794 VK |
267 | BUG(); |
268 | ||
269 | if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) | |
270 | panic("could not allocate xen_cpu_initialized_map\n"); | |
271 | ||
272 | cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); | |
273 | ||
274 | /* Restrict the possible_map according to max_cpus. */ | |
275 | while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { | |
276 | for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) | |
277 | continue; | |
278 | set_cpu_possible(cpu, false); | |
279 | } | |
280 | ||
281 | for_each_possible_cpu(cpu) | |
282 | set_cpu_present(cpu, true); | |
283 | } | |
284 | ||
285 | static int | |
286 | cpu_initialize_context(unsigned int cpu, struct task_struct *idle) | |
287 | { | |
288 | struct vcpu_guest_context *ctxt; | |
289 | struct desc_struct *gdt; | |
290 | unsigned long gdt_mfn; | |
291 | ||
292 | /* used to tell cpu_init() that it can proceed with initialization */ | |
293 | cpumask_set_cpu(cpu, cpu_callout_mask); | |
294 | if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) | |
295 | return 0; | |
296 | ||
297 | ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); | |
298 | if (ctxt == NULL) | |
299 | return -ENOMEM; | |
300 | ||
301 | gdt = get_cpu_gdt_rw(cpu); | |
302 | ||
303 | #ifdef CONFIG_X86_32 | |
304 | ctxt->user_regs.fs = __KERNEL_PERCPU; | |
305 | ctxt->user_regs.gs = __KERNEL_STACK_CANARY; | |
306 | #endif | |
307 | memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); | |
308 | ||
f16b3da1 AL |
309 | /* |
310 | * Bring up the CPU in cpu_bringup_and_idle() with the stack | |
311 | * pointing just below where pt_regs would be if it were a normal | |
312 | * kernel entry. | |
313 | */ | |
c3881eb5 | 314 | ctxt->user_regs.eip = (unsigned long)asm_cpu_bringup_and_idle; |
83b96794 VK |
315 | ctxt->flags = VGCF_IN_KERNEL; |
316 | ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ | |
317 | ctxt->user_regs.ds = __USER_DS; | |
318 | ctxt->user_regs.es = __USER_DS; | |
319 | ctxt->user_regs.ss = __KERNEL_DS; | |
f16b3da1 AL |
320 | ctxt->user_regs.cs = __KERNEL_CS; |
321 | ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle); | |
83b96794 VK |
322 | |
323 | xen_copy_trap_info(ctxt->trap_ctxt); | |
324 | ||
325 | ctxt->ldt_ents = 0; | |
326 | ||
327 | BUG_ON((unsigned long)gdt & ~PAGE_MASK); | |
328 | ||
329 | gdt_mfn = arbitrary_virt_to_mfn(gdt); | |
330 | make_lowmem_page_readonly(gdt); | |
331 | make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); | |
332 | ||
333 | ctxt->gdt_frames[0] = gdt_mfn; | |
334 | ctxt->gdt_ents = GDT_ENTRIES; | |
335 | ||
f16b3da1 AL |
336 | /* |
337 | * Set SS:SP that Xen will use when entering guest kernel mode | |
338 | * from guest user mode. Subsequent calls to load_sp0() can | |
339 | * change this value. | |
340 | */ | |
83b96794 | 341 | ctxt->kernel_ss = __KERNEL_DS; |
f16b3da1 | 342 | ctxt->kernel_sp = task_top_of_stack(idle); |
83b96794 VK |
343 | |
344 | #ifdef CONFIG_X86_32 | |
345 | ctxt->event_callback_cs = __KERNEL_CS; | |
346 | ctxt->failsafe_callback_cs = __KERNEL_CS; | |
347 | #else | |
348 | ctxt->gs_base_kernel = per_cpu_offset(cpu); | |
349 | #endif | |
350 | ctxt->event_callback_eip = | |
351 | (unsigned long)xen_hypervisor_callback; | |
352 | ctxt->failsafe_callback_eip = | |
353 | (unsigned long)xen_failsafe_callback; | |
83b96794 VK |
354 | per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); |
355 | ||
83b96794 VK |
356 | ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); |
357 | if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) | |
358 | BUG(); | |
359 | ||
360 | kfree(ctxt); | |
361 | return 0; | |
362 | } | |
363 | ||
8cb6de39 | 364 | static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) |
83b96794 VK |
365 | { |
366 | int rc; | |
367 | ||
66c7ceb4 TG |
368 | rc = common_cpu_up(cpu, idle); |
369 | if (rc) | |
370 | return rc; | |
83b96794 VK |
371 | |
372 | xen_setup_runstate_info(cpu); | |
373 | ||
374 | /* | |
375 | * PV VCPUs are always successfully taken down (see 'while' loop | |
376 | * in xen_cpu_die()), so -EBUSY is an error. | |
377 | */ | |
378 | rc = cpu_check_up_prepare(cpu); | |
379 | if (rc) | |
380 | return rc; | |
381 | ||
382 | /* make sure interrupts start blocked */ | |
383 | per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; | |
384 | ||
385 | rc = cpu_initialize_context(cpu, idle); | |
386 | if (rc) | |
387 | return rc; | |
388 | ||
389 | xen_pmu_init(cpu); | |
390 | ||
391 | rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); | |
392 | BUG_ON(rc); | |
393 | ||
394 | while (cpu_report_state(cpu) != CPU_ONLINE) | |
395 | HYPERVISOR_sched_op(SCHEDOP_yield, NULL); | |
396 | ||
397 | return 0; | |
398 | } | |
399 | ||
83b96794 | 400 | #ifdef CONFIG_HOTPLUG_CPU |
8cb6de39 | 401 | static int xen_pv_cpu_disable(void) |
83b96794 VK |
402 | { |
403 | unsigned int cpu = smp_processor_id(); | |
404 | if (cpu == 0) | |
405 | return -EBUSY; | |
406 | ||
407 | cpu_disable_common(); | |
408 | ||
409 | load_cr3(swapper_pg_dir); | |
410 | return 0; | |
411 | } | |
412 | ||
413 | static void xen_pv_cpu_die(unsigned int cpu) | |
414 | { | |
415 | while (HYPERVISOR_vcpu_op(VCPUOP_is_up, | |
416 | xen_vcpu_nr(cpu), NULL)) { | |
417 | __set_current_state(TASK_UNINTERRUPTIBLE); | |
418 | schedule_timeout(HZ/10); | |
419 | } | |
420 | ||
421 | if (common_cpu_die(cpu) == 0) { | |
422 | xen_smp_intr_free(cpu); | |
423 | xen_uninit_lock_cpu(cpu); | |
424 | xen_teardown_timer(cpu); | |
425 | xen_pmu_finish(cpu); | |
426 | } | |
427 | } | |
428 | ||
8cb6de39 | 429 | static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ |
83b96794 VK |
430 | { |
431 | play_dead_common(); | |
432 | HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); | |
433 | cpu_bringup(); | |
434 | /* | |
435 | * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) | |
436 | * clears certain data that the cpu_idle loop (which called us | |
437 | * and that we return from) expects. The only way to get that | |
438 | * data back is to call: | |
439 | */ | |
440 | tick_nohz_idle_enter(); | |
0e776768 | 441 | tick_nohz_idle_stop_tick_protected(); |
83b96794 | 442 | |
c185ddec | 443 | cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); |
83b96794 VK |
444 | } |
445 | ||
446 | #else /* !CONFIG_HOTPLUG_CPU */ | |
8cb6de39 | 447 | static int xen_pv_cpu_disable(void) |
83b96794 VK |
448 | { |
449 | return -ENOSYS; | |
450 | } | |
451 | ||
452 | static void xen_pv_cpu_die(unsigned int cpu) | |
453 | { | |
454 | BUG(); | |
455 | } | |
456 | ||
8cb6de39 | 457 | static void xen_pv_play_dead(void) |
83b96794 VK |
458 | { |
459 | BUG(); | |
460 | } | |
461 | ||
462 | #endif | |
463 | static void stop_self(void *v) | |
464 | { | |
465 | int cpu = smp_processor_id(); | |
466 | ||
467 | /* make sure we're not pinning something down */ | |
468 | load_cr3(swapper_pg_dir); | |
469 | /* should set up a minimal gdt */ | |
470 | ||
471 | set_cpu_online(cpu, false); | |
472 | ||
473 | HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL); | |
474 | BUG(); | |
475 | } | |
476 | ||
8cb6de39 | 477 | static void xen_pv_stop_other_cpus(int wait) |
83b96794 VK |
478 | { |
479 | smp_call_function(stop_self, NULL, wait); | |
480 | } | |
481 | ||
83b96794 VK |
482 | static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) |
483 | { | |
484 | irq_enter(); | |
485 | irq_work_run(); | |
486 | inc_irq_stat(apic_irq_work_irqs); | |
487 | irq_exit(); | |
488 | ||
489 | return IRQ_HANDLED; | |
490 | } | |
491 | ||
492 | static const struct smp_ops xen_smp_ops __initconst = { | |
493 | .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, | |
8cb6de39 | 494 | .smp_prepare_cpus = xen_pv_smp_prepare_cpus, |
ae039001 | 495 | .smp_cpus_done = xen_smp_cpus_done, |
83b96794 | 496 | |
8cb6de39 | 497 | .cpu_up = xen_pv_cpu_up, |
83b96794 | 498 | .cpu_die = xen_pv_cpu_die, |
8cb6de39 VK |
499 | .cpu_disable = xen_pv_cpu_disable, |
500 | .play_dead = xen_pv_play_dead, | |
83b96794 | 501 | |
8cb6de39 | 502 | .stop_other_cpus = xen_pv_stop_other_cpus, |
83b96794 VK |
503 | .smp_send_reschedule = xen_smp_send_reschedule, |
504 | ||
505 | .send_call_func_ipi = xen_smp_send_call_function_ipi, | |
506 | .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, | |
507 | }; | |
508 | ||
509 | void __init xen_smp_init(void) | |
510 | { | |
511 | smp_ops = xen_smp_ops; | |
512 | xen_fill_possible_map(); | |
513 | } |