]> git.ipfire.org Git - thirdparty/qemu.git/blob - cpus.c
hw/timer/imx_gpt.c: Switch to transaction-based ptimer API
[thirdparty/qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qemu/config-file.h"
28 #include "migration/vmstate.h"
29 #include "monitor/monitor.h"
30 #include "qapi/error.h"
31 #include "qapi/qapi-commands-misc.h"
32 #include "qapi/qapi-events-run-state.h"
33 #include "qapi/qmp/qerror.h"
34 #include "qemu/error-report.h"
35 #include "qemu/qemu-print.h"
36 #include "sysemu/tcg.h"
37 #include "sysemu/block-backend.h"
38 #include "exec/gdbstub.h"
39 #include "sysemu/dma.h"
40 #include "sysemu/hw_accel.h"
41 #include "sysemu/kvm.h"
42 #include "sysemu/hax.h"
43 #include "sysemu/hvf.h"
44 #include "sysemu/whpx.h"
45 #include "exec/exec-all.h"
46
47 #include "qemu/thread.h"
48 #include "sysemu/cpus.h"
49 #include "sysemu/qtest.h"
50 #include "qemu/main-loop.h"
51 #include "qemu/option.h"
52 #include "qemu/bitmap.h"
53 #include "qemu/seqlock.h"
54 #include "qemu/guest-random.h"
55 #include "tcg.h"
56 #include "hw/nmi.h"
57 #include "sysemu/replay.h"
58 #include "sysemu/runstate.h"
59 #include "hw/boards.h"
60 #include "hw/hw.h"
61
62 #ifdef CONFIG_LINUX
63
64 #include <sys/prctl.h>
65
66 #ifndef PR_MCE_KILL
67 #define PR_MCE_KILL 33
68 #endif
69
70 #ifndef PR_MCE_KILL_SET
71 #define PR_MCE_KILL_SET 1
72 #endif
73
74 #ifndef PR_MCE_KILL_EARLY
75 #define PR_MCE_KILL_EARLY 1
76 #endif
77
78 #endif /* CONFIG_LINUX */
79
80 static QemuMutex qemu_global_mutex;
81
82 int64_t max_delay;
83 int64_t max_advance;
84
85 /* vcpu throttling controls */
86 static QEMUTimer *throttle_timer;
87 static unsigned int throttle_percentage;
88
89 #define CPU_THROTTLE_PCT_MIN 1
90 #define CPU_THROTTLE_PCT_MAX 99
91 #define CPU_THROTTLE_TIMESLICE_NS 10000000
92
93 bool cpu_is_stopped(CPUState *cpu)
94 {
95 return cpu->stopped || !runstate_is_running();
96 }
97
98 static bool cpu_thread_is_idle(CPUState *cpu)
99 {
100 if (cpu->stop || cpu->queued_work_first) {
101 return false;
102 }
103 if (cpu_is_stopped(cpu)) {
104 return true;
105 }
106 if (!cpu->halted || cpu_has_work(cpu) ||
107 kvm_halt_in_kernel()) {
108 return false;
109 }
110 return true;
111 }
112
113 static bool all_cpu_threads_idle(void)
114 {
115 CPUState *cpu;
116
117 CPU_FOREACH(cpu) {
118 if (!cpu_thread_is_idle(cpu)) {
119 return false;
120 }
121 }
122 return true;
123 }
124
125 /***********************************************************/
126 /* guest cycle counter */
127
128 /* Protected by TimersState seqlock */
129
130 static bool icount_sleep = true;
131 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
132 #define MAX_ICOUNT_SHIFT 10
133
134 typedef struct TimersState {
135 /* Protected by BQL. */
136 int64_t cpu_ticks_prev;
137 int64_t cpu_ticks_offset;
138
139 /* Protect fields that can be respectively read outside the
140 * BQL, and written from multiple threads.
141 */
142 QemuSeqLock vm_clock_seqlock;
143 QemuSpin vm_clock_lock;
144
145 int16_t cpu_ticks_enabled;
146
147 /* Conversion factor from emulated instructions to virtual clock ticks. */
148 int16_t icount_time_shift;
149
150 /* Compensate for varying guest execution speed. */
151 int64_t qemu_icount_bias;
152
153 int64_t vm_clock_warp_start;
154 int64_t cpu_clock_offset;
155
156 /* Only written by TCG thread */
157 int64_t qemu_icount;
158
159 /* for adjusting icount */
160 QEMUTimer *icount_rt_timer;
161 QEMUTimer *icount_vm_timer;
162 QEMUTimer *icount_warp_timer;
163 } TimersState;
164
165 static TimersState timers_state;
166 bool mttcg_enabled;
167
168 /*
169 * We default to false if we know other options have been enabled
170 * which are currently incompatible with MTTCG. Otherwise when each
171 * guest (target) has been updated to support:
172 * - atomic instructions
173 * - memory ordering primitives (barriers)
174 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
175 *
176 * Once a guest architecture has been converted to the new primitives
177 * there are two remaining limitations to check.
178 *
179 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
180 * - The host must have a stronger memory order than the guest
181 *
182 * It may be possible in future to support strong guests on weak hosts
183 * but that will require tagging all load/stores in a guest with their
184 * implicit memory order requirements which would likely slow things
185 * down a lot.
186 */
187
188 static bool check_tcg_memory_orders_compatible(void)
189 {
190 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
191 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
192 #else
193 return false;
194 #endif
195 }
196
197 static bool default_mttcg_enabled(void)
198 {
199 if (use_icount || TCG_OVERSIZED_GUEST) {
200 return false;
201 } else {
202 #ifdef TARGET_SUPPORTS_MTTCG
203 return check_tcg_memory_orders_compatible();
204 #else
205 return false;
206 #endif
207 }
208 }
209
210 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
211 {
212 const char *t = qemu_opt_get(opts, "thread");
213 if (t) {
214 if (strcmp(t, "multi") == 0) {
215 if (TCG_OVERSIZED_GUEST) {
216 error_setg(errp, "No MTTCG when guest word size > hosts");
217 } else if (use_icount) {
218 error_setg(errp, "No MTTCG when icount is enabled");
219 } else {
220 #ifndef TARGET_SUPPORTS_MTTCG
221 warn_report("Guest not yet converted to MTTCG - "
222 "you may get unexpected results");
223 #endif
224 if (!check_tcg_memory_orders_compatible()) {
225 warn_report("Guest expects a stronger memory ordering "
226 "than the host provides");
227 error_printf("This may cause strange/hard to debug errors\n");
228 }
229 mttcg_enabled = true;
230 }
231 } else if (strcmp(t, "single") == 0) {
232 mttcg_enabled = false;
233 } else {
234 error_setg(errp, "Invalid 'thread' setting %s", t);
235 }
236 } else {
237 mttcg_enabled = default_mttcg_enabled();
238 }
239 }
240
241 /* The current number of executed instructions is based on what we
242 * originally budgeted minus the current state of the decrementing
243 * icount counters in extra/u16.low.
244 */
245 static int64_t cpu_get_icount_executed(CPUState *cpu)
246 {
247 return (cpu->icount_budget -
248 (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
249 }
250
251 /*
252 * Update the global shared timer_state.qemu_icount to take into
253 * account executed instructions. This is done by the TCG vCPU
254 * thread so the main-loop can see time has moved forward.
255 */
256 static void cpu_update_icount_locked(CPUState *cpu)
257 {
258 int64_t executed = cpu_get_icount_executed(cpu);
259 cpu->icount_budget -= executed;
260
261 atomic_set_i64(&timers_state.qemu_icount,
262 timers_state.qemu_icount + executed);
263 }
264
265 /*
266 * Update the global shared timer_state.qemu_icount to take into
267 * account executed instructions. This is done by the TCG vCPU
268 * thread so the main-loop can see time has moved forward.
269 */
270 void cpu_update_icount(CPUState *cpu)
271 {
272 seqlock_write_lock(&timers_state.vm_clock_seqlock,
273 &timers_state.vm_clock_lock);
274 cpu_update_icount_locked(cpu);
275 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
276 &timers_state.vm_clock_lock);
277 }
278
279 static int64_t cpu_get_icount_raw_locked(void)
280 {
281 CPUState *cpu = current_cpu;
282
283 if (cpu && cpu->running) {
284 if (!cpu->can_do_io) {
285 error_report("Bad icount read");
286 exit(1);
287 }
288 /* Take into account what has run */
289 cpu_update_icount_locked(cpu);
290 }
291 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
292 return atomic_read_i64(&timers_state.qemu_icount);
293 }
294
295 static int64_t cpu_get_icount_locked(void)
296 {
297 int64_t icount = cpu_get_icount_raw_locked();
298 return atomic_read_i64(&timers_state.qemu_icount_bias) +
299 cpu_icount_to_ns(icount);
300 }
301
302 int64_t cpu_get_icount_raw(void)
303 {
304 int64_t icount;
305 unsigned start;
306
307 do {
308 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
309 icount = cpu_get_icount_raw_locked();
310 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
311
312 return icount;
313 }
314
315 /* Return the virtual CPU time, based on the instruction counter. */
316 int64_t cpu_get_icount(void)
317 {
318 int64_t icount;
319 unsigned start;
320
321 do {
322 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
323 icount = cpu_get_icount_locked();
324 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
325
326 return icount;
327 }
328
329 int64_t cpu_icount_to_ns(int64_t icount)
330 {
331 return icount << atomic_read(&timers_state.icount_time_shift);
332 }
333
334 static int64_t cpu_get_ticks_locked(void)
335 {
336 int64_t ticks = timers_state.cpu_ticks_offset;
337 if (timers_state.cpu_ticks_enabled) {
338 ticks += cpu_get_host_ticks();
339 }
340
341 if (timers_state.cpu_ticks_prev > ticks) {
342 /* Non increasing ticks may happen if the host uses software suspend. */
343 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
344 ticks = timers_state.cpu_ticks_prev;
345 }
346
347 timers_state.cpu_ticks_prev = ticks;
348 return ticks;
349 }
350
351 /* return the time elapsed in VM between vm_start and vm_stop. Unless
352 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
353 * counter.
354 */
355 int64_t cpu_get_ticks(void)
356 {
357 int64_t ticks;
358
359 if (use_icount) {
360 return cpu_get_icount();
361 }
362
363 qemu_spin_lock(&timers_state.vm_clock_lock);
364 ticks = cpu_get_ticks_locked();
365 qemu_spin_unlock(&timers_state.vm_clock_lock);
366 return ticks;
367 }
368
369 static int64_t cpu_get_clock_locked(void)
370 {
371 int64_t time;
372
373 time = timers_state.cpu_clock_offset;
374 if (timers_state.cpu_ticks_enabled) {
375 time += get_clock();
376 }
377
378 return time;
379 }
380
381 /* Return the monotonic time elapsed in VM, i.e.,
382 * the time between vm_start and vm_stop
383 */
384 int64_t cpu_get_clock(void)
385 {
386 int64_t ti;
387 unsigned start;
388
389 do {
390 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
391 ti = cpu_get_clock_locked();
392 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
393
394 return ti;
395 }
396
397 /* enable cpu_get_ticks()
398 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
399 */
400 void cpu_enable_ticks(void)
401 {
402 seqlock_write_lock(&timers_state.vm_clock_seqlock,
403 &timers_state.vm_clock_lock);
404 if (!timers_state.cpu_ticks_enabled) {
405 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
406 timers_state.cpu_clock_offset -= get_clock();
407 timers_state.cpu_ticks_enabled = 1;
408 }
409 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
410 &timers_state.vm_clock_lock);
411 }
412
413 /* disable cpu_get_ticks() : the clock is stopped. You must not call
414 * cpu_get_ticks() after that.
415 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
416 */
417 void cpu_disable_ticks(void)
418 {
419 seqlock_write_lock(&timers_state.vm_clock_seqlock,
420 &timers_state.vm_clock_lock);
421 if (timers_state.cpu_ticks_enabled) {
422 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
423 timers_state.cpu_clock_offset = cpu_get_clock_locked();
424 timers_state.cpu_ticks_enabled = 0;
425 }
426 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
427 &timers_state.vm_clock_lock);
428 }
429
430 /* Correlation between real and virtual time is always going to be
431 fairly approximate, so ignore small variation.
432 When the guest is idle real and virtual time will be aligned in
433 the IO wait loop. */
434 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
435
436 static void icount_adjust(void)
437 {
438 int64_t cur_time;
439 int64_t cur_icount;
440 int64_t delta;
441
442 /* Protected by TimersState mutex. */
443 static int64_t last_delta;
444
445 /* If the VM is not running, then do nothing. */
446 if (!runstate_is_running()) {
447 return;
448 }
449
450 seqlock_write_lock(&timers_state.vm_clock_seqlock,
451 &timers_state.vm_clock_lock);
452 cur_time = cpu_get_clock_locked();
453 cur_icount = cpu_get_icount_locked();
454
455 delta = cur_icount - cur_time;
456 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
457 if (delta > 0
458 && last_delta + ICOUNT_WOBBLE < delta * 2
459 && timers_state.icount_time_shift > 0) {
460 /* The guest is getting too far ahead. Slow time down. */
461 atomic_set(&timers_state.icount_time_shift,
462 timers_state.icount_time_shift - 1);
463 }
464 if (delta < 0
465 && last_delta - ICOUNT_WOBBLE > delta * 2
466 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
467 /* The guest is getting too far behind. Speed time up. */
468 atomic_set(&timers_state.icount_time_shift,
469 timers_state.icount_time_shift + 1);
470 }
471 last_delta = delta;
472 atomic_set_i64(&timers_state.qemu_icount_bias,
473 cur_icount - (timers_state.qemu_icount
474 << timers_state.icount_time_shift));
475 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
476 &timers_state.vm_clock_lock);
477 }
478
479 static void icount_adjust_rt(void *opaque)
480 {
481 timer_mod(timers_state.icount_rt_timer,
482 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
483 icount_adjust();
484 }
485
486 static void icount_adjust_vm(void *opaque)
487 {
488 timer_mod(timers_state.icount_vm_timer,
489 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
490 NANOSECONDS_PER_SECOND / 10);
491 icount_adjust();
492 }
493
494 static int64_t qemu_icount_round(int64_t count)
495 {
496 int shift = atomic_read(&timers_state.icount_time_shift);
497 return (count + (1 << shift) - 1) >> shift;
498 }
499
500 static void icount_warp_rt(void)
501 {
502 unsigned seq;
503 int64_t warp_start;
504
505 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
506 * changes from -1 to another value, so the race here is okay.
507 */
508 do {
509 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
510 warp_start = timers_state.vm_clock_warp_start;
511 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
512
513 if (warp_start == -1) {
514 return;
515 }
516
517 seqlock_write_lock(&timers_state.vm_clock_seqlock,
518 &timers_state.vm_clock_lock);
519 if (runstate_is_running()) {
520 int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
521 cpu_get_clock_locked());
522 int64_t warp_delta;
523
524 warp_delta = clock - timers_state.vm_clock_warp_start;
525 if (use_icount == 2) {
526 /*
527 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
528 * far ahead of real time.
529 */
530 int64_t cur_icount = cpu_get_icount_locked();
531 int64_t delta = clock - cur_icount;
532 warp_delta = MIN(warp_delta, delta);
533 }
534 atomic_set_i64(&timers_state.qemu_icount_bias,
535 timers_state.qemu_icount_bias + warp_delta);
536 }
537 timers_state.vm_clock_warp_start = -1;
538 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
539 &timers_state.vm_clock_lock);
540
541 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
542 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
543 }
544 }
545
546 static void icount_timer_cb(void *opaque)
547 {
548 /* No need for a checkpoint because the timer already synchronizes
549 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
550 */
551 icount_warp_rt();
552 }
553
554 void qtest_clock_warp(int64_t dest)
555 {
556 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
557 AioContext *aio_context;
558 assert(qtest_enabled());
559 aio_context = qemu_get_aio_context();
560 while (clock < dest) {
561 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
562 QEMU_TIMER_ATTR_ALL);
563 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
564
565 seqlock_write_lock(&timers_state.vm_clock_seqlock,
566 &timers_state.vm_clock_lock);
567 atomic_set_i64(&timers_state.qemu_icount_bias,
568 timers_state.qemu_icount_bias + warp);
569 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
570 &timers_state.vm_clock_lock);
571
572 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
573 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
574 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
575 }
576 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
577 }
578
579 void qemu_start_warp_timer(void)
580 {
581 int64_t clock;
582 int64_t deadline;
583
584 if (!use_icount) {
585 return;
586 }
587
588 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
589 * do not fire, so computing the deadline does not make sense.
590 */
591 if (!runstate_is_running()) {
592 return;
593 }
594
595 if (replay_mode != REPLAY_MODE_PLAY) {
596 if (!all_cpu_threads_idle()) {
597 return;
598 }
599
600 if (qtest_enabled()) {
601 /* When testing, qtest commands advance icount. */
602 return;
603 }
604
605 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
606 } else {
607 /* warp clock deterministically in record/replay mode */
608 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
609 /* vCPU is sleeping and warp can't be started.
610 It is probably a race condition: notification sent
611 to vCPU was processed in advance and vCPU went to sleep.
612 Therefore we have to wake it up for doing someting. */
613 if (replay_has_checkpoint()) {
614 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
615 }
616 return;
617 }
618 }
619
620 /* We want to use the earliest deadline from ALL vm_clocks */
621 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
622 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
623 ~QEMU_TIMER_ATTR_EXTERNAL);
624 if (deadline < 0) {
625 static bool notified;
626 if (!icount_sleep && !notified) {
627 warn_report("icount sleep disabled and no active timers");
628 notified = true;
629 }
630 return;
631 }
632
633 if (deadline > 0) {
634 /*
635 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
636 * sleep. Otherwise, the CPU might be waiting for a future timer
637 * interrupt to wake it up, but the interrupt never comes because
638 * the vCPU isn't running any insns and thus doesn't advance the
639 * QEMU_CLOCK_VIRTUAL.
640 */
641 if (!icount_sleep) {
642 /*
643 * We never let VCPUs sleep in no sleep icount mode.
644 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
645 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
646 * It is useful when we want a deterministic execution time,
647 * isolated from host latencies.
648 */
649 seqlock_write_lock(&timers_state.vm_clock_seqlock,
650 &timers_state.vm_clock_lock);
651 atomic_set_i64(&timers_state.qemu_icount_bias,
652 timers_state.qemu_icount_bias + deadline);
653 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
654 &timers_state.vm_clock_lock);
655 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
656 } else {
657 /*
658 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
659 * "real" time, (related to the time left until the next event) has
660 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
661 * This avoids that the warps are visible externally; for example,
662 * you will not be sending network packets continuously instead of
663 * every 100ms.
664 */
665 seqlock_write_lock(&timers_state.vm_clock_seqlock,
666 &timers_state.vm_clock_lock);
667 if (timers_state.vm_clock_warp_start == -1
668 || timers_state.vm_clock_warp_start > clock) {
669 timers_state.vm_clock_warp_start = clock;
670 }
671 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
672 &timers_state.vm_clock_lock);
673 timer_mod_anticipate(timers_state.icount_warp_timer,
674 clock + deadline);
675 }
676 } else if (deadline == 0) {
677 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
678 }
679 }
680
681 static void qemu_account_warp_timer(void)
682 {
683 if (!use_icount || !icount_sleep) {
684 return;
685 }
686
687 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
688 * do not fire, so computing the deadline does not make sense.
689 */
690 if (!runstate_is_running()) {
691 return;
692 }
693
694 /* warp clock deterministically in record/replay mode */
695 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
696 return;
697 }
698
699 timer_del(timers_state.icount_warp_timer);
700 icount_warp_rt();
701 }
702
703 static bool icount_state_needed(void *opaque)
704 {
705 return use_icount;
706 }
707
708 static bool warp_timer_state_needed(void *opaque)
709 {
710 TimersState *s = opaque;
711 return s->icount_warp_timer != NULL;
712 }
713
714 static bool adjust_timers_state_needed(void *opaque)
715 {
716 TimersState *s = opaque;
717 return s->icount_rt_timer != NULL;
718 }
719
720 /*
721 * Subsection for warp timer migration is optional, because may not be created
722 */
723 static const VMStateDescription icount_vmstate_warp_timer = {
724 .name = "timer/icount/warp_timer",
725 .version_id = 1,
726 .minimum_version_id = 1,
727 .needed = warp_timer_state_needed,
728 .fields = (VMStateField[]) {
729 VMSTATE_INT64(vm_clock_warp_start, TimersState),
730 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
731 VMSTATE_END_OF_LIST()
732 }
733 };
734
735 static const VMStateDescription icount_vmstate_adjust_timers = {
736 .name = "timer/icount/timers",
737 .version_id = 1,
738 .minimum_version_id = 1,
739 .needed = adjust_timers_state_needed,
740 .fields = (VMStateField[]) {
741 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
742 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
743 VMSTATE_END_OF_LIST()
744 }
745 };
746
747 /*
748 * This is a subsection for icount migration.
749 */
750 static const VMStateDescription icount_vmstate_timers = {
751 .name = "timer/icount",
752 .version_id = 1,
753 .minimum_version_id = 1,
754 .needed = icount_state_needed,
755 .fields = (VMStateField[]) {
756 VMSTATE_INT64(qemu_icount_bias, TimersState),
757 VMSTATE_INT64(qemu_icount, TimersState),
758 VMSTATE_END_OF_LIST()
759 },
760 .subsections = (const VMStateDescription*[]) {
761 &icount_vmstate_warp_timer,
762 &icount_vmstate_adjust_timers,
763 NULL
764 }
765 };
766
767 static const VMStateDescription vmstate_timers = {
768 .name = "timer",
769 .version_id = 2,
770 .minimum_version_id = 1,
771 .fields = (VMStateField[]) {
772 VMSTATE_INT64(cpu_ticks_offset, TimersState),
773 VMSTATE_UNUSED(8),
774 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
775 VMSTATE_END_OF_LIST()
776 },
777 .subsections = (const VMStateDescription*[]) {
778 &icount_vmstate_timers,
779 NULL
780 }
781 };
782
783 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
784 {
785 double pct;
786 double throttle_ratio;
787 int64_t sleeptime_ns, endtime_ns;
788
789 if (!cpu_throttle_get_percentage()) {
790 return;
791 }
792
793 pct = (double)cpu_throttle_get_percentage()/100;
794 throttle_ratio = pct / (1 - pct);
795 /* Add 1ns to fix double's rounding error (like 0.9999999...) */
796 sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
797 endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
798 while (sleeptime_ns > 0 && !cpu->stop) {
799 if (sleeptime_ns > SCALE_MS) {
800 qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
801 sleeptime_ns / SCALE_MS);
802 } else {
803 qemu_mutex_unlock_iothread();
804 g_usleep(sleeptime_ns / SCALE_US);
805 qemu_mutex_lock_iothread();
806 }
807 sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
808 }
809 atomic_set(&cpu->throttle_thread_scheduled, 0);
810 }
811
812 static void cpu_throttle_timer_tick(void *opaque)
813 {
814 CPUState *cpu;
815 double pct;
816
817 /* Stop the timer if needed */
818 if (!cpu_throttle_get_percentage()) {
819 return;
820 }
821 CPU_FOREACH(cpu) {
822 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
823 async_run_on_cpu(cpu, cpu_throttle_thread,
824 RUN_ON_CPU_NULL);
825 }
826 }
827
828 pct = (double)cpu_throttle_get_percentage()/100;
829 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
830 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
831 }
832
833 void cpu_throttle_set(int new_throttle_pct)
834 {
835 /* Ensure throttle percentage is within valid range */
836 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
837 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
838
839 atomic_set(&throttle_percentage, new_throttle_pct);
840
841 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
842 CPU_THROTTLE_TIMESLICE_NS);
843 }
844
845 void cpu_throttle_stop(void)
846 {
847 atomic_set(&throttle_percentage, 0);
848 }
849
850 bool cpu_throttle_active(void)
851 {
852 return (cpu_throttle_get_percentage() != 0);
853 }
854
855 int cpu_throttle_get_percentage(void)
856 {
857 return atomic_read(&throttle_percentage);
858 }
859
860 void cpu_ticks_init(void)
861 {
862 seqlock_init(&timers_state.vm_clock_seqlock);
863 qemu_spin_init(&timers_state.vm_clock_lock);
864 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
865 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
866 cpu_throttle_timer_tick, NULL);
867 }
868
869 void configure_icount(QemuOpts *opts, Error **errp)
870 {
871 const char *option;
872 char *rem_str = NULL;
873
874 option = qemu_opt_get(opts, "shift");
875 if (!option) {
876 if (qemu_opt_get(opts, "align") != NULL) {
877 error_setg(errp, "Please specify shift option when using align");
878 }
879 return;
880 }
881
882 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
883 if (icount_sleep) {
884 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
885 icount_timer_cb, NULL);
886 }
887
888 icount_align_option = qemu_opt_get_bool(opts, "align", false);
889
890 if (icount_align_option && !icount_sleep) {
891 error_setg(errp, "align=on and sleep=off are incompatible");
892 }
893 if (strcmp(option, "auto") != 0) {
894 errno = 0;
895 timers_state.icount_time_shift = strtol(option, &rem_str, 0);
896 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
897 error_setg(errp, "icount: Invalid shift value");
898 }
899 use_icount = 1;
900 return;
901 } else if (icount_align_option) {
902 error_setg(errp, "shift=auto and align=on are incompatible");
903 } else if (!icount_sleep) {
904 error_setg(errp, "shift=auto and sleep=off are incompatible");
905 }
906
907 use_icount = 2;
908
909 /* 125MIPS seems a reasonable initial guess at the guest speed.
910 It will be corrected fairly quickly anyway. */
911 timers_state.icount_time_shift = 3;
912
913 /* Have both realtime and virtual time triggers for speed adjustment.
914 The realtime trigger catches emulated time passing too slowly,
915 the virtual time trigger catches emulated time passing too fast.
916 Realtime triggers occur even when idle, so use them less frequently
917 than VM triggers. */
918 timers_state.vm_clock_warp_start = -1;
919 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
920 icount_adjust_rt, NULL);
921 timer_mod(timers_state.icount_rt_timer,
922 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
923 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
924 icount_adjust_vm, NULL);
925 timer_mod(timers_state.icount_vm_timer,
926 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
927 NANOSECONDS_PER_SECOND / 10);
928 }
929
930 /***********************************************************/
931 /* TCG vCPU kick timer
932 *
933 * The kick timer is responsible for moving single threaded vCPU
934 * emulation on to the next vCPU. If more than one vCPU is running a
935 * timer event with force a cpu->exit so the next vCPU can get
936 * scheduled.
937 *
938 * The timer is removed if all vCPUs are idle and restarted again once
939 * idleness is complete.
940 */
941
942 static QEMUTimer *tcg_kick_vcpu_timer;
943 static CPUState *tcg_current_rr_cpu;
944
945 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
946
947 static inline int64_t qemu_tcg_next_kick(void)
948 {
949 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
950 }
951
952 /* Kick the currently round-robin scheduled vCPU */
953 static void qemu_cpu_kick_rr_cpu(void)
954 {
955 CPUState *cpu;
956 do {
957 cpu = atomic_mb_read(&tcg_current_rr_cpu);
958 if (cpu) {
959 cpu_exit(cpu);
960 }
961 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
962 }
963
964 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
965 {
966 }
967
968 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
969 {
970 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
971 qemu_notify_event();
972 return;
973 }
974
975 if (qemu_in_vcpu_thread()) {
976 /* A CPU is currently running; kick it back out to the
977 * tcg_cpu_exec() loop so it will recalculate its
978 * icount deadline immediately.
979 */
980 qemu_cpu_kick(current_cpu);
981 } else if (first_cpu) {
982 /* qemu_cpu_kick is not enough to kick a halted CPU out of
983 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
984 * causes cpu_thread_is_idle to return false. This way,
985 * handle_icount_deadline can run.
986 * If we have no CPUs at all for some reason, we don't
987 * need to do anything.
988 */
989 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
990 }
991 }
992
993 static void kick_tcg_thread(void *opaque)
994 {
995 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
996 qemu_cpu_kick_rr_cpu();
997 }
998
999 static void start_tcg_kick_timer(void)
1000 {
1001 assert(!mttcg_enabled);
1002 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
1003 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
1004 kick_tcg_thread, NULL);
1005 }
1006 if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
1007 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1008 }
1009 }
1010
1011 static void stop_tcg_kick_timer(void)
1012 {
1013 assert(!mttcg_enabled);
1014 if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1015 timer_del(tcg_kick_vcpu_timer);
1016 }
1017 }
1018
1019 /***********************************************************/
1020 void hw_error(const char *fmt, ...)
1021 {
1022 va_list ap;
1023 CPUState *cpu;
1024
1025 va_start(ap, fmt);
1026 fprintf(stderr, "qemu: hardware error: ");
1027 vfprintf(stderr, fmt, ap);
1028 fprintf(stderr, "\n");
1029 CPU_FOREACH(cpu) {
1030 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1031 cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1032 }
1033 va_end(ap);
1034 abort();
1035 }
1036
1037 void cpu_synchronize_all_states(void)
1038 {
1039 CPUState *cpu;
1040
1041 CPU_FOREACH(cpu) {
1042 cpu_synchronize_state(cpu);
1043 /* TODO: move to cpu_synchronize_state() */
1044 if (hvf_enabled()) {
1045 hvf_cpu_synchronize_state(cpu);
1046 }
1047 }
1048 }
1049
1050 void cpu_synchronize_all_post_reset(void)
1051 {
1052 CPUState *cpu;
1053
1054 CPU_FOREACH(cpu) {
1055 cpu_synchronize_post_reset(cpu);
1056 /* TODO: move to cpu_synchronize_post_reset() */
1057 if (hvf_enabled()) {
1058 hvf_cpu_synchronize_post_reset(cpu);
1059 }
1060 }
1061 }
1062
1063 void cpu_synchronize_all_post_init(void)
1064 {
1065 CPUState *cpu;
1066
1067 CPU_FOREACH(cpu) {
1068 cpu_synchronize_post_init(cpu);
1069 /* TODO: move to cpu_synchronize_post_init() */
1070 if (hvf_enabled()) {
1071 hvf_cpu_synchronize_post_init(cpu);
1072 }
1073 }
1074 }
1075
1076 void cpu_synchronize_all_pre_loadvm(void)
1077 {
1078 CPUState *cpu;
1079
1080 CPU_FOREACH(cpu) {
1081 cpu_synchronize_pre_loadvm(cpu);
1082 }
1083 }
1084
1085 static int do_vm_stop(RunState state, bool send_stop)
1086 {
1087 int ret = 0;
1088
1089 if (runstate_is_running()) {
1090 cpu_disable_ticks();
1091 pause_all_vcpus();
1092 runstate_set(state);
1093 vm_state_notify(0, state);
1094 if (send_stop) {
1095 qapi_event_send_stop();
1096 }
1097 }
1098
1099 bdrv_drain_all();
1100 ret = bdrv_flush_all();
1101
1102 return ret;
1103 }
1104
1105 /* Special vm_stop() variant for terminating the process. Historically clients
1106 * did not expect a QMP STOP event and so we need to retain compatibility.
1107 */
1108 int vm_shutdown(void)
1109 {
1110 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1111 }
1112
1113 static bool cpu_can_run(CPUState *cpu)
1114 {
1115 if (cpu->stop) {
1116 return false;
1117 }
1118 if (cpu_is_stopped(cpu)) {
1119 return false;
1120 }
1121 return true;
1122 }
1123
1124 static void cpu_handle_guest_debug(CPUState *cpu)
1125 {
1126 gdb_set_stop_cpu(cpu);
1127 qemu_system_debug_request();
1128 cpu->stopped = true;
1129 }
1130
1131 #ifdef CONFIG_LINUX
1132 static void sigbus_reraise(void)
1133 {
1134 sigset_t set;
1135 struct sigaction action;
1136
1137 memset(&action, 0, sizeof(action));
1138 action.sa_handler = SIG_DFL;
1139 if (!sigaction(SIGBUS, &action, NULL)) {
1140 raise(SIGBUS);
1141 sigemptyset(&set);
1142 sigaddset(&set, SIGBUS);
1143 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1144 }
1145 perror("Failed to re-raise SIGBUS!\n");
1146 abort();
1147 }
1148
1149 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1150 {
1151 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1152 sigbus_reraise();
1153 }
1154
1155 if (current_cpu) {
1156 /* Called asynchronously in VCPU thread. */
1157 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1158 sigbus_reraise();
1159 }
1160 } else {
1161 /* Called synchronously (via signalfd) in main thread. */
1162 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1163 sigbus_reraise();
1164 }
1165 }
1166 }
1167
1168 static void qemu_init_sigbus(void)
1169 {
1170 struct sigaction action;
1171
1172 memset(&action, 0, sizeof(action));
1173 action.sa_flags = SA_SIGINFO;
1174 action.sa_sigaction = sigbus_handler;
1175 sigaction(SIGBUS, &action, NULL);
1176
1177 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1178 }
1179 #else /* !CONFIG_LINUX */
1180 static void qemu_init_sigbus(void)
1181 {
1182 }
1183 #endif /* !CONFIG_LINUX */
1184
1185 static QemuThread io_thread;
1186
1187 /* cpu creation */
1188 static QemuCond qemu_cpu_cond;
1189 /* system init */
1190 static QemuCond qemu_pause_cond;
1191
1192 void qemu_init_cpu_loop(void)
1193 {
1194 qemu_init_sigbus();
1195 qemu_cond_init(&qemu_cpu_cond);
1196 qemu_cond_init(&qemu_pause_cond);
1197 qemu_mutex_init(&qemu_global_mutex);
1198
1199 qemu_thread_get_self(&io_thread);
1200 }
1201
1202 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1203 {
1204 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1205 }
1206
1207 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1208 {
1209 if (kvm_destroy_vcpu(cpu) < 0) {
1210 error_report("kvm_destroy_vcpu failed");
1211 exit(EXIT_FAILURE);
1212 }
1213 }
1214
1215 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1216 {
1217 }
1218
1219 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1220 {
1221 g_assert(qemu_cpu_is_self(cpu));
1222 cpu->stop = false;
1223 cpu->stopped = true;
1224 if (exit) {
1225 cpu_exit(cpu);
1226 }
1227 qemu_cond_broadcast(&qemu_pause_cond);
1228 }
1229
1230 static void qemu_wait_io_event_common(CPUState *cpu)
1231 {
1232 atomic_mb_set(&cpu->thread_kicked, false);
1233 if (cpu->stop) {
1234 qemu_cpu_stop(cpu, false);
1235 }
1236 process_queued_cpu_work(cpu);
1237 }
1238
1239 static void qemu_tcg_rr_wait_io_event(void)
1240 {
1241 CPUState *cpu;
1242
1243 while (all_cpu_threads_idle()) {
1244 stop_tcg_kick_timer();
1245 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1246 }
1247
1248 start_tcg_kick_timer();
1249
1250 CPU_FOREACH(cpu) {
1251 qemu_wait_io_event_common(cpu);
1252 }
1253 }
1254
1255 static void qemu_wait_io_event(CPUState *cpu)
1256 {
1257 while (cpu_thread_is_idle(cpu)) {
1258 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1259 }
1260
1261 #ifdef _WIN32
1262 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1263 if (!tcg_enabled()) {
1264 SleepEx(0, TRUE);
1265 }
1266 #endif
1267 qemu_wait_io_event_common(cpu);
1268 }
1269
1270 static void *qemu_kvm_cpu_thread_fn(void *arg)
1271 {
1272 CPUState *cpu = arg;
1273 int r;
1274
1275 rcu_register_thread();
1276
1277 qemu_mutex_lock_iothread();
1278 qemu_thread_get_self(cpu->thread);
1279 cpu->thread_id = qemu_get_thread_id();
1280 cpu->can_do_io = 1;
1281 current_cpu = cpu;
1282
1283 r = kvm_init_vcpu(cpu);
1284 if (r < 0) {
1285 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1286 exit(1);
1287 }
1288
1289 kvm_init_cpu_signals(cpu);
1290
1291 /* signal CPU creation */
1292 cpu->created = true;
1293 qemu_cond_signal(&qemu_cpu_cond);
1294 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1295
1296 do {
1297 if (cpu_can_run(cpu)) {
1298 r = kvm_cpu_exec(cpu);
1299 if (r == EXCP_DEBUG) {
1300 cpu_handle_guest_debug(cpu);
1301 }
1302 }
1303 qemu_wait_io_event(cpu);
1304 } while (!cpu->unplug || cpu_can_run(cpu));
1305
1306 qemu_kvm_destroy_vcpu(cpu);
1307 cpu->created = false;
1308 qemu_cond_signal(&qemu_cpu_cond);
1309 qemu_mutex_unlock_iothread();
1310 rcu_unregister_thread();
1311 return NULL;
1312 }
1313
1314 static void *qemu_dummy_cpu_thread_fn(void *arg)
1315 {
1316 #ifdef _WIN32
1317 error_report("qtest is not supported under Windows");
1318 exit(1);
1319 #else
1320 CPUState *cpu = arg;
1321 sigset_t waitset;
1322 int r;
1323
1324 rcu_register_thread();
1325
1326 qemu_mutex_lock_iothread();
1327 qemu_thread_get_self(cpu->thread);
1328 cpu->thread_id = qemu_get_thread_id();
1329 cpu->can_do_io = 1;
1330 current_cpu = cpu;
1331
1332 sigemptyset(&waitset);
1333 sigaddset(&waitset, SIG_IPI);
1334
1335 /* signal CPU creation */
1336 cpu->created = true;
1337 qemu_cond_signal(&qemu_cpu_cond);
1338 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1339
1340 do {
1341 qemu_mutex_unlock_iothread();
1342 do {
1343 int sig;
1344 r = sigwait(&waitset, &sig);
1345 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1346 if (r == -1) {
1347 perror("sigwait");
1348 exit(1);
1349 }
1350 qemu_mutex_lock_iothread();
1351 qemu_wait_io_event(cpu);
1352 } while (!cpu->unplug);
1353
1354 qemu_mutex_unlock_iothread();
1355 rcu_unregister_thread();
1356 return NULL;
1357 #endif
1358 }
1359
1360 static int64_t tcg_get_icount_limit(void)
1361 {
1362 int64_t deadline;
1363
1364 if (replay_mode != REPLAY_MODE_PLAY) {
1365 /*
1366 * Include all the timers, because they may need an attention.
1367 * Too long CPU execution may create unnecessary delay in UI.
1368 */
1369 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1370 QEMU_TIMER_ATTR_ALL);
1371
1372 /* Maintain prior (possibly buggy) behaviour where if no deadline
1373 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1374 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1375 * nanoseconds.
1376 */
1377 if ((deadline < 0) || (deadline > INT32_MAX)) {
1378 deadline = INT32_MAX;
1379 }
1380
1381 return qemu_icount_round(deadline);
1382 } else {
1383 return replay_get_instructions();
1384 }
1385 }
1386
1387 static void handle_icount_deadline(void)
1388 {
1389 assert(qemu_in_vcpu_thread());
1390 if (use_icount) {
1391 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1392 QEMU_TIMER_ATTR_ALL);
1393
1394 if (deadline == 0) {
1395 /* Wake up other AioContexts. */
1396 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1397 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1398 }
1399 }
1400 }
1401
1402 static void prepare_icount_for_run(CPUState *cpu)
1403 {
1404 if (use_icount) {
1405 int insns_left;
1406
1407 /* These should always be cleared by process_icount_data after
1408 * each vCPU execution. However u16.high can be raised
1409 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1410 */
1411 g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1412 g_assert(cpu->icount_extra == 0);
1413
1414 cpu->icount_budget = tcg_get_icount_limit();
1415 insns_left = MIN(0xffff, cpu->icount_budget);
1416 cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1417 cpu->icount_extra = cpu->icount_budget - insns_left;
1418
1419 replay_mutex_lock();
1420 }
1421 }
1422
1423 static void process_icount_data(CPUState *cpu)
1424 {
1425 if (use_icount) {
1426 /* Account for executed instructions */
1427 cpu_update_icount(cpu);
1428
1429 /* Reset the counters */
1430 cpu_neg(cpu)->icount_decr.u16.low = 0;
1431 cpu->icount_extra = 0;
1432 cpu->icount_budget = 0;
1433
1434 replay_account_executed_instructions();
1435
1436 replay_mutex_unlock();
1437 }
1438 }
1439
1440
1441 static int tcg_cpu_exec(CPUState *cpu)
1442 {
1443 int ret;
1444 #ifdef CONFIG_PROFILER
1445 int64_t ti;
1446 #endif
1447
1448 assert(tcg_enabled());
1449 #ifdef CONFIG_PROFILER
1450 ti = profile_getclock();
1451 #endif
1452 cpu_exec_start(cpu);
1453 ret = cpu_exec(cpu);
1454 cpu_exec_end(cpu);
1455 #ifdef CONFIG_PROFILER
1456 atomic_set(&tcg_ctx->prof.cpu_exec_time,
1457 tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1458 #endif
1459 return ret;
1460 }
1461
1462 /* Destroy any remaining vCPUs which have been unplugged and have
1463 * finished running
1464 */
1465 static void deal_with_unplugged_cpus(void)
1466 {
1467 CPUState *cpu;
1468
1469 CPU_FOREACH(cpu) {
1470 if (cpu->unplug && !cpu_can_run(cpu)) {
1471 qemu_tcg_destroy_vcpu(cpu);
1472 cpu->created = false;
1473 qemu_cond_signal(&qemu_cpu_cond);
1474 break;
1475 }
1476 }
1477 }
1478
1479 /* Single-threaded TCG
1480 *
1481 * In the single-threaded case each vCPU is simulated in turn. If
1482 * there is more than a single vCPU we create a simple timer to kick
1483 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1484 * This is done explicitly rather than relying on side-effects
1485 * elsewhere.
1486 */
1487
1488 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1489 {
1490 CPUState *cpu = arg;
1491
1492 assert(tcg_enabled());
1493 rcu_register_thread();
1494 tcg_register_thread();
1495
1496 qemu_mutex_lock_iothread();
1497 qemu_thread_get_self(cpu->thread);
1498
1499 cpu->thread_id = qemu_get_thread_id();
1500 cpu->created = true;
1501 cpu->can_do_io = 1;
1502 qemu_cond_signal(&qemu_cpu_cond);
1503 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1504
1505 /* wait for initial kick-off after machine start */
1506 while (first_cpu->stopped) {
1507 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1508
1509 /* process any pending work */
1510 CPU_FOREACH(cpu) {
1511 current_cpu = cpu;
1512 qemu_wait_io_event_common(cpu);
1513 }
1514 }
1515
1516 start_tcg_kick_timer();
1517
1518 cpu = first_cpu;
1519
1520 /* process any pending work */
1521 cpu->exit_request = 1;
1522
1523 while (1) {
1524 qemu_mutex_unlock_iothread();
1525 replay_mutex_lock();
1526 qemu_mutex_lock_iothread();
1527 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1528 qemu_account_warp_timer();
1529
1530 /* Run the timers here. This is much more efficient than
1531 * waking up the I/O thread and waiting for completion.
1532 */
1533 handle_icount_deadline();
1534
1535 replay_mutex_unlock();
1536
1537 if (!cpu) {
1538 cpu = first_cpu;
1539 }
1540
1541 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1542
1543 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1544 current_cpu = cpu;
1545
1546 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1547 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1548
1549 if (cpu_can_run(cpu)) {
1550 int r;
1551
1552 qemu_mutex_unlock_iothread();
1553 prepare_icount_for_run(cpu);
1554
1555 r = tcg_cpu_exec(cpu);
1556
1557 process_icount_data(cpu);
1558 qemu_mutex_lock_iothread();
1559
1560 if (r == EXCP_DEBUG) {
1561 cpu_handle_guest_debug(cpu);
1562 break;
1563 } else if (r == EXCP_ATOMIC) {
1564 qemu_mutex_unlock_iothread();
1565 cpu_exec_step_atomic(cpu);
1566 qemu_mutex_lock_iothread();
1567 break;
1568 }
1569 } else if (cpu->stop) {
1570 if (cpu->unplug) {
1571 cpu = CPU_NEXT(cpu);
1572 }
1573 break;
1574 }
1575
1576 cpu = CPU_NEXT(cpu);
1577 } /* while (cpu && !cpu->exit_request).. */
1578
1579 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1580 atomic_set(&tcg_current_rr_cpu, NULL);
1581
1582 if (cpu && cpu->exit_request) {
1583 atomic_mb_set(&cpu->exit_request, 0);
1584 }
1585
1586 if (use_icount && all_cpu_threads_idle()) {
1587 /*
1588 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1589 * in the main_loop, wake it up in order to start the warp timer.
1590 */
1591 qemu_notify_event();
1592 }
1593
1594 qemu_tcg_rr_wait_io_event();
1595 deal_with_unplugged_cpus();
1596 }
1597
1598 rcu_unregister_thread();
1599 return NULL;
1600 }
1601
1602 static void *qemu_hax_cpu_thread_fn(void *arg)
1603 {
1604 CPUState *cpu = arg;
1605 int r;
1606
1607 rcu_register_thread();
1608 qemu_mutex_lock_iothread();
1609 qemu_thread_get_self(cpu->thread);
1610
1611 cpu->thread_id = qemu_get_thread_id();
1612 cpu->created = true;
1613 current_cpu = cpu;
1614
1615 hax_init_vcpu(cpu);
1616 qemu_cond_signal(&qemu_cpu_cond);
1617 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1618
1619 do {
1620 if (cpu_can_run(cpu)) {
1621 r = hax_smp_cpu_exec(cpu);
1622 if (r == EXCP_DEBUG) {
1623 cpu_handle_guest_debug(cpu);
1624 }
1625 }
1626
1627 qemu_wait_io_event(cpu);
1628 } while (!cpu->unplug || cpu_can_run(cpu));
1629 rcu_unregister_thread();
1630 return NULL;
1631 }
1632
1633 /* The HVF-specific vCPU thread function. This one should only run when the host
1634 * CPU supports the VMX "unrestricted guest" feature. */
1635 static void *qemu_hvf_cpu_thread_fn(void *arg)
1636 {
1637 CPUState *cpu = arg;
1638
1639 int r;
1640
1641 assert(hvf_enabled());
1642
1643 rcu_register_thread();
1644
1645 qemu_mutex_lock_iothread();
1646 qemu_thread_get_self(cpu->thread);
1647
1648 cpu->thread_id = qemu_get_thread_id();
1649 cpu->can_do_io = 1;
1650 current_cpu = cpu;
1651
1652 hvf_init_vcpu(cpu);
1653
1654 /* signal CPU creation */
1655 cpu->created = true;
1656 qemu_cond_signal(&qemu_cpu_cond);
1657 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1658
1659 do {
1660 if (cpu_can_run(cpu)) {
1661 r = hvf_vcpu_exec(cpu);
1662 if (r == EXCP_DEBUG) {
1663 cpu_handle_guest_debug(cpu);
1664 }
1665 }
1666 qemu_wait_io_event(cpu);
1667 } while (!cpu->unplug || cpu_can_run(cpu));
1668
1669 hvf_vcpu_destroy(cpu);
1670 cpu->created = false;
1671 qemu_cond_signal(&qemu_cpu_cond);
1672 qemu_mutex_unlock_iothread();
1673 rcu_unregister_thread();
1674 return NULL;
1675 }
1676
1677 static void *qemu_whpx_cpu_thread_fn(void *arg)
1678 {
1679 CPUState *cpu = arg;
1680 int r;
1681
1682 rcu_register_thread();
1683
1684 qemu_mutex_lock_iothread();
1685 qemu_thread_get_self(cpu->thread);
1686 cpu->thread_id = qemu_get_thread_id();
1687 current_cpu = cpu;
1688
1689 r = whpx_init_vcpu(cpu);
1690 if (r < 0) {
1691 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1692 exit(1);
1693 }
1694
1695 /* signal CPU creation */
1696 cpu->created = true;
1697 qemu_cond_signal(&qemu_cpu_cond);
1698 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1699
1700 do {
1701 if (cpu_can_run(cpu)) {
1702 r = whpx_vcpu_exec(cpu);
1703 if (r == EXCP_DEBUG) {
1704 cpu_handle_guest_debug(cpu);
1705 }
1706 }
1707 while (cpu_thread_is_idle(cpu)) {
1708 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1709 }
1710 qemu_wait_io_event_common(cpu);
1711 } while (!cpu->unplug || cpu_can_run(cpu));
1712
1713 whpx_destroy_vcpu(cpu);
1714 cpu->created = false;
1715 qemu_cond_signal(&qemu_cpu_cond);
1716 qemu_mutex_unlock_iothread();
1717 rcu_unregister_thread();
1718 return NULL;
1719 }
1720
1721 #ifdef _WIN32
1722 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1723 {
1724 }
1725 #endif
1726
1727 /* Multi-threaded TCG
1728 *
1729 * In the multi-threaded case each vCPU has its own thread. The TLS
1730 * variable current_cpu can be used deep in the code to find the
1731 * current CPUState for a given thread.
1732 */
1733
1734 static void *qemu_tcg_cpu_thread_fn(void *arg)
1735 {
1736 CPUState *cpu = arg;
1737
1738 assert(tcg_enabled());
1739 g_assert(!use_icount);
1740
1741 rcu_register_thread();
1742 tcg_register_thread();
1743
1744 qemu_mutex_lock_iothread();
1745 qemu_thread_get_self(cpu->thread);
1746
1747 cpu->thread_id = qemu_get_thread_id();
1748 cpu->created = true;
1749 cpu->can_do_io = 1;
1750 current_cpu = cpu;
1751 qemu_cond_signal(&qemu_cpu_cond);
1752 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1753
1754 /* process any pending work */
1755 cpu->exit_request = 1;
1756
1757 do {
1758 if (cpu_can_run(cpu)) {
1759 int r;
1760 qemu_mutex_unlock_iothread();
1761 r = tcg_cpu_exec(cpu);
1762 qemu_mutex_lock_iothread();
1763 switch (r) {
1764 case EXCP_DEBUG:
1765 cpu_handle_guest_debug(cpu);
1766 break;
1767 case EXCP_HALTED:
1768 /* during start-up the vCPU is reset and the thread is
1769 * kicked several times. If we don't ensure we go back
1770 * to sleep in the halted state we won't cleanly
1771 * start-up when the vCPU is enabled.
1772 *
1773 * cpu->halted should ensure we sleep in wait_io_event
1774 */
1775 g_assert(cpu->halted);
1776 break;
1777 case EXCP_ATOMIC:
1778 qemu_mutex_unlock_iothread();
1779 cpu_exec_step_atomic(cpu);
1780 qemu_mutex_lock_iothread();
1781 default:
1782 /* Ignore everything else? */
1783 break;
1784 }
1785 }
1786
1787 atomic_mb_set(&cpu->exit_request, 0);
1788 qemu_wait_io_event(cpu);
1789 } while (!cpu->unplug || cpu_can_run(cpu));
1790
1791 qemu_tcg_destroy_vcpu(cpu);
1792 cpu->created = false;
1793 qemu_cond_signal(&qemu_cpu_cond);
1794 qemu_mutex_unlock_iothread();
1795 rcu_unregister_thread();
1796 return NULL;
1797 }
1798
1799 static void qemu_cpu_kick_thread(CPUState *cpu)
1800 {
1801 #ifndef _WIN32
1802 int err;
1803
1804 if (cpu->thread_kicked) {
1805 return;
1806 }
1807 cpu->thread_kicked = true;
1808 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1809 if (err && err != ESRCH) {
1810 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1811 exit(1);
1812 }
1813 #else /* _WIN32 */
1814 if (!qemu_cpu_is_self(cpu)) {
1815 if (whpx_enabled()) {
1816 whpx_vcpu_kick(cpu);
1817 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1818 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1819 __func__, GetLastError());
1820 exit(1);
1821 }
1822 }
1823 #endif
1824 }
1825
1826 void qemu_cpu_kick(CPUState *cpu)
1827 {
1828 qemu_cond_broadcast(cpu->halt_cond);
1829 if (tcg_enabled()) {
1830 cpu_exit(cpu);
1831 /* NOP unless doing single-thread RR */
1832 qemu_cpu_kick_rr_cpu();
1833 } else {
1834 if (hax_enabled()) {
1835 /*
1836 * FIXME: race condition with the exit_request check in
1837 * hax_vcpu_hax_exec
1838 */
1839 cpu->exit_request = 1;
1840 }
1841 qemu_cpu_kick_thread(cpu);
1842 }
1843 }
1844
1845 void qemu_cpu_kick_self(void)
1846 {
1847 assert(current_cpu);
1848 qemu_cpu_kick_thread(current_cpu);
1849 }
1850
1851 bool qemu_cpu_is_self(CPUState *cpu)
1852 {
1853 return qemu_thread_is_self(cpu->thread);
1854 }
1855
1856 bool qemu_in_vcpu_thread(void)
1857 {
1858 return current_cpu && qemu_cpu_is_self(current_cpu);
1859 }
1860
1861 static __thread bool iothread_locked = false;
1862
1863 bool qemu_mutex_iothread_locked(void)
1864 {
1865 return iothread_locked;
1866 }
1867
1868 /*
1869 * The BQL is taken from so many places that it is worth profiling the
1870 * callers directly, instead of funneling them all through a single function.
1871 */
1872 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1873 {
1874 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1875
1876 g_assert(!qemu_mutex_iothread_locked());
1877 bql_lock(&qemu_global_mutex, file, line);
1878 iothread_locked = true;
1879 }
1880
1881 void qemu_mutex_unlock_iothread(void)
1882 {
1883 g_assert(qemu_mutex_iothread_locked());
1884 iothread_locked = false;
1885 qemu_mutex_unlock(&qemu_global_mutex);
1886 }
1887
1888 static bool all_vcpus_paused(void)
1889 {
1890 CPUState *cpu;
1891
1892 CPU_FOREACH(cpu) {
1893 if (!cpu->stopped) {
1894 return false;
1895 }
1896 }
1897
1898 return true;
1899 }
1900
1901 void pause_all_vcpus(void)
1902 {
1903 CPUState *cpu;
1904
1905 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1906 CPU_FOREACH(cpu) {
1907 if (qemu_cpu_is_self(cpu)) {
1908 qemu_cpu_stop(cpu, true);
1909 } else {
1910 cpu->stop = true;
1911 qemu_cpu_kick(cpu);
1912 }
1913 }
1914
1915 /* We need to drop the replay_lock so any vCPU threads woken up
1916 * can finish their replay tasks
1917 */
1918 replay_mutex_unlock();
1919
1920 while (!all_vcpus_paused()) {
1921 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1922 CPU_FOREACH(cpu) {
1923 qemu_cpu_kick(cpu);
1924 }
1925 }
1926
1927 qemu_mutex_unlock_iothread();
1928 replay_mutex_lock();
1929 qemu_mutex_lock_iothread();
1930 }
1931
1932 void cpu_resume(CPUState *cpu)
1933 {
1934 cpu->stop = false;
1935 cpu->stopped = false;
1936 qemu_cpu_kick(cpu);
1937 }
1938
1939 void resume_all_vcpus(void)
1940 {
1941 CPUState *cpu;
1942
1943 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1944 CPU_FOREACH(cpu) {
1945 cpu_resume(cpu);
1946 }
1947 }
1948
1949 void cpu_remove_sync(CPUState *cpu)
1950 {
1951 cpu->stop = true;
1952 cpu->unplug = true;
1953 qemu_cpu_kick(cpu);
1954 qemu_mutex_unlock_iothread();
1955 qemu_thread_join(cpu->thread);
1956 qemu_mutex_lock_iothread();
1957 }
1958
1959 /* For temporary buffers for forming a name */
1960 #define VCPU_THREAD_NAME_SIZE 16
1961
1962 static void qemu_tcg_init_vcpu(CPUState *cpu)
1963 {
1964 char thread_name[VCPU_THREAD_NAME_SIZE];
1965 static QemuCond *single_tcg_halt_cond;
1966 static QemuThread *single_tcg_cpu_thread;
1967 static int tcg_region_inited;
1968
1969 assert(tcg_enabled());
1970 /*
1971 * Initialize TCG regions--once. Now is a good time, because:
1972 * (1) TCG's init context, prologue and target globals have been set up.
1973 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1974 * -accel flag is processed, so the check doesn't work then).
1975 */
1976 if (!tcg_region_inited) {
1977 tcg_region_inited = 1;
1978 tcg_region_init();
1979 }
1980
1981 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1982 cpu->thread = g_malloc0(sizeof(QemuThread));
1983 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1984 qemu_cond_init(cpu->halt_cond);
1985
1986 if (qemu_tcg_mttcg_enabled()) {
1987 /* create a thread per vCPU with TCG (MTTCG) */
1988 parallel_cpus = true;
1989 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1990 cpu->cpu_index);
1991
1992 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1993 cpu, QEMU_THREAD_JOINABLE);
1994
1995 } else {
1996 /* share a single thread for all cpus with TCG */
1997 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1998 qemu_thread_create(cpu->thread, thread_name,
1999 qemu_tcg_rr_cpu_thread_fn,
2000 cpu, QEMU_THREAD_JOINABLE);
2001
2002 single_tcg_halt_cond = cpu->halt_cond;
2003 single_tcg_cpu_thread = cpu->thread;
2004 }
2005 #ifdef _WIN32
2006 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2007 #endif
2008 } else {
2009 /* For non-MTTCG cases we share the thread */
2010 cpu->thread = single_tcg_cpu_thread;
2011 cpu->halt_cond = single_tcg_halt_cond;
2012 cpu->thread_id = first_cpu->thread_id;
2013 cpu->can_do_io = 1;
2014 cpu->created = true;
2015 }
2016 }
2017
2018 static void qemu_hax_start_vcpu(CPUState *cpu)
2019 {
2020 char thread_name[VCPU_THREAD_NAME_SIZE];
2021
2022 cpu->thread = g_malloc0(sizeof(QemuThread));
2023 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2024 qemu_cond_init(cpu->halt_cond);
2025
2026 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2027 cpu->cpu_index);
2028 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2029 cpu, QEMU_THREAD_JOINABLE);
2030 #ifdef _WIN32
2031 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2032 #endif
2033 }
2034
2035 static void qemu_kvm_start_vcpu(CPUState *cpu)
2036 {
2037 char thread_name[VCPU_THREAD_NAME_SIZE];
2038
2039 cpu->thread = g_malloc0(sizeof(QemuThread));
2040 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2041 qemu_cond_init(cpu->halt_cond);
2042 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2043 cpu->cpu_index);
2044 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2045 cpu, QEMU_THREAD_JOINABLE);
2046 }
2047
2048 static void qemu_hvf_start_vcpu(CPUState *cpu)
2049 {
2050 char thread_name[VCPU_THREAD_NAME_SIZE];
2051
2052 /* HVF currently does not support TCG, and only runs in
2053 * unrestricted-guest mode. */
2054 assert(hvf_enabled());
2055
2056 cpu->thread = g_malloc0(sizeof(QemuThread));
2057 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2058 qemu_cond_init(cpu->halt_cond);
2059
2060 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2061 cpu->cpu_index);
2062 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2063 cpu, QEMU_THREAD_JOINABLE);
2064 }
2065
2066 static void qemu_whpx_start_vcpu(CPUState *cpu)
2067 {
2068 char thread_name[VCPU_THREAD_NAME_SIZE];
2069
2070 cpu->thread = g_malloc0(sizeof(QemuThread));
2071 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2072 qemu_cond_init(cpu->halt_cond);
2073 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2074 cpu->cpu_index);
2075 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2076 cpu, QEMU_THREAD_JOINABLE);
2077 #ifdef _WIN32
2078 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2079 #endif
2080 }
2081
2082 static void qemu_dummy_start_vcpu(CPUState *cpu)
2083 {
2084 char thread_name[VCPU_THREAD_NAME_SIZE];
2085
2086 cpu->thread = g_malloc0(sizeof(QemuThread));
2087 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2088 qemu_cond_init(cpu->halt_cond);
2089 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2090 cpu->cpu_index);
2091 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2092 QEMU_THREAD_JOINABLE);
2093 }
2094
2095 void qemu_init_vcpu(CPUState *cpu)
2096 {
2097 MachineState *ms = MACHINE(qdev_get_machine());
2098
2099 cpu->nr_cores = ms->smp.cores;
2100 cpu->nr_threads = ms->smp.threads;
2101 cpu->stopped = true;
2102 cpu->random_seed = qemu_guest_random_seed_thread_part1();
2103
2104 if (!cpu->as) {
2105 /* If the target cpu hasn't set up any address spaces itself,
2106 * give it the default one.
2107 */
2108 cpu->num_ases = 1;
2109 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2110 }
2111
2112 if (kvm_enabled()) {
2113 qemu_kvm_start_vcpu(cpu);
2114 } else if (hax_enabled()) {
2115 qemu_hax_start_vcpu(cpu);
2116 } else if (hvf_enabled()) {
2117 qemu_hvf_start_vcpu(cpu);
2118 } else if (tcg_enabled()) {
2119 qemu_tcg_init_vcpu(cpu);
2120 } else if (whpx_enabled()) {
2121 qemu_whpx_start_vcpu(cpu);
2122 } else {
2123 qemu_dummy_start_vcpu(cpu);
2124 }
2125
2126 while (!cpu->created) {
2127 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2128 }
2129 }
2130
2131 void cpu_stop_current(void)
2132 {
2133 if (current_cpu) {
2134 current_cpu->stop = true;
2135 cpu_exit(current_cpu);
2136 }
2137 }
2138
2139 int vm_stop(RunState state)
2140 {
2141 if (qemu_in_vcpu_thread()) {
2142 qemu_system_vmstop_request_prepare();
2143 qemu_system_vmstop_request(state);
2144 /*
2145 * FIXME: should not return to device code in case
2146 * vm_stop() has been requested.
2147 */
2148 cpu_stop_current();
2149 return 0;
2150 }
2151
2152 return do_vm_stop(state, true);
2153 }
2154
2155 /**
2156 * Prepare for (re)starting the VM.
2157 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2158 * running or in case of an error condition), 0 otherwise.
2159 */
2160 int vm_prepare_start(void)
2161 {
2162 RunState requested;
2163
2164 qemu_vmstop_requested(&requested);
2165 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2166 return -1;
2167 }
2168
2169 /* Ensure that a STOP/RESUME pair of events is emitted if a
2170 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2171 * example, according to documentation is always followed by
2172 * the STOP event.
2173 */
2174 if (runstate_is_running()) {
2175 qapi_event_send_stop();
2176 qapi_event_send_resume();
2177 return -1;
2178 }
2179
2180 /* We are sending this now, but the CPUs will be resumed shortly later */
2181 qapi_event_send_resume();
2182
2183 cpu_enable_ticks();
2184 runstate_set(RUN_STATE_RUNNING);
2185 vm_state_notify(1, RUN_STATE_RUNNING);
2186 return 0;
2187 }
2188
2189 void vm_start(void)
2190 {
2191 if (!vm_prepare_start()) {
2192 resume_all_vcpus();
2193 }
2194 }
2195
2196 /* does a state transition even if the VM is already stopped,
2197 current state is forgotten forever */
2198 int vm_stop_force_state(RunState state)
2199 {
2200 if (runstate_is_running()) {
2201 return vm_stop(state);
2202 } else {
2203 runstate_set(state);
2204
2205 bdrv_drain_all();
2206 /* Make sure to return an error if the flush in a previous vm_stop()
2207 * failed. */
2208 return bdrv_flush_all();
2209 }
2210 }
2211
2212 void list_cpus(const char *optarg)
2213 {
2214 /* XXX: implement xxx_cpu_list for targets that still miss it */
2215 #if defined(cpu_list)
2216 cpu_list();
2217 #endif
2218 }
2219
2220 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2221 bool has_cpu, int64_t cpu_index, Error **errp)
2222 {
2223 FILE *f;
2224 uint32_t l;
2225 CPUState *cpu;
2226 uint8_t buf[1024];
2227 int64_t orig_addr = addr, orig_size = size;
2228
2229 if (!has_cpu) {
2230 cpu_index = 0;
2231 }
2232
2233 cpu = qemu_get_cpu(cpu_index);
2234 if (cpu == NULL) {
2235 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2236 "a CPU number");
2237 return;
2238 }
2239
2240 f = fopen(filename, "wb");
2241 if (!f) {
2242 error_setg_file_open(errp, errno, filename);
2243 return;
2244 }
2245
2246 while (size != 0) {
2247 l = sizeof(buf);
2248 if (l > size)
2249 l = size;
2250 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2251 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2252 " specified", orig_addr, orig_size);
2253 goto exit;
2254 }
2255 if (fwrite(buf, 1, l, f) != l) {
2256 error_setg(errp, QERR_IO_ERROR);
2257 goto exit;
2258 }
2259 addr += l;
2260 size -= l;
2261 }
2262
2263 exit:
2264 fclose(f);
2265 }
2266
2267 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2268 Error **errp)
2269 {
2270 FILE *f;
2271 uint32_t l;
2272 uint8_t buf[1024];
2273
2274 f = fopen(filename, "wb");
2275 if (!f) {
2276 error_setg_file_open(errp, errno, filename);
2277 return;
2278 }
2279
2280 while (size != 0) {
2281 l = sizeof(buf);
2282 if (l > size)
2283 l = size;
2284 cpu_physical_memory_read(addr, buf, l);
2285 if (fwrite(buf, 1, l, f) != l) {
2286 error_setg(errp, QERR_IO_ERROR);
2287 goto exit;
2288 }
2289 addr += l;
2290 size -= l;
2291 }
2292
2293 exit:
2294 fclose(f);
2295 }
2296
2297 void qmp_inject_nmi(Error **errp)
2298 {
2299 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2300 }
2301
2302 void dump_drift_info(void)
2303 {
2304 if (!use_icount) {
2305 return;
2306 }
2307
2308 qemu_printf("Host - Guest clock %"PRIi64" ms\n",
2309 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2310 if (icount_align_option) {
2311 qemu_printf("Max guest delay %"PRIi64" ms\n",
2312 -max_delay / SCALE_MS);
2313 qemu_printf("Max guest advance %"PRIi64" ms\n",
2314 max_advance / SCALE_MS);
2315 } else {
2316 qemu_printf("Max guest delay NA\n");
2317 qemu_printf("Max guest advance NA\n");
2318 }
2319 }