cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "migration/vmstate.h"
  29 #include "monitor/monitor.h"
  30 #include "qapi/error.h"
  31 #include "qapi/qapi-commands-misc.h"
  32 #include "qapi/qapi-events-run-state.h"
  33 #include "qapi/qmp/qerror.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/qemu-print.h"
  36 #include "sysemu/tcg.h"
  37 #include "sysemu/block-backend.h"
  38 #include "exec/gdbstub.h"
  39 #include "sysemu/dma.h"
  40 #include "sysemu/hw_accel.h"
  41 #include "sysemu/kvm.h"
  42 #include "sysemu/hax.h"
  43 #include "sysemu/hvf.h"
  44 #include "sysemu/whpx.h"
  45 #include "exec/exec-all.h"
  46
  47 #include "qemu/thread.h"
  48 #include "sysemu/cpus.h"
  49 #include "sysemu/qtest.h"
  50 #include "qemu/main-loop.h"
  51 #include "qemu/option.h"
  52 #include "qemu/bitmap.h"
  53 #include "qemu/seqlock.h"
  54 #include "qemu/guest-random.h"
  55 #include "tcg.h"
  56 #include "hw/nmi.h"
  57 #include "sysemu/replay.h"
  58 #include "sysemu/runstate.h"
  59 #include "hw/boards.h"
  60 #include "hw/hw.h"
  61
  62 #ifdef CONFIG_LINUX
  63
  64 #include <sys/prctl.h>
  65
  66 #ifndef PR_MCE_KILL
  67 #define PR_MCE_KILL 33
  68 #endif
  69
  70 #ifndef PR_MCE_KILL_SET
  71 #define PR_MCE_KILL_SET 1
  72 #endif
  73
  74 #ifndef PR_MCE_KILL_EARLY
  75 #define PR_MCE_KILL_EARLY 1
  76 #endif
  77
  78 #endif /* CONFIG_LINUX */
  79
  80 static QemuMutex qemu_global_mutex;
  81
  82 int64_t max_delay;
  83 int64_t max_advance;
  84
  85 /* vcpu throttling controls */
  86 static QEMUTimer *throttle_timer;
  87 static unsigned int throttle_percentage;
  88
  89 #define CPU_THROTTLE_PCT_MIN 1
  90 #define CPU_THROTTLE_PCT_MAX 99
  91 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  92
  93 bool cpu_is_stopped(CPUState *cpu)
  94 {
  95     return cpu->stopped || !runstate_is_running();
  96 }
  97
  98 static bool cpu_thread_is_idle(CPUState *cpu)
  99 {
 100     if (cpu->stop || cpu->queued_work_first) {
 101         return false;
 102     }
 103     if (cpu_is_stopped(cpu)) {
 104         return true;
 105     }
 106     if (!cpu->halted || cpu_has_work(cpu) ||
 107         kvm_halt_in_kernel()) {
 108         return false;
 109     }
 110     return true;
 111 }
 112
 113 static bool all_cpu_threads_idle(void)
 114 {
 115     CPUState *cpu;
 116
 117     CPU_FOREACH(cpu) {
 118         if (!cpu_thread_is_idle(cpu)) {
 119             return false;
 120         }
 121     }
 122     return true;
 123 }
 124
 125 /***********************************************************/
 126 /* guest cycle counter */
 127
 128 /* Protected by TimersState seqlock */
 129
 130 static bool icount_sleep = true;
 131 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 132 #define MAX_ICOUNT_SHIFT 10
 133
 134 typedef struct TimersState {
 135     /* Protected by BQL.  */
 136     int64_t cpu_ticks_prev;
 137     int64_t cpu_ticks_offset;
 138
 139     /* Protect fields that can be respectively read outside the
 140      * BQL, and written from multiple threads.
 141      */
 142     QemuSeqLock vm_clock_seqlock;
 143     QemuSpin vm_clock_lock;
 144
 145     int16_t cpu_ticks_enabled;
 146
 147     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 148     int16_t icount_time_shift;
 149
 150     /* Compensate for varying guest execution speed.  */
 151     int64_t qemu_icount_bias;
 152
 153     int64_t vm_clock_warp_start;
 154     int64_t cpu_clock_offset;
 155
 156     /* Only written by TCG thread */
 157     int64_t qemu_icount;
 158
 159     /* for adjusting icount */
 160     QEMUTimer *icount_rt_timer;
 161     QEMUTimer *icount_vm_timer;
 162     QEMUTimer *icount_warp_timer;
 163 } TimersState;
 164
 165 static TimersState timers_state;
 166 bool mttcg_enabled;
 167
 168 /*
 169  * We default to false if we know other options have been enabled
 170  * which are currently incompatible with MTTCG. Otherwise when each
 171  * guest (target) has been updated to support:
 172  *   - atomic instructions
 173  *   - memory ordering primitives (barriers)
 174  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 175  *
 176  * Once a guest architecture has been converted to the new primitives
 177  * there are two remaining limitations to check.
 178  *
 179  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 180  * - The host must have a stronger memory order than the guest
 181  *
 182  * It may be possible in future to support strong guests on weak hosts
 183  * but that will require tagging all load/stores in a guest with their
 184  * implicit memory order requirements which would likely slow things
 185  * down a lot.
 186  */
 187
 188 static bool check_tcg_memory_orders_compatible(void)
 189 {
 190 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 191     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 192 #else
 193     return false;
 194 #endif
 195 }
 196
 197 static bool default_mttcg_enabled(void)
 198 {
 199     if (use_icount || TCG_OVERSIZED_GUEST) {
 200         return false;
 201     } else {
 202 #ifdef TARGET_SUPPORTS_MTTCG
 203         return check_tcg_memory_orders_compatible();
 204 #else
 205         return false;
 206 #endif
 207     }
 208 }
 209
 210 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 211 {
 212     const char *t = qemu_opt_get(opts, "thread");
 213     if (t) {
 214         if (strcmp(t, "multi") == 0) {
 215             if (TCG_OVERSIZED_GUEST) {
 216                 error_setg(errp, "No MTTCG when guest word size > hosts");
 217             } else if (use_icount) {
 218                 error_setg(errp, "No MTTCG when icount is enabled");
 219             } else {
 220 #ifndef TARGET_SUPPORTS_MTTCG
 221                 warn_report("Guest not yet converted to MTTCG - "
 222                             "you may get unexpected results");
 223 #endif
 224                 if (!check_tcg_memory_orders_compatible()) {
 225                     warn_report("Guest expects a stronger memory ordering "
 226                                 "than the host provides");
 227                     error_printf("This may cause strange/hard to debug errors\n");
 228                 }
 229                 mttcg_enabled = true;
 230             }
 231         } else if (strcmp(t, "single") == 0) {
 232             mttcg_enabled = false;
 233         } else {
 234             error_setg(errp, "Invalid 'thread' setting %s", t);
 235         }
 236     } else {
 237         mttcg_enabled = default_mttcg_enabled();
 238     }
 239 }
 240
 241 /* The current number of executed instructions is based on what we
 242  * originally budgeted minus the current state of the decrementing
 243  * icount counters in extra/u16.low.
 244  */
 245 static int64_t cpu_get_icount_executed(CPUState *cpu)
 246 {
 247     return (cpu->icount_budget -
 248             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 249 }
 250
 251 /*
 252  * Update the global shared timer_state.qemu_icount to take into
 253  * account executed instructions. This is done by the TCG vCPU
 254  * thread so the main-loop can see time has moved forward.
 255  */
 256 static void cpu_update_icount_locked(CPUState *cpu)
 257 {
 258     int64_t executed = cpu_get_icount_executed(cpu);
 259     cpu->icount_budget -= executed;
 260
 261     atomic_set_i64(&timers_state.qemu_icount,
 262                    timers_state.qemu_icount + executed);
 263 }
 264
 265 /*
 266  * Update the global shared timer_state.qemu_icount to take into
 267  * account executed instructions. This is done by the TCG vCPU
 268  * thread so the main-loop can see time has moved forward.
 269  */
 270 void cpu_update_icount(CPUState *cpu)
 271 {
 272     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 273                        &timers_state.vm_clock_lock);
 274     cpu_update_icount_locked(cpu);
 275     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 276                          &timers_state.vm_clock_lock);
 277 }
 278
 279 static int64_t cpu_get_icount_raw_locked(void)
 280 {
 281     CPUState *cpu = current_cpu;
 282
 283     if (cpu && cpu->running) {
 284         if (!cpu->can_do_io) {
 285             error_report("Bad icount read");
 286             exit(1);
 287         }
 288         /* Take into account what has run */
 289         cpu_update_icount_locked(cpu);
 290     }
 291     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 292     return atomic_read_i64(&timers_state.qemu_icount);
 293 }
 294
 295 static int64_t cpu_get_icount_locked(void)
 296 {
 297     int64_t icount = cpu_get_icount_raw_locked();
 298     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 299         cpu_icount_to_ns(icount);
 300 }
 301
 302 int64_t cpu_get_icount_raw(void)
 303 {
 304     int64_t icount;
 305     unsigned start;
 306
 307     do {
 308         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 309         icount = cpu_get_icount_raw_locked();
 310     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 311
 312     return icount;
 313 }
 314
 315 /* Return the virtual CPU time, based on the instruction counter.  */
 316 int64_t cpu_get_icount(void)
 317 {
 318     int64_t icount;
 319     unsigned start;
 320
 321     do {
 322         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 323         icount = cpu_get_icount_locked();
 324     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 325
 326     return icount;
 327 }
 328
 329 int64_t cpu_icount_to_ns(int64_t icount)
 330 {
 331     return icount << atomic_read(&timers_state.icount_time_shift);
 332 }
 333
 334 static int64_t cpu_get_ticks_locked(void)
 335 {
 336     int64_t ticks = timers_state.cpu_ticks_offset;
 337     if (timers_state.cpu_ticks_enabled) {
 338         ticks += cpu_get_host_ticks();
 339     }
 340
 341     if (timers_state.cpu_ticks_prev > ticks) {
 342         /* Non increasing ticks may happen if the host uses software suspend.  */
 343         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 344         ticks = timers_state.cpu_ticks_prev;
 345     }
 346
 347     timers_state.cpu_ticks_prev = ticks;
 348     return ticks;
 349 }
 350
 351 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 352  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 353  * counter.
 354  */
 355 int64_t cpu_get_ticks(void)
 356 {
 357     int64_t ticks;
 358
 359     if (use_icount) {
 360         return cpu_get_icount();
 361     }
 362
 363     qemu_spin_lock(&timers_state.vm_clock_lock);
 364     ticks = cpu_get_ticks_locked();
 365     qemu_spin_unlock(&timers_state.vm_clock_lock);
 366     return ticks;
 367 }
 368
 369 static int64_t cpu_get_clock_locked(void)
 370 {
 371     int64_t time;
 372
 373     time = timers_state.cpu_clock_offset;
 374     if (timers_state.cpu_ticks_enabled) {
 375         time += get_clock();
 376     }
 377
 378     return time;
 379 }
 380
 381 /* Return the monotonic time elapsed in VM, i.e.,
 382  * the time between vm_start and vm_stop
 383  */
 384 int64_t cpu_get_clock(void)
 385 {
 386     int64_t ti;
 387     unsigned start;
 388
 389     do {
 390         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 391         ti = cpu_get_clock_locked();
 392     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 393
 394     return ti;
 395 }
 396
 397 /* enable cpu_get_ticks()
 398  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 399  */
 400 void cpu_enable_ticks(void)
 401 {
 402     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 403                        &timers_state.vm_clock_lock);
 404     if (!timers_state.cpu_ticks_enabled) {
 405         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 406         timers_state.cpu_clock_offset -= get_clock();
 407         timers_state.cpu_ticks_enabled = 1;
 408     }
 409     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 410                        &timers_state.vm_clock_lock);
 411 }
 412
 413 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 414  * cpu_get_ticks() after that.
 415  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 416  */
 417 void cpu_disable_ticks(void)
 418 {
 419     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 420                        &timers_state.vm_clock_lock);
 421     if (timers_state.cpu_ticks_enabled) {
 422         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 423         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 424         timers_state.cpu_ticks_enabled = 0;
 425     }
 426     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 427                          &timers_state.vm_clock_lock);
 428 }
 429
 430 /* Correlation between real and virtual time is always going to be
 431    fairly approximate, so ignore small variation.
 432    When the guest is idle real and virtual time will be aligned in
 433    the IO wait loop.  */
 434 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 435
 436 static void icount_adjust(void)
 437 {
 438     int64_t cur_time;
 439     int64_t cur_icount;
 440     int64_t delta;
 441
 442     /* Protected by TimersState mutex.  */
 443     static int64_t last_delta;
 444
 445     /* If the VM is not running, then do nothing.  */
 446     if (!runstate_is_running()) {
 447         return;
 448     }
 449
 450     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 451                        &timers_state.vm_clock_lock);
 452     cur_time = cpu_get_clock_locked();
 453     cur_icount = cpu_get_icount_locked();
 454
 455     delta = cur_icount - cur_time;
 456     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 457     if (delta > 0
 458         && last_delta + ICOUNT_WOBBLE < delta * 2
 459         && timers_state.icount_time_shift > 0) {
 460         /* The guest is getting too far ahead.  Slow time down.  */
 461         atomic_set(&timers_state.icount_time_shift,
 462                    timers_state.icount_time_shift - 1);
 463     }
 464     if (delta < 0
 465         && last_delta - ICOUNT_WOBBLE > delta * 2
 466         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 467         /* The guest is getting too far behind.  Speed time up.  */
 468         atomic_set(&timers_state.icount_time_shift,
 469                    timers_state.icount_time_shift + 1);
 470     }
 471     last_delta = delta;
 472     atomic_set_i64(&timers_state.qemu_icount_bias,
 473                    cur_icount - (timers_state.qemu_icount
 474                                  << timers_state.icount_time_shift));
 475     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 476                          &timers_state.vm_clock_lock);
 477 }
 478
 479 static void icount_adjust_rt(void *opaque)
 480 {
 481     timer_mod(timers_state.icount_rt_timer,
 482               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 483     icount_adjust();
 484 }
 485
 486 static void icount_adjust_vm(void *opaque)
 487 {
 488     timer_mod(timers_state.icount_vm_timer,
 489                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 490                    NANOSECONDS_PER_SECOND / 10);
 491     icount_adjust();
 492 }
 493
 494 static int64_t qemu_icount_round(int64_t count)
 495 {
 496     int shift = atomic_read(&timers_state.icount_time_shift);
 497     return (count + (1 << shift) - 1) >> shift;
 498 }
 499
 500 static void icount_warp_rt(void)
 501 {
 502     unsigned seq;
 503     int64_t warp_start;
 504
 505     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 506      * changes from -1 to another value, so the race here is okay.
 507      */
 508     do {
 509         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 510         warp_start = timers_state.vm_clock_warp_start;
 511     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 512
 513     if (warp_start == -1) {
 514         return;
 515     }
 516
 517     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 518                        &timers_state.vm_clock_lock);
 519     if (runstate_is_running()) {
 520         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 521                                             cpu_get_clock_locked());
 522         int64_t warp_delta;
 523
 524         warp_delta = clock - timers_state.vm_clock_warp_start;
 525         if (use_icount == 2) {
 526             /*
 527              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 528              * far ahead of real time.
 529              */
 530             int64_t cur_icount = cpu_get_icount_locked();
 531             int64_t delta = clock - cur_icount;
 532             warp_delta = MIN(warp_delta, delta);
 533         }
 534         atomic_set_i64(&timers_state.qemu_icount_bias,
 535                        timers_state.qemu_icount_bias + warp_delta);
 536     }
 537     timers_state.vm_clock_warp_start = -1;
 538     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 539                        &timers_state.vm_clock_lock);
 540
 541     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 542         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 543     }
 544 }
 545
 546 static void icount_timer_cb(void *opaque)
 547 {
 548     /* No need for a checkpoint because the timer already synchronizes
 549      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 550      */
 551     icount_warp_rt();
 552 }
 553
 554 void qtest_clock_warp(int64_t dest)
 555 {
 556     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 557     AioContext *aio_context;
 558     assert(qtest_enabled());
 559     aio_context = qemu_get_aio_context();
 560     while (clock < dest) {
 561         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 562                                                       QEMU_TIMER_ATTR_ALL);
 563         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 564
 565         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 566                            &timers_state.vm_clock_lock);
 567         atomic_set_i64(&timers_state.qemu_icount_bias,
 568                        timers_state.qemu_icount_bias + warp);
 569         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 570                              &timers_state.vm_clock_lock);
 571
 572         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 573         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 574         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 575     }
 576     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 577 }
 578
 579 void qemu_start_warp_timer(void)
 580 {
 581     int64_t clock;
 582     int64_t deadline;
 583
 584     if (!use_icount) {
 585         return;
 586     }
 587
 588     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 589      * do not fire, so computing the deadline does not make sense.
 590      */
 591     if (!runstate_is_running()) {
 592         return;
 593     }
 594
 595     if (replay_mode != REPLAY_MODE_PLAY) {
 596         if (!all_cpu_threads_idle()) {
 597             return;
 598         }
 599
 600         if (qtest_enabled()) {
 601             /* When testing, qtest commands advance icount.  */
 602             return;
 603         }
 604
 605         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 606     } else {
 607         /* warp clock deterministically in record/replay mode */
 608         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 609             /* vCPU is sleeping and warp can't be started.
 610                It is probably a race condition: notification sent
 611                to vCPU was processed in advance and vCPU went to sleep.
 612                Therefore we have to wake it up for doing someting. */
 613             if (replay_has_checkpoint()) {
 614                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 615             }
 616             return;
 617         }
 618     }
 619
 620     /* We want to use the earliest deadline from ALL vm_clocks */
 621     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 622     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 623                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 624     if (deadline < 0) {
 625         static bool notified;
 626         if (!icount_sleep && !notified) {
 627             warn_report("icount sleep disabled and no active timers");
 628             notified = true;
 629         }
 630         return;
 631     }
 632
 633     if (deadline > 0) {
 634         /*
 635          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 636          * sleep.  Otherwise, the CPU might be waiting for a future timer
 637          * interrupt to wake it up, but the interrupt never comes because
 638          * the vCPU isn't running any insns and thus doesn't advance the
 639          * QEMU_CLOCK_VIRTUAL.
 640          */
 641         if (!icount_sleep) {
 642             /*
 643              * We never let VCPUs sleep in no sleep icount mode.
 644              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 645              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 646              * It is useful when we want a deterministic execution time,
 647              * isolated from host latencies.
 648              */
 649             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 650                                &timers_state.vm_clock_lock);
 651             atomic_set_i64(&timers_state.qemu_icount_bias,
 652                            timers_state.qemu_icount_bias + deadline);
 653             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 654                                  &timers_state.vm_clock_lock);
 655             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 656         } else {
 657             /*
 658              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 659              * "real" time, (related to the time left until the next event) has
 660              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 661              * This avoids that the warps are visible externally; for example,
 662              * you will not be sending network packets continuously instead of
 663              * every 100ms.
 664              */
 665             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 666                                &timers_state.vm_clock_lock);
 667             if (timers_state.vm_clock_warp_start == -1
 668                 || timers_state.vm_clock_warp_start > clock) {
 669                 timers_state.vm_clock_warp_start = clock;
 670             }
 671             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 672                                  &timers_state.vm_clock_lock);
 673             timer_mod_anticipate(timers_state.icount_warp_timer,
 674                                  clock + deadline);
 675         }
 676     } else if (deadline == 0) {
 677         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 678     }
 679 }
 680
 681 static void qemu_account_warp_timer(void)
 682 {
 683     if (!use_icount || !icount_sleep) {
 684         return;
 685     }
 686
 687     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 688      * do not fire, so computing the deadline does not make sense.
 689      */
 690     if (!runstate_is_running()) {
 691         return;
 692     }
 693
 694     /* warp clock deterministically in record/replay mode */
 695     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 696         return;
 697     }
 698
 699     timer_del(timers_state.icount_warp_timer);
 700     icount_warp_rt();
 701 }
 702
 703 static bool icount_state_needed(void *opaque)
 704 {
 705     return use_icount;
 706 }
 707
 708 static bool warp_timer_state_needed(void *opaque)
 709 {
 710     TimersState *s = opaque;
 711     return s->icount_warp_timer != NULL;
 712 }
 713
 714 static bool adjust_timers_state_needed(void *opaque)
 715 {
 716     TimersState *s = opaque;
 717     return s->icount_rt_timer != NULL;
 718 }
 719
 720 /*
 721  * Subsection for warp timer migration is optional, because may not be created
 722  */
 723 static const VMStateDescription icount_vmstate_warp_timer = {
 724     .name = "timer/icount/warp_timer",
 725     .version_id = 1,
 726     .minimum_version_id = 1,
 727     .needed = warp_timer_state_needed,
 728     .fields = (VMStateField[]) {
 729         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 730         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 731         VMSTATE_END_OF_LIST()
 732     }
 733 };
 734
 735 static const VMStateDescription icount_vmstate_adjust_timers = {
 736     .name = "timer/icount/timers",
 737     .version_id = 1,
 738     .minimum_version_id = 1,
 739     .needed = adjust_timers_state_needed,
 740     .fields = (VMStateField[]) {
 741         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 742         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 743         VMSTATE_END_OF_LIST()
 744     }
 745 };
 746
 747 /*
 748  * This is a subsection for icount migration.
 749  */
 750 static const VMStateDescription icount_vmstate_timers = {
 751     .name = "timer/icount",
 752     .version_id = 1,
 753     .minimum_version_id = 1,
 754     .needed = icount_state_needed,
 755     .fields = (VMStateField[]) {
 756         VMSTATE_INT64(qemu_icount_bias, TimersState),
 757         VMSTATE_INT64(qemu_icount, TimersState),
 758         VMSTATE_END_OF_LIST()
 759     },
 760     .subsections = (const VMStateDescription*[]) {
 761         &icount_vmstate_warp_timer,
 762         &icount_vmstate_adjust_timers,
 763         NULL
 764     }
 765 };
 766
 767 static const VMStateDescription vmstate_timers = {
 768     .name = "timer",
 769     .version_id = 2,
 770     .minimum_version_id = 1,
 771     .fields = (VMStateField[]) {
 772         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 773         VMSTATE_UNUSED(8),
 774         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 775         VMSTATE_END_OF_LIST()
 776     },
 777     .subsections = (const VMStateDescription*[]) {
 778         &icount_vmstate_timers,
 779         NULL
 780     }
 781 };
 782
 783 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 784 {
 785     double pct;
 786     double throttle_ratio;
 787     int64_t sleeptime_ns, endtime_ns;
 788
 789     if (!cpu_throttle_get_percentage()) {
 790         return;
 791     }
 792
 793     pct = (double)cpu_throttle_get_percentage()/100;
 794     throttle_ratio = pct / (1 - pct);
 795     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 796     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 797     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 798     while (sleeptime_ns > 0 && !cpu->stop) {
 799         if (sleeptime_ns > SCALE_MS) {
 800             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 801                                 sleeptime_ns / SCALE_MS);
 802         } else {
 803             qemu_mutex_unlock_iothread();
 804             g_usleep(sleeptime_ns / SCALE_US);
 805             qemu_mutex_lock_iothread();
 806         }
 807         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 808     }
 809     atomic_set(&cpu->throttle_thread_scheduled, 0);
 810 }
 811
 812 static void cpu_throttle_timer_tick(void *opaque)
 813 {
 814     CPUState *cpu;
 815     double pct;
 816
 817     /* Stop the timer if needed */
 818     if (!cpu_throttle_get_percentage()) {
 819         return;
 820     }
 821     CPU_FOREACH(cpu) {
 822         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 823             async_run_on_cpu(cpu, cpu_throttle_thread,
 824                              RUN_ON_CPU_NULL);
 825         }
 826     }
 827
 828     pct = (double)cpu_throttle_get_percentage()/100;
 829     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 830                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 831 }
 832
 833 void cpu_throttle_set(int new_throttle_pct)
 834 {
 835     /* Ensure throttle percentage is within valid range */
 836     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 837     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 838
 839     atomic_set(&throttle_percentage, new_throttle_pct);
 840
 841     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 842                                        CPU_THROTTLE_TIMESLICE_NS);
 843 }
 844
 845 void cpu_throttle_stop(void)
 846 {
 847     atomic_set(&throttle_percentage, 0);
 848 }
 849
 850 bool cpu_throttle_active(void)
 851 {
 852     return (cpu_throttle_get_percentage() != 0);
 853 }
 854
 855 int cpu_throttle_get_percentage(void)
 856 {
 857     return atomic_read(&throttle_percentage);
 858 }
 859
 860 void cpu_ticks_init(void)
 861 {
 862     seqlock_init(&timers_state.vm_clock_seqlock);
 863     qemu_spin_init(&timers_state.vm_clock_lock);
 864     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 865     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 866                                            cpu_throttle_timer_tick, NULL);
 867 }
 868
 869 void configure_icount(QemuOpts *opts, Error **errp)
 870 {
 871     const char *option;
 872     char *rem_str = NULL;
 873
 874     option = qemu_opt_get(opts, "shift");
 875     if (!option) {
 876         if (qemu_opt_get(opts, "align") != NULL) {
 877             error_setg(errp, "Please specify shift option when using align");
 878         }
 879         return;
 880     }
 881
 882     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 883     if (icount_sleep) {
 884         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 885                                          icount_timer_cb, NULL);
 886     }
 887
 888     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 889
 890     if (icount_align_option && !icount_sleep) {
 891         error_setg(errp, "align=on and sleep=off are incompatible");
 892     }
 893     if (strcmp(option, "auto") != 0) {
 894         errno = 0;
 895         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 896         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 897             error_setg(errp, "icount: Invalid shift value");
 898         }
 899         use_icount = 1;
 900         return;
 901     } else if (icount_align_option) {
 902         error_setg(errp, "shift=auto and align=on are incompatible");
 903     } else if (!icount_sleep) {
 904         error_setg(errp, "shift=auto and sleep=off are incompatible");
 905     }
 906
 907     use_icount = 2;
 908
 909     /* 125MIPS seems a reasonable initial guess at the guest speed.
 910        It will be corrected fairly quickly anyway.  */
 911     timers_state.icount_time_shift = 3;
 912
 913     /* Have both realtime and virtual time triggers for speed adjustment.
 914        The realtime trigger catches emulated time passing too slowly,
 915        the virtual time trigger catches emulated time passing too fast.
 916        Realtime triggers occur even when idle, so use them less frequently
 917        than VM triggers.  */
 918     timers_state.vm_clock_warp_start = -1;
 919     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 920                                    icount_adjust_rt, NULL);
 921     timer_mod(timers_state.icount_rt_timer,
 922                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 923     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 924                                         icount_adjust_vm, NULL);
 925     timer_mod(timers_state.icount_vm_timer,
 926                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 927                    NANOSECONDS_PER_SECOND / 10);
 928 }
 929
 930 /***********************************************************/
 931 /* TCG vCPU kick timer
 932  *
 933  * The kick timer is responsible for moving single threaded vCPU
 934  * emulation on to the next vCPU. If more than one vCPU is running a
 935  * timer event with force a cpu->exit so the next vCPU can get
 936  * scheduled.
 937  *
 938  * The timer is removed if all vCPUs are idle and restarted again once
 939  * idleness is complete.
 940  */
 941
 942 static QEMUTimer *tcg_kick_vcpu_timer;
 943 static CPUState *tcg_current_rr_cpu;
 944
 945 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 946
 947 static inline int64_t qemu_tcg_next_kick(void)
 948 {
 949     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 950 }
 951
 952 /* Kick the currently round-robin scheduled vCPU */
 953 static void qemu_cpu_kick_rr_cpu(void)
 954 {
 955     CPUState *cpu;
 956     do {
 957         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 958         if (cpu) {
 959             cpu_exit(cpu);
 960         }
 961     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 962 }
 963
 964 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 965 {
 966 }
 967
 968 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 969 {
 970     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 971         qemu_notify_event();
 972         return;
 973     }
 974
 975     if (qemu_in_vcpu_thread()) {
 976         /* A CPU is currently running; kick it back out to the
 977          * tcg_cpu_exec() loop so it will recalculate its
 978          * icount deadline immediately.
 979          */
 980         qemu_cpu_kick(current_cpu);
 981     } else if (first_cpu) {
 982         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 983          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 984          * causes cpu_thread_is_idle to return false.  This way,
 985          * handle_icount_deadline can run.
 986          * If we have no CPUs at all for some reason, we don't
 987          * need to do anything.
 988          */
 989         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 990     }
 991 }
 992
 993 static void kick_tcg_thread(void *opaque)
 994 {
 995     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 996     qemu_cpu_kick_rr_cpu();
 997 }
 998
 999 static void start_tcg_kick_timer(void)
1000 {
1001     assert(!mttcg_enabled);
1002     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
1003         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
1004                                            kick_tcg_thread, NULL);
1005     }
1006     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
1007         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1008     }
1009 }
1010
1011 static void stop_tcg_kick_timer(void)
1012 {
1013     assert(!mttcg_enabled);
1014     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1015         timer_del(tcg_kick_vcpu_timer);
1016     }
1017 }
1018
1019 /***********************************************************/
1020 void hw_error(const char *fmt, ...)
1021 {
1022     va_list ap;
1023     CPUState *cpu;
1024
1025     va_start(ap, fmt);
1026     fprintf(stderr, "qemu: hardware error: ");
1027     vfprintf(stderr, fmt, ap);
1028     fprintf(stderr, "\n");
1029     CPU_FOREACH(cpu) {
1030         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1031         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1032     }
1033     va_end(ap);
1034     abort();
1035 }
1036
1037 void cpu_synchronize_all_states(void)
1038 {
1039     CPUState *cpu;
1040
1041     CPU_FOREACH(cpu) {
1042         cpu_synchronize_state(cpu);
1043         /* TODO: move to cpu_synchronize_state() */
1044         if (hvf_enabled()) {
1045             hvf_cpu_synchronize_state(cpu);
1046         }
1047     }
1048 }
1049
1050 void cpu_synchronize_all_post_reset(void)
1051 {
1052     CPUState *cpu;
1053
1054     CPU_FOREACH(cpu) {
1055         cpu_synchronize_post_reset(cpu);
1056         /* TODO: move to cpu_synchronize_post_reset() */
1057         if (hvf_enabled()) {
1058             hvf_cpu_synchronize_post_reset(cpu);
1059         }
1060     }
1061 }
1062
1063 void cpu_synchronize_all_post_init(void)
1064 {
1065     CPUState *cpu;
1066
1067     CPU_FOREACH(cpu) {
1068         cpu_synchronize_post_init(cpu);
1069         /* TODO: move to cpu_synchronize_post_init() */
1070         if (hvf_enabled()) {
1071             hvf_cpu_synchronize_post_init(cpu);
1072         }
1073     }
1074 }
1075
1076 void cpu_synchronize_all_pre_loadvm(void)
1077 {
1078     CPUState *cpu;
1079
1080     CPU_FOREACH(cpu) {
1081         cpu_synchronize_pre_loadvm(cpu);
1082     }
1083 }
1084
1085 static int do_vm_stop(RunState state, bool send_stop)
1086 {
1087     int ret = 0;
1088
1089     if (runstate_is_running()) {
1090         cpu_disable_ticks();
1091         pause_all_vcpus();
1092         runstate_set(state);
1093         vm_state_notify(0, state);
1094         if (send_stop) {
1095             qapi_event_send_stop();
1096         }
1097     }
1098
1099     bdrv_drain_all();
1100     ret = bdrv_flush_all();
1101
1102     return ret;
1103 }
1104
1105 /* Special vm_stop() variant for terminating the process.  Historically clients
1106  * did not expect a QMP STOP event and so we need to retain compatibility.
1107  */
1108 int vm_shutdown(void)
1109 {
1110     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1111 }
1112
1113 static bool cpu_can_run(CPUState *cpu)
1114 {
1115     if (cpu->stop) {
1116         return false;
1117     }
1118     if (cpu_is_stopped(cpu)) {
1119         return false;
1120     }
1121     return true;
1122 }
1123
1124 static void cpu_handle_guest_debug(CPUState *cpu)
1125 {
1126     gdb_set_stop_cpu(cpu);
1127     qemu_system_debug_request();
1128     cpu->stopped = true;
1129 }
1130
1131 #ifdef CONFIG_LINUX
1132 static void sigbus_reraise(void)
1133 {
1134     sigset_t set;
1135     struct sigaction action;
1136
1137     memset(&action, 0, sizeof(action));
1138     action.sa_handler = SIG_DFL;
1139     if (!sigaction(SIGBUS, &action, NULL)) {
1140         raise(SIGBUS);
1141         sigemptyset(&set);
1142         sigaddset(&set, SIGBUS);
1143         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1144     }
1145     perror("Failed to re-raise SIGBUS!\n");
1146     abort();
1147 }
1148
1149 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1150 {
1151     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1152         sigbus_reraise();
1153     }
1154
1155     if (current_cpu) {
1156         /* Called asynchronously in VCPU thread.  */
1157         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1158             sigbus_reraise();
1159         }
1160     } else {
1161         /* Called synchronously (via signalfd) in main thread.  */
1162         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1163             sigbus_reraise();
1164         }
1165     }
1166 }
1167
1168 static void qemu_init_sigbus(void)
1169 {
1170     struct sigaction action;
1171
1172     memset(&action, 0, sizeof(action));
1173     action.sa_flags = SA_SIGINFO;
1174     action.sa_sigaction = sigbus_handler;
1175     sigaction(SIGBUS, &action, NULL);
1176
1177     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1178 }
1179 #else /* !CONFIG_LINUX */
1180 static void qemu_init_sigbus(void)
1181 {
1182 }
1183 #endif /* !CONFIG_LINUX */
1184
1185 static QemuThread io_thread;
1186
1187 /* cpu creation */
1188 static QemuCond qemu_cpu_cond;
1189 /* system init */
1190 static QemuCond qemu_pause_cond;
1191
1192 void qemu_init_cpu_loop(void)
1193 {
1194     qemu_init_sigbus();
1195     qemu_cond_init(&qemu_cpu_cond);
1196     qemu_cond_init(&qemu_pause_cond);
1197     qemu_mutex_init(&qemu_global_mutex);
1198
1199     qemu_thread_get_self(&io_thread);
1200 }
1201
1202 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1203 {
1204     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1205 }
1206
1207 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1208 {
1209     if (kvm_destroy_vcpu(cpu) < 0) {
1210         error_report("kvm_destroy_vcpu failed");
1211         exit(EXIT_FAILURE);
1212     }
1213 }
1214
1215 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1216 {
1217 }
1218
1219 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1220 {
1221     g_assert(qemu_cpu_is_self(cpu));
1222     cpu->stop = false;
1223     cpu->stopped = true;
1224     if (exit) {
1225         cpu_exit(cpu);
1226     }
1227     qemu_cond_broadcast(&qemu_pause_cond);
1228 }
1229
1230 static void qemu_wait_io_event_common(CPUState *cpu)
1231 {
1232     atomic_mb_set(&cpu->thread_kicked, false);
1233     if (cpu->stop) {
1234         qemu_cpu_stop(cpu, false);
1235     }
1236     process_queued_cpu_work(cpu);
1237 }
1238
1239 static void qemu_tcg_rr_wait_io_event(void)
1240 {
1241     CPUState *cpu;
1242
1243     while (all_cpu_threads_idle()) {
1244         stop_tcg_kick_timer();
1245         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1246     }
1247
1248     start_tcg_kick_timer();
1249
1250     CPU_FOREACH(cpu) {
1251         qemu_wait_io_event_common(cpu);
1252     }
1253 }
1254
1255 static void qemu_wait_io_event(CPUState *cpu)
1256 {
1257     while (cpu_thread_is_idle(cpu)) {
1258         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1259     }
1260
1261 #ifdef _WIN32
1262     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1263     if (!tcg_enabled()) {
1264         SleepEx(0, TRUE);
1265     }
1266 #endif
1267     qemu_wait_io_event_common(cpu);
1268 }
1269
1270 static void *qemu_kvm_cpu_thread_fn(void *arg)
1271 {
1272     CPUState *cpu = arg;
1273     int r;
1274
1275     rcu_register_thread();
1276
1277     qemu_mutex_lock_iothread();
1278     qemu_thread_get_self(cpu->thread);
1279     cpu->thread_id = qemu_get_thread_id();
1280     cpu->can_do_io = 1;
1281     current_cpu = cpu;
1282
1283     r = kvm_init_vcpu(cpu);
1284     if (r < 0) {
1285         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1286         exit(1);
1287     }
1288
1289     kvm_init_cpu_signals(cpu);
1290
1291     /* signal CPU creation */
1292     cpu->created = true;
1293     qemu_cond_signal(&qemu_cpu_cond);
1294     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1295
1296     do {
1297         if (cpu_can_run(cpu)) {
1298             r = kvm_cpu_exec(cpu);
1299             if (r == EXCP_DEBUG) {
1300                 cpu_handle_guest_debug(cpu);
1301             }
1302         }
1303         qemu_wait_io_event(cpu);
1304     } while (!cpu->unplug || cpu_can_run(cpu));
1305
1306     qemu_kvm_destroy_vcpu(cpu);
1307     cpu->created = false;
1308     qemu_cond_signal(&qemu_cpu_cond);
1309     qemu_mutex_unlock_iothread();
1310     rcu_unregister_thread();
1311     return NULL;
1312 }
1313
1314 static void *qemu_dummy_cpu_thread_fn(void *arg)
1315 {
1316 #ifdef _WIN32
1317     error_report("qtest is not supported under Windows");
1318     exit(1);
1319 #else
1320     CPUState *cpu = arg;
1321     sigset_t waitset;
1322     int r;
1323
1324     rcu_register_thread();
1325
1326     qemu_mutex_lock_iothread();
1327     qemu_thread_get_self(cpu->thread);
1328     cpu->thread_id = qemu_get_thread_id();
1329     cpu->can_do_io = 1;
1330     current_cpu = cpu;
1331
1332     sigemptyset(&waitset);
1333     sigaddset(&waitset, SIG_IPI);
1334
1335     /* signal CPU creation */
1336     cpu->created = true;
1337     qemu_cond_signal(&qemu_cpu_cond);
1338     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1339
1340     do {
1341         qemu_mutex_unlock_iothread();
1342         do {
1343             int sig;
1344             r = sigwait(&waitset, &sig);
1345         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1346         if (r == -1) {
1347             perror("sigwait");
1348             exit(1);
1349         }
1350         qemu_mutex_lock_iothread();
1351         qemu_wait_io_event(cpu);
1352     } while (!cpu->unplug);
1353
1354     qemu_mutex_unlock_iothread();
1355     rcu_unregister_thread();
1356     return NULL;
1357 #endif
1358 }
1359
1360 static int64_t tcg_get_icount_limit(void)
1361 {
1362     int64_t deadline;
1363
1364     if (replay_mode != REPLAY_MODE_PLAY) {
1365         /*
1366          * Include all the timers, because they may need an attention.
1367          * Too long CPU execution may create unnecessary delay in UI.
1368          */
1369         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1370                                               QEMU_TIMER_ATTR_ALL);
1371
1372         /* Maintain prior (possibly buggy) behaviour where if no deadline
1373          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1374          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1375          * nanoseconds.
1376          */
1377         if ((deadline < 0) || (deadline > INT32_MAX)) {
1378             deadline = INT32_MAX;
1379         }
1380
1381         return qemu_icount_round(deadline);
1382     } else {
1383         return replay_get_instructions();
1384     }
1385 }
1386
1387 static void handle_icount_deadline(void)
1388 {
1389     assert(qemu_in_vcpu_thread());
1390     if (use_icount) {
1391         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1392                                                       QEMU_TIMER_ATTR_ALL);
1393
1394         if (deadline == 0) {
1395             /* Wake up other AioContexts.  */
1396             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1397             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1398         }
1399     }
1400 }
1401
1402 static void prepare_icount_for_run(CPUState *cpu)
1403 {
1404     if (use_icount) {
1405         int insns_left;
1406
1407         /* These should always be cleared by process_icount_data after
1408          * each vCPU execution. However u16.high can be raised
1409          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1410          */
1411         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1412         g_assert(cpu->icount_extra == 0);
1413
1414         cpu->icount_budget = tcg_get_icount_limit();
1415         insns_left = MIN(0xffff, cpu->icount_budget);
1416         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1417         cpu->icount_extra = cpu->icount_budget - insns_left;
1418
1419         replay_mutex_lock();
1420     }
1421 }
1422
1423 static void process_icount_data(CPUState *cpu)
1424 {
1425     if (use_icount) {
1426         /* Account for executed instructions */
1427         cpu_update_icount(cpu);
1428
1429         /* Reset the counters */
1430         cpu_neg(cpu)->icount_decr.u16.low = 0;
1431         cpu->icount_extra = 0;
1432         cpu->icount_budget = 0;
1433
1434         replay_account_executed_instructions();
1435
1436         replay_mutex_unlock();
1437     }
1438 }
1439
1440
1441 static int tcg_cpu_exec(CPUState *cpu)
1442 {
1443     int ret;
1444 #ifdef CONFIG_PROFILER
1445     int64_t ti;
1446 #endif
1447
1448     assert(tcg_enabled());
1449 #ifdef CONFIG_PROFILER
1450     ti = profile_getclock();
1451 #endif
1452     cpu_exec_start(cpu);
1453     ret = cpu_exec(cpu);
1454     cpu_exec_end(cpu);
1455 #ifdef CONFIG_PROFILER
1456     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1457                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1458 #endif
1459     return ret;
1460 }
1461
1462 /* Destroy any remaining vCPUs which have been unplugged and have
1463  * finished running
1464  */
1465 static void deal_with_unplugged_cpus(void)
1466 {
1467     CPUState *cpu;
1468
1469     CPU_FOREACH(cpu) {
1470         if (cpu->unplug && !cpu_can_run(cpu)) {
1471             qemu_tcg_destroy_vcpu(cpu);
1472             cpu->created = false;
1473             qemu_cond_signal(&qemu_cpu_cond);
1474             break;
1475         }
1476     }
1477 }
1478
1479 /* Single-threaded TCG
1480  *
1481  * In the single-threaded case each vCPU is simulated in turn. If
1482  * there is more than a single vCPU we create a simple timer to kick
1483  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1484  * This is done explicitly rather than relying on side-effects
1485  * elsewhere.
1486  */
1487
1488 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1489 {
1490     CPUState *cpu = arg;
1491
1492     assert(tcg_enabled());
1493     rcu_register_thread();
1494     tcg_register_thread();
1495
1496     qemu_mutex_lock_iothread();
1497     qemu_thread_get_self(cpu->thread);
1498
1499     cpu->thread_id = qemu_get_thread_id();
1500     cpu->created = true;
1501     cpu->can_do_io = 1;
1502     qemu_cond_signal(&qemu_cpu_cond);
1503     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1504
1505     /* wait for initial kick-off after machine start */
1506     while (first_cpu->stopped) {
1507         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1508
1509         /* process any pending work */
1510         CPU_FOREACH(cpu) {
1511             current_cpu = cpu;
1512             qemu_wait_io_event_common(cpu);
1513         }
1514     }
1515
1516     start_tcg_kick_timer();
1517
1518     cpu = first_cpu;
1519
1520     /* process any pending work */
1521     cpu->exit_request = 1;
1522
1523     while (1) {
1524         qemu_mutex_unlock_iothread();
1525         replay_mutex_lock();
1526         qemu_mutex_lock_iothread();
1527         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1528         qemu_account_warp_timer();
1529
1530         /* Run the timers here.  This is much more efficient than
1531          * waking up the I/O thread and waiting for completion.
1532          */
1533         handle_icount_deadline();
1534
1535         replay_mutex_unlock();
1536
1537         if (!cpu) {
1538             cpu = first_cpu;
1539         }
1540
1541         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1542
1543             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1544             current_cpu = cpu;
1545
1546             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1547                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1548
1549             if (cpu_can_run(cpu)) {
1550                 int r;
1551
1552                 qemu_mutex_unlock_iothread();
1553                 prepare_icount_for_run(cpu);
1554
1555                 r = tcg_cpu_exec(cpu);
1556
1557                 process_icount_data(cpu);
1558                 qemu_mutex_lock_iothread();
1559
1560                 if (r == EXCP_DEBUG) {
1561                     cpu_handle_guest_debug(cpu);
1562                     break;
1563                 } else if (r == EXCP_ATOMIC) {
1564                     qemu_mutex_unlock_iothread();
1565                     cpu_exec_step_atomic(cpu);
1566                     qemu_mutex_lock_iothread();
1567                     break;
1568                 }
1569             } else if (cpu->stop) {
1570                 if (cpu->unplug) {
1571                     cpu = CPU_NEXT(cpu);
1572                 }
1573                 break;
1574             }
1575
1576             cpu = CPU_NEXT(cpu);
1577         } /* while (cpu && !cpu->exit_request).. */
1578
1579         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1580         atomic_set(&tcg_current_rr_cpu, NULL);
1581
1582         if (cpu && cpu->exit_request) {
1583             atomic_mb_set(&cpu->exit_request, 0);
1584         }
1585
1586         if (use_icount && all_cpu_threads_idle()) {
1587             /*
1588              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1589              * in the main_loop, wake it up in order to start the warp timer.
1590              */
1591             qemu_notify_event();
1592         }
1593
1594         qemu_tcg_rr_wait_io_event();
1595         deal_with_unplugged_cpus();
1596     }
1597
1598     rcu_unregister_thread();
1599     return NULL;
1600 }
1601
1602 static void *qemu_hax_cpu_thread_fn(void *arg)
1603 {
1604     CPUState *cpu = arg;
1605     int r;
1606
1607     rcu_register_thread();
1608     qemu_mutex_lock_iothread();
1609     qemu_thread_get_self(cpu->thread);
1610
1611     cpu->thread_id = qemu_get_thread_id();
1612     cpu->created = true;
1613     current_cpu = cpu;
1614
1615     hax_init_vcpu(cpu);
1616     qemu_cond_signal(&qemu_cpu_cond);
1617     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1618
1619     do {
1620         if (cpu_can_run(cpu)) {
1621             r = hax_smp_cpu_exec(cpu);
1622             if (r == EXCP_DEBUG) {
1623                 cpu_handle_guest_debug(cpu);
1624             }
1625         }
1626
1627         qemu_wait_io_event(cpu);
1628     } while (!cpu->unplug || cpu_can_run(cpu));
1629     rcu_unregister_thread();
1630     return NULL;
1631 }
1632
1633 /* The HVF-specific vCPU thread function. This one should only run when the host
1634  * CPU supports the VMX "unrestricted guest" feature. */
1635 static void *qemu_hvf_cpu_thread_fn(void *arg)
1636 {
1637     CPUState *cpu = arg;
1638
1639     int r;
1640
1641     assert(hvf_enabled());
1642
1643     rcu_register_thread();
1644
1645     qemu_mutex_lock_iothread();
1646     qemu_thread_get_self(cpu->thread);
1647
1648     cpu->thread_id = qemu_get_thread_id();
1649     cpu->can_do_io = 1;
1650     current_cpu = cpu;
1651
1652     hvf_init_vcpu(cpu);
1653
1654     /* signal CPU creation */
1655     cpu->created = true;
1656     qemu_cond_signal(&qemu_cpu_cond);
1657     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1658
1659     do {
1660         if (cpu_can_run(cpu)) {
1661             r = hvf_vcpu_exec(cpu);
1662             if (r == EXCP_DEBUG) {
1663                 cpu_handle_guest_debug(cpu);
1664             }
1665         }
1666         qemu_wait_io_event(cpu);
1667     } while (!cpu->unplug || cpu_can_run(cpu));
1668
1669     hvf_vcpu_destroy(cpu);
1670     cpu->created = false;
1671     qemu_cond_signal(&qemu_cpu_cond);
1672     qemu_mutex_unlock_iothread();
1673     rcu_unregister_thread();
1674     return NULL;
1675 }
1676
1677 static void *qemu_whpx_cpu_thread_fn(void *arg)
1678 {
1679     CPUState *cpu = arg;
1680     int r;
1681
1682     rcu_register_thread();
1683
1684     qemu_mutex_lock_iothread();
1685     qemu_thread_get_self(cpu->thread);
1686     cpu->thread_id = qemu_get_thread_id();
1687     current_cpu = cpu;
1688
1689     r = whpx_init_vcpu(cpu);
1690     if (r < 0) {
1691         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1692         exit(1);
1693     }
1694
1695     /* signal CPU creation */
1696     cpu->created = true;
1697     qemu_cond_signal(&qemu_cpu_cond);
1698     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1699
1700     do {
1701         if (cpu_can_run(cpu)) {
1702             r = whpx_vcpu_exec(cpu);
1703             if (r == EXCP_DEBUG) {
1704                 cpu_handle_guest_debug(cpu);
1705             }
1706         }
1707         while (cpu_thread_is_idle(cpu)) {
1708             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1709         }
1710         qemu_wait_io_event_common(cpu);
1711     } while (!cpu->unplug || cpu_can_run(cpu));
1712
1713     whpx_destroy_vcpu(cpu);
1714     cpu->created = false;
1715     qemu_cond_signal(&qemu_cpu_cond);
1716     qemu_mutex_unlock_iothread();
1717     rcu_unregister_thread();
1718     return NULL;
1719 }
1720
1721 #ifdef _WIN32
1722 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1723 {
1724 }
1725 #endif
1726
1727 /* Multi-threaded TCG
1728  *
1729  * In the multi-threaded case each vCPU has its own thread. The TLS
1730  * variable current_cpu can be used deep in the code to find the
1731  * current CPUState for a given thread.
1732  */
1733
1734 static void *qemu_tcg_cpu_thread_fn(void *arg)
1735 {
1736     CPUState *cpu = arg;
1737
1738     assert(tcg_enabled());
1739     g_assert(!use_icount);
1740
1741     rcu_register_thread();
1742     tcg_register_thread();
1743
1744     qemu_mutex_lock_iothread();
1745     qemu_thread_get_self(cpu->thread);
1746
1747     cpu->thread_id = qemu_get_thread_id();
1748     cpu->created = true;
1749     cpu->can_do_io = 1;
1750     current_cpu = cpu;
1751     qemu_cond_signal(&qemu_cpu_cond);
1752     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1753
1754     /* process any pending work */
1755     cpu->exit_request = 1;
1756
1757     do {
1758         if (cpu_can_run(cpu)) {
1759             int r;
1760             qemu_mutex_unlock_iothread();
1761             r = tcg_cpu_exec(cpu);
1762             qemu_mutex_lock_iothread();
1763             switch (r) {
1764             case EXCP_DEBUG:
1765                 cpu_handle_guest_debug(cpu);
1766                 break;
1767             case EXCP_HALTED:
1768                 /* during start-up the vCPU is reset and the thread is
1769                  * kicked several times. If we don't ensure we go back
1770                  * to sleep in the halted state we won't cleanly
1771                  * start-up when the vCPU is enabled.
1772                  *
1773                  * cpu->halted should ensure we sleep in wait_io_event
1774                  */
1775                 g_assert(cpu->halted);
1776                 break;
1777             case EXCP_ATOMIC:
1778                 qemu_mutex_unlock_iothread();
1779                 cpu_exec_step_atomic(cpu);
1780                 qemu_mutex_lock_iothread();
1781             default:
1782                 /* Ignore everything else? */
1783                 break;
1784             }
1785         }
1786
1787         atomic_mb_set(&cpu->exit_request, 0);
1788         qemu_wait_io_event(cpu);
1789     } while (!cpu->unplug || cpu_can_run(cpu));
1790
1791     qemu_tcg_destroy_vcpu(cpu);
1792     cpu->created = false;
1793     qemu_cond_signal(&qemu_cpu_cond);
1794     qemu_mutex_unlock_iothread();
1795     rcu_unregister_thread();
1796     return NULL;
1797 }
1798
1799 static void qemu_cpu_kick_thread(CPUState *cpu)
1800 {
1801 #ifndef _WIN32
1802     int err;
1803
1804     if (cpu->thread_kicked) {
1805         return;
1806     }
1807     cpu->thread_kicked = true;
1808     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1809     if (err && err != ESRCH) {
1810         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1811         exit(1);
1812     }
1813 #else /* _WIN32 */
1814     if (!qemu_cpu_is_self(cpu)) {
1815         if (whpx_enabled()) {
1816             whpx_vcpu_kick(cpu);
1817         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1818             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1819                     __func__, GetLastError());
1820             exit(1);
1821         }
1822     }
1823 #endif
1824 }
1825
1826 void qemu_cpu_kick(CPUState *cpu)
1827 {
1828     qemu_cond_broadcast(cpu->halt_cond);
1829     if (tcg_enabled()) {
1830         cpu_exit(cpu);
1831         /* NOP unless doing single-thread RR */
1832         qemu_cpu_kick_rr_cpu();
1833     } else {
1834         if (hax_enabled()) {
1835             /*
1836              * FIXME: race condition with the exit_request check in
1837              * hax_vcpu_hax_exec
1838              */
1839             cpu->exit_request = 1;
1840         }
1841         qemu_cpu_kick_thread(cpu);
1842     }
1843 }
1844
1845 void qemu_cpu_kick_self(void)
1846 {
1847     assert(current_cpu);
1848     qemu_cpu_kick_thread(current_cpu);
1849 }
1850
1851 bool qemu_cpu_is_self(CPUState *cpu)
1852 {
1853     return qemu_thread_is_self(cpu->thread);
1854 }
1855
1856 bool qemu_in_vcpu_thread(void)
1857 {
1858     return current_cpu && qemu_cpu_is_self(current_cpu);
1859 }
1860
1861 static __thread bool iothread_locked = false;
1862
1863 bool qemu_mutex_iothread_locked(void)
1864 {
1865     return iothread_locked;
1866 }
1867
1868 /*
1869  * The BQL is taken from so many places that it is worth profiling the
1870  * callers directly, instead of funneling them all through a single function.
1871  */
1872 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1873 {
1874     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1875
1876     g_assert(!qemu_mutex_iothread_locked());
1877     bql_lock(&qemu_global_mutex, file, line);
1878     iothread_locked = true;
1879 }
1880
1881 void qemu_mutex_unlock_iothread(void)
1882 {
1883     g_assert(qemu_mutex_iothread_locked());
1884     iothread_locked = false;
1885     qemu_mutex_unlock(&qemu_global_mutex);
1886 }
1887
1888 static bool all_vcpus_paused(void)
1889 {
1890     CPUState *cpu;
1891
1892     CPU_FOREACH(cpu) {
1893         if (!cpu->stopped) {
1894             return false;
1895         }
1896     }
1897
1898     return true;
1899 }
1900
1901 void pause_all_vcpus(void)
1902 {
1903     CPUState *cpu;
1904
1905     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1906     CPU_FOREACH(cpu) {
1907         if (qemu_cpu_is_self(cpu)) {
1908             qemu_cpu_stop(cpu, true);
1909         } else {
1910             cpu->stop = true;
1911             qemu_cpu_kick(cpu);
1912         }
1913     }
1914
1915     /* We need to drop the replay_lock so any vCPU threads woken up
1916      * can finish their replay tasks
1917      */
1918     replay_mutex_unlock();
1919
1920     while (!all_vcpus_paused()) {
1921         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1922         CPU_FOREACH(cpu) {
1923             qemu_cpu_kick(cpu);
1924         }
1925     }
1926
1927     qemu_mutex_unlock_iothread();
1928     replay_mutex_lock();
1929     qemu_mutex_lock_iothread();
1930 }
1931
1932 void cpu_resume(CPUState *cpu)
1933 {
1934     cpu->stop = false;
1935     cpu->stopped = false;
1936     qemu_cpu_kick(cpu);
1937 }
1938
1939 void resume_all_vcpus(void)
1940 {
1941     CPUState *cpu;
1942
1943     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1944     CPU_FOREACH(cpu) {
1945         cpu_resume(cpu);
1946     }
1947 }
1948
1949 void cpu_remove_sync(CPUState *cpu)
1950 {
1951     cpu->stop = true;
1952     cpu->unplug = true;
1953     qemu_cpu_kick(cpu);
1954     qemu_mutex_unlock_iothread();
1955     qemu_thread_join(cpu->thread);
1956     qemu_mutex_lock_iothread();
1957 }
1958
1959 /* For temporary buffers for forming a name */
1960 #define VCPU_THREAD_NAME_SIZE 16
1961
1962 static void qemu_tcg_init_vcpu(CPUState *cpu)
1963 {
1964     char thread_name[VCPU_THREAD_NAME_SIZE];
1965     static QemuCond *single_tcg_halt_cond;
1966     static QemuThread *single_tcg_cpu_thread;
1967     static int tcg_region_inited;
1968
1969     assert(tcg_enabled());
1970     /*
1971      * Initialize TCG regions--once. Now is a good time, because:
1972      * (1) TCG's init context, prologue and target globals have been set up.
1973      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1974      *     -accel flag is processed, so the check doesn't work then).
1975      */
1976     if (!tcg_region_inited) {
1977         tcg_region_inited = 1;
1978         tcg_region_init();
1979     }
1980
1981     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1982         cpu->thread = g_malloc0(sizeof(QemuThread));
1983         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1984         qemu_cond_init(cpu->halt_cond);
1985
1986         if (qemu_tcg_mttcg_enabled()) {
1987             /* create a thread per vCPU with TCG (MTTCG) */
1988             parallel_cpus = true;
1989             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1990                  cpu->cpu_index);
1991
1992             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1993                                cpu, QEMU_THREAD_JOINABLE);
1994
1995         } else {
1996             /* share a single thread for all cpus with TCG */
1997             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1998             qemu_thread_create(cpu->thread, thread_name,
1999                                qemu_tcg_rr_cpu_thread_fn,
2000                                cpu, QEMU_THREAD_JOINABLE);
2001
2002             single_tcg_halt_cond = cpu->halt_cond;
2003             single_tcg_cpu_thread = cpu->thread;
2004         }
2005 #ifdef _WIN32
2006         cpu->hThread = qemu_thread_get_handle(cpu->thread);
2007 #endif
2008     } else {
2009         /* For non-MTTCG cases we share the thread */
2010         cpu->thread = single_tcg_cpu_thread;
2011         cpu->halt_cond = single_tcg_halt_cond;
2012         cpu->thread_id = first_cpu->thread_id;
2013         cpu->can_do_io = 1;
2014         cpu->created = true;
2015     }
2016 }
2017
2018 static void qemu_hax_start_vcpu(CPUState *cpu)
2019 {
2020     char thread_name[VCPU_THREAD_NAME_SIZE];
2021
2022     cpu->thread = g_malloc0(sizeof(QemuThread));
2023     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2024     qemu_cond_init(cpu->halt_cond);
2025
2026     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2027              cpu->cpu_index);
2028     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2029                        cpu, QEMU_THREAD_JOINABLE);
2030 #ifdef _WIN32
2031     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2032 #endif
2033 }
2034
2035 static void qemu_kvm_start_vcpu(CPUState *cpu)
2036 {
2037     char thread_name[VCPU_THREAD_NAME_SIZE];
2038
2039     cpu->thread = g_malloc0(sizeof(QemuThread));
2040     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2041     qemu_cond_init(cpu->halt_cond);
2042     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2043              cpu->cpu_index);
2044     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2045                        cpu, QEMU_THREAD_JOINABLE);
2046 }
2047
2048 static void qemu_hvf_start_vcpu(CPUState *cpu)
2049 {
2050     char thread_name[VCPU_THREAD_NAME_SIZE];
2051
2052     /* HVF currently does not support TCG, and only runs in
2053      * unrestricted-guest mode. */
2054     assert(hvf_enabled());
2055
2056     cpu->thread = g_malloc0(sizeof(QemuThread));
2057     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2058     qemu_cond_init(cpu->halt_cond);
2059
2060     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2061              cpu->cpu_index);
2062     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2063                        cpu, QEMU_THREAD_JOINABLE);
2064 }
2065
2066 static void qemu_whpx_start_vcpu(CPUState *cpu)
2067 {
2068     char thread_name[VCPU_THREAD_NAME_SIZE];
2069
2070     cpu->thread = g_malloc0(sizeof(QemuThread));
2071     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2072     qemu_cond_init(cpu->halt_cond);
2073     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2074              cpu->cpu_index);
2075     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2076                        cpu, QEMU_THREAD_JOINABLE);
2077 #ifdef _WIN32
2078     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2079 #endif
2080 }
2081
2082 static void qemu_dummy_start_vcpu(CPUState *cpu)
2083 {
2084     char thread_name[VCPU_THREAD_NAME_SIZE];
2085
2086     cpu->thread = g_malloc0(sizeof(QemuThread));
2087     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2088     qemu_cond_init(cpu->halt_cond);
2089     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2090              cpu->cpu_index);
2091     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2092                        QEMU_THREAD_JOINABLE);
2093 }
2094
2095 void qemu_init_vcpu(CPUState *cpu)
2096 {
2097     MachineState *ms = MACHINE(qdev_get_machine());
2098
2099     cpu->nr_cores = ms->smp.cores;
2100     cpu->nr_threads =  ms->smp.threads;
2101     cpu->stopped = true;
2102     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2103
2104     if (!cpu->as) {
2105         /* If the target cpu hasn't set up any address spaces itself,
2106          * give it the default one.
2107          */
2108         cpu->num_ases = 1;
2109         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2110     }
2111
2112     if (kvm_enabled()) {
2113         qemu_kvm_start_vcpu(cpu);
2114     } else if (hax_enabled()) {
2115         qemu_hax_start_vcpu(cpu);
2116     } else if (hvf_enabled()) {
2117         qemu_hvf_start_vcpu(cpu);
2118     } else if (tcg_enabled()) {
2119         qemu_tcg_init_vcpu(cpu);
2120     } else if (whpx_enabled()) {
2121         qemu_whpx_start_vcpu(cpu);
2122     } else {
2123         qemu_dummy_start_vcpu(cpu);
2124     }
2125
2126     while (!cpu->created) {
2127         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2128     }
2129 }
2130
2131 void cpu_stop_current(void)
2132 {
2133     if (current_cpu) {
2134         current_cpu->stop = true;
2135         cpu_exit(current_cpu);
2136     }
2137 }
2138
2139 int vm_stop(RunState state)
2140 {
2141     if (qemu_in_vcpu_thread()) {
2142         qemu_system_vmstop_request_prepare();
2143         qemu_system_vmstop_request(state);
2144         /*
2145          * FIXME: should not return to device code in case
2146          * vm_stop() has been requested.
2147          */
2148         cpu_stop_current();
2149         return 0;
2150     }
2151
2152     return do_vm_stop(state, true);
2153 }
2154
2155 /**
2156  * Prepare for (re)starting the VM.
2157  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2158  * running or in case of an error condition), 0 otherwise.
2159  */
2160 int vm_prepare_start(void)
2161 {
2162     RunState requested;
2163
2164     qemu_vmstop_requested(&requested);
2165     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2166         return -1;
2167     }
2168
2169     /* Ensure that a STOP/RESUME pair of events is emitted if a
2170      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2171      * example, according to documentation is always followed by
2172      * the STOP event.
2173      */
2174     if (runstate_is_running()) {
2175         qapi_event_send_stop();
2176         qapi_event_send_resume();
2177         return -1;
2178     }
2179
2180     /* We are sending this now, but the CPUs will be resumed shortly later */
2181     qapi_event_send_resume();
2182
2183     cpu_enable_ticks();
2184     runstate_set(RUN_STATE_RUNNING);
2185     vm_state_notify(1, RUN_STATE_RUNNING);
2186     return 0;
2187 }
2188
2189 void vm_start(void)
2190 {
2191     if (!vm_prepare_start()) {
2192         resume_all_vcpus();
2193     }
2194 }
2195
2196 /* does a state transition even if the VM is already stopped,
2197    current state is forgotten forever */
2198 int vm_stop_force_state(RunState state)
2199 {
2200     if (runstate_is_running()) {
2201         return vm_stop(state);
2202     } else {
2203         runstate_set(state);
2204
2205         bdrv_drain_all();
2206         /* Make sure to return an error if the flush in a previous vm_stop()
2207          * failed. */
2208         return bdrv_flush_all();
2209     }
2210 }
2211
2212 void list_cpus(const char *optarg)
2213 {
2214     /* XXX: implement xxx_cpu_list for targets that still miss it */
2215 #if defined(cpu_list)
2216     cpu_list();
2217 #endif
2218 }
2219
2220 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2221                  bool has_cpu, int64_t cpu_index, Error **errp)
2222 {
2223     FILE *f;
2224     uint32_t l;
2225     CPUState *cpu;
2226     uint8_t buf[1024];
2227     int64_t orig_addr = addr, orig_size = size;
2228
2229     if (!has_cpu) {
2230         cpu_index = 0;
2231     }
2232
2233     cpu = qemu_get_cpu(cpu_index);
2234     if (cpu == NULL) {
2235         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2236                    "a CPU number");
2237         return;
2238     }
2239
2240     f = fopen(filename, "wb");
2241     if (!f) {
2242         error_setg_file_open(errp, errno, filename);
2243         return;
2244     }
2245
2246     while (size != 0) {
2247         l = sizeof(buf);
2248         if (l > size)
2249             l = size;
2250         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2251             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2252                              " specified", orig_addr, orig_size);
2253             goto exit;
2254         }
2255         if (fwrite(buf, 1, l, f) != l) {
2256             error_setg(errp, QERR_IO_ERROR);
2257             goto exit;
2258         }
2259         addr += l;
2260         size -= l;
2261     }
2262
2263 exit:
2264     fclose(f);
2265 }
2266
2267 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2268                   Error **errp)
2269 {
2270     FILE *f;
2271     uint32_t l;
2272     uint8_t buf[1024];
2273
2274     f = fopen(filename, "wb");
2275     if (!f) {
2276         error_setg_file_open(errp, errno, filename);
2277         return;
2278     }
2279
2280     while (size != 0) {
2281         l = sizeof(buf);
2282         if (l > size)
2283             l = size;
2284         cpu_physical_memory_read(addr, buf, l);
2285         if (fwrite(buf, 1, l, f) != l) {
2286             error_setg(errp, QERR_IO_ERROR);
2287             goto exit;
2288         }
2289         addr += l;
2290         size -= l;
2291     }
2292
2293 exit:
2294     fclose(f);
2295 }
2296
2297 void qmp_inject_nmi(Error **errp)
2298 {
2299     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2300 }
2301
2302 void dump_drift_info(void)
2303 {
2304     if (!use_icount) {
2305         return;
2306     }
2307
2308     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2309                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2310     if (icount_align_option) {
2311         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2312                     -max_delay / SCALE_MS);
2313         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2314                     max_advance / SCALE_MS);
2315     } else {
2316         qemu_printf("Max guest delay     NA\n");
2317         qemu_printf("Max guest advance   NA\n");
2318     }
2319 }