migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include "qemu/cutils.h"
  32 #include "qemu/bitops.h"
  33 #include "qemu/bitmap.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "savevm.h"
  56 #include "qemu/iov.h"
  57 #include "multifd.h"
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  63  * worked for pages that where filled with the same char.  We switched
  64  * it to only search for the zero value.  And to avoid confusion with
  65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  66  */
  67
  68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  69 #define RAM_SAVE_FLAG_ZERO     0x02
  70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  71 #define RAM_SAVE_FLAG_PAGE     0x08
  72 #define RAM_SAVE_FLAG_EOS      0x10
  73 #define RAM_SAVE_FLAG_CONTINUE 0x20
  74 #define RAM_SAVE_FLAG_XBZRLE   0x40
  75 /* 0x80 is reserved in migration.h start with 0x100 next */
  76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  77
  78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  79 {
  80     return buffer_is_zero(p, size);
  81 }
  82
  83 XBZRLECacheStats xbzrle_counters;
  84
  85 /* struct contains XBZRLE cache and a static page
  86    used by the compression */
  87 static struct {
  88     /* buffer used for XBZRLE encoding */
  89     uint8_t *encoded_buf;
  90     /* buffer for storing page content */
  91     uint8_t *current_buf;
  92     /* Cache for XBZRLE, Protected by lock. */
  93     PageCache *cache;
  94     QemuMutex lock;
  95     /* it will store a page full of zeros */
  96     uint8_t *zero_target_page;
  97     /* buffer used for XBZRLE decoding */
  98     uint8_t *decoded_buf;
  99 } XBZRLE;
 100
 101 static void XBZRLE_cache_lock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_lock(&XBZRLE.lock);
 105 }
 106
 107 static void XBZRLE_cache_unlock(void)
 108 {
 109     if (migrate_use_xbzrle())
 110         qemu_mutex_unlock(&XBZRLE.lock);
 111 }
 112
 113 /**
 114  * xbzrle_cache_resize: resize the xbzrle cache
 115  *
 116  * This function is called from qmp_migrate_set_cache_size in main
 117  * thread, possibly while a migration is in progress.  A running
 118  * migration may be using the cache and might finish during this call,
 119  * hence changes to the cache are protected by XBZRLE.lock().
 120  *
 121  * Returns 0 for success or -1 for error
 122  *
 123  * @new_size: new cache size
 124  * @errp: set *errp if the check failed, with reason
 125  */
 126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 127 {
 128     PageCache *new_cache;
 129     int64_t ret = 0;
 130
 131     /* Check for truncation */
 132     if (new_size != (size_t)new_size) {
 133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                    "exceeding address space");
 135         return -1;
 136     }
 137
 138     if (new_size == migrate_xbzrle_cache_size()) {
 139         /* nothing to do */
 140         return 0;
 141     }
 142
 143     XBZRLE_cache_lock();
 144
 145     if (XBZRLE.cache != NULL) {
 146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 147         if (!new_cache) {
 148             ret = -1;
 149             goto out;
 150         }
 151
 152         cache_fini(XBZRLE.cache);
 153         XBZRLE.cache = new_cache;
 154     }
 155 out:
 156     XBZRLE_cache_unlock();
 157     return ret;
 158 }
 159
 160 static bool ramblock_is_ignored(RAMBlock *block)
 161 {
 162     return !qemu_ram_is_migratable(block) ||
 163            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 164 }
 165
 166 /* Should be holding either ram_list.mutex, or the RCU lock. */
 167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 168     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 169         if (ramblock_is_ignored(block)) {} else
 170
 171 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 172     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 173         if (!qemu_ram_is_migratable(block)) {} else
 174
 175 #undef RAMBLOCK_FOREACH
 176
 177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 178 {
 179     RAMBlock *block;
 180     int ret = 0;
 181
 182     RCU_READ_LOCK_GUARD();
 183
 184     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 185         ret = func(block, opaque);
 186         if (ret) {
 187             break;
 188         }
 189     }
 190     return ret;
 191 }
 192
 193 static void ramblock_recv_map_init(void)
 194 {
 195     RAMBlock *rb;
 196
 197     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 198         assert(!rb->receivedmap);
 199         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 200     }
 201 }
 202
 203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 204 {
 205     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 206                     rb->receivedmap);
 207 }
 208
 209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 210 {
 211     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 212 }
 213
 214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 215 {
 216     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 217 }
 218
 219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 220                                     size_t nr)
 221 {
 222     bitmap_set_atomic(rb->receivedmap,
 223                       ramblock_recv_bitmap_offset(host_addr, rb),
 224                       nr);
 225 }
 226
 227 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 228
 229 /*
 230  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 231  *
 232  * Returns >0 if success with sent bytes, or <0 if error.
 233  */
 234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 235                                   const char *block_name)
 236 {
 237     RAMBlock *block = qemu_ram_block_by_name(block_name);
 238     unsigned long *le_bitmap, nbits;
 239     uint64_t size;
 240
 241     if (!block) {
 242         error_report("%s: invalid block name: %s", __func__, block_name);
 243         return -1;
 244     }
 245
 246     nbits = block->used_length >> TARGET_PAGE_BITS;
 247
 248     /*
 249      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 250      * machines we may need 4 more bytes for padding (see below
 251      * comment). So extend it a bit before hand.
 252      */
 253     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 254
 255     /*
 256      * Always use little endian when sending the bitmap. This is
 257      * required that when source and destination VMs are not using the
 258      * same endianess. (Note: big endian won't work.)
 259      */
 260     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 261
 262     /* Size of the bitmap, in bytes */
 263     size = DIV_ROUND_UP(nbits, 8);
 264
 265     /*
 266      * size is always aligned to 8 bytes for 64bit machines, but it
 267      * may not be true for 32bit machines. We need this padding to
 268      * make sure the migration can survive even between 32bit and
 269      * 64bit machines.
 270      */
 271     size = ROUND_UP(size, 8);
 272
 273     qemu_put_be64(file, size);
 274     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 275     /*
 276      * Mark as an end, in case the middle part is screwed up due to
 277      * some "misterious" reason.
 278      */
 279     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 280     qemu_fflush(file);
 281
 282     g_free(le_bitmap);
 283
 284     if (qemu_file_get_error(file)) {
 285         return qemu_file_get_error(file);
 286     }
 287
 288     return size + sizeof(size);
 289 }
 290
 291 /*
 292  * An outstanding page request, on the source, having been received
 293  * and queued
 294  */
 295 struct RAMSrcPageRequest {
 296     RAMBlock *rb;
 297     hwaddr    offset;
 298     hwaddr    len;
 299
 300     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 301 };
 302
 303 /* State of RAM for migration */
 304 struct RAMState {
 305     /* QEMUFile used for this migration */
 306     QEMUFile *f;
 307     /* Last block that we have visited searching for dirty pages */
 308     RAMBlock *last_seen_block;
 309     /* Last block from where we have sent data */
 310     RAMBlock *last_sent_block;
 311     /* Last dirty target page we have sent */
 312     ram_addr_t last_page;
 313     /* last ram version we have seen */
 314     uint32_t last_version;
 315     /* We are in the first round */
 316     bool ram_bulk_stage;
 317     /* The free page optimization is enabled */
 318     bool fpo_enabled;
 319     /* How many times we have dirty too many pages */
 320     int dirty_rate_high_cnt;
 321     /* these variables are used for bitmap sync */
 322     /* last time we did a full bitmap_sync */
 323     int64_t time_last_bitmap_sync;
 324     /* bytes transferred at start_time */
 325     uint64_t bytes_xfer_prev;
 326     /* number of dirty pages since start_time */
 327     uint64_t num_dirty_pages_period;
 328     /* xbzrle misses since the beginning of the period */
 329     uint64_t xbzrle_cache_miss_prev;
 330
 331     /* compression statistics since the beginning of the period */
 332     /* amount of count that no free thread to compress data */
 333     uint64_t compress_thread_busy_prev;
 334     /* amount bytes after compression */
 335     uint64_t compressed_size_prev;
 336     /* amount of compressed pages */
 337     uint64_t compress_pages_prev;
 338
 339     /* total handled target pages at the beginning of period */
 340     uint64_t target_page_count_prev;
 341     /* total handled target pages since start */
 342     uint64_t target_page_count;
 343     /* number of dirty bits in the bitmap */
 344     uint64_t migration_dirty_pages;
 345     /* Protects modification of the bitmap and migration dirty pages */
 346     QemuMutex bitmap_mutex;
 347     /* The RAMBlock used in the last src_page_requests */
 348     RAMBlock *last_req_rb;
 349     /* Queue of outstanding page requests from the destination */
 350     QemuMutex src_page_req_mutex;
 351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 352 };
 353 typedef struct RAMState RAMState;
 354
 355 static RAMState *ram_state;
 356
 357 static NotifierWithReturnList precopy_notifier_list;
 358
 359 void precopy_infrastructure_init(void)
 360 {
 361     notifier_with_return_list_init(&precopy_notifier_list);
 362 }
 363
 364 void precopy_add_notifier(NotifierWithReturn *n)
 365 {
 366     notifier_with_return_list_add(&precopy_notifier_list, n);
 367 }
 368
 369 void precopy_remove_notifier(NotifierWithReturn *n)
 370 {
 371     notifier_with_return_remove(n);
 372 }
 373
 374 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 375 {
 376     PrecopyNotifyData pnd;
 377     pnd.reason = reason;
 378     pnd.errp = errp;
 379
 380     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 381 }
 382
 383 void precopy_enable_free_page_optimization(void)
 384 {
 385     if (!ram_state) {
 386         return;
 387     }
 388
 389     ram_state->fpo_enabled = true;
 390 }
 391
 392 uint64_t ram_bytes_remaining(void)
 393 {
 394     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 395                        0;
 396 }
 397
 398 MigrationStats ram_counters;
 399
 400 /* used by the search for pages to send */
 401 struct PageSearchStatus {
 402     /* Current block being searched */
 403     RAMBlock    *block;
 404     /* Current page to search from */
 405     unsigned long page;
 406     /* Set once we wrap around */
 407     bool         complete_round;
 408 };
 409 typedef struct PageSearchStatus PageSearchStatus;
 410
 411 CompressionStats compression_counters;
 412
 413 struct CompressParam {
 414     bool done;
 415     bool quit;
 416     bool zero_page;
 417     QEMUFile *file;
 418     QemuMutex mutex;
 419     QemuCond cond;
 420     RAMBlock *block;
 421     ram_addr_t offset;
 422
 423     /* internally used fields */
 424     z_stream stream;
 425     uint8_t *originbuf;
 426 };
 427 typedef struct CompressParam CompressParam;
 428
 429 struct DecompressParam {
 430     bool done;
 431     bool quit;
 432     QemuMutex mutex;
 433     QemuCond cond;
 434     void *des;
 435     uint8_t *compbuf;
 436     int len;
 437     z_stream stream;
 438 };
 439 typedef struct DecompressParam DecompressParam;
 440
 441 static CompressParam *comp_param;
 442 static QemuThread *compress_threads;
 443 /* comp_done_cond is used to wake up the migration thread when
 444  * one of the compression threads has finished the compression.
 445  * comp_done_lock is used to co-work with comp_done_cond.
 446  */
 447 static QemuMutex comp_done_lock;
 448 static QemuCond comp_done_cond;
 449 /* The empty QEMUFileOps will be used by file in CompressParam */
 450 static const QEMUFileOps empty_ops = { };
 451
 452 static QEMUFile *decomp_file;
 453 static DecompressParam *decomp_param;
 454 static QemuThread *decompress_threads;
 455 static QemuMutex decomp_done_lock;
 456 static QemuCond decomp_done_cond;
 457
 458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 459                                  ram_addr_t offset, uint8_t *source_buf);
 460
 461 static void *do_data_compress(void *opaque)
 462 {
 463     CompressParam *param = opaque;
 464     RAMBlock *block;
 465     ram_addr_t offset;
 466     bool zero_page;
 467
 468     qemu_mutex_lock(&param->mutex);
 469     while (!param->quit) {
 470         if (param->block) {
 471             block = param->block;
 472             offset = param->offset;
 473             param->block = NULL;
 474             qemu_mutex_unlock(&param->mutex);
 475
 476             zero_page = do_compress_ram_page(param->file, &param->stream,
 477                                              block, offset, param->originbuf);
 478
 479             qemu_mutex_lock(&comp_done_lock);
 480             param->done = true;
 481             param->zero_page = zero_page;
 482             qemu_cond_signal(&comp_done_cond);
 483             qemu_mutex_unlock(&comp_done_lock);
 484
 485             qemu_mutex_lock(&param->mutex);
 486         } else {
 487             qemu_cond_wait(&param->cond, &param->mutex);
 488         }
 489     }
 490     qemu_mutex_unlock(&param->mutex);
 491
 492     return NULL;
 493 }
 494
 495 static void compress_threads_save_cleanup(void)
 496 {
 497     int i, thread_count;
 498
 499     if (!migrate_use_compression() || !comp_param) {
 500         return;
 501     }
 502
 503     thread_count = migrate_compress_threads();
 504     for (i = 0; i < thread_count; i++) {
 505         /*
 506          * we use it as a indicator which shows if the thread is
 507          * properly init'd or not
 508          */
 509         if (!comp_param[i].file) {
 510             break;
 511         }
 512
 513         qemu_mutex_lock(&comp_param[i].mutex);
 514         comp_param[i].quit = true;
 515         qemu_cond_signal(&comp_param[i].cond);
 516         qemu_mutex_unlock(&comp_param[i].mutex);
 517
 518         qemu_thread_join(compress_threads + i);
 519         qemu_mutex_destroy(&comp_param[i].mutex);
 520         qemu_cond_destroy(&comp_param[i].cond);
 521         deflateEnd(&comp_param[i].stream);
 522         g_free(comp_param[i].originbuf);
 523         qemu_fclose(comp_param[i].file);
 524         comp_param[i].file = NULL;
 525     }
 526     qemu_mutex_destroy(&comp_done_lock);
 527     qemu_cond_destroy(&comp_done_cond);
 528     g_free(compress_threads);
 529     g_free(comp_param);
 530     compress_threads = NULL;
 531     comp_param = NULL;
 532 }
 533
 534 static int compress_threads_save_setup(void)
 535 {
 536     int i, thread_count;
 537
 538     if (!migrate_use_compression()) {
 539         return 0;
 540     }
 541     thread_count = migrate_compress_threads();
 542     compress_threads = g_new0(QemuThread, thread_count);
 543     comp_param = g_new0(CompressParam, thread_count);
 544     qemu_cond_init(&comp_done_cond);
 545     qemu_mutex_init(&comp_done_lock);
 546     for (i = 0; i < thread_count; i++) {
 547         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 548         if (!comp_param[i].originbuf) {
 549             goto exit;
 550         }
 551
 552         if (deflateInit(&comp_param[i].stream,
 553                         migrate_compress_level()) != Z_OK) {
 554             g_free(comp_param[i].originbuf);
 555             goto exit;
 556         }
 557
 558         /* comp_param[i].file is just used as a dummy buffer to save data,
 559          * set its ops to empty.
 560          */
 561         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 562         comp_param[i].done = true;
 563         comp_param[i].quit = false;
 564         qemu_mutex_init(&comp_param[i].mutex);
 565         qemu_cond_init(&comp_param[i].cond);
 566         qemu_thread_create(compress_threads + i, "compress",
 567                            do_data_compress, comp_param + i,
 568                            QEMU_THREAD_JOINABLE);
 569     }
 570     return 0;
 571
 572 exit:
 573     compress_threads_save_cleanup();
 574     return -1;
 575 }
 576
 577 /**
 578  * save_page_header: write page header to wire
 579  *
 580  * If this is the 1st block, it also writes the block identification
 581  *
 582  * Returns the number of bytes written
 583  *
 584  * @f: QEMUFile where to send the data
 585  * @block: block that contains the page we want to send
 586  * @offset: offset inside the block for the page
 587  *          in the lower bits, it contains flags
 588  */
 589 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 590                                ram_addr_t offset)
 591 {
 592     size_t size, len;
 593
 594     if (block == rs->last_sent_block) {
 595         offset |= RAM_SAVE_FLAG_CONTINUE;
 596     }
 597     qemu_put_be64(f, offset);
 598     size = 8;
 599
 600     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 601         len = strlen(block->idstr);
 602         qemu_put_byte(f, len);
 603         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 604         size += 1 + len;
 605         rs->last_sent_block = block;
 606     }
 607     return size;
 608 }
 609
 610 /**
 611  * mig_throttle_guest_down: throotle down the guest
 612  *
 613  * Reduce amount of guest cpu execution to hopefully slow down memory
 614  * writes. If guest dirty memory rate is reduced below the rate at
 615  * which we can transfer pages to the destination then we should be
 616  * able to complete migration. Some workloads dirty memory way too
 617  * fast and will not effectively converge, even with auto-converge.
 618  */
 619 static void mig_throttle_guest_down(void)
 620 {
 621     MigrationState *s = migrate_get_current();
 622     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 623     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 624     int pct_max = s->parameters.max_cpu_throttle;
 625
 626     /* We have not started throttling yet. Let's start it. */
 627     if (!cpu_throttle_active()) {
 628         cpu_throttle_set(pct_initial);
 629     } else {
 630         /* Throttling already on, just increase the rate */
 631         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
 632                          pct_max));
 633     }
 634 }
 635
 636 /**
 637  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 638  *
 639  * @rs: current RAM state
 640  * @current_addr: address for the zero page
 641  *
 642  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 643  * The important thing is that a stale (not-yet-0'd) page be replaced
 644  * by the new data.
 645  * As a bonus, if the page wasn't in the cache it gets added so that
 646  * when a small write is made into the 0'd page it gets XBZRLE sent.
 647  */
 648 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 649 {
 650     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 651         return;
 652     }
 653
 654     /* We don't care if this fails to allocate a new cache page
 655      * as long as it updated an old one */
 656     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 657                  ram_counters.dirty_sync_count);
 658 }
 659
 660 #define ENCODING_FLAG_XBZRLE 0x1
 661
 662 /**
 663  * save_xbzrle_page: compress and send current page
 664  *
 665  * Returns: 1 means that we wrote the page
 666  *          0 means that page is identical to the one already sent
 667  *          -1 means that xbzrle would be longer than normal
 668  *
 669  * @rs: current RAM state
 670  * @current_data: pointer to the address of the page contents
 671  * @current_addr: addr of the page
 672  * @block: block that contains the page we want to send
 673  * @offset: offset inside the block for the page
 674  * @last_stage: if we are at the completion stage
 675  */
 676 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 677                             ram_addr_t current_addr, RAMBlock *block,
 678                             ram_addr_t offset, bool last_stage)
 679 {
 680     int encoded_len = 0, bytes_xbzrle;
 681     uint8_t *prev_cached_page;
 682
 683     if (!cache_is_cached(XBZRLE.cache, current_addr,
 684                          ram_counters.dirty_sync_count)) {
 685         xbzrle_counters.cache_miss++;
 686         if (!last_stage) {
 687             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 688                              ram_counters.dirty_sync_count) == -1) {
 689                 return -1;
 690             } else {
 691                 /* update *current_data when the page has been
 692                    inserted into cache */
 693                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 694             }
 695         }
 696         return -1;
 697     }
 698
 699     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 700
 701     /* save current buffer into memory */
 702     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 703
 704     /* XBZRLE encoding (if there is no overflow) */
 705     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 706                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 707                                        TARGET_PAGE_SIZE);
 708
 709     /*
 710      * Update the cache contents, so that it corresponds to the data
 711      * sent, in all cases except where we skip the page.
 712      */
 713     if (!last_stage && encoded_len != 0) {
 714         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 715         /*
 716          * In the case where we couldn't compress, ensure that the caller
 717          * sends the data from the cache, since the guest might have
 718          * changed the RAM since we copied it.
 719          */
 720         *current_data = prev_cached_page;
 721     }
 722
 723     if (encoded_len == 0) {
 724         trace_save_xbzrle_page_skipping();
 725         return 0;
 726     } else if (encoded_len == -1) {
 727         trace_save_xbzrle_page_overflow();
 728         xbzrle_counters.overflow++;
 729         return -1;
 730     }
 731
 732     /* Send XBZRLE based compressed page */
 733     bytes_xbzrle = save_page_header(rs, rs->f, block,
 734                                     offset | RAM_SAVE_FLAG_XBZRLE);
 735     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 736     qemu_put_be16(rs->f, encoded_len);
 737     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 738     bytes_xbzrle += encoded_len + 1 + 2;
 739     xbzrle_counters.pages++;
 740     xbzrle_counters.bytes += bytes_xbzrle;
 741     ram_counters.transferred += bytes_xbzrle;
 742
 743     return 1;
 744 }
 745
 746 /**
 747  * migration_bitmap_find_dirty: find the next dirty page from start
 748  *
 749  * Returns the page offset within memory region of the start of a dirty page
 750  *
 751  * @rs: current RAM state
 752  * @rb: RAMBlock where to search for dirty pages
 753  * @start: page where we start the search
 754  */
 755 static inline
 756 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 757                                           unsigned long start)
 758 {
 759     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 760     unsigned long *bitmap = rb->bmap;
 761     unsigned long next;
 762
 763     if (ramblock_is_ignored(rb)) {
 764         return size;
 765     }
 766
 767     /*
 768      * When the free page optimization is enabled, we need to check the bitmap
 769      * to send the non-free pages rather than all the pages in the bulk stage.
 770      */
 771     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 772         next = start + 1;
 773     } else {
 774         next = find_next_bit(bitmap, size, start);
 775     }
 776
 777     return next;
 778 }
 779
 780 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 781                                                 RAMBlock *rb,
 782                                                 unsigned long page)
 783 {
 784     bool ret;
 785
 786     qemu_mutex_lock(&rs->bitmap_mutex);
 787
 788     /*
 789      * Clear dirty bitmap if needed.  This _must_ be called before we
 790      * send any of the page in the chunk because we need to make sure
 791      * we can capture further page content changes when we sync dirty
 792      * log the next time.  So as long as we are going to send any of
 793      * the page in the chunk we clear the remote dirty bitmap for all.
 794      * Clearing it earlier won't be a problem, but too late will.
 795      */
 796     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 797         uint8_t shift = rb->clear_bmap_shift;
 798         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 799         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 800
 801         /*
 802          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 803          * can make things easier sometimes since then start address
 804          * of the small chunk will always be 64 pages aligned so the
 805          * bitmap will always be aligned to unsigned long.  We should
 806          * even be able to remove this restriction but I'm simply
 807          * keeping it.
 808          */
 809         assert(shift >= 6);
 810         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 811         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 812     }
 813
 814     ret = test_and_clear_bit(page, rb->bmap);
 815
 816     if (ret) {
 817         rs->migration_dirty_pages--;
 818     }
 819     qemu_mutex_unlock(&rs->bitmap_mutex);
 820
 821     return ret;
 822 }
 823
 824 /* Called with RCU critical section */
 825 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 826 {
 827     rs->migration_dirty_pages +=
 828         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
 829                                               &rs->num_dirty_pages_period);
 830 }
 831
 832 /**
 833  * ram_pagesize_summary: calculate all the pagesizes of a VM
 834  *
 835  * Returns a summary bitmap of the page sizes of all RAMBlocks
 836  *
 837  * For VMs with just normal pages this is equivalent to the host page
 838  * size. If it's got some huge pages then it's the OR of all the
 839  * different page sizes.
 840  */
 841 uint64_t ram_pagesize_summary(void)
 842 {
 843     RAMBlock *block;
 844     uint64_t summary = 0;
 845
 846     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 847         summary |= block->page_size;
 848     }
 849
 850     return summary;
 851 }
 852
 853 uint64_t ram_get_total_transferred_pages(void)
 854 {
 855     return  ram_counters.normal + ram_counters.duplicate +
 856                 compression_counters.pages + xbzrle_counters.pages;
 857 }
 858
 859 static void migration_update_rates(RAMState *rs, int64_t end_time)
 860 {
 861     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 862     double compressed_size;
 863
 864     /* calculate period counters */
 865     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 866                 / (end_time - rs->time_last_bitmap_sync);
 867
 868     if (!page_count) {
 869         return;
 870     }
 871
 872     if (migrate_use_xbzrle()) {
 873         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 874             rs->xbzrle_cache_miss_prev) / page_count;
 875         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 876     }
 877
 878     if (migrate_use_compression()) {
 879         compression_counters.busy_rate = (double)(compression_counters.busy -
 880             rs->compress_thread_busy_prev) / page_count;
 881         rs->compress_thread_busy_prev = compression_counters.busy;
 882
 883         compressed_size = compression_counters.compressed_size -
 884                           rs->compressed_size_prev;
 885         if (compressed_size) {
 886             double uncompressed_size = (compression_counters.pages -
 887                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 888
 889             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 890             compression_counters.compression_rate =
 891                                         uncompressed_size / compressed_size;
 892
 893             rs->compress_pages_prev = compression_counters.pages;
 894             rs->compressed_size_prev = compression_counters.compressed_size;
 895         }
 896     }
 897 }
 898
 899 static void migration_trigger_throttle(RAMState *rs)
 900 {
 901     MigrationState *s = migrate_get_current();
 902     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 903
 904     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 905     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 906     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 907
 908     /* During block migration the auto-converge logic incorrectly detects
 909      * that ram migration makes no progress. Avoid this by disabling the
 910      * throttling logic during the bulk phase of block migration. */
 911     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 912         /* The following detection logic can be refined later. For now:
 913            Check to see if the ratio between dirtied bytes and the approx.
 914            amount of bytes that just got transferred since the last time
 915            we were in this routine reaches the threshold. If that happens
 916            twice, start or increase throttling. */
 917
 918         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 919             (++rs->dirty_rate_high_cnt >= 2)) {
 920             trace_migration_throttle();
 921             rs->dirty_rate_high_cnt = 0;
 922             mig_throttle_guest_down();
 923         }
 924     }
 925 }
 926
 927 static void migration_bitmap_sync(RAMState *rs)
 928 {
 929     RAMBlock *block;
 930     int64_t end_time;
 931
 932     ram_counters.dirty_sync_count++;
 933
 934     if (!rs->time_last_bitmap_sync) {
 935         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 936     }
 937
 938     trace_migration_bitmap_sync_start();
 939     memory_global_dirty_log_sync();
 940
 941     qemu_mutex_lock(&rs->bitmap_mutex);
 942     WITH_RCU_READ_LOCK_GUARD() {
 943         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 944             ramblock_sync_dirty_bitmap(rs, block);
 945         }
 946         ram_counters.remaining = ram_bytes_remaining();
 947     }
 948     qemu_mutex_unlock(&rs->bitmap_mutex);
 949
 950     memory_global_after_dirty_log_sync();
 951     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 952
 953     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 954
 955     /* more than 1 second = 1000 millisecons */
 956     if (end_time > rs->time_last_bitmap_sync + 1000) {
 957         migration_trigger_throttle(rs);
 958
 959         migration_update_rates(rs, end_time);
 960
 961         rs->target_page_count_prev = rs->target_page_count;
 962
 963         /* reset period counters */
 964         rs->time_last_bitmap_sync = end_time;
 965         rs->num_dirty_pages_period = 0;
 966         rs->bytes_xfer_prev = ram_counters.transferred;
 967     }
 968     if (migrate_use_events()) {
 969         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
 970     }
 971 }
 972
 973 static void migration_bitmap_sync_precopy(RAMState *rs)
 974 {
 975     Error *local_err = NULL;
 976
 977     /*
 978      * The current notifier usage is just an optimization to migration, so we
 979      * don't stop the normal migration process in the error case.
 980      */
 981     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
 982         error_report_err(local_err);
 983         local_err = NULL;
 984     }
 985
 986     migration_bitmap_sync(rs);
 987
 988     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
 989         error_report_err(local_err);
 990     }
 991 }
 992
 993 /**
 994  * save_zero_page_to_file: send the zero page to the file
 995  *
 996  * Returns the size of data written to the file, 0 means the page is not
 997  * a zero page
 998  *
 999  * @rs: current RAM state
1000  * @file: the file where the data is saved
1001  * @block: block that contains the page we want to send
1002  * @offset: offset inside the block for the page
1003  */
1004 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1005                                   RAMBlock *block, ram_addr_t offset)
1006 {
1007     uint8_t *p = block->host + offset;
1008     int len = 0;
1009
1010     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1011         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1012         qemu_put_byte(file, 0);
1013         len += 1;
1014     }
1015     return len;
1016 }
1017
1018 /**
1019  * save_zero_page: send the zero page to the stream
1020  *
1021  * Returns the number of pages written.
1022  *
1023  * @rs: current RAM state
1024  * @block: block that contains the page we want to send
1025  * @offset: offset inside the block for the page
1026  */
1027 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1028 {
1029     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1030
1031     if (len) {
1032         ram_counters.duplicate++;
1033         ram_counters.transferred += len;
1034         return 1;
1035     }
1036     return -1;
1037 }
1038
1039 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1040 {
1041     if (!migrate_release_ram() || !migration_in_postcopy()) {
1042         return;
1043     }
1044
1045     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1046 }
1047
1048 /*
1049  * @pages: the number of pages written by the control path,
1050  *        < 0 - error
1051  *        > 0 - number of pages written
1052  *
1053  * Return true if the pages has been saved, otherwise false is returned.
1054  */
1055 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1056                               int *pages)
1057 {
1058     uint64_t bytes_xmit = 0;
1059     int ret;
1060
1061     *pages = -1;
1062     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1063                                 &bytes_xmit);
1064     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1065         return false;
1066     }
1067
1068     if (bytes_xmit) {
1069         ram_counters.transferred += bytes_xmit;
1070         *pages = 1;
1071     }
1072
1073     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1074         return true;
1075     }
1076
1077     if (bytes_xmit > 0) {
1078         ram_counters.normal++;
1079     } else if (bytes_xmit == 0) {
1080         ram_counters.duplicate++;
1081     }
1082
1083     return true;
1084 }
1085
1086 /*
1087  * directly send the page to the stream
1088  *
1089  * Returns the number of pages written.
1090  *
1091  * @rs: current RAM state
1092  * @block: block that contains the page we want to send
1093  * @offset: offset inside the block for the page
1094  * @buf: the page to be sent
1095  * @async: send to page asyncly
1096  */
1097 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1098                             uint8_t *buf, bool async)
1099 {
1100     ram_counters.transferred += save_page_header(rs, rs->f, block,
1101                                                  offset | RAM_SAVE_FLAG_PAGE);
1102     if (async) {
1103         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1104                               migrate_release_ram() &
1105                               migration_in_postcopy());
1106     } else {
1107         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1108     }
1109     ram_counters.transferred += TARGET_PAGE_SIZE;
1110     ram_counters.normal++;
1111     return 1;
1112 }
1113
1114 /**
1115  * ram_save_page: send the given page to the stream
1116  *
1117  * Returns the number of pages written.
1118  *          < 0 - error
1119  *          >=0 - Number of pages written - this might legally be 0
1120  *                if xbzrle noticed the page was the same.
1121  *
1122  * @rs: current RAM state
1123  * @block: block that contains the page we want to send
1124  * @offset: offset inside the block for the page
1125  * @last_stage: if we are at the completion stage
1126  */
1127 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1128 {
1129     int pages = -1;
1130     uint8_t *p;
1131     bool send_async = true;
1132     RAMBlock *block = pss->block;
1133     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1134     ram_addr_t current_addr = block->offset + offset;
1135
1136     p = block->host + offset;
1137     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1138
1139     XBZRLE_cache_lock();
1140     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1141         migrate_use_xbzrle()) {
1142         pages = save_xbzrle_page(rs, &p, current_addr, block,
1143                                  offset, last_stage);
1144         if (!last_stage) {
1145             /* Can't send this cached data async, since the cache page
1146              * might get updated before it gets to the wire
1147              */
1148             send_async = false;
1149         }
1150     }
1151
1152     /* XBZRLE overflow or normal page */
1153     if (pages == -1) {
1154         pages = save_normal_page(rs, block, offset, p, send_async);
1155     }
1156
1157     XBZRLE_cache_unlock();
1158
1159     return pages;
1160 }
1161
1162 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1163                                  ram_addr_t offset)
1164 {
1165     if (multifd_queue_page(rs->f, block, offset) < 0) {
1166         return -1;
1167     }
1168     ram_counters.normal++;
1169
1170     return 1;
1171 }
1172
1173 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1174                                  ram_addr_t offset, uint8_t *source_buf)
1175 {
1176     RAMState *rs = ram_state;
1177     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1178     bool zero_page = false;
1179     int ret;
1180
1181     if (save_zero_page_to_file(rs, f, block, offset)) {
1182         zero_page = true;
1183         goto exit;
1184     }
1185
1186     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1187
1188     /*
1189      * copy it to a internal buffer to avoid it being modified by VM
1190      * so that we can catch up the error during compression and
1191      * decompression
1192      */
1193     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1194     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1195     if (ret < 0) {
1196         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1197         error_report("compressed data failed!");
1198         return false;
1199     }
1200
1201 exit:
1202     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1203     return zero_page;
1204 }
1205
1206 static void
1207 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1208 {
1209     ram_counters.transferred += bytes_xmit;
1210
1211     if (param->zero_page) {
1212         ram_counters.duplicate++;
1213         return;
1214     }
1215
1216     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1217     compression_counters.compressed_size += bytes_xmit - 8;
1218     compression_counters.pages++;
1219 }
1220
1221 static bool save_page_use_compression(RAMState *rs);
1222
1223 static void flush_compressed_data(RAMState *rs)
1224 {
1225     int idx, len, thread_count;
1226
1227     if (!save_page_use_compression(rs)) {
1228         return;
1229     }
1230     thread_count = migrate_compress_threads();
1231
1232     qemu_mutex_lock(&comp_done_lock);
1233     for (idx = 0; idx < thread_count; idx++) {
1234         while (!comp_param[idx].done) {
1235             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1236         }
1237     }
1238     qemu_mutex_unlock(&comp_done_lock);
1239
1240     for (idx = 0; idx < thread_count; idx++) {
1241         qemu_mutex_lock(&comp_param[idx].mutex);
1242         if (!comp_param[idx].quit) {
1243             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1244             /*
1245              * it's safe to fetch zero_page without holding comp_done_lock
1246              * as there is no further request submitted to the thread,
1247              * i.e, the thread should be waiting for a request at this point.
1248              */
1249             update_compress_thread_counts(&comp_param[idx], len);
1250         }
1251         qemu_mutex_unlock(&comp_param[idx].mutex);
1252     }
1253 }
1254
1255 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1256                                        ram_addr_t offset)
1257 {
1258     param->block = block;
1259     param->offset = offset;
1260 }
1261
1262 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1263                                            ram_addr_t offset)
1264 {
1265     int idx, thread_count, bytes_xmit = -1, pages = -1;
1266     bool wait = migrate_compress_wait_thread();
1267
1268     thread_count = migrate_compress_threads();
1269     qemu_mutex_lock(&comp_done_lock);
1270 retry:
1271     for (idx = 0; idx < thread_count; idx++) {
1272         if (comp_param[idx].done) {
1273             comp_param[idx].done = false;
1274             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1275             qemu_mutex_lock(&comp_param[idx].mutex);
1276             set_compress_params(&comp_param[idx], block, offset);
1277             qemu_cond_signal(&comp_param[idx].cond);
1278             qemu_mutex_unlock(&comp_param[idx].mutex);
1279             pages = 1;
1280             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1281             break;
1282         }
1283     }
1284
1285     /*
1286      * wait for the free thread if the user specifies 'compress-wait-thread',
1287      * otherwise we will post the page out in the main thread as normal page.
1288      */
1289     if (pages < 0 && wait) {
1290         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1291         goto retry;
1292     }
1293     qemu_mutex_unlock(&comp_done_lock);
1294
1295     return pages;
1296 }
1297
1298 /**
1299  * find_dirty_block: find the next dirty page and update any state
1300  * associated with the search process.
1301  *
1302  * Returns true if a page is found
1303  *
1304  * @rs: current RAM state
1305  * @pss: data about the state of the current dirty page scan
1306  * @again: set to false if the search has scanned the whole of RAM
1307  */
1308 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1309 {
1310     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1311     if (pss->complete_round && pss->block == rs->last_seen_block &&
1312         pss->page >= rs->last_page) {
1313         /*
1314          * We've been once around the RAM and haven't found anything.
1315          * Give up.
1316          */
1317         *again = false;
1318         return false;
1319     }
1320     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1321         >= pss->block->used_length) {
1322         /* Didn't find anything in this RAM Block */
1323         pss->page = 0;
1324         pss->block = QLIST_NEXT_RCU(pss->block, next);
1325         if (!pss->block) {
1326             /*
1327              * If memory migration starts over, we will meet a dirtied page
1328              * which may still exists in compression threads's ring, so we
1329              * should flush the compressed data to make sure the new page
1330              * is not overwritten by the old one in the destination.
1331              *
1332              * Also If xbzrle is on, stop using the data compression at this
1333              * point. In theory, xbzrle can do better than compression.
1334              */
1335             flush_compressed_data(rs);
1336
1337             /* Hit the end of the list */
1338             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1339             /* Flag that we've looped */
1340             pss->complete_round = true;
1341             rs->ram_bulk_stage = false;
1342         }
1343         /* Didn't find anything this time, but try again on the new block */
1344         *again = true;
1345         return false;
1346     } else {
1347         /* Can go around again, but... */
1348         *again = true;
1349         /* We've found something so probably don't need to */
1350         return true;
1351     }
1352 }
1353
1354 /**
1355  * unqueue_page: gets a page of the queue
1356  *
1357  * Helper for 'get_queued_page' - gets a page off the queue
1358  *
1359  * Returns the block of the page (or NULL if none available)
1360  *
1361  * @rs: current RAM state
1362  * @offset: used to return the offset within the RAMBlock
1363  */
1364 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1365 {
1366     RAMBlock *block = NULL;
1367
1368     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1369         return NULL;
1370     }
1371
1372     qemu_mutex_lock(&rs->src_page_req_mutex);
1373     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1374         struct RAMSrcPageRequest *entry =
1375                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1376         block = entry->rb;
1377         *offset = entry->offset;
1378
1379         if (entry->len > TARGET_PAGE_SIZE) {
1380             entry->len -= TARGET_PAGE_SIZE;
1381             entry->offset += TARGET_PAGE_SIZE;
1382         } else {
1383             memory_region_unref(block->mr);
1384             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1385             g_free(entry);
1386             migration_consume_urgent_request();
1387         }
1388     }
1389     qemu_mutex_unlock(&rs->src_page_req_mutex);
1390
1391     return block;
1392 }
1393
1394 /**
1395  * get_queued_page: unqueue a page from the postcopy requests
1396  *
1397  * Skips pages that are already sent (!dirty)
1398  *
1399  * Returns true if a queued page is found
1400  *
1401  * @rs: current RAM state
1402  * @pss: data about the state of the current dirty page scan
1403  */
1404 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1405 {
1406     RAMBlock  *block;
1407     ram_addr_t offset;
1408     bool dirty;
1409
1410     do {
1411         block = unqueue_page(rs, &offset);
1412         /*
1413          * We're sending this page, and since it's postcopy nothing else
1414          * will dirty it, and we must make sure it doesn't get sent again
1415          * even if this queue request was received after the background
1416          * search already sent it.
1417          */
1418         if (block) {
1419             unsigned long page;
1420
1421             page = offset >> TARGET_PAGE_BITS;
1422             dirty = test_bit(page, block->bmap);
1423             if (!dirty) {
1424                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1425                                                 page);
1426             } else {
1427                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1428             }
1429         }
1430
1431     } while (block && !dirty);
1432
1433     if (block) {
1434         /*
1435          * As soon as we start servicing pages out of order, then we have
1436          * to kill the bulk stage, since the bulk stage assumes
1437          * in (migration_bitmap_find_and_reset_dirty) that every page is
1438          * dirty, that's no longer true.
1439          */
1440         rs->ram_bulk_stage = false;
1441
1442         /*
1443          * We want the background search to continue from the queued page
1444          * since the guest is likely to want other pages near to the page
1445          * it just requested.
1446          */
1447         pss->block = block;
1448         pss->page = offset >> TARGET_PAGE_BITS;
1449
1450         /*
1451          * This unqueued page would break the "one round" check, even is
1452          * really rare.
1453          */
1454         pss->complete_round = false;
1455     }
1456
1457     return !!block;
1458 }
1459
1460 /**
1461  * migration_page_queue_free: drop any remaining pages in the ram
1462  * request queue
1463  *
1464  * It should be empty at the end anyway, but in error cases there may
1465  * be some left.  in case that there is any page left, we drop it.
1466  *
1467  */
1468 static void migration_page_queue_free(RAMState *rs)
1469 {
1470     struct RAMSrcPageRequest *mspr, *next_mspr;
1471     /* This queue generally should be empty - but in the case of a failed
1472      * migration might have some droppings in.
1473      */
1474     RCU_READ_LOCK_GUARD();
1475     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1476         memory_region_unref(mspr->rb->mr);
1477         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1478         g_free(mspr);
1479     }
1480 }
1481
1482 /**
1483  * ram_save_queue_pages: queue the page for transmission
1484  *
1485  * A request from postcopy destination for example.
1486  *
1487  * Returns zero on success or negative on error
1488  *
1489  * @rbname: Name of the RAMBLock of the request. NULL means the
1490  *          same that last one.
1491  * @start: starting address from the start of the RAMBlock
1492  * @len: length (in bytes) to send
1493  */
1494 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1495 {
1496     RAMBlock *ramblock;
1497     RAMState *rs = ram_state;
1498
1499     ram_counters.postcopy_requests++;
1500     RCU_READ_LOCK_GUARD();
1501
1502     if (!rbname) {
1503         /* Reuse last RAMBlock */
1504         ramblock = rs->last_req_rb;
1505
1506         if (!ramblock) {
1507             /*
1508              * Shouldn't happen, we can't reuse the last RAMBlock if
1509              * it's the 1st request.
1510              */
1511             error_report("ram_save_queue_pages no previous block");
1512             return -1;
1513         }
1514     } else {
1515         ramblock = qemu_ram_block_by_name(rbname);
1516
1517         if (!ramblock) {
1518             /* We shouldn't be asked for a non-existent RAMBlock */
1519             error_report("ram_save_queue_pages no block '%s'", rbname);
1520             return -1;
1521         }
1522         rs->last_req_rb = ramblock;
1523     }
1524     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1525     if (start+len > ramblock->used_length) {
1526         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1527                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1528                      __func__, start, len, ramblock->used_length);
1529         return -1;
1530     }
1531
1532     struct RAMSrcPageRequest *new_entry =
1533         g_malloc0(sizeof(struct RAMSrcPageRequest));
1534     new_entry->rb = ramblock;
1535     new_entry->offset = start;
1536     new_entry->len = len;
1537
1538     memory_region_ref(ramblock->mr);
1539     qemu_mutex_lock(&rs->src_page_req_mutex);
1540     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1541     migration_make_urgent_request();
1542     qemu_mutex_unlock(&rs->src_page_req_mutex);
1543
1544     return 0;
1545 }
1546
1547 static bool save_page_use_compression(RAMState *rs)
1548 {
1549     if (!migrate_use_compression()) {
1550         return false;
1551     }
1552
1553     /*
1554      * If xbzrle is on, stop using the data compression after first
1555      * round of migration even if compression is enabled. In theory,
1556      * xbzrle can do better than compression.
1557      */
1558     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1559         return true;
1560     }
1561
1562     return false;
1563 }
1564
1565 /*
1566  * try to compress the page before posting it out, return true if the page
1567  * has been properly handled by compression, otherwise needs other
1568  * paths to handle it
1569  */
1570 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1571 {
1572     if (!save_page_use_compression(rs)) {
1573         return false;
1574     }
1575
1576     /*
1577      * When starting the process of a new block, the first page of
1578      * the block should be sent out before other pages in the same
1579      * block, and all the pages in last block should have been sent
1580      * out, keeping this order is important, because the 'cont' flag
1581      * is used to avoid resending the block name.
1582      *
1583      * We post the fist page as normal page as compression will take
1584      * much CPU resource.
1585      */
1586     if (block != rs->last_sent_block) {
1587         flush_compressed_data(rs);
1588         return false;
1589     }
1590
1591     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1592         return true;
1593     }
1594
1595     compression_counters.busy++;
1596     return false;
1597 }
1598
1599 /**
1600  * ram_save_target_page: save one target page
1601  *
1602  * Returns the number of pages written
1603  *
1604  * @rs: current RAM state
1605  * @pss: data about the page we want to send
1606  * @last_stage: if we are at the completion stage
1607  */
1608 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1609                                 bool last_stage)
1610 {
1611     RAMBlock *block = pss->block;
1612     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1613     int res;
1614
1615     if (control_save_page(rs, block, offset, &res)) {
1616         return res;
1617     }
1618
1619     if (save_compress_page(rs, block, offset)) {
1620         return 1;
1621     }
1622
1623     res = save_zero_page(rs, block, offset);
1624     if (res > 0) {
1625         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1626          * page would be stale
1627          */
1628         if (!save_page_use_compression(rs)) {
1629             XBZRLE_cache_lock();
1630             xbzrle_cache_zero_page(rs, block->offset + offset);
1631             XBZRLE_cache_unlock();
1632         }
1633         ram_release_pages(block->idstr, offset, res);
1634         return res;
1635     }
1636
1637     /*
1638      * Do not use multifd for:
1639      * 1. Compression as the first page in the new block should be posted out
1640      *    before sending the compressed page
1641      * 2. In postcopy as one whole host page should be placed
1642      */
1643     if (!save_page_use_compression(rs) && migrate_use_multifd()
1644         && !migration_in_postcopy()) {
1645         return ram_save_multifd_page(rs, block, offset);
1646     }
1647
1648     return ram_save_page(rs, pss, last_stage);
1649 }
1650
1651 /**
1652  * ram_save_host_page: save a whole host page
1653  *
1654  * Starting at *offset send pages up to the end of the current host
1655  * page. It's valid for the initial offset to point into the middle of
1656  * a host page in which case the remainder of the hostpage is sent.
1657  * Only dirty target pages are sent. Note that the host page size may
1658  * be a huge page for this block.
1659  * The saving stops at the boundary of the used_length of the block
1660  * if the RAMBlock isn't a multiple of the host page size.
1661  *
1662  * Returns the number of pages written or negative on error
1663  *
1664  * @rs: current RAM state
1665  * @ms: current migration state
1666  * @pss: data about the page we want to send
1667  * @last_stage: if we are at the completion stage
1668  */
1669 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1670                               bool last_stage)
1671 {
1672     int tmppages, pages = 0;
1673     size_t pagesize_bits =
1674         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1675
1676     if (ramblock_is_ignored(pss->block)) {
1677         error_report("block %s should not be migrated !", pss->block->idstr);
1678         return 0;
1679     }
1680
1681     do {
1682         /* Check the pages is dirty and if it is send it */
1683         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1684             pss->page++;
1685             continue;
1686         }
1687
1688         tmppages = ram_save_target_page(rs, pss, last_stage);
1689         if (tmppages < 0) {
1690             return tmppages;
1691         }
1692
1693         pages += tmppages;
1694         pss->page++;
1695         /* Allow rate limiting to happen in the middle of huge pages */
1696         migration_rate_limit();
1697     } while ((pss->page & (pagesize_bits - 1)) &&
1698              offset_in_ramblock(pss->block,
1699                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1700
1701     /* The offset we leave with is the last one we looked at */
1702     pss->page--;
1703     return pages;
1704 }
1705
1706 /**
1707  * ram_find_and_save_block: finds a dirty page and sends it to f
1708  *
1709  * Called within an RCU critical section.
1710  *
1711  * Returns the number of pages written where zero means no dirty pages,
1712  * or negative on error
1713  *
1714  * @rs: current RAM state
1715  * @last_stage: if we are at the completion stage
1716  *
1717  * On systems where host-page-size > target-page-size it will send all the
1718  * pages in a host page that are dirty.
1719  */
1720
1721 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1722 {
1723     PageSearchStatus pss;
1724     int pages = 0;
1725     bool again, found;
1726
1727     /* No dirty page as there is zero RAM */
1728     if (!ram_bytes_total()) {
1729         return pages;
1730     }
1731
1732     pss.block = rs->last_seen_block;
1733     pss.page = rs->last_page;
1734     pss.complete_round = false;
1735
1736     if (!pss.block) {
1737         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1738     }
1739
1740     do {
1741         again = true;
1742         found = get_queued_page(rs, &pss);
1743
1744         if (!found) {
1745             /* priority queue empty, so just search for something dirty */
1746             found = find_dirty_block(rs, &pss, &again);
1747         }
1748
1749         if (found) {
1750             pages = ram_save_host_page(rs, &pss, last_stage);
1751         }
1752     } while (!pages && again);
1753
1754     rs->last_seen_block = pss.block;
1755     rs->last_page = pss.page;
1756
1757     return pages;
1758 }
1759
1760 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1761 {
1762     uint64_t pages = size / TARGET_PAGE_SIZE;
1763
1764     if (zero) {
1765         ram_counters.duplicate += pages;
1766     } else {
1767         ram_counters.normal += pages;
1768         ram_counters.transferred += size;
1769         qemu_update_position(f, size);
1770     }
1771 }
1772
1773 static uint64_t ram_bytes_total_common(bool count_ignored)
1774 {
1775     RAMBlock *block;
1776     uint64_t total = 0;
1777
1778     RCU_READ_LOCK_GUARD();
1779
1780     if (count_ignored) {
1781         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1782             total += block->used_length;
1783         }
1784     } else {
1785         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1786             total += block->used_length;
1787         }
1788     }
1789     return total;
1790 }
1791
1792 uint64_t ram_bytes_total(void)
1793 {
1794     return ram_bytes_total_common(false);
1795 }
1796
1797 static void xbzrle_load_setup(void)
1798 {
1799     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1800 }
1801
1802 static void xbzrle_load_cleanup(void)
1803 {
1804     g_free(XBZRLE.decoded_buf);
1805     XBZRLE.decoded_buf = NULL;
1806 }
1807
1808 static void ram_state_cleanup(RAMState **rsp)
1809 {
1810     if (*rsp) {
1811         migration_page_queue_free(*rsp);
1812         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1813         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1814         g_free(*rsp);
1815         *rsp = NULL;
1816     }
1817 }
1818
1819 static void xbzrle_cleanup(void)
1820 {
1821     XBZRLE_cache_lock();
1822     if (XBZRLE.cache) {
1823         cache_fini(XBZRLE.cache);
1824         g_free(XBZRLE.encoded_buf);
1825         g_free(XBZRLE.current_buf);
1826         g_free(XBZRLE.zero_target_page);
1827         XBZRLE.cache = NULL;
1828         XBZRLE.encoded_buf = NULL;
1829         XBZRLE.current_buf = NULL;
1830         XBZRLE.zero_target_page = NULL;
1831     }
1832     XBZRLE_cache_unlock();
1833 }
1834
1835 static void ram_save_cleanup(void *opaque)
1836 {
1837     RAMState **rsp = opaque;
1838     RAMBlock *block;
1839
1840     /* caller have hold iothread lock or is in a bh, so there is
1841      * no writing race against the migration bitmap
1842      */
1843     memory_global_dirty_log_stop();
1844
1845     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1846         g_free(block->clear_bmap);
1847         block->clear_bmap = NULL;
1848         g_free(block->bmap);
1849         block->bmap = NULL;
1850     }
1851
1852     xbzrle_cleanup();
1853     compress_threads_save_cleanup();
1854     ram_state_cleanup(rsp);
1855 }
1856
1857 static void ram_state_reset(RAMState *rs)
1858 {
1859     rs->last_seen_block = NULL;
1860     rs->last_sent_block = NULL;
1861     rs->last_page = 0;
1862     rs->last_version = ram_list.version;
1863     rs->ram_bulk_stage = true;
1864     rs->fpo_enabled = false;
1865 }
1866
1867 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1868
1869 /*
1870  * 'expected' is the value you expect the bitmap mostly to be full
1871  * of; it won't bother printing lines that are all this value.
1872  * If 'todump' is null the migration bitmap is dumped.
1873  */
1874 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1875                            unsigned long pages)
1876 {
1877     int64_t cur;
1878     int64_t linelen = 128;
1879     char linebuf[129];
1880
1881     for (cur = 0; cur < pages; cur += linelen) {
1882         int64_t curb;
1883         bool found = false;
1884         /*
1885          * Last line; catch the case where the line length
1886          * is longer than remaining ram
1887          */
1888         if (cur + linelen > pages) {
1889             linelen = pages - cur;
1890         }
1891         for (curb = 0; curb < linelen; curb++) {
1892             bool thisbit = test_bit(cur + curb, todump);
1893             linebuf[curb] = thisbit ? '1' : '.';
1894             found = found || (thisbit != expected);
1895         }
1896         if (found) {
1897             linebuf[curb] = '\0';
1898             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1899         }
1900     }
1901 }
1902
1903 /* **** functions for postcopy ***** */
1904
1905 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1906 {
1907     struct RAMBlock *block;
1908
1909     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1910         unsigned long *bitmap = block->bmap;
1911         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1912         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1913
1914         while (run_start < range) {
1915             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1916             ram_discard_range(block->idstr,
1917                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1918                               ((ram_addr_t)(run_end - run_start))
1919                                 << TARGET_PAGE_BITS);
1920             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1921         }
1922     }
1923 }
1924
1925 /**
1926  * postcopy_send_discard_bm_ram: discard a RAMBlock
1927  *
1928  * Returns zero on success
1929  *
1930  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1931  *
1932  * @ms: current migration state
1933  * @block: RAMBlock to discard
1934  */
1935 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1936 {
1937     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1938     unsigned long current;
1939     unsigned long *bitmap = block->bmap;
1940
1941     for (current = 0; current < end; ) {
1942         unsigned long one = find_next_bit(bitmap, end, current);
1943         unsigned long zero, discard_length;
1944
1945         if (one >= end) {
1946             break;
1947         }
1948
1949         zero = find_next_zero_bit(bitmap, end, one + 1);
1950
1951         if (zero >= end) {
1952             discard_length = end - one;
1953         } else {
1954             discard_length = zero - one;
1955         }
1956         postcopy_discard_send_range(ms, one, discard_length);
1957         current = one + discard_length;
1958     }
1959
1960     return 0;
1961 }
1962
1963 /**
1964  * postcopy_each_ram_send_discard: discard all RAMBlocks
1965  *
1966  * Returns 0 for success or negative for error
1967  *
1968  * Utility for the outgoing postcopy code.
1969  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1970  *   passing it bitmap indexes and name.
1971  * (qemu_ram_foreach_block ends up passing unscaled lengths
1972  *  which would mean postcopy code would have to deal with target page)
1973  *
1974  * @ms: current migration state
1975  */
1976 static int postcopy_each_ram_send_discard(MigrationState *ms)
1977 {
1978     struct RAMBlock *block;
1979     int ret;
1980
1981     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1982         postcopy_discard_send_init(ms, block->idstr);
1983
1984         /*
1985          * Postcopy sends chunks of bitmap over the wire, but it
1986          * just needs indexes at this point, avoids it having
1987          * target page specific code.
1988          */
1989         ret = postcopy_send_discard_bm_ram(ms, block);
1990         postcopy_discard_send_finish(ms);
1991         if (ret) {
1992             return ret;
1993         }
1994     }
1995
1996     return 0;
1997 }
1998
1999 /**
2000  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2001  *
2002  * Helper for postcopy_chunk_hostpages; it's called twice to
2003  * canonicalize the two bitmaps, that are similar, but one is
2004  * inverted.
2005  *
2006  * Postcopy requires that all target pages in a hostpage are dirty or
2007  * clean, not a mix.  This function canonicalizes the bitmaps.
2008  *
2009  * @ms: current migration state
2010  * @block: block that contains the page we want to canonicalize
2011  */
2012 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2013 {
2014     RAMState *rs = ram_state;
2015     unsigned long *bitmap = block->bmap;
2016     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2017     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2018     unsigned long run_start;
2019
2020     if (block->page_size == TARGET_PAGE_SIZE) {
2021         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2022         return;
2023     }
2024
2025     /* Find a dirty page */
2026     run_start = find_next_bit(bitmap, pages, 0);
2027
2028     while (run_start < pages) {
2029
2030         /*
2031          * If the start of this run of pages is in the middle of a host
2032          * page, then we need to fixup this host page.
2033          */
2034         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2035             /* Find the end of this run */
2036             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2037             /*
2038              * If the end isn't at the start of a host page, then the
2039              * run doesn't finish at the end of a host page
2040              * and we need to discard.
2041              */
2042         }
2043
2044         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2045             unsigned long page;
2046             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2047                                                              host_ratio);
2048             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2049
2050             /* Clean up the bitmap */
2051             for (page = fixup_start_addr;
2052                  page < fixup_start_addr + host_ratio; page++) {
2053                 /*
2054                  * Remark them as dirty, updating the count for any pages
2055                  * that weren't previously dirty.
2056                  */
2057                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2058             }
2059         }
2060
2061         /* Find the next dirty page for the next iteration */
2062         run_start = find_next_bit(bitmap, pages, run_start);
2063     }
2064 }
2065
2066 /**
2067  * postcopy_chunk_hostpages: discard any partially sent host page
2068  *
2069  * Utility for the outgoing postcopy code.
2070  *
2071  * Discard any partially sent host-page size chunks, mark any partially
2072  * dirty host-page size chunks as all dirty.  In this case the host-page
2073  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2074  *
2075  * Returns zero on success
2076  *
2077  * @ms: current migration state
2078  * @block: block we want to work with
2079  */
2080 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2081 {
2082     postcopy_discard_send_init(ms, block->idstr);
2083
2084     /*
2085      * Ensure that all partially dirty host pages are made fully dirty.
2086      */
2087     postcopy_chunk_hostpages_pass(ms, block);
2088
2089     postcopy_discard_send_finish(ms);
2090     return 0;
2091 }
2092
2093 /**
2094  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2095  *
2096  * Returns zero on success
2097  *
2098  * Transmit the set of pages to be discarded after precopy to the target
2099  * these are pages that:
2100  *     a) Have been previously transmitted but are now dirty again
2101  *     b) Pages that have never been transmitted, this ensures that
2102  *        any pages on the destination that have been mapped by background
2103  *        tasks get discarded (transparent huge pages is the specific concern)
2104  * Hopefully this is pretty sparse
2105  *
2106  * @ms: current migration state
2107  */
2108 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2109 {
2110     RAMState *rs = ram_state;
2111     RAMBlock *block;
2112     int ret;
2113
2114     RCU_READ_LOCK_GUARD();
2115
2116     /* This should be our last sync, the src is now paused */
2117     migration_bitmap_sync(rs);
2118
2119     /* Easiest way to make sure we don't resume in the middle of a host-page */
2120     rs->last_seen_block = NULL;
2121     rs->last_sent_block = NULL;
2122     rs->last_page = 0;
2123
2124     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2125         /* Deal with TPS != HPS and huge pages */
2126         ret = postcopy_chunk_hostpages(ms, block);
2127         if (ret) {
2128             return ret;
2129         }
2130
2131 #ifdef DEBUG_POSTCOPY
2132         ram_debug_dump_bitmap(block->bmap, true,
2133                               block->used_length >> TARGET_PAGE_BITS);
2134 #endif
2135     }
2136     trace_ram_postcopy_send_discard_bitmap();
2137
2138     ret = postcopy_each_ram_send_discard(ms);
2139
2140     return ret;
2141 }
2142
2143 /**
2144  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2145  *
2146  * Returns zero on success
2147  *
2148  * @rbname: name of the RAMBlock of the request. NULL means the
2149  *          same that last one.
2150  * @start: RAMBlock starting page
2151  * @length: RAMBlock size
2152  */
2153 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2154 {
2155     trace_ram_discard_range(rbname, start, length);
2156
2157     RCU_READ_LOCK_GUARD();
2158     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2159
2160     if (!rb) {
2161         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2162         return -1;
2163     }
2164
2165     /*
2166      * On source VM, we don't need to update the received bitmap since
2167      * we don't even have one.
2168      */
2169     if (rb->receivedmap) {
2170         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2171                      length >> qemu_target_page_bits());
2172     }
2173
2174     return ram_block_discard_range(rb, start, length);
2175 }
2176
2177 /*
2178  * For every allocation, we will try not to crash the VM if the
2179  * allocation failed.
2180  */
2181 static int xbzrle_init(void)
2182 {
2183     Error *local_err = NULL;
2184
2185     if (!migrate_use_xbzrle()) {
2186         return 0;
2187     }
2188
2189     XBZRLE_cache_lock();
2190
2191     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2192     if (!XBZRLE.zero_target_page) {
2193         error_report("%s: Error allocating zero page", __func__);
2194         goto err_out;
2195     }
2196
2197     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2198                               TARGET_PAGE_SIZE, &local_err);
2199     if (!XBZRLE.cache) {
2200         error_report_err(local_err);
2201         goto free_zero_page;
2202     }
2203
2204     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2205     if (!XBZRLE.encoded_buf) {
2206         error_report("%s: Error allocating encoded_buf", __func__);
2207         goto free_cache;
2208     }
2209
2210     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2211     if (!XBZRLE.current_buf) {
2212         error_report("%s: Error allocating current_buf", __func__);
2213         goto free_encoded_buf;
2214     }
2215
2216     /* We are all good */
2217     XBZRLE_cache_unlock();
2218     return 0;
2219
2220 free_encoded_buf:
2221     g_free(XBZRLE.encoded_buf);
2222     XBZRLE.encoded_buf = NULL;
2223 free_cache:
2224     cache_fini(XBZRLE.cache);
2225     XBZRLE.cache = NULL;
2226 free_zero_page:
2227     g_free(XBZRLE.zero_target_page);
2228     XBZRLE.zero_target_page = NULL;
2229 err_out:
2230     XBZRLE_cache_unlock();
2231     return -ENOMEM;
2232 }
2233
2234 static int ram_state_init(RAMState **rsp)
2235 {
2236     *rsp = g_try_new0(RAMState, 1);
2237
2238     if (!*rsp) {
2239         error_report("%s: Init ramstate fail", __func__);
2240         return -1;
2241     }
2242
2243     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2244     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2245     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2246
2247     /*
2248      * Count the total number of pages used by ram blocks not including any
2249      * gaps due to alignment or unplugs.
2250      * This must match with the initial values of dirty bitmap.
2251      */
2252     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2253     ram_state_reset(*rsp);
2254
2255     return 0;
2256 }
2257
2258 static void ram_list_init_bitmaps(void)
2259 {
2260     MigrationState *ms = migrate_get_current();
2261     RAMBlock *block;
2262     unsigned long pages;
2263     uint8_t shift;
2264
2265     /* Skip setting bitmap if there is no RAM */
2266     if (ram_bytes_total()) {
2267         shift = ms->clear_bitmap_shift;
2268         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2269             error_report("clear_bitmap_shift (%u) too big, using "
2270                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2271             shift = CLEAR_BITMAP_SHIFT_MAX;
2272         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2273             error_report("clear_bitmap_shift (%u) too small, using "
2274                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2275             shift = CLEAR_BITMAP_SHIFT_MIN;
2276         }
2277
2278         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2279             pages = block->max_length >> TARGET_PAGE_BITS;
2280             /*
2281              * The initial dirty bitmap for migration must be set with all
2282              * ones to make sure we'll migrate every guest RAM page to
2283              * destination.
2284              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2285              * new migration after a failed migration, ram_list.
2286              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2287              * guest memory.
2288              */
2289             block->bmap = bitmap_new(pages);
2290             bitmap_set(block->bmap, 0, pages);
2291             block->clear_bmap_shift = shift;
2292             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2293         }
2294     }
2295 }
2296
2297 static void ram_init_bitmaps(RAMState *rs)
2298 {
2299     /* For memory_global_dirty_log_start below.  */
2300     qemu_mutex_lock_iothread();
2301     qemu_mutex_lock_ramlist();
2302
2303     WITH_RCU_READ_LOCK_GUARD() {
2304         ram_list_init_bitmaps();
2305         memory_global_dirty_log_start();
2306         migration_bitmap_sync_precopy(rs);
2307     }
2308     qemu_mutex_unlock_ramlist();
2309     qemu_mutex_unlock_iothread();
2310 }
2311
2312 static int ram_init_all(RAMState **rsp)
2313 {
2314     if (ram_state_init(rsp)) {
2315         return -1;
2316     }
2317
2318     if (xbzrle_init()) {
2319         ram_state_cleanup(rsp);
2320         return -1;
2321     }
2322
2323     ram_init_bitmaps(*rsp);
2324
2325     return 0;
2326 }
2327
2328 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2329 {
2330     RAMBlock *block;
2331     uint64_t pages = 0;
2332
2333     /*
2334      * Postcopy is not using xbzrle/compression, so no need for that.
2335      * Also, since source are already halted, we don't need to care
2336      * about dirty page logging as well.
2337      */
2338
2339     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2340         pages += bitmap_count_one(block->bmap,
2341                                   block->used_length >> TARGET_PAGE_BITS);
2342     }
2343
2344     /* This may not be aligned with current bitmaps. Recalculate. */
2345     rs->migration_dirty_pages = pages;
2346
2347     rs->last_seen_block = NULL;
2348     rs->last_sent_block = NULL;
2349     rs->last_page = 0;
2350     rs->last_version = ram_list.version;
2351     /*
2352      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2353      * matter what we have sent.
2354      */
2355     rs->ram_bulk_stage = false;
2356
2357     /* Update RAMState cache of output QEMUFile */
2358     rs->f = out;
2359
2360     trace_ram_state_resume_prepare(pages);
2361 }
2362
2363 /*
2364  * This function clears bits of the free pages reported by the caller from the
2365  * migration dirty bitmap. @addr is the host address corresponding to the
2366  * start of the continuous guest free pages, and @len is the total bytes of
2367  * those pages.
2368  */
2369 void qemu_guest_free_page_hint(void *addr, size_t len)
2370 {
2371     RAMBlock *block;
2372     ram_addr_t offset;
2373     size_t used_len, start, npages;
2374     MigrationState *s = migrate_get_current();
2375
2376     /* This function is currently expected to be used during live migration */
2377     if (!migration_is_setup_or_active(s->state)) {
2378         return;
2379     }
2380
2381     for (; len > 0; len -= used_len, addr += used_len) {
2382         block = qemu_ram_block_from_host(addr, false, &offset);
2383         if (unlikely(!block || offset >= block->used_length)) {
2384             /*
2385              * The implementation might not support RAMBlock resize during
2386              * live migration, but it could happen in theory with future
2387              * updates. So we add a check here to capture that case.
2388              */
2389             error_report_once("%s unexpected error", __func__);
2390             return;
2391         }
2392
2393         if (len <= block->used_length - offset) {
2394             used_len = len;
2395         } else {
2396             used_len = block->used_length - offset;
2397         }
2398
2399         start = offset >> TARGET_PAGE_BITS;
2400         npages = used_len >> TARGET_PAGE_BITS;
2401
2402         qemu_mutex_lock(&ram_state->bitmap_mutex);
2403         ram_state->migration_dirty_pages -=
2404                       bitmap_count_one_with_offset(block->bmap, start, npages);
2405         bitmap_clear(block->bmap, start, npages);
2406         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2407     }
2408 }
2409
2410 /*
2411  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2412  * long-running RCU critical section.  When rcu-reclaims in the code
2413  * start to become numerous it will be necessary to reduce the
2414  * granularity of these critical sections.
2415  */
2416
2417 /**
2418  * ram_save_setup: Setup RAM for migration
2419  *
2420  * Returns zero to indicate success and negative for error
2421  *
2422  * @f: QEMUFile where to send the data
2423  * @opaque: RAMState pointer
2424  */
2425 static int ram_save_setup(QEMUFile *f, void *opaque)
2426 {
2427     RAMState **rsp = opaque;
2428     RAMBlock *block;
2429
2430     if (compress_threads_save_setup()) {
2431         return -1;
2432     }
2433
2434     /* migration has already setup the bitmap, reuse it. */
2435     if (!migration_in_colo_state()) {
2436         if (ram_init_all(rsp) != 0) {
2437             compress_threads_save_cleanup();
2438             return -1;
2439         }
2440     }
2441     (*rsp)->f = f;
2442
2443     WITH_RCU_READ_LOCK_GUARD() {
2444         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2445
2446         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2447             qemu_put_byte(f, strlen(block->idstr));
2448             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2449             qemu_put_be64(f, block->used_length);
2450             if (migrate_postcopy_ram() && block->page_size !=
2451                                           qemu_host_page_size) {
2452                 qemu_put_be64(f, block->page_size);
2453             }
2454             if (migrate_ignore_shared()) {
2455                 qemu_put_be64(f, block->mr->addr);
2456             }
2457         }
2458     }
2459
2460     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2461     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2462
2463     multifd_send_sync_main(f);
2464     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2465     qemu_fflush(f);
2466
2467     return 0;
2468 }
2469
2470 /**
2471  * ram_save_iterate: iterative stage for migration
2472  *
2473  * Returns zero to indicate success and negative for error
2474  *
2475  * @f: QEMUFile where to send the data
2476  * @opaque: RAMState pointer
2477  */
2478 static int ram_save_iterate(QEMUFile *f, void *opaque)
2479 {
2480     RAMState **temp = opaque;
2481     RAMState *rs = *temp;
2482     int ret = 0;
2483     int i;
2484     int64_t t0;
2485     int done = 0;
2486
2487     if (blk_mig_bulk_active()) {
2488         /* Avoid transferring ram during bulk phase of block migration as
2489          * the bulk phase will usually take a long time and transferring
2490          * ram updates during that time is pointless. */
2491         goto out;
2492     }
2493
2494     WITH_RCU_READ_LOCK_GUARD() {
2495         if (ram_list.version != rs->last_version) {
2496             ram_state_reset(rs);
2497         }
2498
2499         /* Read version before ram_list.blocks */
2500         smp_rmb();
2501
2502         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2503
2504         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2505         i = 0;
2506         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2507                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2508             int pages;
2509
2510             if (qemu_file_get_error(f)) {
2511                 break;
2512             }
2513
2514             pages = ram_find_and_save_block(rs, false);
2515             /* no more pages to sent */
2516             if (pages == 0) {
2517                 done = 1;
2518                 break;
2519             }
2520
2521             if (pages < 0) {
2522                 qemu_file_set_error(f, pages);
2523                 break;
2524             }
2525
2526             rs->target_page_count += pages;
2527
2528             /*
2529              * During postcopy, it is necessary to make sure one whole host
2530              * page is sent in one chunk.
2531              */
2532             if (migrate_postcopy_ram()) {
2533                 flush_compressed_data(rs);
2534             }
2535
2536             /*
2537              * we want to check in the 1st loop, just in case it was the 1st
2538              * time and we had to sync the dirty bitmap.
2539              * qemu_clock_get_ns() is a bit expensive, so we only check each
2540              * some iterations
2541              */
2542             if ((i & 63) == 0) {
2543                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2544                               1000000;
2545                 if (t1 > MAX_WAIT) {
2546                     trace_ram_save_iterate_big_wait(t1, i);
2547                     break;
2548                 }
2549             }
2550             i++;
2551         }
2552     }
2553
2554     /*
2555      * Must occur before EOS (or any QEMUFile operation)
2556      * because of RDMA protocol.
2557      */
2558     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2559
2560 out:
2561     if (ret >= 0
2562         && migration_is_setup_or_active(migrate_get_current()->state)) {
2563         multifd_send_sync_main(rs->f);
2564         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2565         qemu_fflush(f);
2566         ram_counters.transferred += 8;
2567
2568         ret = qemu_file_get_error(f);
2569     }
2570     if (ret < 0) {
2571         return ret;
2572     }
2573
2574     return done;
2575 }
2576
2577 /**
2578  * ram_save_complete: function called to send the remaining amount of ram
2579  *
2580  * Returns zero to indicate success or negative on error
2581  *
2582  * Called with iothread lock
2583  *
2584  * @f: QEMUFile where to send the data
2585  * @opaque: RAMState pointer
2586  */
2587 static int ram_save_complete(QEMUFile *f, void *opaque)
2588 {
2589     RAMState **temp = opaque;
2590     RAMState *rs = *temp;
2591     int ret = 0;
2592
2593     WITH_RCU_READ_LOCK_GUARD() {
2594         if (!migration_in_postcopy()) {
2595             migration_bitmap_sync_precopy(rs);
2596         }
2597
2598         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2599
2600         /* try transferring iterative blocks of memory */
2601
2602         /* flush all remaining blocks regardless of rate limiting */
2603         while (true) {
2604             int pages;
2605
2606             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2607             /* no more blocks to sent */
2608             if (pages == 0) {
2609                 break;
2610             }
2611             if (pages < 0) {
2612                 ret = pages;
2613                 break;
2614             }
2615         }
2616
2617         flush_compressed_data(rs);
2618         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2619     }
2620
2621     if (ret >= 0) {
2622         multifd_send_sync_main(rs->f);
2623         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2624         qemu_fflush(f);
2625     }
2626
2627     return ret;
2628 }
2629
2630 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2631                              uint64_t *res_precopy_only,
2632                              uint64_t *res_compatible,
2633                              uint64_t *res_postcopy_only)
2634 {
2635     RAMState **temp = opaque;
2636     RAMState *rs = *temp;
2637     uint64_t remaining_size;
2638
2639     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2640
2641     if (!migration_in_postcopy() &&
2642         remaining_size < max_size) {
2643         qemu_mutex_lock_iothread();
2644         WITH_RCU_READ_LOCK_GUARD() {
2645             migration_bitmap_sync_precopy(rs);
2646         }
2647         qemu_mutex_unlock_iothread();
2648         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2649     }
2650
2651     if (migrate_postcopy_ram()) {
2652         /* We can do postcopy, and all the data is postcopiable */
2653         *res_compatible += remaining_size;
2654     } else {
2655         *res_precopy_only += remaining_size;
2656     }
2657 }
2658
2659 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2660 {
2661     unsigned int xh_len;
2662     int xh_flags;
2663     uint8_t *loaded_data;
2664
2665     /* extract RLE header */
2666     xh_flags = qemu_get_byte(f);
2667     xh_len = qemu_get_be16(f);
2668
2669     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2670         error_report("Failed to load XBZRLE page - wrong compression!");
2671         return -1;
2672     }
2673
2674     if (xh_len > TARGET_PAGE_SIZE) {
2675         error_report("Failed to load XBZRLE page - len overflow!");
2676         return -1;
2677     }
2678     loaded_data = XBZRLE.decoded_buf;
2679     /* load data and decode */
2680     /* it can change loaded_data to point to an internal buffer */
2681     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2682
2683     /* decode RLE */
2684     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2685                              TARGET_PAGE_SIZE) == -1) {
2686         error_report("Failed to load XBZRLE page - decode error!");
2687         return -1;
2688     }
2689
2690     return 0;
2691 }
2692
2693 /**
2694  * ram_block_from_stream: read a RAMBlock id from the migration stream
2695  *
2696  * Must be called from within a rcu critical section.
2697  *
2698  * Returns a pointer from within the RCU-protected ram_list.
2699  *
2700  * @f: QEMUFile where to read the data from
2701  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2702  */
2703 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2704 {
2705     static RAMBlock *block = NULL;
2706     char id[256];
2707     uint8_t len;
2708
2709     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2710         if (!block) {
2711             error_report("Ack, bad migration stream!");
2712             return NULL;
2713         }
2714         return block;
2715     }
2716
2717     len = qemu_get_byte(f);
2718     qemu_get_buffer(f, (uint8_t *)id, len);
2719     id[len] = 0;
2720
2721     block = qemu_ram_block_by_name(id);
2722     if (!block) {
2723         error_report("Can't find block %s", id);
2724         return NULL;
2725     }
2726
2727     if (ramblock_is_ignored(block)) {
2728         error_report("block %s should not be migrated !", id);
2729         return NULL;
2730     }
2731
2732     return block;
2733 }
2734
2735 static inline void *host_from_ram_block_offset(RAMBlock *block,
2736                                                ram_addr_t offset)
2737 {
2738     if (!offset_in_ramblock(block, offset)) {
2739         return NULL;
2740     }
2741
2742     return block->host + offset;
2743 }
2744
2745 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2746                              ram_addr_t offset, bool record_bitmap)
2747 {
2748     if (!offset_in_ramblock(block, offset)) {
2749         return NULL;
2750     }
2751     if (!block->colo_cache) {
2752         error_report("%s: colo_cache is NULL in block :%s",
2753                      __func__, block->idstr);
2754         return NULL;
2755     }
2756
2757     /*
2758     * During colo checkpoint, we need bitmap of these migrated pages.
2759     * It help us to decide which pages in ram cache should be flushed
2760     * into VM's RAM later.
2761     */
2762     if (record_bitmap &&
2763         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2764         ram_state->migration_dirty_pages++;
2765     }
2766     return block->colo_cache + offset;
2767 }
2768
2769 /**
2770  * ram_handle_compressed: handle the zero page case
2771  *
2772  * If a page (or a whole RDMA chunk) has been
2773  * determined to be zero, then zap it.
2774  *
2775  * @host: host address for the zero page
2776  * @ch: what the page is filled from.  We only support zero
2777  * @size: size of the zero page
2778  */
2779 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2780 {
2781     if (ch != 0 || !is_zero_range(host, size)) {
2782         memset(host, ch, size);
2783     }
2784 }
2785
2786 /* return the size after decompression, or negative value on error */
2787 static int
2788 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2789                      const uint8_t *source, size_t source_len)
2790 {
2791     int err;
2792
2793     err = inflateReset(stream);
2794     if (err != Z_OK) {
2795         return -1;
2796     }
2797
2798     stream->avail_in = source_len;
2799     stream->next_in = (uint8_t *)source;
2800     stream->avail_out = dest_len;
2801     stream->next_out = dest;
2802
2803     err = inflate(stream, Z_NO_FLUSH);
2804     if (err != Z_STREAM_END) {
2805         return -1;
2806     }
2807
2808     return stream->total_out;
2809 }
2810
2811 static void *do_data_decompress(void *opaque)
2812 {
2813     DecompressParam *param = opaque;
2814     unsigned long pagesize;
2815     uint8_t *des;
2816     int len, ret;
2817
2818     qemu_mutex_lock(&param->mutex);
2819     while (!param->quit) {
2820         if (param->des) {
2821             des = param->des;
2822             len = param->len;
2823             param->des = 0;
2824             qemu_mutex_unlock(&param->mutex);
2825
2826             pagesize = TARGET_PAGE_SIZE;
2827
2828             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2829                                        param->compbuf, len);
2830             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2831                 error_report("decompress data failed");
2832                 qemu_file_set_error(decomp_file, ret);
2833             }
2834
2835             qemu_mutex_lock(&decomp_done_lock);
2836             param->done = true;
2837             qemu_cond_signal(&decomp_done_cond);
2838             qemu_mutex_unlock(&decomp_done_lock);
2839
2840             qemu_mutex_lock(&param->mutex);
2841         } else {
2842             qemu_cond_wait(&param->cond, &param->mutex);
2843         }
2844     }
2845     qemu_mutex_unlock(&param->mutex);
2846
2847     return NULL;
2848 }
2849
2850 static int wait_for_decompress_done(void)
2851 {
2852     int idx, thread_count;
2853
2854     if (!migrate_use_compression()) {
2855         return 0;
2856     }
2857
2858     thread_count = migrate_decompress_threads();
2859     qemu_mutex_lock(&decomp_done_lock);
2860     for (idx = 0; idx < thread_count; idx++) {
2861         while (!decomp_param[idx].done) {
2862             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2863         }
2864     }
2865     qemu_mutex_unlock(&decomp_done_lock);
2866     return qemu_file_get_error(decomp_file);
2867 }
2868
2869 static void compress_threads_load_cleanup(void)
2870 {
2871     int i, thread_count;
2872
2873     if (!migrate_use_compression()) {
2874         return;
2875     }
2876     thread_count = migrate_decompress_threads();
2877     for (i = 0; i < thread_count; i++) {
2878         /*
2879          * we use it as a indicator which shows if the thread is
2880          * properly init'd or not
2881          */
2882         if (!decomp_param[i].compbuf) {
2883             break;
2884         }
2885
2886         qemu_mutex_lock(&decomp_param[i].mutex);
2887         decomp_param[i].quit = true;
2888         qemu_cond_signal(&decomp_param[i].cond);
2889         qemu_mutex_unlock(&decomp_param[i].mutex);
2890     }
2891     for (i = 0; i < thread_count; i++) {
2892         if (!decomp_param[i].compbuf) {
2893             break;
2894         }
2895
2896         qemu_thread_join(decompress_threads + i);
2897         qemu_mutex_destroy(&decomp_param[i].mutex);
2898         qemu_cond_destroy(&decomp_param[i].cond);
2899         inflateEnd(&decomp_param[i].stream);
2900         g_free(decomp_param[i].compbuf);
2901         decomp_param[i].compbuf = NULL;
2902     }
2903     g_free(decompress_threads);
2904     g_free(decomp_param);
2905     decompress_threads = NULL;
2906     decomp_param = NULL;
2907     decomp_file = NULL;
2908 }
2909
2910 static int compress_threads_load_setup(QEMUFile *f)
2911 {
2912     int i, thread_count;
2913
2914     if (!migrate_use_compression()) {
2915         return 0;
2916     }
2917
2918     thread_count = migrate_decompress_threads();
2919     decompress_threads = g_new0(QemuThread, thread_count);
2920     decomp_param = g_new0(DecompressParam, thread_count);
2921     qemu_mutex_init(&decomp_done_lock);
2922     qemu_cond_init(&decomp_done_cond);
2923     decomp_file = f;
2924     for (i = 0; i < thread_count; i++) {
2925         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2926             goto exit;
2927         }
2928
2929         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2930         qemu_mutex_init(&decomp_param[i].mutex);
2931         qemu_cond_init(&decomp_param[i].cond);
2932         decomp_param[i].done = true;
2933         decomp_param[i].quit = false;
2934         qemu_thread_create(decompress_threads + i, "decompress",
2935                            do_data_decompress, decomp_param + i,
2936                            QEMU_THREAD_JOINABLE);
2937     }
2938     return 0;
2939 exit:
2940     compress_threads_load_cleanup();
2941     return -1;
2942 }
2943
2944 static void decompress_data_with_multi_threads(QEMUFile *f,
2945                                                void *host, int len)
2946 {
2947     int idx, thread_count;
2948
2949     thread_count = migrate_decompress_threads();
2950     qemu_mutex_lock(&decomp_done_lock);
2951     while (true) {
2952         for (idx = 0; idx < thread_count; idx++) {
2953             if (decomp_param[idx].done) {
2954                 decomp_param[idx].done = false;
2955                 qemu_mutex_lock(&decomp_param[idx].mutex);
2956                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2957                 decomp_param[idx].des = host;
2958                 decomp_param[idx].len = len;
2959                 qemu_cond_signal(&decomp_param[idx].cond);
2960                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2961                 break;
2962             }
2963         }
2964         if (idx < thread_count) {
2965             break;
2966         } else {
2967             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2968         }
2969     }
2970     qemu_mutex_unlock(&decomp_done_lock);
2971 }
2972
2973 /*
2974  * colo cache: this is for secondary VM, we cache the whole
2975  * memory of the secondary VM, it is need to hold the global lock
2976  * to call this helper.
2977  */
2978 int colo_init_ram_cache(void)
2979 {
2980     RAMBlock *block;
2981
2982     WITH_RCU_READ_LOCK_GUARD() {
2983         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2984             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2985                                                     NULL,
2986                                                     false);
2987             if (!block->colo_cache) {
2988                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
2989                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2990                              block->used_length);
2991                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2992                     if (block->colo_cache) {
2993                         qemu_anon_ram_free(block->colo_cache, block->used_length);
2994                         block->colo_cache = NULL;
2995                     }
2996                 }
2997                 return -errno;
2998             }
2999         }
3000     }
3001
3002     /*
3003     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3004     * with to decide which page in cache should be flushed into SVM's RAM. Here
3005     * we use the same name 'ram_bitmap' as for migration.
3006     */
3007     if (ram_bytes_total()) {
3008         RAMBlock *block;
3009
3010         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3011             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3012             block->bmap = bitmap_new(pages);
3013         }
3014     }
3015
3016     ram_state_init(&ram_state);
3017     return 0;
3018 }
3019
3020 /* TODO: duplicated with ram_init_bitmaps */
3021 void colo_incoming_start_dirty_log(void)
3022 {
3023     RAMBlock *block = NULL;
3024     /* For memory_global_dirty_log_start below. */
3025     qemu_mutex_lock_iothread();
3026     qemu_mutex_lock_ramlist();
3027
3028     memory_global_dirty_log_sync();
3029     WITH_RCU_READ_LOCK_GUARD() {
3030         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3031             ramblock_sync_dirty_bitmap(ram_state, block);
3032             /* Discard this dirty bitmap record */
3033             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3034         }
3035         memory_global_dirty_log_start();
3036     }
3037     ram_state->migration_dirty_pages = 0;
3038     qemu_mutex_unlock_ramlist();
3039     qemu_mutex_unlock_iothread();
3040 }
3041
3042 /* It is need to hold the global lock to call this helper */
3043 void colo_release_ram_cache(void)
3044 {
3045     RAMBlock *block;
3046
3047     memory_global_dirty_log_stop();
3048     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3049         g_free(block->bmap);
3050         block->bmap = NULL;
3051     }
3052
3053     WITH_RCU_READ_LOCK_GUARD() {
3054         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3055             if (block->colo_cache) {
3056                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3057                 block->colo_cache = NULL;
3058             }
3059         }
3060     }
3061     ram_state_cleanup(&ram_state);
3062 }
3063
3064 /**
3065  * ram_load_setup: Setup RAM for migration incoming side
3066  *
3067  * Returns zero to indicate success and negative for error
3068  *
3069  * @f: QEMUFile where to receive the data
3070  * @opaque: RAMState pointer
3071  */
3072 static int ram_load_setup(QEMUFile *f, void *opaque)
3073 {
3074     if (compress_threads_load_setup(f)) {
3075         return -1;
3076     }
3077
3078     xbzrle_load_setup();
3079     ramblock_recv_map_init();
3080
3081     return 0;
3082 }
3083
3084 static int ram_load_cleanup(void *opaque)
3085 {
3086     RAMBlock *rb;
3087
3088     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3089         qemu_ram_block_writeback(rb);
3090     }
3091
3092     xbzrle_load_cleanup();
3093     compress_threads_load_cleanup();
3094
3095     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3096         g_free(rb->receivedmap);
3097         rb->receivedmap = NULL;
3098     }
3099
3100     return 0;
3101 }
3102
3103 /**
3104  * ram_postcopy_incoming_init: allocate postcopy data structures
3105  *
3106  * Returns 0 for success and negative if there was one error
3107  *
3108  * @mis: current migration incoming state
3109  *
3110  * Allocate data structures etc needed by incoming migration with
3111  * postcopy-ram. postcopy-ram's similarly names
3112  * postcopy_ram_incoming_init does the work.
3113  */
3114 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3115 {
3116     return postcopy_ram_incoming_init(mis);
3117 }
3118
3119 /**
3120  * ram_load_postcopy: load a page in postcopy case
3121  *
3122  * Returns 0 for success or -errno in case of error
3123  *
3124  * Called in postcopy mode by ram_load().
3125  * rcu_read_lock is taken prior to this being called.
3126  *
3127  * @f: QEMUFile where to send the data
3128  */
3129 static int ram_load_postcopy(QEMUFile *f)
3130 {
3131     int flags = 0, ret = 0;
3132     bool place_needed = false;
3133     bool matches_target_page_size = false;
3134     MigrationIncomingState *mis = migration_incoming_get_current();
3135     /* Temporary page that is later 'placed' */
3136     void *postcopy_host_page = mis->postcopy_tmp_page;
3137     void *this_host = NULL;
3138     bool all_zero = false;
3139     int target_pages = 0;
3140
3141     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3142         ram_addr_t addr;
3143         void *host = NULL;
3144         void *page_buffer = NULL;
3145         void *place_source = NULL;
3146         RAMBlock *block = NULL;
3147         uint8_t ch;
3148         int len;
3149
3150         addr = qemu_get_be64(f);
3151
3152         /*
3153          * If qemu file error, we should stop here, and then "addr"
3154          * may be invalid
3155          */
3156         ret = qemu_file_get_error(f);
3157         if (ret) {
3158             break;
3159         }
3160
3161         flags = addr & ~TARGET_PAGE_MASK;
3162         addr &= TARGET_PAGE_MASK;
3163
3164         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3165         place_needed = false;
3166         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3167                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3168             block = ram_block_from_stream(f, flags);
3169
3170             host = host_from_ram_block_offset(block, addr);
3171             if (!host) {
3172                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3173                 ret = -EINVAL;
3174                 break;
3175             }
3176             target_pages++;
3177             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3178             /*
3179              * Postcopy requires that we place whole host pages atomically;
3180              * these may be huge pages for RAMBlocks that are backed by
3181              * hugetlbfs.
3182              * To make it atomic, the data is read into a temporary page
3183              * that's moved into place later.
3184              * The migration protocol uses,  possibly smaller, target-pages
3185              * however the source ensures it always sends all the components
3186              * of a host page in one chunk.
3187              */
3188             page_buffer = postcopy_host_page +
3189                           ((uintptr_t)host & (block->page_size - 1));
3190             /* If all TP are zero then we can optimise the place */
3191             if (target_pages == 1) {
3192                 all_zero = true;
3193                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3194                                                     block->page_size);
3195             } else {
3196                 /* not the 1st TP within the HP */
3197                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3198                     (uintptr_t)this_host) {
3199                     error_report("Non-same host page %p/%p",
3200                                   host, this_host);
3201                     ret = -EINVAL;
3202                     break;
3203                 }
3204             }
3205
3206             /*
3207              * If it's the last part of a host page then we place the host
3208              * page
3209              */
3210             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3211                 place_needed = true;
3212                 target_pages = 0;
3213             }
3214             place_source = postcopy_host_page;
3215         }
3216
3217         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3218         case RAM_SAVE_FLAG_ZERO:
3219             ch = qemu_get_byte(f);
3220             /*
3221              * Can skip to set page_buffer when
3222              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3223              */
3224             if (ch || !matches_target_page_size) {
3225                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3226             }
3227             if (ch) {
3228                 all_zero = false;
3229             }
3230             break;
3231
3232         case RAM_SAVE_FLAG_PAGE:
3233             all_zero = false;
3234             if (!matches_target_page_size) {
3235                 /* For huge pages, we always use temporary buffer */
3236                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3237             } else {
3238                 /*
3239                  * For small pages that matches target page size, we
3240                  * avoid the qemu_file copy.  Instead we directly use
3241                  * the buffer of QEMUFile to place the page.  Note: we
3242                  * cannot do any QEMUFile operation before using that
3243                  * buffer to make sure the buffer is valid when
3244                  * placing the page.
3245                  */
3246                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3247                                          TARGET_PAGE_SIZE);
3248             }
3249             break;
3250         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3251             all_zero = false;
3252             len = qemu_get_be32(f);
3253             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3254                 error_report("Invalid compressed data length: %d", len);
3255                 ret = -EINVAL;
3256                 break;
3257             }
3258             decompress_data_with_multi_threads(f, page_buffer, len);
3259             break;
3260
3261         case RAM_SAVE_FLAG_EOS:
3262             /* normal exit */
3263             multifd_recv_sync_main();
3264             break;
3265         default:
3266             error_report("Unknown combination of migration flags: %#x"
3267                          " (postcopy mode)", flags);
3268             ret = -EINVAL;
3269             break;
3270         }
3271
3272         /* Got the whole host page, wait for decompress before placing. */
3273         if (place_needed) {
3274             ret |= wait_for_decompress_done();
3275         }
3276
3277         /* Detect for any possible file errors */
3278         if (!ret && qemu_file_get_error(f)) {
3279             ret = qemu_file_get_error(f);
3280         }
3281
3282         if (!ret && place_needed) {
3283             /* This gets called at the last target page in the host page */
3284             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3285                                                        block->page_size);
3286
3287             if (all_zero) {
3288                 ret = postcopy_place_page_zero(mis, place_dest,
3289                                                block);
3290             } else {
3291                 ret = postcopy_place_page(mis, place_dest,
3292                                           place_source, block);
3293             }
3294         }
3295     }
3296
3297     return ret;
3298 }
3299
3300 static bool postcopy_is_advised(void)
3301 {
3302     PostcopyState ps = postcopy_state_get();
3303     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3304 }
3305
3306 static bool postcopy_is_running(void)
3307 {
3308     PostcopyState ps = postcopy_state_get();
3309     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3310 }
3311
3312 /*
3313  * Flush content of RAM cache into SVM's memory.
3314  * Only flush the pages that be dirtied by PVM or SVM or both.
3315  */
3316 static void colo_flush_ram_cache(void)
3317 {
3318     RAMBlock *block = NULL;
3319     void *dst_host;
3320     void *src_host;
3321     unsigned long offset = 0;
3322
3323     memory_global_dirty_log_sync();
3324     WITH_RCU_READ_LOCK_GUARD() {
3325         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3326             ramblock_sync_dirty_bitmap(ram_state, block);
3327         }
3328     }
3329
3330     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3331     WITH_RCU_READ_LOCK_GUARD() {
3332         block = QLIST_FIRST_RCU(&ram_list.blocks);
3333
3334         while (block) {
3335             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3336
3337             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3338                 >= block->used_length) {
3339                 offset = 0;
3340                 block = QLIST_NEXT_RCU(block, next);
3341             } else {
3342                 migration_bitmap_clear_dirty(ram_state, block, offset);
3343                 dst_host = block->host
3344                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3345                 src_host = block->colo_cache
3346                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3347                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3348             }
3349         }
3350     }
3351     trace_colo_flush_ram_cache_end();
3352 }
3353
3354 /**
3355  * ram_load_precopy: load pages in precopy case
3356  *
3357  * Returns 0 for success or -errno in case of error
3358  *
3359  * Called in precopy mode by ram_load().
3360  * rcu_read_lock is taken prior to this being called.
3361  *
3362  * @f: QEMUFile where to send the data
3363  */
3364 static int ram_load_precopy(QEMUFile *f)
3365 {
3366     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3367     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3368     bool postcopy_advised = postcopy_is_advised();
3369     if (!migrate_use_compression()) {
3370         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3371     }
3372
3373     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3374         ram_addr_t addr, total_ram_bytes;
3375         void *host = NULL, *host_bak = NULL;
3376         uint8_t ch;
3377
3378         /*
3379          * Yield periodically to let main loop run, but an iteration of
3380          * the main loop is expensive, so do it each some iterations
3381          */
3382         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3383             aio_co_schedule(qemu_get_current_aio_context(),
3384                             qemu_coroutine_self());
3385             qemu_coroutine_yield();
3386         }
3387         i++;
3388
3389         addr = qemu_get_be64(f);
3390         flags = addr & ~TARGET_PAGE_MASK;
3391         addr &= TARGET_PAGE_MASK;
3392
3393         if (flags & invalid_flags) {
3394             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3395                 error_report("Received an unexpected compressed page");
3396             }
3397
3398             ret = -EINVAL;
3399             break;
3400         }
3401
3402         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3403                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3404             RAMBlock *block = ram_block_from_stream(f, flags);
3405
3406             host = host_from_ram_block_offset(block, addr);
3407             /*
3408              * After going into COLO stage, we should not load the page
3409              * into SVM's memory directly, we put them into colo_cache firstly.
3410              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3411              * Previously, we copied all these memory in preparing stage of COLO
3412              * while we need to stop VM, which is a time-consuming process.
3413              * Here we optimize it by a trick, back-up every page while in
3414              * migration process while COLO is enabled, though it affects the
3415              * speed of the migration, but it obviously reduce the downtime of
3416              * back-up all SVM'S memory in COLO preparing stage.
3417              */
3418             if (migration_incoming_colo_enabled()) {
3419                 if (migration_incoming_in_colo_state()) {
3420                     /* In COLO stage, put all pages into cache temporarily */
3421                     host = colo_cache_from_block_offset(block, addr, true);
3422                 } else {
3423                    /*
3424                     * In migration stage but before COLO stage,
3425                     * Put all pages into both cache and SVM's memory.
3426                     */
3427                     host_bak = colo_cache_from_block_offset(block, addr, false);
3428                 }
3429             }
3430             if (!host) {
3431                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3432                 ret = -EINVAL;
3433                 break;
3434             }
3435             if (!migration_incoming_in_colo_state()) {
3436                 ramblock_recv_bitmap_set(block, host);
3437             }
3438
3439             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3440         }
3441
3442         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3443         case RAM_SAVE_FLAG_MEM_SIZE:
3444             /* Synchronize RAM block list */
3445             total_ram_bytes = addr;
3446             while (!ret && total_ram_bytes) {
3447                 RAMBlock *block;
3448                 char id[256];
3449                 ram_addr_t length;
3450
3451                 len = qemu_get_byte(f);
3452                 qemu_get_buffer(f, (uint8_t *)id, len);
3453                 id[len] = 0;
3454                 length = qemu_get_be64(f);
3455
3456                 block = qemu_ram_block_by_name(id);
3457                 if (block && !qemu_ram_is_migratable(block)) {
3458                     error_report("block %s should not be migrated !", id);
3459                     ret = -EINVAL;
3460                 } else if (block) {
3461                     if (length != block->used_length) {
3462                         Error *local_err = NULL;
3463
3464                         ret = qemu_ram_resize(block, length,
3465                                               &local_err);
3466                         if (local_err) {
3467                             error_report_err(local_err);
3468                         }
3469                     }
3470                     /* For postcopy we need to check hugepage sizes match */
3471                     if (postcopy_advised &&
3472                         block->page_size != qemu_host_page_size) {
3473                         uint64_t remote_page_size = qemu_get_be64(f);
3474                         if (remote_page_size != block->page_size) {
3475                             error_report("Mismatched RAM page size %s "
3476                                          "(local) %zd != %" PRId64,
3477                                          id, block->page_size,
3478                                          remote_page_size);
3479                             ret = -EINVAL;
3480                         }
3481                     }
3482                     if (migrate_ignore_shared()) {
3483                         hwaddr addr = qemu_get_be64(f);
3484                         if (ramblock_is_ignored(block) &&
3485                             block->mr->addr != addr) {
3486                             error_report("Mismatched GPAs for block %s "
3487                                          "%" PRId64 "!= %" PRId64,
3488                                          id, (uint64_t)addr,
3489                                          (uint64_t)block->mr->addr);
3490                             ret = -EINVAL;
3491                         }
3492                     }
3493                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3494                                           block->idstr);
3495                 } else {
3496                     error_report("Unknown ramblock \"%s\", cannot "
3497                                  "accept migration", id);
3498                     ret = -EINVAL;
3499                 }
3500
3501                 total_ram_bytes -= length;
3502             }
3503             break;
3504
3505         case RAM_SAVE_FLAG_ZERO:
3506             ch = qemu_get_byte(f);
3507             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3508             break;
3509
3510         case RAM_SAVE_FLAG_PAGE:
3511             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3512             break;
3513
3514         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3515             len = qemu_get_be32(f);
3516             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3517                 error_report("Invalid compressed data length: %d", len);
3518                 ret = -EINVAL;
3519                 break;
3520             }
3521             decompress_data_with_multi_threads(f, host, len);
3522             break;
3523
3524         case RAM_SAVE_FLAG_XBZRLE:
3525             if (load_xbzrle(f, addr, host) < 0) {
3526                 error_report("Failed to decompress XBZRLE page at "
3527                              RAM_ADDR_FMT, addr);
3528                 ret = -EINVAL;
3529                 break;
3530             }
3531             break;
3532         case RAM_SAVE_FLAG_EOS:
3533             /* normal exit */
3534             multifd_recv_sync_main();
3535             break;
3536         default:
3537             if (flags & RAM_SAVE_FLAG_HOOK) {
3538                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3539             } else {
3540                 error_report("Unknown combination of migration flags: %#x",
3541                              flags);
3542                 ret = -EINVAL;
3543             }
3544         }
3545         if (!ret) {
3546             ret = qemu_file_get_error(f);
3547         }
3548         if (!ret && host_bak) {
3549             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3550         }
3551     }
3552
3553     ret |= wait_for_decompress_done();
3554     return ret;
3555 }
3556
3557 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3558 {
3559     int ret = 0;
3560     static uint64_t seq_iter;
3561     /*
3562      * If system is running in postcopy mode, page inserts to host memory must
3563      * be atomic
3564      */
3565     bool postcopy_running = postcopy_is_running();
3566
3567     seq_iter++;
3568
3569     if (version_id != 4) {
3570         return -EINVAL;
3571     }
3572
3573     /*
3574      * This RCU critical section can be very long running.
3575      * When RCU reclaims in the code start to become numerous,
3576      * it will be necessary to reduce the granularity of this
3577      * critical section.
3578      */
3579     WITH_RCU_READ_LOCK_GUARD() {
3580         if (postcopy_running) {
3581             ret = ram_load_postcopy(f);
3582         } else {
3583             ret = ram_load_precopy(f);
3584         }
3585     }
3586     trace_ram_load_complete(ret, seq_iter);
3587
3588     if (!ret  && migration_incoming_in_colo_state()) {
3589         colo_flush_ram_cache();
3590     }
3591     return ret;
3592 }
3593
3594 static bool ram_has_postcopy(void *opaque)
3595 {
3596     RAMBlock *rb;
3597     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3598         if (ramblock_is_pmem(rb)) {
3599             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3600                          "is not supported now!", rb->idstr, rb->host);
3601             return false;
3602         }
3603     }
3604
3605     return migrate_postcopy_ram();
3606 }
3607
3608 /* Sync all the dirty bitmap with destination VM.  */
3609 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3610 {
3611     RAMBlock *block;
3612     QEMUFile *file = s->to_dst_file;
3613     int ramblock_count = 0;
3614
3615     trace_ram_dirty_bitmap_sync_start();
3616
3617     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3618         qemu_savevm_send_recv_bitmap(file, block->idstr);
3619         trace_ram_dirty_bitmap_request(block->idstr);
3620         ramblock_count++;
3621     }
3622
3623     trace_ram_dirty_bitmap_sync_wait();
3624
3625     /* Wait until all the ramblocks' dirty bitmap synced */
3626     while (ramblock_count--) {
3627         qemu_sem_wait(&s->rp_state.rp_sem);
3628     }
3629
3630     trace_ram_dirty_bitmap_sync_complete();
3631
3632     return 0;
3633 }
3634
3635 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3636 {
3637     qemu_sem_post(&s->rp_state.rp_sem);
3638 }
3639
3640 /*
3641  * Read the received bitmap, revert it as the initial dirty bitmap.
3642  * This is only used when the postcopy migration is paused but wants
3643  * to resume from a middle point.
3644  */
3645 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3646 {
3647     int ret = -EINVAL;
3648     QEMUFile *file = s->rp_state.from_dst_file;
3649     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3650     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3651     uint64_t size, end_mark;
3652
3653     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3654
3655     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3656         error_report("%s: incorrect state %s", __func__,
3657                      MigrationStatus_str(s->state));
3658         return -EINVAL;
3659     }
3660
3661     /*
3662      * Note: see comments in ramblock_recv_bitmap_send() on why we
3663      * need the endianess convertion, and the paddings.
3664      */
3665     local_size = ROUND_UP(local_size, 8);
3666
3667     /* Add paddings */
3668     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3669
3670     size = qemu_get_be64(file);
3671
3672     /* The size of the bitmap should match with our ramblock */
3673     if (size != local_size) {
3674         error_report("%s: ramblock '%s' bitmap size mismatch "
3675                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3676                      block->idstr, size, local_size);
3677         ret = -EINVAL;
3678         goto out;
3679     }
3680
3681     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3682     end_mark = qemu_get_be64(file);
3683
3684     ret = qemu_file_get_error(file);
3685     if (ret || size != local_size) {
3686         error_report("%s: read bitmap failed for ramblock '%s': %d"
3687                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3688                      __func__, block->idstr, ret, local_size, size);
3689         ret = -EIO;
3690         goto out;
3691     }
3692
3693     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3694         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3695                      __func__, block->idstr, end_mark);
3696         ret = -EINVAL;
3697         goto out;
3698     }
3699
3700     /*
3701      * Endianess convertion. We are during postcopy (though paused).
3702      * The dirty bitmap won't change. We can directly modify it.
3703      */
3704     bitmap_from_le(block->bmap, le_bitmap, nbits);
3705
3706     /*
3707      * What we received is "received bitmap". Revert it as the initial
3708      * dirty bitmap for this ramblock.
3709      */
3710     bitmap_complement(block->bmap, block->bmap, nbits);
3711
3712     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3713
3714     /*
3715      * We succeeded to sync bitmap for current ramblock. If this is
3716      * the last one to sync, we need to notify the main send thread.
3717      */
3718     ram_dirty_bitmap_reload_notify(s);
3719
3720     ret = 0;
3721 out:
3722     g_free(le_bitmap);
3723     return ret;
3724 }
3725
3726 static int ram_resume_prepare(MigrationState *s, void *opaque)
3727 {
3728     RAMState *rs = *(RAMState **)opaque;
3729     int ret;
3730
3731     ret = ram_dirty_bitmap_sync_all(s, rs);
3732     if (ret) {
3733         return ret;
3734     }
3735
3736     ram_state_resume_prepare(rs, s->to_dst_file);
3737
3738     return 0;
3739 }
3740
3741 static SaveVMHandlers savevm_ram_handlers = {
3742     .save_setup = ram_save_setup,
3743     .save_live_iterate = ram_save_iterate,
3744     .save_live_complete_postcopy = ram_save_complete,
3745     .save_live_complete_precopy = ram_save_complete,
3746     .has_postcopy = ram_has_postcopy,
3747     .save_live_pending = ram_save_pending,
3748     .load_state = ram_load,
3749     .save_cleanup = ram_save_cleanup,
3750     .load_setup = ram_load_setup,
3751     .load_cleanup = ram_load_cleanup,
3752     .resume_prepare = ram_resume_prepare,
3753 };
3754
3755 void ram_mig_init(void)
3756 {
3757     qemu_mutex_init(&XBZRLE.lock);
3758     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3759 }