migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include "qemu/cutils.h"
  32 #include "qemu/bitops.h"
  33 #include "qemu/bitmap.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "savevm.h"
  56 #include "qemu/iov.h"
  57 #include "multifd.h"
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  63  * worked for pages that where filled with the same char.  We switched
  64  * it to only search for the zero value.  And to avoid confusion with
  65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  66  */
  67
  68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  69 #define RAM_SAVE_FLAG_ZERO     0x02
  70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  71 #define RAM_SAVE_FLAG_PAGE     0x08
  72 #define RAM_SAVE_FLAG_EOS      0x10
  73 #define RAM_SAVE_FLAG_CONTINUE 0x20
  74 #define RAM_SAVE_FLAG_XBZRLE   0x40
  75 /* 0x80 is reserved in migration.h start with 0x100 next */
  76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  77
  78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  79 {
  80     return buffer_is_zero(p, size);
  81 }
  82
  83 XBZRLECacheStats xbzrle_counters;
  84
  85 /* struct contains XBZRLE cache and a static page
  86    used by the compression */
  87 static struct {
  88     /* buffer used for XBZRLE encoding */
  89     uint8_t *encoded_buf;
  90     /* buffer for storing page content */
  91     uint8_t *current_buf;
  92     /* Cache for XBZRLE, Protected by lock. */
  93     PageCache *cache;
  94     QemuMutex lock;
  95     /* it will store a page full of zeros */
  96     uint8_t *zero_target_page;
  97     /* buffer used for XBZRLE decoding */
  98     uint8_t *decoded_buf;
  99 } XBZRLE;
 100
 101 static void XBZRLE_cache_lock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_lock(&XBZRLE.lock);
 105 }
 106
 107 static void XBZRLE_cache_unlock(void)
 108 {
 109     if (migrate_use_xbzrle())
 110         qemu_mutex_unlock(&XBZRLE.lock);
 111 }
 112
 113 /**
 114  * xbzrle_cache_resize: resize the xbzrle cache
 115  *
 116  * This function is called from qmp_migrate_set_cache_size in main
 117  * thread, possibly while a migration is in progress.  A running
 118  * migration may be using the cache and might finish during this call,
 119  * hence changes to the cache are protected by XBZRLE.lock().
 120  *
 121  * Returns 0 for success or -1 for error
 122  *
 123  * @new_size: new cache size
 124  * @errp: set *errp if the check failed, with reason
 125  */
 126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 127 {
 128     PageCache *new_cache;
 129     int64_t ret = 0;
 130
 131     /* Check for truncation */
 132     if (new_size != (size_t)new_size) {
 133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                    "exceeding address space");
 135         return -1;
 136     }
 137
 138     if (new_size == migrate_xbzrle_cache_size()) {
 139         /* nothing to do */
 140         return 0;
 141     }
 142
 143     XBZRLE_cache_lock();
 144
 145     if (XBZRLE.cache != NULL) {
 146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 147         if (!new_cache) {
 148             ret = -1;
 149             goto out;
 150         }
 151
 152         cache_fini(XBZRLE.cache);
 153         XBZRLE.cache = new_cache;
 154     }
 155 out:
 156     XBZRLE_cache_unlock();
 157     return ret;
 158 }
 159
 160 static bool ramblock_is_ignored(RAMBlock *block)
 161 {
 162     return !qemu_ram_is_migratable(block) ||
 163            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 164 }
 165
 166 /* Should be holding either ram_list.mutex, or the RCU lock. */
 167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 168     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 169         if (ramblock_is_ignored(block)) {} else
 170
 171 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 172     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 173         if (!qemu_ram_is_migratable(block)) {} else
 174
 175 #undef RAMBLOCK_FOREACH
 176
 177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 178 {
 179     RAMBlock *block;
 180     int ret = 0;
 181
 182     RCU_READ_LOCK_GUARD();
 183
 184     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 185         ret = func(block, opaque);
 186         if (ret) {
 187             break;
 188         }
 189     }
 190     return ret;
 191 }
 192
 193 static void ramblock_recv_map_init(void)
 194 {
 195     RAMBlock *rb;
 196
 197     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 198         assert(!rb->receivedmap);
 199         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 200     }
 201 }
 202
 203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 204 {
 205     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 206                     rb->receivedmap);
 207 }
 208
 209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 210 {
 211     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 212 }
 213
 214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 215 {
 216     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 217 }
 218
 219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 220                                     size_t nr)
 221 {
 222     bitmap_set_atomic(rb->receivedmap,
 223                       ramblock_recv_bitmap_offset(host_addr, rb),
 224                       nr);
 225 }
 226
 227 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 228
 229 /*
 230  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 231  *
 232  * Returns >0 if success with sent bytes, or <0 if error.
 233  */
 234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 235                                   const char *block_name)
 236 {
 237     RAMBlock *block = qemu_ram_block_by_name(block_name);
 238     unsigned long *le_bitmap, nbits;
 239     uint64_t size;
 240
 241     if (!block) {
 242         error_report("%s: invalid block name: %s", __func__, block_name);
 243         return -1;
 244     }
 245
 246     nbits = block->used_length >> TARGET_PAGE_BITS;
 247
 248     /*
 249      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 250      * machines we may need 4 more bytes for padding (see below
 251      * comment). So extend it a bit before hand.
 252      */
 253     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 254
 255     /*
 256      * Always use little endian when sending the bitmap. This is
 257      * required that when source and destination VMs are not using the
 258      * same endianess. (Note: big endian won't work.)
 259      */
 260     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 261
 262     /* Size of the bitmap, in bytes */
 263     size = DIV_ROUND_UP(nbits, 8);
 264
 265     /*
 266      * size is always aligned to 8 bytes for 64bit machines, but it
 267      * may not be true for 32bit machines. We need this padding to
 268      * make sure the migration can survive even between 32bit and
 269      * 64bit machines.
 270      */
 271     size = ROUND_UP(size, 8);
 272
 273     qemu_put_be64(file, size);
 274     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 275     /*
 276      * Mark as an end, in case the middle part is screwed up due to
 277      * some "misterious" reason.
 278      */
 279     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 280     qemu_fflush(file);
 281
 282     g_free(le_bitmap);
 283
 284     if (qemu_file_get_error(file)) {
 285         return qemu_file_get_error(file);
 286     }
 287
 288     return size + sizeof(size);
 289 }
 290
 291 /*
 292  * An outstanding page request, on the source, having been received
 293  * and queued
 294  */
 295 struct RAMSrcPageRequest {
 296     RAMBlock *rb;
 297     hwaddr    offset;
 298     hwaddr    len;
 299
 300     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 301 };
 302
 303 /* State of RAM for migration */
 304 struct RAMState {
 305     /* QEMUFile used for this migration */
 306     QEMUFile *f;
 307     /* Last block that we have visited searching for dirty pages */
 308     RAMBlock *last_seen_block;
 309     /* Last block from where we have sent data */
 310     RAMBlock *last_sent_block;
 311     /* Last dirty target page we have sent */
 312     ram_addr_t last_page;
 313     /* last ram version we have seen */
 314     uint32_t last_version;
 315     /* We are in the first round */
 316     bool ram_bulk_stage;
 317     /* The free page optimization is enabled */
 318     bool fpo_enabled;
 319     /* How many times we have dirty too many pages */
 320     int dirty_rate_high_cnt;
 321     /* these variables are used for bitmap sync */
 322     /* last time we did a full bitmap_sync */
 323     int64_t time_last_bitmap_sync;
 324     /* bytes transferred at start_time */
 325     uint64_t bytes_xfer_prev;
 326     /* number of dirty pages since start_time */
 327     uint64_t num_dirty_pages_period;
 328     /* xbzrle misses since the beginning of the period */
 329     uint64_t xbzrle_cache_miss_prev;
 330
 331     /* compression statistics since the beginning of the period */
 332     /* amount of count that no free thread to compress data */
 333     uint64_t compress_thread_busy_prev;
 334     /* amount bytes after compression */
 335     uint64_t compressed_size_prev;
 336     /* amount of compressed pages */
 337     uint64_t compress_pages_prev;
 338
 339     /* total handled target pages at the beginning of period */
 340     uint64_t target_page_count_prev;
 341     /* total handled target pages since start */
 342     uint64_t target_page_count;
 343     /* number of dirty bits in the bitmap */
 344     uint64_t migration_dirty_pages;
 345     /* Protects modification of the bitmap and migration dirty pages */
 346     QemuMutex bitmap_mutex;
 347     /* The RAMBlock used in the last src_page_requests */
 348     RAMBlock *last_req_rb;
 349     /* Queue of outstanding page requests from the destination */
 350     QemuMutex src_page_req_mutex;
 351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 352 };
 353 typedef struct RAMState RAMState;
 354
 355 static RAMState *ram_state;
 356
 357 static NotifierWithReturnList precopy_notifier_list;
 358
 359 void precopy_infrastructure_init(void)
 360 {
 361     notifier_with_return_list_init(&precopy_notifier_list);
 362 }
 363
 364 void precopy_add_notifier(NotifierWithReturn *n)
 365 {
 366     notifier_with_return_list_add(&precopy_notifier_list, n);
 367 }
 368
 369 void precopy_remove_notifier(NotifierWithReturn *n)
 370 {
 371     notifier_with_return_remove(n);
 372 }
 373
 374 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 375 {
 376     PrecopyNotifyData pnd;
 377     pnd.reason = reason;
 378     pnd.errp = errp;
 379
 380     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 381 }
 382
 383 void precopy_enable_free_page_optimization(void)
 384 {
 385     if (!ram_state) {
 386         return;
 387     }
 388
 389     ram_state->fpo_enabled = true;
 390 }
 391
 392 uint64_t ram_bytes_remaining(void)
 393 {
 394     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 395                        0;
 396 }
 397
 398 MigrationStats ram_counters;
 399
 400 /* used by the search for pages to send */
 401 struct PageSearchStatus {
 402     /* Current block being searched */
 403     RAMBlock    *block;
 404     /* Current page to search from */
 405     unsigned long page;
 406     /* Set once we wrap around */
 407     bool         complete_round;
 408 };
 409 typedef struct PageSearchStatus PageSearchStatus;
 410
 411 CompressionStats compression_counters;
 412
 413 struct CompressParam {
 414     bool done;
 415     bool quit;
 416     bool zero_page;
 417     QEMUFile *file;
 418     QemuMutex mutex;
 419     QemuCond cond;
 420     RAMBlock *block;
 421     ram_addr_t offset;
 422
 423     /* internally used fields */
 424     z_stream stream;
 425     uint8_t *originbuf;
 426 };
 427 typedef struct CompressParam CompressParam;
 428
 429 struct DecompressParam {
 430     bool done;
 431     bool quit;
 432     QemuMutex mutex;
 433     QemuCond cond;
 434     void *des;
 435     uint8_t *compbuf;
 436     int len;
 437     z_stream stream;
 438 };
 439 typedef struct DecompressParam DecompressParam;
 440
 441 static CompressParam *comp_param;
 442 static QemuThread *compress_threads;
 443 /* comp_done_cond is used to wake up the migration thread when
 444  * one of the compression threads has finished the compression.
 445  * comp_done_lock is used to co-work with comp_done_cond.
 446  */
 447 static QemuMutex comp_done_lock;
 448 static QemuCond comp_done_cond;
 449 /* The empty QEMUFileOps will be used by file in CompressParam */
 450 static const QEMUFileOps empty_ops = { };
 451
 452 static QEMUFile *decomp_file;
 453 static DecompressParam *decomp_param;
 454 static QemuThread *decompress_threads;
 455 static QemuMutex decomp_done_lock;
 456 static QemuCond decomp_done_cond;
 457
 458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 459                                  ram_addr_t offset, uint8_t *source_buf);
 460
 461 static void *do_data_compress(void *opaque)
 462 {
 463     CompressParam *param = opaque;
 464     RAMBlock *block;
 465     ram_addr_t offset;
 466     bool zero_page;
 467
 468     qemu_mutex_lock(&param->mutex);
 469     while (!param->quit) {
 470         if (param->block) {
 471             block = param->block;
 472             offset = param->offset;
 473             param->block = NULL;
 474             qemu_mutex_unlock(&param->mutex);
 475
 476             zero_page = do_compress_ram_page(param->file, &param->stream,
 477                                              block, offset, param->originbuf);
 478
 479             qemu_mutex_lock(&comp_done_lock);
 480             param->done = true;
 481             param->zero_page = zero_page;
 482             qemu_cond_signal(&comp_done_cond);
 483             qemu_mutex_unlock(&comp_done_lock);
 484
 485             qemu_mutex_lock(&param->mutex);
 486         } else {
 487             qemu_cond_wait(&param->cond, &param->mutex);
 488         }
 489     }
 490     qemu_mutex_unlock(&param->mutex);
 491
 492     return NULL;
 493 }
 494
 495 static void compress_threads_save_cleanup(void)
 496 {
 497     int i, thread_count;
 498
 499     if (!migrate_use_compression() || !comp_param) {
 500         return;
 501     }
 502
 503     thread_count = migrate_compress_threads();
 504     for (i = 0; i < thread_count; i++) {
 505         /*
 506          * we use it as a indicator which shows if the thread is
 507          * properly init'd or not
 508          */
 509         if (!comp_param[i].file) {
 510             break;
 511         }
 512
 513         qemu_mutex_lock(&comp_param[i].mutex);
 514         comp_param[i].quit = true;
 515         qemu_cond_signal(&comp_param[i].cond);
 516         qemu_mutex_unlock(&comp_param[i].mutex);
 517
 518         qemu_thread_join(compress_threads + i);
 519         qemu_mutex_destroy(&comp_param[i].mutex);
 520         qemu_cond_destroy(&comp_param[i].cond);
 521         deflateEnd(&comp_param[i].stream);
 522         g_free(comp_param[i].originbuf);
 523         qemu_fclose(comp_param[i].file);
 524         comp_param[i].file = NULL;
 525     }
 526     qemu_mutex_destroy(&comp_done_lock);
 527     qemu_cond_destroy(&comp_done_cond);
 528     g_free(compress_threads);
 529     g_free(comp_param);
 530     compress_threads = NULL;
 531     comp_param = NULL;
 532 }
 533
 534 static int compress_threads_save_setup(void)
 535 {
 536     int i, thread_count;
 537
 538     if (!migrate_use_compression()) {
 539         return 0;
 540     }
 541     thread_count = migrate_compress_threads();
 542     compress_threads = g_new0(QemuThread, thread_count);
 543     comp_param = g_new0(CompressParam, thread_count);
 544     qemu_cond_init(&comp_done_cond);
 545     qemu_mutex_init(&comp_done_lock);
 546     for (i = 0; i < thread_count; i++) {
 547         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 548         if (!comp_param[i].originbuf) {
 549             goto exit;
 550         }
 551
 552         if (deflateInit(&comp_param[i].stream,
 553                         migrate_compress_level()) != Z_OK) {
 554             g_free(comp_param[i].originbuf);
 555             goto exit;
 556         }
 557
 558         /* comp_param[i].file is just used as a dummy buffer to save data,
 559          * set its ops to empty.
 560          */
 561         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 562         comp_param[i].done = true;
 563         comp_param[i].quit = false;
 564         qemu_mutex_init(&comp_param[i].mutex);
 565         qemu_cond_init(&comp_param[i].cond);
 566         qemu_thread_create(compress_threads + i, "compress",
 567                            do_data_compress, comp_param + i,
 568                            QEMU_THREAD_JOINABLE);
 569     }
 570     return 0;
 571
 572 exit:
 573     compress_threads_save_cleanup();
 574     return -1;
 575 }
 576
 577 /**
 578  * save_page_header: write page header to wire
 579  *
 580  * If this is the 1st block, it also writes the block identification
 581  *
 582  * Returns the number of bytes written
 583  *
 584  * @f: QEMUFile where to send the data
 585  * @block: block that contains the page we want to send
 586  * @offset: offset inside the block for the page
 587  *          in the lower bits, it contains flags
 588  */
 589 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 590                                ram_addr_t offset)
 591 {
 592     size_t size, len;
 593
 594     if (block == rs->last_sent_block) {
 595         offset |= RAM_SAVE_FLAG_CONTINUE;
 596     }
 597     qemu_put_be64(f, offset);
 598     size = 8;
 599
 600     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 601         len = strlen(block->idstr);
 602         qemu_put_byte(f, len);
 603         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 604         size += 1 + len;
 605         rs->last_sent_block = block;
 606     }
 607     return size;
 608 }
 609
 610 /**
 611  * mig_throttle_guest_down: throotle down the guest
 612  *
 613  * Reduce amount of guest cpu execution to hopefully slow down memory
 614  * writes. If guest dirty memory rate is reduced below the rate at
 615  * which we can transfer pages to the destination then we should be
 616  * able to complete migration. Some workloads dirty memory way too
 617  * fast and will not effectively converge, even with auto-converge.
 618  */
 619 static void mig_throttle_guest_down(void)
 620 {
 621     MigrationState *s = migrate_get_current();
 622     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 623     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 624     int pct_max = s->parameters.max_cpu_throttle;
 625
 626     /* We have not started throttling yet. Let's start it. */
 627     if (!cpu_throttle_active()) {
 628         cpu_throttle_set(pct_initial);
 629     } else {
 630         /* Throttling already on, just increase the rate */
 631         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
 632                          pct_max));
 633     }
 634 }
 635
 636 /**
 637  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 638  *
 639  * @rs: current RAM state
 640  * @current_addr: address for the zero page
 641  *
 642  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 643  * The important thing is that a stale (not-yet-0'd) page be replaced
 644  * by the new data.
 645  * As a bonus, if the page wasn't in the cache it gets added so that
 646  * when a small write is made into the 0'd page it gets XBZRLE sent.
 647  */
 648 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 649 {
 650     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 651         return;
 652     }
 653
 654     /* We don't care if this fails to allocate a new cache page
 655      * as long as it updated an old one */
 656     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 657                  ram_counters.dirty_sync_count);
 658 }
 659
 660 #define ENCODING_FLAG_XBZRLE 0x1
 661
 662 /**
 663  * save_xbzrle_page: compress and send current page
 664  *
 665  * Returns: 1 means that we wrote the page
 666  *          0 means that page is identical to the one already sent
 667  *          -1 means that xbzrle would be longer than normal
 668  *
 669  * @rs: current RAM state
 670  * @current_data: pointer to the address of the page contents
 671  * @current_addr: addr of the page
 672  * @block: block that contains the page we want to send
 673  * @offset: offset inside the block for the page
 674  * @last_stage: if we are at the completion stage
 675  */
 676 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 677                             ram_addr_t current_addr, RAMBlock *block,
 678                             ram_addr_t offset, bool last_stage)
 679 {
 680     int encoded_len = 0, bytes_xbzrle;
 681     uint8_t *prev_cached_page;
 682
 683     if (!cache_is_cached(XBZRLE.cache, current_addr,
 684                          ram_counters.dirty_sync_count)) {
 685         xbzrle_counters.cache_miss++;
 686         if (!last_stage) {
 687             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 688                              ram_counters.dirty_sync_count) == -1) {
 689                 return -1;
 690             } else {
 691                 /* update *current_data when the page has been
 692                    inserted into cache */
 693                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 694             }
 695         }
 696         return -1;
 697     }
 698
 699     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 700
 701     /* save current buffer into memory */
 702     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 703
 704     /* XBZRLE encoding (if there is no overflow) */
 705     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 706                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 707                                        TARGET_PAGE_SIZE);
 708
 709     /*
 710      * Update the cache contents, so that it corresponds to the data
 711      * sent, in all cases except where we skip the page.
 712      */
 713     if (!last_stage && encoded_len != 0) {
 714         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 715         /*
 716          * In the case where we couldn't compress, ensure that the caller
 717          * sends the data from the cache, since the guest might have
 718          * changed the RAM since we copied it.
 719          */
 720         *current_data = prev_cached_page;
 721     }
 722
 723     if (encoded_len == 0) {
 724         trace_save_xbzrle_page_skipping();
 725         return 0;
 726     } else if (encoded_len == -1) {
 727         trace_save_xbzrle_page_overflow();
 728         xbzrle_counters.overflow++;
 729         return -1;
 730     }
 731
 732     /* Send XBZRLE based compressed page */
 733     bytes_xbzrle = save_page_header(rs, rs->f, block,
 734                                     offset | RAM_SAVE_FLAG_XBZRLE);
 735     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 736     qemu_put_be16(rs->f, encoded_len);
 737     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 738     bytes_xbzrle += encoded_len + 1 + 2;
 739     xbzrle_counters.pages++;
 740     xbzrle_counters.bytes += bytes_xbzrle;
 741     ram_counters.transferred += bytes_xbzrle;
 742
 743     return 1;
 744 }
 745
 746 /**
 747  * migration_bitmap_find_dirty: find the next dirty page from start
 748  *
 749  * Returns the page offset within memory region of the start of a dirty page
 750  *
 751  * @rs: current RAM state
 752  * @rb: RAMBlock where to search for dirty pages
 753  * @start: page where we start the search
 754  */
 755 static inline
 756 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 757                                           unsigned long start)
 758 {
 759     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 760     unsigned long *bitmap = rb->bmap;
 761     unsigned long next;
 762
 763     if (ramblock_is_ignored(rb)) {
 764         return size;
 765     }
 766
 767     /*
 768      * When the free page optimization is enabled, we need to check the bitmap
 769      * to send the non-free pages rather than all the pages in the bulk stage.
 770      */
 771     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 772         next = start + 1;
 773     } else {
 774         next = find_next_bit(bitmap, size, start);
 775     }
 776
 777     return next;
 778 }
 779
 780 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 781                                                 RAMBlock *rb,
 782                                                 unsigned long page)
 783 {
 784     bool ret;
 785
 786     qemu_mutex_lock(&rs->bitmap_mutex);
 787
 788     /*
 789      * Clear dirty bitmap if needed.  This _must_ be called before we
 790      * send any of the page in the chunk because we need to make sure
 791      * we can capture further page content changes when we sync dirty
 792      * log the next time.  So as long as we are going to send any of
 793      * the page in the chunk we clear the remote dirty bitmap for all.
 794      * Clearing it earlier won't be a problem, but too late will.
 795      */
 796     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 797         uint8_t shift = rb->clear_bmap_shift;
 798         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 799         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 800
 801         /*
 802          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 803          * can make things easier sometimes since then start address
 804          * of the small chunk will always be 64 pages aligned so the
 805          * bitmap will always be aligned to unsigned long.  We should
 806          * even be able to remove this restriction but I'm simply
 807          * keeping it.
 808          */
 809         assert(shift >= 6);
 810         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 811         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 812     }
 813
 814     ret = test_and_clear_bit(page, rb->bmap);
 815
 816     if (ret) {
 817         rs->migration_dirty_pages--;
 818     }
 819     qemu_mutex_unlock(&rs->bitmap_mutex);
 820
 821     return ret;
 822 }
 823
 824 /* Called with RCU critical section */
 825 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 826 {
 827     rs->migration_dirty_pages +=
 828         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
 829                                               &rs->num_dirty_pages_period);
 830 }
 831
 832 /**
 833  * ram_pagesize_summary: calculate all the pagesizes of a VM
 834  *
 835  * Returns a summary bitmap of the page sizes of all RAMBlocks
 836  *
 837  * For VMs with just normal pages this is equivalent to the host page
 838  * size. If it's got some huge pages then it's the OR of all the
 839  * different page sizes.
 840  */
 841 uint64_t ram_pagesize_summary(void)
 842 {
 843     RAMBlock *block;
 844     uint64_t summary = 0;
 845
 846     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 847         summary |= block->page_size;
 848     }
 849
 850     return summary;
 851 }
 852
 853 uint64_t ram_get_total_transferred_pages(void)
 854 {
 855     return  ram_counters.normal + ram_counters.duplicate +
 856                 compression_counters.pages + xbzrle_counters.pages;
 857 }
 858
 859 static void migration_update_rates(RAMState *rs, int64_t end_time)
 860 {
 861     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 862     double compressed_size;
 863
 864     /* calculate period counters */
 865     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 866                 / (end_time - rs->time_last_bitmap_sync);
 867
 868     if (!page_count) {
 869         return;
 870     }
 871
 872     if (migrate_use_xbzrle()) {
 873         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 874             rs->xbzrle_cache_miss_prev) / page_count;
 875         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 876     }
 877
 878     if (migrate_use_compression()) {
 879         compression_counters.busy_rate = (double)(compression_counters.busy -
 880             rs->compress_thread_busy_prev) / page_count;
 881         rs->compress_thread_busy_prev = compression_counters.busy;
 882
 883         compressed_size = compression_counters.compressed_size -
 884                           rs->compressed_size_prev;
 885         if (compressed_size) {
 886             double uncompressed_size = (compression_counters.pages -
 887                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 888
 889             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 890             compression_counters.compression_rate =
 891                                         uncompressed_size / compressed_size;
 892
 893             rs->compress_pages_prev = compression_counters.pages;
 894             rs->compressed_size_prev = compression_counters.compressed_size;
 895         }
 896     }
 897 }
 898
 899 static void migration_trigger_throttle(RAMState *rs)
 900 {
 901     MigrationState *s = migrate_get_current();
 902     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 903
 904     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 905     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 906     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 907
 908     /* During block migration the auto-converge logic incorrectly detects
 909      * that ram migration makes no progress. Avoid this by disabling the
 910      * throttling logic during the bulk phase of block migration. */
 911     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 912         /* The following detection logic can be refined later. For now:
 913            Check to see if the ratio between dirtied bytes and the approx.
 914            amount of bytes that just got transferred since the last time
 915            we were in this routine reaches the threshold. If that happens
 916            twice, start or increase throttling. */
 917
 918         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 919             (++rs->dirty_rate_high_cnt >= 2)) {
 920             trace_migration_throttle();
 921             rs->dirty_rate_high_cnt = 0;
 922             mig_throttle_guest_down();
 923         }
 924     }
 925 }
 926
 927 static void migration_bitmap_sync(RAMState *rs)
 928 {
 929     RAMBlock *block;
 930     int64_t end_time;
 931
 932     ram_counters.dirty_sync_count++;
 933
 934     if (!rs->time_last_bitmap_sync) {
 935         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 936     }
 937
 938     trace_migration_bitmap_sync_start();
 939     memory_global_dirty_log_sync();
 940
 941     qemu_mutex_lock(&rs->bitmap_mutex);
 942     WITH_RCU_READ_LOCK_GUARD() {
 943         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 944             ramblock_sync_dirty_bitmap(rs, block);
 945         }
 946         ram_counters.remaining = ram_bytes_remaining();
 947     }
 948     qemu_mutex_unlock(&rs->bitmap_mutex);
 949
 950     memory_global_after_dirty_log_sync();
 951     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 952
 953     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 954
 955     /* more than 1 second = 1000 millisecons */
 956     if (end_time > rs->time_last_bitmap_sync + 1000) {
 957         migration_trigger_throttle(rs);
 958
 959         migration_update_rates(rs, end_time);
 960
 961         rs->target_page_count_prev = rs->target_page_count;
 962
 963         /* reset period counters */
 964         rs->time_last_bitmap_sync = end_time;
 965         rs->num_dirty_pages_period = 0;
 966         rs->bytes_xfer_prev = ram_counters.transferred;
 967     }
 968     if (migrate_use_events()) {
 969         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
 970     }
 971 }
 972
 973 static void migration_bitmap_sync_precopy(RAMState *rs)
 974 {
 975     Error *local_err = NULL;
 976
 977     /*
 978      * The current notifier usage is just an optimization to migration, so we
 979      * don't stop the normal migration process in the error case.
 980      */
 981     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
 982         error_report_err(local_err);
 983     }
 984
 985     migration_bitmap_sync(rs);
 986
 987     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
 988         error_report_err(local_err);
 989     }
 990 }
 991
 992 /**
 993  * save_zero_page_to_file: send the zero page to the file
 994  *
 995  * Returns the size of data written to the file, 0 means the page is not
 996  * a zero page
 997  *
 998  * @rs: current RAM state
 999  * @file: the file where the data is saved
1000  * @block: block that contains the page we want to send
1001  * @offset: offset inside the block for the page
1002  */
1003 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1004                                   RAMBlock *block, ram_addr_t offset)
1005 {
1006     uint8_t *p = block->host + offset;
1007     int len = 0;
1008
1009     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1010         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1011         qemu_put_byte(file, 0);
1012         len += 1;
1013     }
1014     return len;
1015 }
1016
1017 /**
1018  * save_zero_page: send the zero page to the stream
1019  *
1020  * Returns the number of pages written.
1021  *
1022  * @rs: current RAM state
1023  * @block: block that contains the page we want to send
1024  * @offset: offset inside the block for the page
1025  */
1026 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1027 {
1028     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1029
1030     if (len) {
1031         ram_counters.duplicate++;
1032         ram_counters.transferred += len;
1033         return 1;
1034     }
1035     return -1;
1036 }
1037
1038 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1039 {
1040     if (!migrate_release_ram() || !migration_in_postcopy()) {
1041         return;
1042     }
1043
1044     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1045 }
1046
1047 /*
1048  * @pages: the number of pages written by the control path,
1049  *        < 0 - error
1050  *        > 0 - number of pages written
1051  *
1052  * Return true if the pages has been saved, otherwise false is returned.
1053  */
1054 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1055                               int *pages)
1056 {
1057     uint64_t bytes_xmit = 0;
1058     int ret;
1059
1060     *pages = -1;
1061     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1062                                 &bytes_xmit);
1063     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1064         return false;
1065     }
1066
1067     if (bytes_xmit) {
1068         ram_counters.transferred += bytes_xmit;
1069         *pages = 1;
1070     }
1071
1072     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1073         return true;
1074     }
1075
1076     if (bytes_xmit > 0) {
1077         ram_counters.normal++;
1078     } else if (bytes_xmit == 0) {
1079         ram_counters.duplicate++;
1080     }
1081
1082     return true;
1083 }
1084
1085 /*
1086  * directly send the page to the stream
1087  *
1088  * Returns the number of pages written.
1089  *
1090  * @rs: current RAM state
1091  * @block: block that contains the page we want to send
1092  * @offset: offset inside the block for the page
1093  * @buf: the page to be sent
1094  * @async: send to page asyncly
1095  */
1096 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1097                             uint8_t *buf, bool async)
1098 {
1099     ram_counters.transferred += save_page_header(rs, rs->f, block,
1100                                                  offset | RAM_SAVE_FLAG_PAGE);
1101     if (async) {
1102         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1103                               migrate_release_ram() &
1104                               migration_in_postcopy());
1105     } else {
1106         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1107     }
1108     ram_counters.transferred += TARGET_PAGE_SIZE;
1109     ram_counters.normal++;
1110     return 1;
1111 }
1112
1113 /**
1114  * ram_save_page: send the given page to the stream
1115  *
1116  * Returns the number of pages written.
1117  *          < 0 - error
1118  *          >=0 - Number of pages written - this might legally be 0
1119  *                if xbzrle noticed the page was the same.
1120  *
1121  * @rs: current RAM state
1122  * @block: block that contains the page we want to send
1123  * @offset: offset inside the block for the page
1124  * @last_stage: if we are at the completion stage
1125  */
1126 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1127 {
1128     int pages = -1;
1129     uint8_t *p;
1130     bool send_async = true;
1131     RAMBlock *block = pss->block;
1132     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1133     ram_addr_t current_addr = block->offset + offset;
1134
1135     p = block->host + offset;
1136     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1137
1138     XBZRLE_cache_lock();
1139     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1140         migrate_use_xbzrle()) {
1141         pages = save_xbzrle_page(rs, &p, current_addr, block,
1142                                  offset, last_stage);
1143         if (!last_stage) {
1144             /* Can't send this cached data async, since the cache page
1145              * might get updated before it gets to the wire
1146              */
1147             send_async = false;
1148         }
1149     }
1150
1151     /* XBZRLE overflow or normal page */
1152     if (pages == -1) {
1153         pages = save_normal_page(rs, block, offset, p, send_async);
1154     }
1155
1156     XBZRLE_cache_unlock();
1157
1158     return pages;
1159 }
1160
1161 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1162                                  ram_addr_t offset)
1163 {
1164     if (multifd_queue_page(rs->f, block, offset) < 0) {
1165         return -1;
1166     }
1167     ram_counters.normal++;
1168
1169     return 1;
1170 }
1171
1172 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1173                                  ram_addr_t offset, uint8_t *source_buf)
1174 {
1175     RAMState *rs = ram_state;
1176     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1177     bool zero_page = false;
1178     int ret;
1179
1180     if (save_zero_page_to_file(rs, f, block, offset)) {
1181         zero_page = true;
1182         goto exit;
1183     }
1184
1185     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1186
1187     /*
1188      * copy it to a internal buffer to avoid it being modified by VM
1189      * so that we can catch up the error during compression and
1190      * decompression
1191      */
1192     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1193     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1194     if (ret < 0) {
1195         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1196         error_report("compressed data failed!");
1197         return false;
1198     }
1199
1200 exit:
1201     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1202     return zero_page;
1203 }
1204
1205 static void
1206 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1207 {
1208     ram_counters.transferred += bytes_xmit;
1209
1210     if (param->zero_page) {
1211         ram_counters.duplicate++;
1212         return;
1213     }
1214
1215     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1216     compression_counters.compressed_size += bytes_xmit - 8;
1217     compression_counters.pages++;
1218 }
1219
1220 static bool save_page_use_compression(RAMState *rs);
1221
1222 static void flush_compressed_data(RAMState *rs)
1223 {
1224     int idx, len, thread_count;
1225
1226     if (!save_page_use_compression(rs)) {
1227         return;
1228     }
1229     thread_count = migrate_compress_threads();
1230
1231     qemu_mutex_lock(&comp_done_lock);
1232     for (idx = 0; idx < thread_count; idx++) {
1233         while (!comp_param[idx].done) {
1234             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1235         }
1236     }
1237     qemu_mutex_unlock(&comp_done_lock);
1238
1239     for (idx = 0; idx < thread_count; idx++) {
1240         qemu_mutex_lock(&comp_param[idx].mutex);
1241         if (!comp_param[idx].quit) {
1242             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1243             /*
1244              * it's safe to fetch zero_page without holding comp_done_lock
1245              * as there is no further request submitted to the thread,
1246              * i.e, the thread should be waiting for a request at this point.
1247              */
1248             update_compress_thread_counts(&comp_param[idx], len);
1249         }
1250         qemu_mutex_unlock(&comp_param[idx].mutex);
1251     }
1252 }
1253
1254 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1255                                        ram_addr_t offset)
1256 {
1257     param->block = block;
1258     param->offset = offset;
1259 }
1260
1261 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1262                                            ram_addr_t offset)
1263 {
1264     int idx, thread_count, bytes_xmit = -1, pages = -1;
1265     bool wait = migrate_compress_wait_thread();
1266
1267     thread_count = migrate_compress_threads();
1268     qemu_mutex_lock(&comp_done_lock);
1269 retry:
1270     for (idx = 0; idx < thread_count; idx++) {
1271         if (comp_param[idx].done) {
1272             comp_param[idx].done = false;
1273             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1274             qemu_mutex_lock(&comp_param[idx].mutex);
1275             set_compress_params(&comp_param[idx], block, offset);
1276             qemu_cond_signal(&comp_param[idx].cond);
1277             qemu_mutex_unlock(&comp_param[idx].mutex);
1278             pages = 1;
1279             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1280             break;
1281         }
1282     }
1283
1284     /*
1285      * wait for the free thread if the user specifies 'compress-wait-thread',
1286      * otherwise we will post the page out in the main thread as normal page.
1287      */
1288     if (pages < 0 && wait) {
1289         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1290         goto retry;
1291     }
1292     qemu_mutex_unlock(&comp_done_lock);
1293
1294     return pages;
1295 }
1296
1297 /**
1298  * find_dirty_block: find the next dirty page and update any state
1299  * associated with the search process.
1300  *
1301  * Returns true if a page is found
1302  *
1303  * @rs: current RAM state
1304  * @pss: data about the state of the current dirty page scan
1305  * @again: set to false if the search has scanned the whole of RAM
1306  */
1307 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1308 {
1309     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1310     if (pss->complete_round && pss->block == rs->last_seen_block &&
1311         pss->page >= rs->last_page) {
1312         /*
1313          * We've been once around the RAM and haven't found anything.
1314          * Give up.
1315          */
1316         *again = false;
1317         return false;
1318     }
1319     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1320         >= pss->block->used_length) {
1321         /* Didn't find anything in this RAM Block */
1322         pss->page = 0;
1323         pss->block = QLIST_NEXT_RCU(pss->block, next);
1324         if (!pss->block) {
1325             /*
1326              * If memory migration starts over, we will meet a dirtied page
1327              * which may still exists in compression threads's ring, so we
1328              * should flush the compressed data to make sure the new page
1329              * is not overwritten by the old one in the destination.
1330              *
1331              * Also If xbzrle is on, stop using the data compression at this
1332              * point. In theory, xbzrle can do better than compression.
1333              */
1334             flush_compressed_data(rs);
1335
1336             /* Hit the end of the list */
1337             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1338             /* Flag that we've looped */
1339             pss->complete_round = true;
1340             rs->ram_bulk_stage = false;
1341         }
1342         /* Didn't find anything this time, but try again on the new block */
1343         *again = true;
1344         return false;
1345     } else {
1346         /* Can go around again, but... */
1347         *again = true;
1348         /* We've found something so probably don't need to */
1349         return true;
1350     }
1351 }
1352
1353 /**
1354  * unqueue_page: gets a page of the queue
1355  *
1356  * Helper for 'get_queued_page' - gets a page off the queue
1357  *
1358  * Returns the block of the page (or NULL if none available)
1359  *
1360  * @rs: current RAM state
1361  * @offset: used to return the offset within the RAMBlock
1362  */
1363 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1364 {
1365     RAMBlock *block = NULL;
1366
1367     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1368         return NULL;
1369     }
1370
1371     qemu_mutex_lock(&rs->src_page_req_mutex);
1372     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1373         struct RAMSrcPageRequest *entry =
1374                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1375         block = entry->rb;
1376         *offset = entry->offset;
1377
1378         if (entry->len > TARGET_PAGE_SIZE) {
1379             entry->len -= TARGET_PAGE_SIZE;
1380             entry->offset += TARGET_PAGE_SIZE;
1381         } else {
1382             memory_region_unref(block->mr);
1383             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1384             g_free(entry);
1385             migration_consume_urgent_request();
1386         }
1387     }
1388     qemu_mutex_unlock(&rs->src_page_req_mutex);
1389
1390     return block;
1391 }
1392
1393 /**
1394  * get_queued_page: unqueue a page from the postcopy requests
1395  *
1396  * Skips pages that are already sent (!dirty)
1397  *
1398  * Returns true if a queued page is found
1399  *
1400  * @rs: current RAM state
1401  * @pss: data about the state of the current dirty page scan
1402  */
1403 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1404 {
1405     RAMBlock  *block;
1406     ram_addr_t offset;
1407     bool dirty;
1408
1409     do {
1410         block = unqueue_page(rs, &offset);
1411         /*
1412          * We're sending this page, and since it's postcopy nothing else
1413          * will dirty it, and we must make sure it doesn't get sent again
1414          * even if this queue request was received after the background
1415          * search already sent it.
1416          */
1417         if (block) {
1418             unsigned long page;
1419
1420             page = offset >> TARGET_PAGE_BITS;
1421             dirty = test_bit(page, block->bmap);
1422             if (!dirty) {
1423                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1424                                                 page);
1425             } else {
1426                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1427             }
1428         }
1429
1430     } while (block && !dirty);
1431
1432     if (block) {
1433         /*
1434          * As soon as we start servicing pages out of order, then we have
1435          * to kill the bulk stage, since the bulk stage assumes
1436          * in (migration_bitmap_find_and_reset_dirty) that every page is
1437          * dirty, that's no longer true.
1438          */
1439         rs->ram_bulk_stage = false;
1440
1441         /*
1442          * We want the background search to continue from the queued page
1443          * since the guest is likely to want other pages near to the page
1444          * it just requested.
1445          */
1446         pss->block = block;
1447         pss->page = offset >> TARGET_PAGE_BITS;
1448
1449         /*
1450          * This unqueued page would break the "one round" check, even is
1451          * really rare.
1452          */
1453         pss->complete_round = false;
1454     }
1455
1456     return !!block;
1457 }
1458
1459 /**
1460  * migration_page_queue_free: drop any remaining pages in the ram
1461  * request queue
1462  *
1463  * It should be empty at the end anyway, but in error cases there may
1464  * be some left.  in case that there is any page left, we drop it.
1465  *
1466  */
1467 static void migration_page_queue_free(RAMState *rs)
1468 {
1469     struct RAMSrcPageRequest *mspr, *next_mspr;
1470     /* This queue generally should be empty - but in the case of a failed
1471      * migration might have some droppings in.
1472      */
1473     RCU_READ_LOCK_GUARD();
1474     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1475         memory_region_unref(mspr->rb->mr);
1476         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1477         g_free(mspr);
1478     }
1479 }
1480
1481 /**
1482  * ram_save_queue_pages: queue the page for transmission
1483  *
1484  * A request from postcopy destination for example.
1485  *
1486  * Returns zero on success or negative on error
1487  *
1488  * @rbname: Name of the RAMBLock of the request. NULL means the
1489  *          same that last one.
1490  * @start: starting address from the start of the RAMBlock
1491  * @len: length (in bytes) to send
1492  */
1493 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1494 {
1495     RAMBlock *ramblock;
1496     RAMState *rs = ram_state;
1497
1498     ram_counters.postcopy_requests++;
1499     RCU_READ_LOCK_GUARD();
1500
1501     if (!rbname) {
1502         /* Reuse last RAMBlock */
1503         ramblock = rs->last_req_rb;
1504
1505         if (!ramblock) {
1506             /*
1507              * Shouldn't happen, we can't reuse the last RAMBlock if
1508              * it's the 1st request.
1509              */
1510             error_report("ram_save_queue_pages no previous block");
1511             return -1;
1512         }
1513     } else {
1514         ramblock = qemu_ram_block_by_name(rbname);
1515
1516         if (!ramblock) {
1517             /* We shouldn't be asked for a non-existent RAMBlock */
1518             error_report("ram_save_queue_pages no block '%s'", rbname);
1519             return -1;
1520         }
1521         rs->last_req_rb = ramblock;
1522     }
1523     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1524     if (start+len > ramblock->used_length) {
1525         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1526                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1527                      __func__, start, len, ramblock->used_length);
1528         return -1;
1529     }
1530
1531     struct RAMSrcPageRequest *new_entry =
1532         g_malloc0(sizeof(struct RAMSrcPageRequest));
1533     new_entry->rb = ramblock;
1534     new_entry->offset = start;
1535     new_entry->len = len;
1536
1537     memory_region_ref(ramblock->mr);
1538     qemu_mutex_lock(&rs->src_page_req_mutex);
1539     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1540     migration_make_urgent_request();
1541     qemu_mutex_unlock(&rs->src_page_req_mutex);
1542
1543     return 0;
1544 }
1545
1546 static bool save_page_use_compression(RAMState *rs)
1547 {
1548     if (!migrate_use_compression()) {
1549         return false;
1550     }
1551
1552     /*
1553      * If xbzrle is on, stop using the data compression after first
1554      * round of migration even if compression is enabled. In theory,
1555      * xbzrle can do better than compression.
1556      */
1557     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1558         return true;
1559     }
1560
1561     return false;
1562 }
1563
1564 /*
1565  * try to compress the page before posting it out, return true if the page
1566  * has been properly handled by compression, otherwise needs other
1567  * paths to handle it
1568  */
1569 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1570 {
1571     if (!save_page_use_compression(rs)) {
1572         return false;
1573     }
1574
1575     /*
1576      * When starting the process of a new block, the first page of
1577      * the block should be sent out before other pages in the same
1578      * block, and all the pages in last block should have been sent
1579      * out, keeping this order is important, because the 'cont' flag
1580      * is used to avoid resending the block name.
1581      *
1582      * We post the fist page as normal page as compression will take
1583      * much CPU resource.
1584      */
1585     if (block != rs->last_sent_block) {
1586         flush_compressed_data(rs);
1587         return false;
1588     }
1589
1590     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1591         return true;
1592     }
1593
1594     compression_counters.busy++;
1595     return false;
1596 }
1597
1598 /**
1599  * ram_save_target_page: save one target page
1600  *
1601  * Returns the number of pages written
1602  *
1603  * @rs: current RAM state
1604  * @pss: data about the page we want to send
1605  * @last_stage: if we are at the completion stage
1606  */
1607 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1608                                 bool last_stage)
1609 {
1610     RAMBlock *block = pss->block;
1611     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1612     int res;
1613
1614     if (control_save_page(rs, block, offset, &res)) {
1615         return res;
1616     }
1617
1618     if (save_compress_page(rs, block, offset)) {
1619         return 1;
1620     }
1621
1622     res = save_zero_page(rs, block, offset);
1623     if (res > 0) {
1624         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1625          * page would be stale
1626          */
1627         if (!save_page_use_compression(rs)) {
1628             XBZRLE_cache_lock();
1629             xbzrle_cache_zero_page(rs, block->offset + offset);
1630             XBZRLE_cache_unlock();
1631         }
1632         ram_release_pages(block->idstr, offset, res);
1633         return res;
1634     }
1635
1636     /*
1637      * Do not use multifd for:
1638      * 1. Compression as the first page in the new block should be posted out
1639      *    before sending the compressed page
1640      * 2. In postcopy as one whole host page should be placed
1641      */
1642     if (!save_page_use_compression(rs) && migrate_use_multifd()
1643         && !migration_in_postcopy()) {
1644         return ram_save_multifd_page(rs, block, offset);
1645     }
1646
1647     return ram_save_page(rs, pss, last_stage);
1648 }
1649
1650 /**
1651  * ram_save_host_page: save a whole host page
1652  *
1653  * Starting at *offset send pages up to the end of the current host
1654  * page. It's valid for the initial offset to point into the middle of
1655  * a host page in which case the remainder of the hostpage is sent.
1656  * Only dirty target pages are sent. Note that the host page size may
1657  * be a huge page for this block.
1658  * The saving stops at the boundary of the used_length of the block
1659  * if the RAMBlock isn't a multiple of the host page size.
1660  *
1661  * Returns the number of pages written or negative on error
1662  *
1663  * @rs: current RAM state
1664  * @ms: current migration state
1665  * @pss: data about the page we want to send
1666  * @last_stage: if we are at the completion stage
1667  */
1668 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1669                               bool last_stage)
1670 {
1671     int tmppages, pages = 0;
1672     size_t pagesize_bits =
1673         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1674
1675     if (ramblock_is_ignored(pss->block)) {
1676         error_report("block %s should not be migrated !", pss->block->idstr);
1677         return 0;
1678     }
1679
1680     do {
1681         /* Check the pages is dirty and if it is send it */
1682         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1683             pss->page++;
1684             continue;
1685         }
1686
1687         tmppages = ram_save_target_page(rs, pss, last_stage);
1688         if (tmppages < 0) {
1689             return tmppages;
1690         }
1691
1692         pages += tmppages;
1693         pss->page++;
1694         /* Allow rate limiting to happen in the middle of huge pages */
1695         migration_rate_limit();
1696     } while ((pss->page & (pagesize_bits - 1)) &&
1697              offset_in_ramblock(pss->block,
1698                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1699
1700     /* The offset we leave with is the last one we looked at */
1701     pss->page--;
1702     return pages;
1703 }
1704
1705 /**
1706  * ram_find_and_save_block: finds a dirty page and sends it to f
1707  *
1708  * Called within an RCU critical section.
1709  *
1710  * Returns the number of pages written where zero means no dirty pages,
1711  * or negative on error
1712  *
1713  * @rs: current RAM state
1714  * @last_stage: if we are at the completion stage
1715  *
1716  * On systems where host-page-size > target-page-size it will send all the
1717  * pages in a host page that are dirty.
1718  */
1719
1720 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1721 {
1722     PageSearchStatus pss;
1723     int pages = 0;
1724     bool again, found;
1725
1726     /* No dirty page as there is zero RAM */
1727     if (!ram_bytes_total()) {
1728         return pages;
1729     }
1730
1731     pss.block = rs->last_seen_block;
1732     pss.page = rs->last_page;
1733     pss.complete_round = false;
1734
1735     if (!pss.block) {
1736         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1737     }
1738
1739     do {
1740         again = true;
1741         found = get_queued_page(rs, &pss);
1742
1743         if (!found) {
1744             /* priority queue empty, so just search for something dirty */
1745             found = find_dirty_block(rs, &pss, &again);
1746         }
1747
1748         if (found) {
1749             pages = ram_save_host_page(rs, &pss, last_stage);
1750         }
1751     } while (!pages && again);
1752
1753     rs->last_seen_block = pss.block;
1754     rs->last_page = pss.page;
1755
1756     return pages;
1757 }
1758
1759 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1760 {
1761     uint64_t pages = size / TARGET_PAGE_SIZE;
1762
1763     if (zero) {
1764         ram_counters.duplicate += pages;
1765     } else {
1766         ram_counters.normal += pages;
1767         ram_counters.transferred += size;
1768         qemu_update_position(f, size);
1769     }
1770 }
1771
1772 static uint64_t ram_bytes_total_common(bool count_ignored)
1773 {
1774     RAMBlock *block;
1775     uint64_t total = 0;
1776
1777     RCU_READ_LOCK_GUARD();
1778
1779     if (count_ignored) {
1780         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1781             total += block->used_length;
1782         }
1783     } else {
1784         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1785             total += block->used_length;
1786         }
1787     }
1788     return total;
1789 }
1790
1791 uint64_t ram_bytes_total(void)
1792 {
1793     return ram_bytes_total_common(false);
1794 }
1795
1796 static void xbzrle_load_setup(void)
1797 {
1798     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1799 }
1800
1801 static void xbzrle_load_cleanup(void)
1802 {
1803     g_free(XBZRLE.decoded_buf);
1804     XBZRLE.decoded_buf = NULL;
1805 }
1806
1807 static void ram_state_cleanup(RAMState **rsp)
1808 {
1809     if (*rsp) {
1810         migration_page_queue_free(*rsp);
1811         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1812         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1813         g_free(*rsp);
1814         *rsp = NULL;
1815     }
1816 }
1817
1818 static void xbzrle_cleanup(void)
1819 {
1820     XBZRLE_cache_lock();
1821     if (XBZRLE.cache) {
1822         cache_fini(XBZRLE.cache);
1823         g_free(XBZRLE.encoded_buf);
1824         g_free(XBZRLE.current_buf);
1825         g_free(XBZRLE.zero_target_page);
1826         XBZRLE.cache = NULL;
1827         XBZRLE.encoded_buf = NULL;
1828         XBZRLE.current_buf = NULL;
1829         XBZRLE.zero_target_page = NULL;
1830     }
1831     XBZRLE_cache_unlock();
1832 }
1833
1834 static void ram_save_cleanup(void *opaque)
1835 {
1836     RAMState **rsp = opaque;
1837     RAMBlock *block;
1838
1839     /* caller have hold iothread lock or is in a bh, so there is
1840      * no writing race against the migration bitmap
1841      */
1842     memory_global_dirty_log_stop();
1843
1844     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1845         g_free(block->clear_bmap);
1846         block->clear_bmap = NULL;
1847         g_free(block->bmap);
1848         block->bmap = NULL;
1849     }
1850
1851     xbzrle_cleanup();
1852     compress_threads_save_cleanup();
1853     ram_state_cleanup(rsp);
1854 }
1855
1856 static void ram_state_reset(RAMState *rs)
1857 {
1858     rs->last_seen_block = NULL;
1859     rs->last_sent_block = NULL;
1860     rs->last_page = 0;
1861     rs->last_version = ram_list.version;
1862     rs->ram_bulk_stage = true;
1863     rs->fpo_enabled = false;
1864 }
1865
1866 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1867
1868 /*
1869  * 'expected' is the value you expect the bitmap mostly to be full
1870  * of; it won't bother printing lines that are all this value.
1871  * If 'todump' is null the migration bitmap is dumped.
1872  */
1873 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1874                            unsigned long pages)
1875 {
1876     int64_t cur;
1877     int64_t linelen = 128;
1878     char linebuf[129];
1879
1880     for (cur = 0; cur < pages; cur += linelen) {
1881         int64_t curb;
1882         bool found = false;
1883         /*
1884          * Last line; catch the case where the line length
1885          * is longer than remaining ram
1886          */
1887         if (cur + linelen > pages) {
1888             linelen = pages - cur;
1889         }
1890         for (curb = 0; curb < linelen; curb++) {
1891             bool thisbit = test_bit(cur + curb, todump);
1892             linebuf[curb] = thisbit ? '1' : '.';
1893             found = found || (thisbit != expected);
1894         }
1895         if (found) {
1896             linebuf[curb] = '\0';
1897             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1898         }
1899     }
1900 }
1901
1902 /* **** functions for postcopy ***** */
1903
1904 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1905 {
1906     struct RAMBlock *block;
1907
1908     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1909         unsigned long *bitmap = block->bmap;
1910         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1911         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1912
1913         while (run_start < range) {
1914             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1915             ram_discard_range(block->idstr,
1916                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1917                               ((ram_addr_t)(run_end - run_start))
1918                                 << TARGET_PAGE_BITS);
1919             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1920         }
1921     }
1922 }
1923
1924 /**
1925  * postcopy_send_discard_bm_ram: discard a RAMBlock
1926  *
1927  * Returns zero on success
1928  *
1929  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1930  *
1931  * @ms: current migration state
1932  * @block: RAMBlock to discard
1933  */
1934 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1935 {
1936     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1937     unsigned long current;
1938     unsigned long *bitmap = block->bmap;
1939
1940     for (current = 0; current < end; ) {
1941         unsigned long one = find_next_bit(bitmap, end, current);
1942         unsigned long zero, discard_length;
1943
1944         if (one >= end) {
1945             break;
1946         }
1947
1948         zero = find_next_zero_bit(bitmap, end, one + 1);
1949
1950         if (zero >= end) {
1951             discard_length = end - one;
1952         } else {
1953             discard_length = zero - one;
1954         }
1955         postcopy_discard_send_range(ms, one, discard_length);
1956         current = one + discard_length;
1957     }
1958
1959     return 0;
1960 }
1961
1962 /**
1963  * postcopy_each_ram_send_discard: discard all RAMBlocks
1964  *
1965  * Returns 0 for success or negative for error
1966  *
1967  * Utility for the outgoing postcopy code.
1968  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1969  *   passing it bitmap indexes and name.
1970  * (qemu_ram_foreach_block ends up passing unscaled lengths
1971  *  which would mean postcopy code would have to deal with target page)
1972  *
1973  * @ms: current migration state
1974  */
1975 static int postcopy_each_ram_send_discard(MigrationState *ms)
1976 {
1977     struct RAMBlock *block;
1978     int ret;
1979
1980     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1981         postcopy_discard_send_init(ms, block->idstr);
1982
1983         /*
1984          * Postcopy sends chunks of bitmap over the wire, but it
1985          * just needs indexes at this point, avoids it having
1986          * target page specific code.
1987          */
1988         ret = postcopy_send_discard_bm_ram(ms, block);
1989         postcopy_discard_send_finish(ms);
1990         if (ret) {
1991             return ret;
1992         }
1993     }
1994
1995     return 0;
1996 }
1997
1998 /**
1999  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2000  *
2001  * Helper for postcopy_chunk_hostpages; it's called twice to
2002  * canonicalize the two bitmaps, that are similar, but one is
2003  * inverted.
2004  *
2005  * Postcopy requires that all target pages in a hostpage are dirty or
2006  * clean, not a mix.  This function canonicalizes the bitmaps.
2007  *
2008  * @ms: current migration state
2009  * @block: block that contains the page we want to canonicalize
2010  */
2011 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2012 {
2013     RAMState *rs = ram_state;
2014     unsigned long *bitmap = block->bmap;
2015     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2016     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2017     unsigned long run_start;
2018
2019     if (block->page_size == TARGET_PAGE_SIZE) {
2020         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2021         return;
2022     }
2023
2024     /* Find a dirty page */
2025     run_start = find_next_bit(bitmap, pages, 0);
2026
2027     while (run_start < pages) {
2028
2029         /*
2030          * If the start of this run of pages is in the middle of a host
2031          * page, then we need to fixup this host page.
2032          */
2033         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2034             /* Find the end of this run */
2035             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2036             /*
2037              * If the end isn't at the start of a host page, then the
2038              * run doesn't finish at the end of a host page
2039              * and we need to discard.
2040              */
2041         }
2042
2043         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2044             unsigned long page;
2045             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2046                                                              host_ratio);
2047             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2048
2049             /* Clean up the bitmap */
2050             for (page = fixup_start_addr;
2051                  page < fixup_start_addr + host_ratio; page++) {
2052                 /*
2053                  * Remark them as dirty, updating the count for any pages
2054                  * that weren't previously dirty.
2055                  */
2056                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2057             }
2058         }
2059
2060         /* Find the next dirty page for the next iteration */
2061         run_start = find_next_bit(bitmap, pages, run_start);
2062     }
2063 }
2064
2065 /**
2066  * postcopy_chunk_hostpages: discard any partially sent host page
2067  *
2068  * Utility for the outgoing postcopy code.
2069  *
2070  * Discard any partially sent host-page size chunks, mark any partially
2071  * dirty host-page size chunks as all dirty.  In this case the host-page
2072  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2073  *
2074  * Returns zero on success
2075  *
2076  * @ms: current migration state
2077  * @block: block we want to work with
2078  */
2079 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2080 {
2081     postcopy_discard_send_init(ms, block->idstr);
2082
2083     /*
2084      * Ensure that all partially dirty host pages are made fully dirty.
2085      */
2086     postcopy_chunk_hostpages_pass(ms, block);
2087
2088     postcopy_discard_send_finish(ms);
2089     return 0;
2090 }
2091
2092 /**
2093  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2094  *
2095  * Returns zero on success
2096  *
2097  * Transmit the set of pages to be discarded after precopy to the target
2098  * these are pages that:
2099  *     a) Have been previously transmitted but are now dirty again
2100  *     b) Pages that have never been transmitted, this ensures that
2101  *        any pages on the destination that have been mapped by background
2102  *        tasks get discarded (transparent huge pages is the specific concern)
2103  * Hopefully this is pretty sparse
2104  *
2105  * @ms: current migration state
2106  */
2107 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2108 {
2109     RAMState *rs = ram_state;
2110     RAMBlock *block;
2111     int ret;
2112
2113     RCU_READ_LOCK_GUARD();
2114
2115     /* This should be our last sync, the src is now paused */
2116     migration_bitmap_sync(rs);
2117
2118     /* Easiest way to make sure we don't resume in the middle of a host-page */
2119     rs->last_seen_block = NULL;
2120     rs->last_sent_block = NULL;
2121     rs->last_page = 0;
2122
2123     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2124         /* Deal with TPS != HPS and huge pages */
2125         ret = postcopy_chunk_hostpages(ms, block);
2126         if (ret) {
2127             return ret;
2128         }
2129
2130 #ifdef DEBUG_POSTCOPY
2131         ram_debug_dump_bitmap(block->bmap, true,
2132                               block->used_length >> TARGET_PAGE_BITS);
2133 #endif
2134     }
2135     trace_ram_postcopy_send_discard_bitmap();
2136
2137     ret = postcopy_each_ram_send_discard(ms);
2138
2139     return ret;
2140 }
2141
2142 /**
2143  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2144  *
2145  * Returns zero on success
2146  *
2147  * @rbname: name of the RAMBlock of the request. NULL means the
2148  *          same that last one.
2149  * @start: RAMBlock starting page
2150  * @length: RAMBlock size
2151  */
2152 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2153 {
2154     trace_ram_discard_range(rbname, start, length);
2155
2156     RCU_READ_LOCK_GUARD();
2157     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2158
2159     if (!rb) {
2160         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2161         return -1;
2162     }
2163
2164     /*
2165      * On source VM, we don't need to update the received bitmap since
2166      * we don't even have one.
2167      */
2168     if (rb->receivedmap) {
2169         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2170                      length >> qemu_target_page_bits());
2171     }
2172
2173     return ram_block_discard_range(rb, start, length);
2174 }
2175
2176 /*
2177  * For every allocation, we will try not to crash the VM if the
2178  * allocation failed.
2179  */
2180 static int xbzrle_init(void)
2181 {
2182     Error *local_err = NULL;
2183
2184     if (!migrate_use_xbzrle()) {
2185         return 0;
2186     }
2187
2188     XBZRLE_cache_lock();
2189
2190     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2191     if (!XBZRLE.zero_target_page) {
2192         error_report("%s: Error allocating zero page", __func__);
2193         goto err_out;
2194     }
2195
2196     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2197                               TARGET_PAGE_SIZE, &local_err);
2198     if (!XBZRLE.cache) {
2199         error_report_err(local_err);
2200         goto free_zero_page;
2201     }
2202
2203     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2204     if (!XBZRLE.encoded_buf) {
2205         error_report("%s: Error allocating encoded_buf", __func__);
2206         goto free_cache;
2207     }
2208
2209     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2210     if (!XBZRLE.current_buf) {
2211         error_report("%s: Error allocating current_buf", __func__);
2212         goto free_encoded_buf;
2213     }
2214
2215     /* We are all good */
2216     XBZRLE_cache_unlock();
2217     return 0;
2218
2219 free_encoded_buf:
2220     g_free(XBZRLE.encoded_buf);
2221     XBZRLE.encoded_buf = NULL;
2222 free_cache:
2223     cache_fini(XBZRLE.cache);
2224     XBZRLE.cache = NULL;
2225 free_zero_page:
2226     g_free(XBZRLE.zero_target_page);
2227     XBZRLE.zero_target_page = NULL;
2228 err_out:
2229     XBZRLE_cache_unlock();
2230     return -ENOMEM;
2231 }
2232
2233 static int ram_state_init(RAMState **rsp)
2234 {
2235     *rsp = g_try_new0(RAMState, 1);
2236
2237     if (!*rsp) {
2238         error_report("%s: Init ramstate fail", __func__);
2239         return -1;
2240     }
2241
2242     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2243     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2244     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2245
2246     /*
2247      * Count the total number of pages used by ram blocks not including any
2248      * gaps due to alignment or unplugs.
2249      * This must match with the initial values of dirty bitmap.
2250      */
2251     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2252     ram_state_reset(*rsp);
2253
2254     return 0;
2255 }
2256
2257 static void ram_list_init_bitmaps(void)
2258 {
2259     MigrationState *ms = migrate_get_current();
2260     RAMBlock *block;
2261     unsigned long pages;
2262     uint8_t shift;
2263
2264     /* Skip setting bitmap if there is no RAM */
2265     if (ram_bytes_total()) {
2266         shift = ms->clear_bitmap_shift;
2267         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2268             error_report("clear_bitmap_shift (%u) too big, using "
2269                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2270             shift = CLEAR_BITMAP_SHIFT_MAX;
2271         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2272             error_report("clear_bitmap_shift (%u) too small, using "
2273                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2274             shift = CLEAR_BITMAP_SHIFT_MIN;
2275         }
2276
2277         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2278             pages = block->max_length >> TARGET_PAGE_BITS;
2279             /*
2280              * The initial dirty bitmap for migration must be set with all
2281              * ones to make sure we'll migrate every guest RAM page to
2282              * destination.
2283              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2284              * new migration after a failed migration, ram_list.
2285              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2286              * guest memory.
2287              */
2288             block->bmap = bitmap_new(pages);
2289             bitmap_set(block->bmap, 0, pages);
2290             block->clear_bmap_shift = shift;
2291             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2292         }
2293     }
2294 }
2295
2296 static void ram_init_bitmaps(RAMState *rs)
2297 {
2298     /* For memory_global_dirty_log_start below.  */
2299     qemu_mutex_lock_iothread();
2300     qemu_mutex_lock_ramlist();
2301
2302     WITH_RCU_READ_LOCK_GUARD() {
2303         ram_list_init_bitmaps();
2304         memory_global_dirty_log_start();
2305         migration_bitmap_sync_precopy(rs);
2306     }
2307     qemu_mutex_unlock_ramlist();
2308     qemu_mutex_unlock_iothread();
2309 }
2310
2311 static int ram_init_all(RAMState **rsp)
2312 {
2313     if (ram_state_init(rsp)) {
2314         return -1;
2315     }
2316
2317     if (xbzrle_init()) {
2318         ram_state_cleanup(rsp);
2319         return -1;
2320     }
2321
2322     ram_init_bitmaps(*rsp);
2323
2324     return 0;
2325 }
2326
2327 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2328 {
2329     RAMBlock *block;
2330     uint64_t pages = 0;
2331
2332     /*
2333      * Postcopy is not using xbzrle/compression, so no need for that.
2334      * Also, since source are already halted, we don't need to care
2335      * about dirty page logging as well.
2336      */
2337
2338     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2339         pages += bitmap_count_one(block->bmap,
2340                                   block->used_length >> TARGET_PAGE_BITS);
2341     }
2342
2343     /* This may not be aligned with current bitmaps. Recalculate. */
2344     rs->migration_dirty_pages = pages;
2345
2346     rs->last_seen_block = NULL;
2347     rs->last_sent_block = NULL;
2348     rs->last_page = 0;
2349     rs->last_version = ram_list.version;
2350     /*
2351      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2352      * matter what we have sent.
2353      */
2354     rs->ram_bulk_stage = false;
2355
2356     /* Update RAMState cache of output QEMUFile */
2357     rs->f = out;
2358
2359     trace_ram_state_resume_prepare(pages);
2360 }
2361
2362 /*
2363  * This function clears bits of the free pages reported by the caller from the
2364  * migration dirty bitmap. @addr is the host address corresponding to the
2365  * start of the continuous guest free pages, and @len is the total bytes of
2366  * those pages.
2367  */
2368 void qemu_guest_free_page_hint(void *addr, size_t len)
2369 {
2370     RAMBlock *block;
2371     ram_addr_t offset;
2372     size_t used_len, start, npages;
2373     MigrationState *s = migrate_get_current();
2374
2375     /* This function is currently expected to be used during live migration */
2376     if (!migration_is_setup_or_active(s->state)) {
2377         return;
2378     }
2379
2380     for (; len > 0; len -= used_len, addr += used_len) {
2381         block = qemu_ram_block_from_host(addr, false, &offset);
2382         if (unlikely(!block || offset >= block->used_length)) {
2383             /*
2384              * The implementation might not support RAMBlock resize during
2385              * live migration, but it could happen in theory with future
2386              * updates. So we add a check here to capture that case.
2387              */
2388             error_report_once("%s unexpected error", __func__);
2389             return;
2390         }
2391
2392         if (len <= block->used_length - offset) {
2393             used_len = len;
2394         } else {
2395             used_len = block->used_length - offset;
2396         }
2397
2398         start = offset >> TARGET_PAGE_BITS;
2399         npages = used_len >> TARGET_PAGE_BITS;
2400
2401         qemu_mutex_lock(&ram_state->bitmap_mutex);
2402         ram_state->migration_dirty_pages -=
2403                       bitmap_count_one_with_offset(block->bmap, start, npages);
2404         bitmap_clear(block->bmap, start, npages);
2405         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2406     }
2407 }
2408
2409 /*
2410  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2411  * long-running RCU critical section.  When rcu-reclaims in the code
2412  * start to become numerous it will be necessary to reduce the
2413  * granularity of these critical sections.
2414  */
2415
2416 /**
2417  * ram_save_setup: Setup RAM for migration
2418  *
2419  * Returns zero to indicate success and negative for error
2420  *
2421  * @f: QEMUFile where to send the data
2422  * @opaque: RAMState pointer
2423  */
2424 static int ram_save_setup(QEMUFile *f, void *opaque)
2425 {
2426     RAMState **rsp = opaque;
2427     RAMBlock *block;
2428
2429     if (compress_threads_save_setup()) {
2430         return -1;
2431     }
2432
2433     /* migration has already setup the bitmap, reuse it. */
2434     if (!migration_in_colo_state()) {
2435         if (ram_init_all(rsp) != 0) {
2436             compress_threads_save_cleanup();
2437             return -1;
2438         }
2439     }
2440     (*rsp)->f = f;
2441
2442     WITH_RCU_READ_LOCK_GUARD() {
2443         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2444
2445         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2446             qemu_put_byte(f, strlen(block->idstr));
2447             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2448             qemu_put_be64(f, block->used_length);
2449             if (migrate_postcopy_ram() && block->page_size !=
2450                                           qemu_host_page_size) {
2451                 qemu_put_be64(f, block->page_size);
2452             }
2453             if (migrate_ignore_shared()) {
2454                 qemu_put_be64(f, block->mr->addr);
2455             }
2456         }
2457     }
2458
2459     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2460     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2461
2462     multifd_send_sync_main(f);
2463     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2464     qemu_fflush(f);
2465
2466     return 0;
2467 }
2468
2469 /**
2470  * ram_save_iterate: iterative stage for migration
2471  *
2472  * Returns zero to indicate success and negative for error
2473  *
2474  * @f: QEMUFile where to send the data
2475  * @opaque: RAMState pointer
2476  */
2477 static int ram_save_iterate(QEMUFile *f, void *opaque)
2478 {
2479     RAMState **temp = opaque;
2480     RAMState *rs = *temp;
2481     int ret = 0;
2482     int i;
2483     int64_t t0;
2484     int done = 0;
2485
2486     if (blk_mig_bulk_active()) {
2487         /* Avoid transferring ram during bulk phase of block migration as
2488          * the bulk phase will usually take a long time and transferring
2489          * ram updates during that time is pointless. */
2490         goto out;
2491     }
2492
2493     WITH_RCU_READ_LOCK_GUARD() {
2494         if (ram_list.version != rs->last_version) {
2495             ram_state_reset(rs);
2496         }
2497
2498         /* Read version before ram_list.blocks */
2499         smp_rmb();
2500
2501         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2502
2503         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2504         i = 0;
2505         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2506                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2507             int pages;
2508
2509             if (qemu_file_get_error(f)) {
2510                 break;
2511             }
2512
2513             pages = ram_find_and_save_block(rs, false);
2514             /* no more pages to sent */
2515             if (pages == 0) {
2516                 done = 1;
2517                 break;
2518             }
2519
2520             if (pages < 0) {
2521                 qemu_file_set_error(f, pages);
2522                 break;
2523             }
2524
2525             rs->target_page_count += pages;
2526
2527             /*
2528              * During postcopy, it is necessary to make sure one whole host
2529              * page is sent in one chunk.
2530              */
2531             if (migrate_postcopy_ram()) {
2532                 flush_compressed_data(rs);
2533             }
2534
2535             /*
2536              * we want to check in the 1st loop, just in case it was the 1st
2537              * time and we had to sync the dirty bitmap.
2538              * qemu_clock_get_ns() is a bit expensive, so we only check each
2539              * some iterations
2540              */
2541             if ((i & 63) == 0) {
2542                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2543                               1000000;
2544                 if (t1 > MAX_WAIT) {
2545                     trace_ram_save_iterate_big_wait(t1, i);
2546                     break;
2547                 }
2548             }
2549             i++;
2550         }
2551     }
2552
2553     /*
2554      * Must occur before EOS (or any QEMUFile operation)
2555      * because of RDMA protocol.
2556      */
2557     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2558
2559 out:
2560     if (ret >= 0
2561         && migration_is_setup_or_active(migrate_get_current()->state)) {
2562         multifd_send_sync_main(rs->f);
2563         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2564         qemu_fflush(f);
2565         ram_counters.transferred += 8;
2566
2567         ret = qemu_file_get_error(f);
2568     }
2569     if (ret < 0) {
2570         return ret;
2571     }
2572
2573     return done;
2574 }
2575
2576 /**
2577  * ram_save_complete: function called to send the remaining amount of ram
2578  *
2579  * Returns zero to indicate success or negative on error
2580  *
2581  * Called with iothread lock
2582  *
2583  * @f: QEMUFile where to send the data
2584  * @opaque: RAMState pointer
2585  */
2586 static int ram_save_complete(QEMUFile *f, void *opaque)
2587 {
2588     RAMState **temp = opaque;
2589     RAMState *rs = *temp;
2590     int ret = 0;
2591
2592     WITH_RCU_READ_LOCK_GUARD() {
2593         if (!migration_in_postcopy()) {
2594             migration_bitmap_sync_precopy(rs);
2595         }
2596
2597         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2598
2599         /* try transferring iterative blocks of memory */
2600
2601         /* flush all remaining blocks regardless of rate limiting */
2602         while (true) {
2603             int pages;
2604
2605             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2606             /* no more blocks to sent */
2607             if (pages == 0) {
2608                 break;
2609             }
2610             if (pages < 0) {
2611                 ret = pages;
2612                 break;
2613             }
2614         }
2615
2616         flush_compressed_data(rs);
2617         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2618     }
2619
2620     if (ret >= 0) {
2621         multifd_send_sync_main(rs->f);
2622         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2623         qemu_fflush(f);
2624     }
2625
2626     return ret;
2627 }
2628
2629 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2630                              uint64_t *res_precopy_only,
2631                              uint64_t *res_compatible,
2632                              uint64_t *res_postcopy_only)
2633 {
2634     RAMState **temp = opaque;
2635     RAMState *rs = *temp;
2636     uint64_t remaining_size;
2637
2638     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2639
2640     if (!migration_in_postcopy() &&
2641         remaining_size < max_size) {
2642         qemu_mutex_lock_iothread();
2643         WITH_RCU_READ_LOCK_GUARD() {
2644             migration_bitmap_sync_precopy(rs);
2645         }
2646         qemu_mutex_unlock_iothread();
2647         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2648     }
2649
2650     if (migrate_postcopy_ram()) {
2651         /* We can do postcopy, and all the data is postcopiable */
2652         *res_compatible += remaining_size;
2653     } else {
2654         *res_precopy_only += remaining_size;
2655     }
2656 }
2657
2658 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2659 {
2660     unsigned int xh_len;
2661     int xh_flags;
2662     uint8_t *loaded_data;
2663
2664     /* extract RLE header */
2665     xh_flags = qemu_get_byte(f);
2666     xh_len = qemu_get_be16(f);
2667
2668     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2669         error_report("Failed to load XBZRLE page - wrong compression!");
2670         return -1;
2671     }
2672
2673     if (xh_len > TARGET_PAGE_SIZE) {
2674         error_report("Failed to load XBZRLE page - len overflow!");
2675         return -1;
2676     }
2677     loaded_data = XBZRLE.decoded_buf;
2678     /* load data and decode */
2679     /* it can change loaded_data to point to an internal buffer */
2680     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2681
2682     /* decode RLE */
2683     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2684                              TARGET_PAGE_SIZE) == -1) {
2685         error_report("Failed to load XBZRLE page - decode error!");
2686         return -1;
2687     }
2688
2689     return 0;
2690 }
2691
2692 /**
2693  * ram_block_from_stream: read a RAMBlock id from the migration stream
2694  *
2695  * Must be called from within a rcu critical section.
2696  *
2697  * Returns a pointer from within the RCU-protected ram_list.
2698  *
2699  * @f: QEMUFile where to read the data from
2700  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2701  */
2702 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2703 {
2704     static RAMBlock *block = NULL;
2705     char id[256];
2706     uint8_t len;
2707
2708     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2709         if (!block) {
2710             error_report("Ack, bad migration stream!");
2711             return NULL;
2712         }
2713         return block;
2714     }
2715
2716     len = qemu_get_byte(f);
2717     qemu_get_buffer(f, (uint8_t *)id, len);
2718     id[len] = 0;
2719
2720     block = qemu_ram_block_by_name(id);
2721     if (!block) {
2722         error_report("Can't find block %s", id);
2723         return NULL;
2724     }
2725
2726     if (ramblock_is_ignored(block)) {
2727         error_report("block %s should not be migrated !", id);
2728         return NULL;
2729     }
2730
2731     return block;
2732 }
2733
2734 static inline void *host_from_ram_block_offset(RAMBlock *block,
2735                                                ram_addr_t offset)
2736 {
2737     if (!offset_in_ramblock(block, offset)) {
2738         return NULL;
2739     }
2740
2741     return block->host + offset;
2742 }
2743
2744 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2745                              ram_addr_t offset, bool record_bitmap)
2746 {
2747     if (!offset_in_ramblock(block, offset)) {
2748         return NULL;
2749     }
2750     if (!block->colo_cache) {
2751         error_report("%s: colo_cache is NULL in block :%s",
2752                      __func__, block->idstr);
2753         return NULL;
2754     }
2755
2756     /*
2757     * During colo checkpoint, we need bitmap of these migrated pages.
2758     * It help us to decide which pages in ram cache should be flushed
2759     * into VM's RAM later.
2760     */
2761     if (record_bitmap &&
2762         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2763         ram_state->migration_dirty_pages++;
2764     }
2765     return block->colo_cache + offset;
2766 }
2767
2768 /**
2769  * ram_handle_compressed: handle the zero page case
2770  *
2771  * If a page (or a whole RDMA chunk) has been
2772  * determined to be zero, then zap it.
2773  *
2774  * @host: host address for the zero page
2775  * @ch: what the page is filled from.  We only support zero
2776  * @size: size of the zero page
2777  */
2778 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2779 {
2780     if (ch != 0 || !is_zero_range(host, size)) {
2781         memset(host, ch, size);
2782     }
2783 }
2784
2785 /* return the size after decompression, or negative value on error */
2786 static int
2787 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2788                      const uint8_t *source, size_t source_len)
2789 {
2790     int err;
2791
2792     err = inflateReset(stream);
2793     if (err != Z_OK) {
2794         return -1;
2795     }
2796
2797     stream->avail_in = source_len;
2798     stream->next_in = (uint8_t *)source;
2799     stream->avail_out = dest_len;
2800     stream->next_out = dest;
2801
2802     err = inflate(stream, Z_NO_FLUSH);
2803     if (err != Z_STREAM_END) {
2804         return -1;
2805     }
2806
2807     return stream->total_out;
2808 }
2809
2810 static void *do_data_decompress(void *opaque)
2811 {
2812     DecompressParam *param = opaque;
2813     unsigned long pagesize;
2814     uint8_t *des;
2815     int len, ret;
2816
2817     qemu_mutex_lock(&param->mutex);
2818     while (!param->quit) {
2819         if (param->des) {
2820             des = param->des;
2821             len = param->len;
2822             param->des = 0;
2823             qemu_mutex_unlock(&param->mutex);
2824
2825             pagesize = TARGET_PAGE_SIZE;
2826
2827             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2828                                        param->compbuf, len);
2829             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2830                 error_report("decompress data failed");
2831                 qemu_file_set_error(decomp_file, ret);
2832             }
2833
2834             qemu_mutex_lock(&decomp_done_lock);
2835             param->done = true;
2836             qemu_cond_signal(&decomp_done_cond);
2837             qemu_mutex_unlock(&decomp_done_lock);
2838
2839             qemu_mutex_lock(&param->mutex);
2840         } else {
2841             qemu_cond_wait(&param->cond, &param->mutex);
2842         }
2843     }
2844     qemu_mutex_unlock(&param->mutex);
2845
2846     return NULL;
2847 }
2848
2849 static int wait_for_decompress_done(void)
2850 {
2851     int idx, thread_count;
2852
2853     if (!migrate_use_compression()) {
2854         return 0;
2855     }
2856
2857     thread_count = migrate_decompress_threads();
2858     qemu_mutex_lock(&decomp_done_lock);
2859     for (idx = 0; idx < thread_count; idx++) {
2860         while (!decomp_param[idx].done) {
2861             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2862         }
2863     }
2864     qemu_mutex_unlock(&decomp_done_lock);
2865     return qemu_file_get_error(decomp_file);
2866 }
2867
2868 static void compress_threads_load_cleanup(void)
2869 {
2870     int i, thread_count;
2871
2872     if (!migrate_use_compression()) {
2873         return;
2874     }
2875     thread_count = migrate_decompress_threads();
2876     for (i = 0; i < thread_count; i++) {
2877         /*
2878          * we use it as a indicator which shows if the thread is
2879          * properly init'd or not
2880          */
2881         if (!decomp_param[i].compbuf) {
2882             break;
2883         }
2884
2885         qemu_mutex_lock(&decomp_param[i].mutex);
2886         decomp_param[i].quit = true;
2887         qemu_cond_signal(&decomp_param[i].cond);
2888         qemu_mutex_unlock(&decomp_param[i].mutex);
2889     }
2890     for (i = 0; i < thread_count; i++) {
2891         if (!decomp_param[i].compbuf) {
2892             break;
2893         }
2894
2895         qemu_thread_join(decompress_threads + i);
2896         qemu_mutex_destroy(&decomp_param[i].mutex);
2897         qemu_cond_destroy(&decomp_param[i].cond);
2898         inflateEnd(&decomp_param[i].stream);
2899         g_free(decomp_param[i].compbuf);
2900         decomp_param[i].compbuf = NULL;
2901     }
2902     g_free(decompress_threads);
2903     g_free(decomp_param);
2904     decompress_threads = NULL;
2905     decomp_param = NULL;
2906     decomp_file = NULL;
2907 }
2908
2909 static int compress_threads_load_setup(QEMUFile *f)
2910 {
2911     int i, thread_count;
2912
2913     if (!migrate_use_compression()) {
2914         return 0;
2915     }
2916
2917     thread_count = migrate_decompress_threads();
2918     decompress_threads = g_new0(QemuThread, thread_count);
2919     decomp_param = g_new0(DecompressParam, thread_count);
2920     qemu_mutex_init(&decomp_done_lock);
2921     qemu_cond_init(&decomp_done_cond);
2922     decomp_file = f;
2923     for (i = 0; i < thread_count; i++) {
2924         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2925             goto exit;
2926         }
2927
2928         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2929         qemu_mutex_init(&decomp_param[i].mutex);
2930         qemu_cond_init(&decomp_param[i].cond);
2931         decomp_param[i].done = true;
2932         decomp_param[i].quit = false;
2933         qemu_thread_create(decompress_threads + i, "decompress",
2934                            do_data_decompress, decomp_param + i,
2935                            QEMU_THREAD_JOINABLE);
2936     }
2937     return 0;
2938 exit:
2939     compress_threads_load_cleanup();
2940     return -1;
2941 }
2942
2943 static void decompress_data_with_multi_threads(QEMUFile *f,
2944                                                void *host, int len)
2945 {
2946     int idx, thread_count;
2947
2948     thread_count = migrate_decompress_threads();
2949     qemu_mutex_lock(&decomp_done_lock);
2950     while (true) {
2951         for (idx = 0; idx < thread_count; idx++) {
2952             if (decomp_param[idx].done) {
2953                 decomp_param[idx].done = false;
2954                 qemu_mutex_lock(&decomp_param[idx].mutex);
2955                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2956                 decomp_param[idx].des = host;
2957                 decomp_param[idx].len = len;
2958                 qemu_cond_signal(&decomp_param[idx].cond);
2959                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2960                 break;
2961             }
2962         }
2963         if (idx < thread_count) {
2964             break;
2965         } else {
2966             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2967         }
2968     }
2969     qemu_mutex_unlock(&decomp_done_lock);
2970 }
2971
2972 /*
2973  * colo cache: this is for secondary VM, we cache the whole
2974  * memory of the secondary VM, it is need to hold the global lock
2975  * to call this helper.
2976  */
2977 int colo_init_ram_cache(void)
2978 {
2979     RAMBlock *block;
2980
2981     WITH_RCU_READ_LOCK_GUARD() {
2982         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2983             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2984                                                     NULL,
2985                                                     false);
2986             if (!block->colo_cache) {
2987                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
2988                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2989                              block->used_length);
2990                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2991                     if (block->colo_cache) {
2992                         qemu_anon_ram_free(block->colo_cache, block->used_length);
2993                         block->colo_cache = NULL;
2994                     }
2995                 }
2996                 return -errno;
2997             }
2998         }
2999     }
3000
3001     /*
3002     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3003     * with to decide which page in cache should be flushed into SVM's RAM. Here
3004     * we use the same name 'ram_bitmap' as for migration.
3005     */
3006     if (ram_bytes_total()) {
3007         RAMBlock *block;
3008
3009         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3010             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3011             block->bmap = bitmap_new(pages);
3012         }
3013     }
3014
3015     ram_state_init(&ram_state);
3016     return 0;
3017 }
3018
3019 /* TODO: duplicated with ram_init_bitmaps */
3020 void colo_incoming_start_dirty_log(void)
3021 {
3022     RAMBlock *block = NULL;
3023     /* For memory_global_dirty_log_start below. */
3024     qemu_mutex_lock_iothread();
3025     qemu_mutex_lock_ramlist();
3026
3027     memory_global_dirty_log_sync();
3028     WITH_RCU_READ_LOCK_GUARD() {
3029         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3030             ramblock_sync_dirty_bitmap(ram_state, block);
3031             /* Discard this dirty bitmap record */
3032             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3033         }
3034         memory_global_dirty_log_start();
3035     }
3036     ram_state->migration_dirty_pages = 0;
3037     qemu_mutex_unlock_ramlist();
3038     qemu_mutex_unlock_iothread();
3039 }
3040
3041 /* It is need to hold the global lock to call this helper */
3042 void colo_release_ram_cache(void)
3043 {
3044     RAMBlock *block;
3045
3046     memory_global_dirty_log_stop();
3047     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3048         g_free(block->bmap);
3049         block->bmap = NULL;
3050     }
3051
3052     WITH_RCU_READ_LOCK_GUARD() {
3053         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3054             if (block->colo_cache) {
3055                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3056                 block->colo_cache = NULL;
3057             }
3058         }
3059     }
3060     ram_state_cleanup(&ram_state);
3061 }
3062
3063 /**
3064  * ram_load_setup: Setup RAM for migration incoming side
3065  *
3066  * Returns zero to indicate success and negative for error
3067  *
3068  * @f: QEMUFile where to receive the data
3069  * @opaque: RAMState pointer
3070  */
3071 static int ram_load_setup(QEMUFile *f, void *opaque)
3072 {
3073     if (compress_threads_load_setup(f)) {
3074         return -1;
3075     }
3076
3077     xbzrle_load_setup();
3078     ramblock_recv_map_init();
3079
3080     return 0;
3081 }
3082
3083 static int ram_load_cleanup(void *opaque)
3084 {
3085     RAMBlock *rb;
3086
3087     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3088         qemu_ram_block_writeback(rb);
3089     }
3090
3091     xbzrle_load_cleanup();
3092     compress_threads_load_cleanup();
3093
3094     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3095         g_free(rb->receivedmap);
3096         rb->receivedmap = NULL;
3097     }
3098
3099     return 0;
3100 }
3101
3102 /**
3103  * ram_postcopy_incoming_init: allocate postcopy data structures
3104  *
3105  * Returns 0 for success and negative if there was one error
3106  *
3107  * @mis: current migration incoming state
3108  *
3109  * Allocate data structures etc needed by incoming migration with
3110  * postcopy-ram. postcopy-ram's similarly names
3111  * postcopy_ram_incoming_init does the work.
3112  */
3113 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3114 {
3115     return postcopy_ram_incoming_init(mis);
3116 }
3117
3118 /**
3119  * ram_load_postcopy: load a page in postcopy case
3120  *
3121  * Returns 0 for success or -errno in case of error
3122  *
3123  * Called in postcopy mode by ram_load().
3124  * rcu_read_lock is taken prior to this being called.
3125  *
3126  * @f: QEMUFile where to send the data
3127  */
3128 static int ram_load_postcopy(QEMUFile *f)
3129 {
3130     int flags = 0, ret = 0;
3131     bool place_needed = false;
3132     bool matches_target_page_size = false;
3133     MigrationIncomingState *mis = migration_incoming_get_current();
3134     /* Temporary page that is later 'placed' */
3135     void *postcopy_host_page = mis->postcopy_tmp_page;
3136     void *this_host = NULL;
3137     bool all_zero = false;
3138     int target_pages = 0;
3139
3140     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3141         ram_addr_t addr;
3142         void *host = NULL;
3143         void *page_buffer = NULL;
3144         void *place_source = NULL;
3145         RAMBlock *block = NULL;
3146         uint8_t ch;
3147         int len;
3148
3149         addr = qemu_get_be64(f);
3150
3151         /*
3152          * If qemu file error, we should stop here, and then "addr"
3153          * may be invalid
3154          */
3155         ret = qemu_file_get_error(f);
3156         if (ret) {
3157             break;
3158         }
3159
3160         flags = addr & ~TARGET_PAGE_MASK;
3161         addr &= TARGET_PAGE_MASK;
3162
3163         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3164         place_needed = false;
3165         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3166                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3167             block = ram_block_from_stream(f, flags);
3168
3169             host = host_from_ram_block_offset(block, addr);
3170             if (!host) {
3171                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3172                 ret = -EINVAL;
3173                 break;
3174             }
3175             target_pages++;
3176             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3177             /*
3178              * Postcopy requires that we place whole host pages atomically;
3179              * these may be huge pages for RAMBlocks that are backed by
3180              * hugetlbfs.
3181              * To make it atomic, the data is read into a temporary page
3182              * that's moved into place later.
3183              * The migration protocol uses,  possibly smaller, target-pages
3184              * however the source ensures it always sends all the components
3185              * of a host page in one chunk.
3186              */
3187             page_buffer = postcopy_host_page +
3188                           ((uintptr_t)host & (block->page_size - 1));
3189             /* If all TP are zero then we can optimise the place */
3190             if (target_pages == 1) {
3191                 all_zero = true;
3192                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3193                                                     block->page_size);
3194             } else {
3195                 /* not the 1st TP within the HP */
3196                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3197                     (uintptr_t)this_host) {
3198                     error_report("Non-same host page %p/%p",
3199                                   host, this_host);
3200                     ret = -EINVAL;
3201                     break;
3202                 }
3203             }
3204
3205             /*
3206              * If it's the last part of a host page then we place the host
3207              * page
3208              */
3209             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3210                 place_needed = true;
3211                 target_pages = 0;
3212             }
3213             place_source = postcopy_host_page;
3214         }
3215
3216         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3217         case RAM_SAVE_FLAG_ZERO:
3218             ch = qemu_get_byte(f);
3219             /*
3220              * Can skip to set page_buffer when
3221              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3222              */
3223             if (ch || !matches_target_page_size) {
3224                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3225             }
3226             if (ch) {
3227                 all_zero = false;
3228             }
3229             break;
3230
3231         case RAM_SAVE_FLAG_PAGE:
3232             all_zero = false;
3233             if (!matches_target_page_size) {
3234                 /* For huge pages, we always use temporary buffer */
3235                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3236             } else {
3237                 /*
3238                  * For small pages that matches target page size, we
3239                  * avoid the qemu_file copy.  Instead we directly use
3240                  * the buffer of QEMUFile to place the page.  Note: we
3241                  * cannot do any QEMUFile operation before using that
3242                  * buffer to make sure the buffer is valid when
3243                  * placing the page.
3244                  */
3245                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3246                                          TARGET_PAGE_SIZE);
3247             }
3248             break;
3249         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3250             all_zero = false;
3251             len = qemu_get_be32(f);
3252             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3253                 error_report("Invalid compressed data length: %d", len);
3254                 ret = -EINVAL;
3255                 break;
3256             }
3257             decompress_data_with_multi_threads(f, page_buffer, len);
3258             break;
3259
3260         case RAM_SAVE_FLAG_EOS:
3261             /* normal exit */
3262             multifd_recv_sync_main();
3263             break;
3264         default:
3265             error_report("Unknown combination of migration flags: %#x"
3266                          " (postcopy mode)", flags);
3267             ret = -EINVAL;
3268             break;
3269         }
3270
3271         /* Got the whole host page, wait for decompress before placing. */
3272         if (place_needed) {
3273             ret |= wait_for_decompress_done();
3274         }
3275
3276         /* Detect for any possible file errors */
3277         if (!ret && qemu_file_get_error(f)) {
3278             ret = qemu_file_get_error(f);
3279         }
3280
3281         if (!ret && place_needed) {
3282             /* This gets called at the last target page in the host page */
3283             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3284                                                        block->page_size);
3285
3286             if (all_zero) {
3287                 ret = postcopy_place_page_zero(mis, place_dest,
3288                                                block);
3289             } else {
3290                 ret = postcopy_place_page(mis, place_dest,
3291                                           place_source, block);
3292             }
3293         }
3294     }
3295
3296     return ret;
3297 }
3298
3299 static bool postcopy_is_advised(void)
3300 {
3301     PostcopyState ps = postcopy_state_get();
3302     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3303 }
3304
3305 static bool postcopy_is_running(void)
3306 {
3307     PostcopyState ps = postcopy_state_get();
3308     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3309 }
3310
3311 /*
3312  * Flush content of RAM cache into SVM's memory.
3313  * Only flush the pages that be dirtied by PVM or SVM or both.
3314  */
3315 static void colo_flush_ram_cache(void)
3316 {
3317     RAMBlock *block = NULL;
3318     void *dst_host;
3319     void *src_host;
3320     unsigned long offset = 0;
3321
3322     memory_global_dirty_log_sync();
3323     WITH_RCU_READ_LOCK_GUARD() {
3324         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3325             ramblock_sync_dirty_bitmap(ram_state, block);
3326         }
3327     }
3328
3329     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3330     WITH_RCU_READ_LOCK_GUARD() {
3331         block = QLIST_FIRST_RCU(&ram_list.blocks);
3332
3333         while (block) {
3334             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3335
3336             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3337                 >= block->used_length) {
3338                 offset = 0;
3339                 block = QLIST_NEXT_RCU(block, next);
3340             } else {
3341                 migration_bitmap_clear_dirty(ram_state, block, offset);
3342                 dst_host = block->host
3343                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3344                 src_host = block->colo_cache
3345                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3346                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3347             }
3348         }
3349     }
3350     trace_colo_flush_ram_cache_end();
3351 }
3352
3353 /**
3354  * ram_load_precopy: load pages in precopy case
3355  *
3356  * Returns 0 for success or -errno in case of error
3357  *
3358  * Called in precopy mode by ram_load().
3359  * rcu_read_lock is taken prior to this being called.
3360  *
3361  * @f: QEMUFile where to send the data
3362  */
3363 static int ram_load_precopy(QEMUFile *f)
3364 {
3365     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3366     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3367     bool postcopy_advised = postcopy_is_advised();
3368     if (!migrate_use_compression()) {
3369         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3370     }
3371
3372     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3373         ram_addr_t addr, total_ram_bytes;
3374         void *host = NULL, *host_bak = NULL;
3375         uint8_t ch;
3376
3377         /*
3378          * Yield periodically to let main loop run, but an iteration of
3379          * the main loop is expensive, so do it each some iterations
3380          */
3381         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3382             aio_co_schedule(qemu_get_current_aio_context(),
3383                             qemu_coroutine_self());
3384             qemu_coroutine_yield();
3385         }
3386         i++;
3387
3388         addr = qemu_get_be64(f);
3389         flags = addr & ~TARGET_PAGE_MASK;
3390         addr &= TARGET_PAGE_MASK;
3391
3392         if (flags & invalid_flags) {
3393             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3394                 error_report("Received an unexpected compressed page");
3395             }
3396
3397             ret = -EINVAL;
3398             break;
3399         }
3400
3401         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3402                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3403             RAMBlock *block = ram_block_from_stream(f, flags);
3404
3405             host = host_from_ram_block_offset(block, addr);
3406             /*
3407              * After going into COLO stage, we should not load the page
3408              * into SVM's memory directly, we put them into colo_cache firstly.
3409              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3410              * Previously, we copied all these memory in preparing stage of COLO
3411              * while we need to stop VM, which is a time-consuming process.
3412              * Here we optimize it by a trick, back-up every page while in
3413              * migration process while COLO is enabled, though it affects the
3414              * speed of the migration, but it obviously reduce the downtime of
3415              * back-up all SVM'S memory in COLO preparing stage.
3416              */
3417             if (migration_incoming_colo_enabled()) {
3418                 if (migration_incoming_in_colo_state()) {
3419                     /* In COLO stage, put all pages into cache temporarily */
3420                     host = colo_cache_from_block_offset(block, addr, true);
3421                 } else {
3422                    /*
3423                     * In migration stage but before COLO stage,
3424                     * Put all pages into both cache and SVM's memory.
3425                     */
3426                     host_bak = colo_cache_from_block_offset(block, addr, false);
3427                 }
3428             }
3429             if (!host) {
3430                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3431                 ret = -EINVAL;
3432                 break;
3433             }
3434             if (!migration_incoming_in_colo_state()) {
3435                 ramblock_recv_bitmap_set(block, host);
3436             }
3437
3438             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3439         }
3440
3441         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3442         case RAM_SAVE_FLAG_MEM_SIZE:
3443             /* Synchronize RAM block list */
3444             total_ram_bytes = addr;
3445             while (!ret && total_ram_bytes) {
3446                 RAMBlock *block;
3447                 char id[256];
3448                 ram_addr_t length;
3449
3450                 len = qemu_get_byte(f);
3451                 qemu_get_buffer(f, (uint8_t *)id, len);
3452                 id[len] = 0;
3453                 length = qemu_get_be64(f);
3454
3455                 block = qemu_ram_block_by_name(id);
3456                 if (block && !qemu_ram_is_migratable(block)) {
3457                     error_report("block %s should not be migrated !", id);
3458                     ret = -EINVAL;
3459                 } else if (block) {
3460                     if (length != block->used_length) {
3461                         Error *local_err = NULL;
3462
3463                         ret = qemu_ram_resize(block, length,
3464                                               &local_err);
3465                         if (local_err) {
3466                             error_report_err(local_err);
3467                         }
3468                     }
3469                     /* For postcopy we need to check hugepage sizes match */
3470                     if (postcopy_advised &&
3471                         block->page_size != qemu_host_page_size) {
3472                         uint64_t remote_page_size = qemu_get_be64(f);
3473                         if (remote_page_size != block->page_size) {
3474                             error_report("Mismatched RAM page size %s "
3475                                          "(local) %zd != %" PRId64,
3476                                          id, block->page_size,
3477                                          remote_page_size);
3478                             ret = -EINVAL;
3479                         }
3480                     }
3481                     if (migrate_ignore_shared()) {
3482                         hwaddr addr = qemu_get_be64(f);
3483                         if (ramblock_is_ignored(block) &&
3484                             block->mr->addr != addr) {
3485                             error_report("Mismatched GPAs for block %s "
3486                                          "%" PRId64 "!= %" PRId64,
3487                                          id, (uint64_t)addr,
3488                                          (uint64_t)block->mr->addr);
3489                             ret = -EINVAL;
3490                         }
3491                     }
3492                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3493                                           block->idstr);
3494                 } else {
3495                     error_report("Unknown ramblock \"%s\", cannot "
3496                                  "accept migration", id);
3497                     ret = -EINVAL;
3498                 }
3499
3500                 total_ram_bytes -= length;
3501             }
3502             break;
3503
3504         case RAM_SAVE_FLAG_ZERO:
3505             ch = qemu_get_byte(f);
3506             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3507             break;
3508
3509         case RAM_SAVE_FLAG_PAGE:
3510             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3511             break;
3512
3513         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3514             len = qemu_get_be32(f);
3515             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3516                 error_report("Invalid compressed data length: %d", len);
3517                 ret = -EINVAL;
3518                 break;
3519             }
3520             decompress_data_with_multi_threads(f, host, len);
3521             break;
3522
3523         case RAM_SAVE_FLAG_XBZRLE:
3524             if (load_xbzrle(f, addr, host) < 0) {
3525                 error_report("Failed to decompress XBZRLE page at "
3526                              RAM_ADDR_FMT, addr);
3527                 ret = -EINVAL;
3528                 break;
3529             }
3530             break;
3531         case RAM_SAVE_FLAG_EOS:
3532             /* normal exit */
3533             multifd_recv_sync_main();
3534             break;
3535         default:
3536             if (flags & RAM_SAVE_FLAG_HOOK) {
3537                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3538             } else {
3539                 error_report("Unknown combination of migration flags: %#x",
3540                              flags);
3541                 ret = -EINVAL;
3542             }
3543         }
3544         if (!ret) {
3545             ret = qemu_file_get_error(f);
3546         }
3547         if (!ret && host_bak) {
3548             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3549         }
3550     }
3551
3552     ret |= wait_for_decompress_done();
3553     return ret;
3554 }
3555
3556 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3557 {
3558     int ret = 0;
3559     static uint64_t seq_iter;
3560     /*
3561      * If system is running in postcopy mode, page inserts to host memory must
3562      * be atomic
3563      */
3564     bool postcopy_running = postcopy_is_running();
3565
3566     seq_iter++;
3567
3568     if (version_id != 4) {
3569         return -EINVAL;
3570     }
3571
3572     /*
3573      * This RCU critical section can be very long running.
3574      * When RCU reclaims in the code start to become numerous,
3575      * it will be necessary to reduce the granularity of this
3576      * critical section.
3577      */
3578     WITH_RCU_READ_LOCK_GUARD() {
3579         if (postcopy_running) {
3580             ret = ram_load_postcopy(f);
3581         } else {
3582             ret = ram_load_precopy(f);
3583         }
3584     }
3585     trace_ram_load_complete(ret, seq_iter);
3586
3587     if (!ret  && migration_incoming_in_colo_state()) {
3588         colo_flush_ram_cache();
3589     }
3590     return ret;
3591 }
3592
3593 static bool ram_has_postcopy(void *opaque)
3594 {
3595     RAMBlock *rb;
3596     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3597         if (ramblock_is_pmem(rb)) {
3598             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3599                          "is not supported now!", rb->idstr, rb->host);
3600             return false;
3601         }
3602     }
3603
3604     return migrate_postcopy_ram();
3605 }
3606
3607 /* Sync all the dirty bitmap with destination VM.  */
3608 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3609 {
3610     RAMBlock *block;
3611     QEMUFile *file = s->to_dst_file;
3612     int ramblock_count = 0;
3613
3614     trace_ram_dirty_bitmap_sync_start();
3615
3616     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3617         qemu_savevm_send_recv_bitmap(file, block->idstr);
3618         trace_ram_dirty_bitmap_request(block->idstr);
3619         ramblock_count++;
3620     }
3621
3622     trace_ram_dirty_bitmap_sync_wait();
3623
3624     /* Wait until all the ramblocks' dirty bitmap synced */
3625     while (ramblock_count--) {
3626         qemu_sem_wait(&s->rp_state.rp_sem);
3627     }
3628
3629     trace_ram_dirty_bitmap_sync_complete();
3630
3631     return 0;
3632 }
3633
3634 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3635 {
3636     qemu_sem_post(&s->rp_state.rp_sem);
3637 }
3638
3639 /*
3640  * Read the received bitmap, revert it as the initial dirty bitmap.
3641  * This is only used when the postcopy migration is paused but wants
3642  * to resume from a middle point.
3643  */
3644 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3645 {
3646     int ret = -EINVAL;
3647     QEMUFile *file = s->rp_state.from_dst_file;
3648     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3649     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3650     uint64_t size, end_mark;
3651
3652     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3653
3654     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3655         error_report("%s: incorrect state %s", __func__,
3656                      MigrationStatus_str(s->state));
3657         return -EINVAL;
3658     }
3659
3660     /*
3661      * Note: see comments in ramblock_recv_bitmap_send() on why we
3662      * need the endianess convertion, and the paddings.
3663      */
3664     local_size = ROUND_UP(local_size, 8);
3665
3666     /* Add paddings */
3667     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3668
3669     size = qemu_get_be64(file);
3670
3671     /* The size of the bitmap should match with our ramblock */
3672     if (size != local_size) {
3673         error_report("%s: ramblock '%s' bitmap size mismatch "
3674                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3675                      block->idstr, size, local_size);
3676         ret = -EINVAL;
3677         goto out;
3678     }
3679
3680     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3681     end_mark = qemu_get_be64(file);
3682
3683     ret = qemu_file_get_error(file);
3684     if (ret || size != local_size) {
3685         error_report("%s: read bitmap failed for ramblock '%s': %d"
3686                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3687                      __func__, block->idstr, ret, local_size, size);
3688         ret = -EIO;
3689         goto out;
3690     }
3691
3692     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3693         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3694                      __func__, block->idstr, end_mark);
3695         ret = -EINVAL;
3696         goto out;
3697     }
3698
3699     /*
3700      * Endianess convertion. We are during postcopy (though paused).
3701      * The dirty bitmap won't change. We can directly modify it.
3702      */
3703     bitmap_from_le(block->bmap, le_bitmap, nbits);
3704
3705     /*
3706      * What we received is "received bitmap". Revert it as the initial
3707      * dirty bitmap for this ramblock.
3708      */
3709     bitmap_complement(block->bmap, block->bmap, nbits);
3710
3711     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3712
3713     /*
3714      * We succeeded to sync bitmap for current ramblock. If this is
3715      * the last one to sync, we need to notify the main send thread.
3716      */
3717     ram_dirty_bitmap_reload_notify(s);
3718
3719     ret = 0;
3720 out:
3721     g_free(le_bitmap);
3722     return ret;
3723 }
3724
3725 static int ram_resume_prepare(MigrationState *s, void *opaque)
3726 {
3727     RAMState *rs = *(RAMState **)opaque;
3728     int ret;
3729
3730     ret = ram_dirty_bitmap_sync_all(s, rs);
3731     if (ret) {
3732         return ret;
3733     }
3734
3735     ram_state_resume_prepare(rs, s->to_dst_file);
3736
3737     return 0;
3738 }
3739
3740 static SaveVMHandlers savevm_ram_handlers = {
3741     .save_setup = ram_save_setup,
3742     .save_live_iterate = ram_save_iterate,
3743     .save_live_complete_postcopy = ram_save_complete,
3744     .save_live_complete_precopy = ram_save_complete,
3745     .has_postcopy = ram_has_postcopy,
3746     .save_live_pending = ram_save_pending,
3747     .load_state = ram_load,
3748     .save_cleanup = ram_save_cleanup,
3749     .load_setup = ram_load_setup,
3750     .load_cleanup = ram_load_cleanup,
3751     .resume_prepare = ram_resume_prepare,
3752 };
3753
3754 void ram_mig_init(void)
3755 {
3756     qemu_mutex_init(&XBZRLE.lock);
3757     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3758 }