]> git.ipfire.org Git - thirdparty/qemu.git/blob - migration/ram.c
Merge remote-tracking branch 'remotes/mdroth/tags/qga-pull-2020-03-24-tag0' into...
[thirdparty/qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "savevm.h"
56 #include "qemu/iov.h"
57 #include "multifd.h"
58
59 /***********************************************************/
60 /* ram save/restore */
61
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
66 */
67
68 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO 0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE 0x08
72 #define RAM_SAVE_FLAG_EOS 0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE 0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
77
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
79 {
80 return buffer_is_zero(p, size);
81 }
82
83 XBZRLECacheStats xbzrle_counters;
84
85 /* struct contains XBZRLE cache and a static page
86 used by the compression */
87 static struct {
88 /* buffer used for XBZRLE encoding */
89 uint8_t *encoded_buf;
90 /* buffer for storing page content */
91 uint8_t *current_buf;
92 /* Cache for XBZRLE, Protected by lock. */
93 PageCache *cache;
94 QemuMutex lock;
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
97 /* buffer used for XBZRLE decoding */
98 uint8_t *decoded_buf;
99 } XBZRLE;
100
101 static void XBZRLE_cache_lock(void)
102 {
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
105 }
106
107 static void XBZRLE_cache_unlock(void)
108 {
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
111 }
112
113 /**
114 * xbzrle_cache_resize: resize the xbzrle cache
115 *
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
120 *
121 * Returns 0 for success or -1 for error
122 *
123 * @new_size: new cache size
124 * @errp: set *errp if the check failed, with reason
125 */
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
127 {
128 PageCache *new_cache;
129 int64_t ret = 0;
130
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
135 return -1;
136 }
137
138 if (new_size == migrate_xbzrle_cache_size()) {
139 /* nothing to do */
140 return 0;
141 }
142
143 XBZRLE_cache_lock();
144
145 if (XBZRLE.cache != NULL) {
146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
147 if (!new_cache) {
148 ret = -1;
149 goto out;
150 }
151
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
154 }
155 out:
156 XBZRLE_cache_unlock();
157 return ret;
158 }
159
160 static bool ramblock_is_ignored(RAMBlock *block)
161 {
162 return !qemu_ram_is_migratable(block) ||
163 (migrate_ignore_shared() && qemu_ram_is_shared(block));
164 }
165
166 /* Should be holding either ram_list.mutex, or the RCU lock. */
167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
168 INTERNAL_RAMBLOCK_FOREACH(block) \
169 if (ramblock_is_ignored(block)) {} else
170
171 #define RAMBLOCK_FOREACH_MIGRATABLE(block) \
172 INTERNAL_RAMBLOCK_FOREACH(block) \
173 if (!qemu_ram_is_migratable(block)) {} else
174
175 #undef RAMBLOCK_FOREACH
176
177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
178 {
179 RAMBlock *block;
180 int ret = 0;
181
182 RCU_READ_LOCK_GUARD();
183
184 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
185 ret = func(block, opaque);
186 if (ret) {
187 break;
188 }
189 }
190 return ret;
191 }
192
193 static void ramblock_recv_map_init(void)
194 {
195 RAMBlock *rb;
196
197 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
198 assert(!rb->receivedmap);
199 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200 }
201 }
202
203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
204 {
205 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
206 rb->receivedmap);
207 }
208
209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
210 {
211 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
212 }
213
214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
215 {
216 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
217 }
218
219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
220 size_t nr)
221 {
222 bitmap_set_atomic(rb->receivedmap,
223 ramblock_recv_bitmap_offset(host_addr, rb),
224 nr);
225 }
226
227 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
228
229 /*
230 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231 *
232 * Returns >0 if success with sent bytes, or <0 if error.
233 */
234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
235 const char *block_name)
236 {
237 RAMBlock *block = qemu_ram_block_by_name(block_name);
238 unsigned long *le_bitmap, nbits;
239 uint64_t size;
240
241 if (!block) {
242 error_report("%s: invalid block name: %s", __func__, block_name);
243 return -1;
244 }
245
246 nbits = block->used_length >> TARGET_PAGE_BITS;
247
248 /*
249 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
250 * machines we may need 4 more bytes for padding (see below
251 * comment). So extend it a bit before hand.
252 */
253 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
254
255 /*
256 * Always use little endian when sending the bitmap. This is
257 * required that when source and destination VMs are not using the
258 * same endianess. (Note: big endian won't work.)
259 */
260 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
261
262 /* Size of the bitmap, in bytes */
263 size = DIV_ROUND_UP(nbits, 8);
264
265 /*
266 * size is always aligned to 8 bytes for 64bit machines, but it
267 * may not be true for 32bit machines. We need this padding to
268 * make sure the migration can survive even between 32bit and
269 * 64bit machines.
270 */
271 size = ROUND_UP(size, 8);
272
273 qemu_put_be64(file, size);
274 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
275 /*
276 * Mark as an end, in case the middle part is screwed up due to
277 * some "misterious" reason.
278 */
279 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
280 qemu_fflush(file);
281
282 g_free(le_bitmap);
283
284 if (qemu_file_get_error(file)) {
285 return qemu_file_get_error(file);
286 }
287
288 return size + sizeof(size);
289 }
290
291 /*
292 * An outstanding page request, on the source, having been received
293 * and queued
294 */
295 struct RAMSrcPageRequest {
296 RAMBlock *rb;
297 hwaddr offset;
298 hwaddr len;
299
300 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
301 };
302
303 /* State of RAM for migration */
304 struct RAMState {
305 /* QEMUFile used for this migration */
306 QEMUFile *f;
307 /* Last block that we have visited searching for dirty pages */
308 RAMBlock *last_seen_block;
309 /* Last block from where we have sent data */
310 RAMBlock *last_sent_block;
311 /* Last dirty target page we have sent */
312 ram_addr_t last_page;
313 /* last ram version we have seen */
314 uint32_t last_version;
315 /* We are in the first round */
316 bool ram_bulk_stage;
317 /* The free page optimization is enabled */
318 bool fpo_enabled;
319 /* How many times we have dirty too many pages */
320 int dirty_rate_high_cnt;
321 /* these variables are used for bitmap sync */
322 /* last time we did a full bitmap_sync */
323 int64_t time_last_bitmap_sync;
324 /* bytes transferred at start_time */
325 uint64_t bytes_xfer_prev;
326 /* number of dirty pages since start_time */
327 uint64_t num_dirty_pages_period;
328 /* xbzrle misses since the beginning of the period */
329 uint64_t xbzrle_cache_miss_prev;
330
331 /* compression statistics since the beginning of the period */
332 /* amount of count that no free thread to compress data */
333 uint64_t compress_thread_busy_prev;
334 /* amount bytes after compression */
335 uint64_t compressed_size_prev;
336 /* amount of compressed pages */
337 uint64_t compress_pages_prev;
338
339 /* total handled target pages at the beginning of period */
340 uint64_t target_page_count_prev;
341 /* total handled target pages since start */
342 uint64_t target_page_count;
343 /* number of dirty bits in the bitmap */
344 uint64_t migration_dirty_pages;
345 /* Protects modification of the bitmap and migration dirty pages */
346 QemuMutex bitmap_mutex;
347 /* The RAMBlock used in the last src_page_requests */
348 RAMBlock *last_req_rb;
349 /* Queue of outstanding page requests from the destination */
350 QemuMutex src_page_req_mutex;
351 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
352 };
353 typedef struct RAMState RAMState;
354
355 static RAMState *ram_state;
356
357 static NotifierWithReturnList precopy_notifier_list;
358
359 void precopy_infrastructure_init(void)
360 {
361 notifier_with_return_list_init(&precopy_notifier_list);
362 }
363
364 void precopy_add_notifier(NotifierWithReturn *n)
365 {
366 notifier_with_return_list_add(&precopy_notifier_list, n);
367 }
368
369 void precopy_remove_notifier(NotifierWithReturn *n)
370 {
371 notifier_with_return_remove(n);
372 }
373
374 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
375 {
376 PrecopyNotifyData pnd;
377 pnd.reason = reason;
378 pnd.errp = errp;
379
380 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
381 }
382
383 void precopy_enable_free_page_optimization(void)
384 {
385 if (!ram_state) {
386 return;
387 }
388
389 ram_state->fpo_enabled = true;
390 }
391
392 uint64_t ram_bytes_remaining(void)
393 {
394 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
395 0;
396 }
397
398 MigrationStats ram_counters;
399
400 /* used by the search for pages to send */
401 struct PageSearchStatus {
402 /* Current block being searched */
403 RAMBlock *block;
404 /* Current page to search from */
405 unsigned long page;
406 /* Set once we wrap around */
407 bool complete_round;
408 };
409 typedef struct PageSearchStatus PageSearchStatus;
410
411 CompressionStats compression_counters;
412
413 struct CompressParam {
414 bool done;
415 bool quit;
416 bool zero_page;
417 QEMUFile *file;
418 QemuMutex mutex;
419 QemuCond cond;
420 RAMBlock *block;
421 ram_addr_t offset;
422
423 /* internally used fields */
424 z_stream stream;
425 uint8_t *originbuf;
426 };
427 typedef struct CompressParam CompressParam;
428
429 struct DecompressParam {
430 bool done;
431 bool quit;
432 QemuMutex mutex;
433 QemuCond cond;
434 void *des;
435 uint8_t *compbuf;
436 int len;
437 z_stream stream;
438 };
439 typedef struct DecompressParam DecompressParam;
440
441 static CompressParam *comp_param;
442 static QemuThread *compress_threads;
443 /* comp_done_cond is used to wake up the migration thread when
444 * one of the compression threads has finished the compression.
445 * comp_done_lock is used to co-work with comp_done_cond.
446 */
447 static QemuMutex comp_done_lock;
448 static QemuCond comp_done_cond;
449 /* The empty QEMUFileOps will be used by file in CompressParam */
450 static const QEMUFileOps empty_ops = { };
451
452 static QEMUFile *decomp_file;
453 static DecompressParam *decomp_param;
454 static QemuThread *decompress_threads;
455 static QemuMutex decomp_done_lock;
456 static QemuCond decomp_done_cond;
457
458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
459 ram_addr_t offset, uint8_t *source_buf);
460
461 static void *do_data_compress(void *opaque)
462 {
463 CompressParam *param = opaque;
464 RAMBlock *block;
465 ram_addr_t offset;
466 bool zero_page;
467
468 qemu_mutex_lock(&param->mutex);
469 while (!param->quit) {
470 if (param->block) {
471 block = param->block;
472 offset = param->offset;
473 param->block = NULL;
474 qemu_mutex_unlock(&param->mutex);
475
476 zero_page = do_compress_ram_page(param->file, &param->stream,
477 block, offset, param->originbuf);
478
479 qemu_mutex_lock(&comp_done_lock);
480 param->done = true;
481 param->zero_page = zero_page;
482 qemu_cond_signal(&comp_done_cond);
483 qemu_mutex_unlock(&comp_done_lock);
484
485 qemu_mutex_lock(&param->mutex);
486 } else {
487 qemu_cond_wait(&param->cond, &param->mutex);
488 }
489 }
490 qemu_mutex_unlock(&param->mutex);
491
492 return NULL;
493 }
494
495 static void compress_threads_save_cleanup(void)
496 {
497 int i, thread_count;
498
499 if (!migrate_use_compression() || !comp_param) {
500 return;
501 }
502
503 thread_count = migrate_compress_threads();
504 for (i = 0; i < thread_count; i++) {
505 /*
506 * we use it as a indicator which shows if the thread is
507 * properly init'd or not
508 */
509 if (!comp_param[i].file) {
510 break;
511 }
512
513 qemu_mutex_lock(&comp_param[i].mutex);
514 comp_param[i].quit = true;
515 qemu_cond_signal(&comp_param[i].cond);
516 qemu_mutex_unlock(&comp_param[i].mutex);
517
518 qemu_thread_join(compress_threads + i);
519 qemu_mutex_destroy(&comp_param[i].mutex);
520 qemu_cond_destroy(&comp_param[i].cond);
521 deflateEnd(&comp_param[i].stream);
522 g_free(comp_param[i].originbuf);
523 qemu_fclose(comp_param[i].file);
524 comp_param[i].file = NULL;
525 }
526 qemu_mutex_destroy(&comp_done_lock);
527 qemu_cond_destroy(&comp_done_cond);
528 g_free(compress_threads);
529 g_free(comp_param);
530 compress_threads = NULL;
531 comp_param = NULL;
532 }
533
534 static int compress_threads_save_setup(void)
535 {
536 int i, thread_count;
537
538 if (!migrate_use_compression()) {
539 return 0;
540 }
541 thread_count = migrate_compress_threads();
542 compress_threads = g_new0(QemuThread, thread_count);
543 comp_param = g_new0(CompressParam, thread_count);
544 qemu_cond_init(&comp_done_cond);
545 qemu_mutex_init(&comp_done_lock);
546 for (i = 0; i < thread_count; i++) {
547 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
548 if (!comp_param[i].originbuf) {
549 goto exit;
550 }
551
552 if (deflateInit(&comp_param[i].stream,
553 migrate_compress_level()) != Z_OK) {
554 g_free(comp_param[i].originbuf);
555 goto exit;
556 }
557
558 /* comp_param[i].file is just used as a dummy buffer to save data,
559 * set its ops to empty.
560 */
561 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
562 comp_param[i].done = true;
563 comp_param[i].quit = false;
564 qemu_mutex_init(&comp_param[i].mutex);
565 qemu_cond_init(&comp_param[i].cond);
566 qemu_thread_create(compress_threads + i, "compress",
567 do_data_compress, comp_param + i,
568 QEMU_THREAD_JOINABLE);
569 }
570 return 0;
571
572 exit:
573 compress_threads_save_cleanup();
574 return -1;
575 }
576
577 /**
578 * save_page_header: write page header to wire
579 *
580 * If this is the 1st block, it also writes the block identification
581 *
582 * Returns the number of bytes written
583 *
584 * @f: QEMUFile where to send the data
585 * @block: block that contains the page we want to send
586 * @offset: offset inside the block for the page
587 * in the lower bits, it contains flags
588 */
589 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
590 ram_addr_t offset)
591 {
592 size_t size, len;
593
594 if (block == rs->last_sent_block) {
595 offset |= RAM_SAVE_FLAG_CONTINUE;
596 }
597 qemu_put_be64(f, offset);
598 size = 8;
599
600 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
601 len = strlen(block->idstr);
602 qemu_put_byte(f, len);
603 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
604 size += 1 + len;
605 rs->last_sent_block = block;
606 }
607 return size;
608 }
609
610 /**
611 * mig_throttle_guest_down: throotle down the guest
612 *
613 * Reduce amount of guest cpu execution to hopefully slow down memory
614 * writes. If guest dirty memory rate is reduced below the rate at
615 * which we can transfer pages to the destination then we should be
616 * able to complete migration. Some workloads dirty memory way too
617 * fast and will not effectively converge, even with auto-converge.
618 */
619 static void mig_throttle_guest_down(void)
620 {
621 MigrationState *s = migrate_get_current();
622 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
623 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
624 int pct_max = s->parameters.max_cpu_throttle;
625
626 /* We have not started throttling yet. Let's start it. */
627 if (!cpu_throttle_active()) {
628 cpu_throttle_set(pct_initial);
629 } else {
630 /* Throttling already on, just increase the rate */
631 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
632 pct_max));
633 }
634 }
635
636 /**
637 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
638 *
639 * @rs: current RAM state
640 * @current_addr: address for the zero page
641 *
642 * Update the xbzrle cache to reflect a page that's been sent as all 0.
643 * The important thing is that a stale (not-yet-0'd) page be replaced
644 * by the new data.
645 * As a bonus, if the page wasn't in the cache it gets added so that
646 * when a small write is made into the 0'd page it gets XBZRLE sent.
647 */
648 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
649 {
650 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
651 return;
652 }
653
654 /* We don't care if this fails to allocate a new cache page
655 * as long as it updated an old one */
656 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
657 ram_counters.dirty_sync_count);
658 }
659
660 #define ENCODING_FLAG_XBZRLE 0x1
661
662 /**
663 * save_xbzrle_page: compress and send current page
664 *
665 * Returns: 1 means that we wrote the page
666 * 0 means that page is identical to the one already sent
667 * -1 means that xbzrle would be longer than normal
668 *
669 * @rs: current RAM state
670 * @current_data: pointer to the address of the page contents
671 * @current_addr: addr of the page
672 * @block: block that contains the page we want to send
673 * @offset: offset inside the block for the page
674 * @last_stage: if we are at the completion stage
675 */
676 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
677 ram_addr_t current_addr, RAMBlock *block,
678 ram_addr_t offset, bool last_stage)
679 {
680 int encoded_len = 0, bytes_xbzrle;
681 uint8_t *prev_cached_page;
682
683 if (!cache_is_cached(XBZRLE.cache, current_addr,
684 ram_counters.dirty_sync_count)) {
685 xbzrle_counters.cache_miss++;
686 if (!last_stage) {
687 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
688 ram_counters.dirty_sync_count) == -1) {
689 return -1;
690 } else {
691 /* update *current_data when the page has been
692 inserted into cache */
693 *current_data = get_cached_data(XBZRLE.cache, current_addr);
694 }
695 }
696 return -1;
697 }
698
699 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
700
701 /* save current buffer into memory */
702 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
703
704 /* XBZRLE encoding (if there is no overflow) */
705 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
706 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
707 TARGET_PAGE_SIZE);
708
709 /*
710 * Update the cache contents, so that it corresponds to the data
711 * sent, in all cases except where we skip the page.
712 */
713 if (!last_stage && encoded_len != 0) {
714 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
715 /*
716 * In the case where we couldn't compress, ensure that the caller
717 * sends the data from the cache, since the guest might have
718 * changed the RAM since we copied it.
719 */
720 *current_data = prev_cached_page;
721 }
722
723 if (encoded_len == 0) {
724 trace_save_xbzrle_page_skipping();
725 return 0;
726 } else if (encoded_len == -1) {
727 trace_save_xbzrle_page_overflow();
728 xbzrle_counters.overflow++;
729 return -1;
730 }
731
732 /* Send XBZRLE based compressed page */
733 bytes_xbzrle = save_page_header(rs, rs->f, block,
734 offset | RAM_SAVE_FLAG_XBZRLE);
735 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
736 qemu_put_be16(rs->f, encoded_len);
737 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
738 bytes_xbzrle += encoded_len + 1 + 2;
739 xbzrle_counters.pages++;
740 xbzrle_counters.bytes += bytes_xbzrle;
741 ram_counters.transferred += bytes_xbzrle;
742
743 return 1;
744 }
745
746 /**
747 * migration_bitmap_find_dirty: find the next dirty page from start
748 *
749 * Returns the page offset within memory region of the start of a dirty page
750 *
751 * @rs: current RAM state
752 * @rb: RAMBlock where to search for dirty pages
753 * @start: page where we start the search
754 */
755 static inline
756 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
757 unsigned long start)
758 {
759 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
760 unsigned long *bitmap = rb->bmap;
761 unsigned long next;
762
763 if (ramblock_is_ignored(rb)) {
764 return size;
765 }
766
767 /*
768 * When the free page optimization is enabled, we need to check the bitmap
769 * to send the non-free pages rather than all the pages in the bulk stage.
770 */
771 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
772 next = start + 1;
773 } else {
774 next = find_next_bit(bitmap, size, start);
775 }
776
777 return next;
778 }
779
780 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
781 RAMBlock *rb,
782 unsigned long page)
783 {
784 bool ret;
785
786 qemu_mutex_lock(&rs->bitmap_mutex);
787
788 /*
789 * Clear dirty bitmap if needed. This _must_ be called before we
790 * send any of the page in the chunk because we need to make sure
791 * we can capture further page content changes when we sync dirty
792 * log the next time. So as long as we are going to send any of
793 * the page in the chunk we clear the remote dirty bitmap for all.
794 * Clearing it earlier won't be a problem, but too late will.
795 */
796 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
797 uint8_t shift = rb->clear_bmap_shift;
798 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
799 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
800
801 /*
802 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
803 * can make things easier sometimes since then start address
804 * of the small chunk will always be 64 pages aligned so the
805 * bitmap will always be aligned to unsigned long. We should
806 * even be able to remove this restriction but I'm simply
807 * keeping it.
808 */
809 assert(shift >= 6);
810 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
811 memory_region_clear_dirty_bitmap(rb->mr, start, size);
812 }
813
814 ret = test_and_clear_bit(page, rb->bmap);
815
816 if (ret) {
817 rs->migration_dirty_pages--;
818 }
819 qemu_mutex_unlock(&rs->bitmap_mutex);
820
821 return ret;
822 }
823
824 /* Called with RCU critical section */
825 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
826 {
827 rs->migration_dirty_pages +=
828 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
829 &rs->num_dirty_pages_period);
830 }
831
832 /**
833 * ram_pagesize_summary: calculate all the pagesizes of a VM
834 *
835 * Returns a summary bitmap of the page sizes of all RAMBlocks
836 *
837 * For VMs with just normal pages this is equivalent to the host page
838 * size. If it's got some huge pages then it's the OR of all the
839 * different page sizes.
840 */
841 uint64_t ram_pagesize_summary(void)
842 {
843 RAMBlock *block;
844 uint64_t summary = 0;
845
846 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
847 summary |= block->page_size;
848 }
849
850 return summary;
851 }
852
853 uint64_t ram_get_total_transferred_pages(void)
854 {
855 return ram_counters.normal + ram_counters.duplicate +
856 compression_counters.pages + xbzrle_counters.pages;
857 }
858
859 static void migration_update_rates(RAMState *rs, int64_t end_time)
860 {
861 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
862 double compressed_size;
863
864 /* calculate period counters */
865 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
866 / (end_time - rs->time_last_bitmap_sync);
867
868 if (!page_count) {
869 return;
870 }
871
872 if (migrate_use_xbzrle()) {
873 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
874 rs->xbzrle_cache_miss_prev) / page_count;
875 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
876 }
877
878 if (migrate_use_compression()) {
879 compression_counters.busy_rate = (double)(compression_counters.busy -
880 rs->compress_thread_busy_prev) / page_count;
881 rs->compress_thread_busy_prev = compression_counters.busy;
882
883 compressed_size = compression_counters.compressed_size -
884 rs->compressed_size_prev;
885 if (compressed_size) {
886 double uncompressed_size = (compression_counters.pages -
887 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
888
889 /* Compression-Ratio = Uncompressed-size / Compressed-size */
890 compression_counters.compression_rate =
891 uncompressed_size / compressed_size;
892
893 rs->compress_pages_prev = compression_counters.pages;
894 rs->compressed_size_prev = compression_counters.compressed_size;
895 }
896 }
897 }
898
899 static void migration_trigger_throttle(RAMState *rs)
900 {
901 MigrationState *s = migrate_get_current();
902 uint64_t threshold = s->parameters.throttle_trigger_threshold;
903
904 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
905 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
906 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
907
908 /* During block migration the auto-converge logic incorrectly detects
909 * that ram migration makes no progress. Avoid this by disabling the
910 * throttling logic during the bulk phase of block migration. */
911 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
912 /* The following detection logic can be refined later. For now:
913 Check to see if the ratio between dirtied bytes and the approx.
914 amount of bytes that just got transferred since the last time
915 we were in this routine reaches the threshold. If that happens
916 twice, start or increase throttling. */
917
918 if ((bytes_dirty_period > bytes_dirty_threshold) &&
919 (++rs->dirty_rate_high_cnt >= 2)) {
920 trace_migration_throttle();
921 rs->dirty_rate_high_cnt = 0;
922 mig_throttle_guest_down();
923 }
924 }
925 }
926
927 static void migration_bitmap_sync(RAMState *rs)
928 {
929 RAMBlock *block;
930 int64_t end_time;
931
932 ram_counters.dirty_sync_count++;
933
934 if (!rs->time_last_bitmap_sync) {
935 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
936 }
937
938 trace_migration_bitmap_sync_start();
939 memory_global_dirty_log_sync();
940
941 qemu_mutex_lock(&rs->bitmap_mutex);
942 WITH_RCU_READ_LOCK_GUARD() {
943 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
944 ramblock_sync_dirty_bitmap(rs, block);
945 }
946 ram_counters.remaining = ram_bytes_remaining();
947 }
948 qemu_mutex_unlock(&rs->bitmap_mutex);
949
950 memory_global_after_dirty_log_sync();
951 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
952
953 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
954
955 /* more than 1 second = 1000 millisecons */
956 if (end_time > rs->time_last_bitmap_sync + 1000) {
957 migration_trigger_throttle(rs);
958
959 migration_update_rates(rs, end_time);
960
961 rs->target_page_count_prev = rs->target_page_count;
962
963 /* reset period counters */
964 rs->time_last_bitmap_sync = end_time;
965 rs->num_dirty_pages_period = 0;
966 rs->bytes_xfer_prev = ram_counters.transferred;
967 }
968 if (migrate_use_events()) {
969 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
970 }
971 }
972
973 static void migration_bitmap_sync_precopy(RAMState *rs)
974 {
975 Error *local_err = NULL;
976
977 /*
978 * The current notifier usage is just an optimization to migration, so we
979 * don't stop the normal migration process in the error case.
980 */
981 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
982 error_report_err(local_err);
983 }
984
985 migration_bitmap_sync(rs);
986
987 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
988 error_report_err(local_err);
989 }
990 }
991
992 /**
993 * save_zero_page_to_file: send the zero page to the file
994 *
995 * Returns the size of data written to the file, 0 means the page is not
996 * a zero page
997 *
998 * @rs: current RAM state
999 * @file: the file where the data is saved
1000 * @block: block that contains the page we want to send
1001 * @offset: offset inside the block for the page
1002 */
1003 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1004 RAMBlock *block, ram_addr_t offset)
1005 {
1006 uint8_t *p = block->host + offset;
1007 int len = 0;
1008
1009 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1010 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1011 qemu_put_byte(file, 0);
1012 len += 1;
1013 }
1014 return len;
1015 }
1016
1017 /**
1018 * save_zero_page: send the zero page to the stream
1019 *
1020 * Returns the number of pages written.
1021 *
1022 * @rs: current RAM state
1023 * @block: block that contains the page we want to send
1024 * @offset: offset inside the block for the page
1025 */
1026 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1027 {
1028 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1029
1030 if (len) {
1031 ram_counters.duplicate++;
1032 ram_counters.transferred += len;
1033 return 1;
1034 }
1035 return -1;
1036 }
1037
1038 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1039 {
1040 if (!migrate_release_ram() || !migration_in_postcopy()) {
1041 return;
1042 }
1043
1044 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1045 }
1046
1047 /*
1048 * @pages: the number of pages written by the control path,
1049 * < 0 - error
1050 * > 0 - number of pages written
1051 *
1052 * Return true if the pages has been saved, otherwise false is returned.
1053 */
1054 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1055 int *pages)
1056 {
1057 uint64_t bytes_xmit = 0;
1058 int ret;
1059
1060 *pages = -1;
1061 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1062 &bytes_xmit);
1063 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1064 return false;
1065 }
1066
1067 if (bytes_xmit) {
1068 ram_counters.transferred += bytes_xmit;
1069 *pages = 1;
1070 }
1071
1072 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1073 return true;
1074 }
1075
1076 if (bytes_xmit > 0) {
1077 ram_counters.normal++;
1078 } else if (bytes_xmit == 0) {
1079 ram_counters.duplicate++;
1080 }
1081
1082 return true;
1083 }
1084
1085 /*
1086 * directly send the page to the stream
1087 *
1088 * Returns the number of pages written.
1089 *
1090 * @rs: current RAM state
1091 * @block: block that contains the page we want to send
1092 * @offset: offset inside the block for the page
1093 * @buf: the page to be sent
1094 * @async: send to page asyncly
1095 */
1096 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1097 uint8_t *buf, bool async)
1098 {
1099 ram_counters.transferred += save_page_header(rs, rs->f, block,
1100 offset | RAM_SAVE_FLAG_PAGE);
1101 if (async) {
1102 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1103 migrate_release_ram() &
1104 migration_in_postcopy());
1105 } else {
1106 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1107 }
1108 ram_counters.transferred += TARGET_PAGE_SIZE;
1109 ram_counters.normal++;
1110 return 1;
1111 }
1112
1113 /**
1114 * ram_save_page: send the given page to the stream
1115 *
1116 * Returns the number of pages written.
1117 * < 0 - error
1118 * >=0 - Number of pages written - this might legally be 0
1119 * if xbzrle noticed the page was the same.
1120 *
1121 * @rs: current RAM state
1122 * @block: block that contains the page we want to send
1123 * @offset: offset inside the block for the page
1124 * @last_stage: if we are at the completion stage
1125 */
1126 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1127 {
1128 int pages = -1;
1129 uint8_t *p;
1130 bool send_async = true;
1131 RAMBlock *block = pss->block;
1132 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1133 ram_addr_t current_addr = block->offset + offset;
1134
1135 p = block->host + offset;
1136 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1137
1138 XBZRLE_cache_lock();
1139 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1140 migrate_use_xbzrle()) {
1141 pages = save_xbzrle_page(rs, &p, current_addr, block,
1142 offset, last_stage);
1143 if (!last_stage) {
1144 /* Can't send this cached data async, since the cache page
1145 * might get updated before it gets to the wire
1146 */
1147 send_async = false;
1148 }
1149 }
1150
1151 /* XBZRLE overflow or normal page */
1152 if (pages == -1) {
1153 pages = save_normal_page(rs, block, offset, p, send_async);
1154 }
1155
1156 XBZRLE_cache_unlock();
1157
1158 return pages;
1159 }
1160
1161 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1162 ram_addr_t offset)
1163 {
1164 if (multifd_queue_page(rs->f, block, offset) < 0) {
1165 return -1;
1166 }
1167 ram_counters.normal++;
1168
1169 return 1;
1170 }
1171
1172 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1173 ram_addr_t offset, uint8_t *source_buf)
1174 {
1175 RAMState *rs = ram_state;
1176 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1177 bool zero_page = false;
1178 int ret;
1179
1180 if (save_zero_page_to_file(rs, f, block, offset)) {
1181 zero_page = true;
1182 goto exit;
1183 }
1184
1185 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1186
1187 /*
1188 * copy it to a internal buffer to avoid it being modified by VM
1189 * so that we can catch up the error during compression and
1190 * decompression
1191 */
1192 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1193 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1194 if (ret < 0) {
1195 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1196 error_report("compressed data failed!");
1197 return false;
1198 }
1199
1200 exit:
1201 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1202 return zero_page;
1203 }
1204
1205 static void
1206 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1207 {
1208 ram_counters.transferred += bytes_xmit;
1209
1210 if (param->zero_page) {
1211 ram_counters.duplicate++;
1212 return;
1213 }
1214
1215 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1216 compression_counters.compressed_size += bytes_xmit - 8;
1217 compression_counters.pages++;
1218 }
1219
1220 static bool save_page_use_compression(RAMState *rs);
1221
1222 static void flush_compressed_data(RAMState *rs)
1223 {
1224 int idx, len, thread_count;
1225
1226 if (!save_page_use_compression(rs)) {
1227 return;
1228 }
1229 thread_count = migrate_compress_threads();
1230
1231 qemu_mutex_lock(&comp_done_lock);
1232 for (idx = 0; idx < thread_count; idx++) {
1233 while (!comp_param[idx].done) {
1234 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1235 }
1236 }
1237 qemu_mutex_unlock(&comp_done_lock);
1238
1239 for (idx = 0; idx < thread_count; idx++) {
1240 qemu_mutex_lock(&comp_param[idx].mutex);
1241 if (!comp_param[idx].quit) {
1242 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1243 /*
1244 * it's safe to fetch zero_page without holding comp_done_lock
1245 * as there is no further request submitted to the thread,
1246 * i.e, the thread should be waiting for a request at this point.
1247 */
1248 update_compress_thread_counts(&comp_param[idx], len);
1249 }
1250 qemu_mutex_unlock(&comp_param[idx].mutex);
1251 }
1252 }
1253
1254 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1255 ram_addr_t offset)
1256 {
1257 param->block = block;
1258 param->offset = offset;
1259 }
1260
1261 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1262 ram_addr_t offset)
1263 {
1264 int idx, thread_count, bytes_xmit = -1, pages = -1;
1265 bool wait = migrate_compress_wait_thread();
1266
1267 thread_count = migrate_compress_threads();
1268 qemu_mutex_lock(&comp_done_lock);
1269 retry:
1270 for (idx = 0; idx < thread_count; idx++) {
1271 if (comp_param[idx].done) {
1272 comp_param[idx].done = false;
1273 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1274 qemu_mutex_lock(&comp_param[idx].mutex);
1275 set_compress_params(&comp_param[idx], block, offset);
1276 qemu_cond_signal(&comp_param[idx].cond);
1277 qemu_mutex_unlock(&comp_param[idx].mutex);
1278 pages = 1;
1279 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1280 break;
1281 }
1282 }
1283
1284 /*
1285 * wait for the free thread if the user specifies 'compress-wait-thread',
1286 * otherwise we will post the page out in the main thread as normal page.
1287 */
1288 if (pages < 0 && wait) {
1289 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1290 goto retry;
1291 }
1292 qemu_mutex_unlock(&comp_done_lock);
1293
1294 return pages;
1295 }
1296
1297 /**
1298 * find_dirty_block: find the next dirty page and update any state
1299 * associated with the search process.
1300 *
1301 * Returns true if a page is found
1302 *
1303 * @rs: current RAM state
1304 * @pss: data about the state of the current dirty page scan
1305 * @again: set to false if the search has scanned the whole of RAM
1306 */
1307 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1308 {
1309 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1310 if (pss->complete_round && pss->block == rs->last_seen_block &&
1311 pss->page >= rs->last_page) {
1312 /*
1313 * We've been once around the RAM and haven't found anything.
1314 * Give up.
1315 */
1316 *again = false;
1317 return false;
1318 }
1319 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1320 >= pss->block->used_length) {
1321 /* Didn't find anything in this RAM Block */
1322 pss->page = 0;
1323 pss->block = QLIST_NEXT_RCU(pss->block, next);
1324 if (!pss->block) {
1325 /*
1326 * If memory migration starts over, we will meet a dirtied page
1327 * which may still exists in compression threads's ring, so we
1328 * should flush the compressed data to make sure the new page
1329 * is not overwritten by the old one in the destination.
1330 *
1331 * Also If xbzrle is on, stop using the data compression at this
1332 * point. In theory, xbzrle can do better than compression.
1333 */
1334 flush_compressed_data(rs);
1335
1336 /* Hit the end of the list */
1337 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1338 /* Flag that we've looped */
1339 pss->complete_round = true;
1340 rs->ram_bulk_stage = false;
1341 }
1342 /* Didn't find anything this time, but try again on the new block */
1343 *again = true;
1344 return false;
1345 } else {
1346 /* Can go around again, but... */
1347 *again = true;
1348 /* We've found something so probably don't need to */
1349 return true;
1350 }
1351 }
1352
1353 /**
1354 * unqueue_page: gets a page of the queue
1355 *
1356 * Helper for 'get_queued_page' - gets a page off the queue
1357 *
1358 * Returns the block of the page (or NULL if none available)
1359 *
1360 * @rs: current RAM state
1361 * @offset: used to return the offset within the RAMBlock
1362 */
1363 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1364 {
1365 RAMBlock *block = NULL;
1366
1367 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1368 return NULL;
1369 }
1370
1371 qemu_mutex_lock(&rs->src_page_req_mutex);
1372 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1373 struct RAMSrcPageRequest *entry =
1374 QSIMPLEQ_FIRST(&rs->src_page_requests);
1375 block = entry->rb;
1376 *offset = entry->offset;
1377
1378 if (entry->len > TARGET_PAGE_SIZE) {
1379 entry->len -= TARGET_PAGE_SIZE;
1380 entry->offset += TARGET_PAGE_SIZE;
1381 } else {
1382 memory_region_unref(block->mr);
1383 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1384 g_free(entry);
1385 migration_consume_urgent_request();
1386 }
1387 }
1388 qemu_mutex_unlock(&rs->src_page_req_mutex);
1389
1390 return block;
1391 }
1392
1393 /**
1394 * get_queued_page: unqueue a page from the postcopy requests
1395 *
1396 * Skips pages that are already sent (!dirty)
1397 *
1398 * Returns true if a queued page is found
1399 *
1400 * @rs: current RAM state
1401 * @pss: data about the state of the current dirty page scan
1402 */
1403 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1404 {
1405 RAMBlock *block;
1406 ram_addr_t offset;
1407 bool dirty;
1408
1409 do {
1410 block = unqueue_page(rs, &offset);
1411 /*
1412 * We're sending this page, and since it's postcopy nothing else
1413 * will dirty it, and we must make sure it doesn't get sent again
1414 * even if this queue request was received after the background
1415 * search already sent it.
1416 */
1417 if (block) {
1418 unsigned long page;
1419
1420 page = offset >> TARGET_PAGE_BITS;
1421 dirty = test_bit(page, block->bmap);
1422 if (!dirty) {
1423 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1424 page);
1425 } else {
1426 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1427 }
1428 }
1429
1430 } while (block && !dirty);
1431
1432 if (block) {
1433 /*
1434 * As soon as we start servicing pages out of order, then we have
1435 * to kill the bulk stage, since the bulk stage assumes
1436 * in (migration_bitmap_find_and_reset_dirty) that every page is
1437 * dirty, that's no longer true.
1438 */
1439 rs->ram_bulk_stage = false;
1440
1441 /*
1442 * We want the background search to continue from the queued page
1443 * since the guest is likely to want other pages near to the page
1444 * it just requested.
1445 */
1446 pss->block = block;
1447 pss->page = offset >> TARGET_PAGE_BITS;
1448
1449 /*
1450 * This unqueued page would break the "one round" check, even is
1451 * really rare.
1452 */
1453 pss->complete_round = false;
1454 }
1455
1456 return !!block;
1457 }
1458
1459 /**
1460 * migration_page_queue_free: drop any remaining pages in the ram
1461 * request queue
1462 *
1463 * It should be empty at the end anyway, but in error cases there may
1464 * be some left. in case that there is any page left, we drop it.
1465 *
1466 */
1467 static void migration_page_queue_free(RAMState *rs)
1468 {
1469 struct RAMSrcPageRequest *mspr, *next_mspr;
1470 /* This queue generally should be empty - but in the case of a failed
1471 * migration might have some droppings in.
1472 */
1473 RCU_READ_LOCK_GUARD();
1474 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1475 memory_region_unref(mspr->rb->mr);
1476 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1477 g_free(mspr);
1478 }
1479 }
1480
1481 /**
1482 * ram_save_queue_pages: queue the page for transmission
1483 *
1484 * A request from postcopy destination for example.
1485 *
1486 * Returns zero on success or negative on error
1487 *
1488 * @rbname: Name of the RAMBLock of the request. NULL means the
1489 * same that last one.
1490 * @start: starting address from the start of the RAMBlock
1491 * @len: length (in bytes) to send
1492 */
1493 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1494 {
1495 RAMBlock *ramblock;
1496 RAMState *rs = ram_state;
1497
1498 ram_counters.postcopy_requests++;
1499 RCU_READ_LOCK_GUARD();
1500
1501 if (!rbname) {
1502 /* Reuse last RAMBlock */
1503 ramblock = rs->last_req_rb;
1504
1505 if (!ramblock) {
1506 /*
1507 * Shouldn't happen, we can't reuse the last RAMBlock if
1508 * it's the 1st request.
1509 */
1510 error_report("ram_save_queue_pages no previous block");
1511 return -1;
1512 }
1513 } else {
1514 ramblock = qemu_ram_block_by_name(rbname);
1515
1516 if (!ramblock) {
1517 /* We shouldn't be asked for a non-existent RAMBlock */
1518 error_report("ram_save_queue_pages no block '%s'", rbname);
1519 return -1;
1520 }
1521 rs->last_req_rb = ramblock;
1522 }
1523 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1524 if (start+len > ramblock->used_length) {
1525 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1526 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1527 __func__, start, len, ramblock->used_length);
1528 return -1;
1529 }
1530
1531 struct RAMSrcPageRequest *new_entry =
1532 g_malloc0(sizeof(struct RAMSrcPageRequest));
1533 new_entry->rb = ramblock;
1534 new_entry->offset = start;
1535 new_entry->len = len;
1536
1537 memory_region_ref(ramblock->mr);
1538 qemu_mutex_lock(&rs->src_page_req_mutex);
1539 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1540 migration_make_urgent_request();
1541 qemu_mutex_unlock(&rs->src_page_req_mutex);
1542
1543 return 0;
1544 }
1545
1546 static bool save_page_use_compression(RAMState *rs)
1547 {
1548 if (!migrate_use_compression()) {
1549 return false;
1550 }
1551
1552 /*
1553 * If xbzrle is on, stop using the data compression after first
1554 * round of migration even if compression is enabled. In theory,
1555 * xbzrle can do better than compression.
1556 */
1557 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1558 return true;
1559 }
1560
1561 return false;
1562 }
1563
1564 /*
1565 * try to compress the page before posting it out, return true if the page
1566 * has been properly handled by compression, otherwise needs other
1567 * paths to handle it
1568 */
1569 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1570 {
1571 if (!save_page_use_compression(rs)) {
1572 return false;
1573 }
1574
1575 /*
1576 * When starting the process of a new block, the first page of
1577 * the block should be sent out before other pages in the same
1578 * block, and all the pages in last block should have been sent
1579 * out, keeping this order is important, because the 'cont' flag
1580 * is used to avoid resending the block name.
1581 *
1582 * We post the fist page as normal page as compression will take
1583 * much CPU resource.
1584 */
1585 if (block != rs->last_sent_block) {
1586 flush_compressed_data(rs);
1587 return false;
1588 }
1589
1590 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1591 return true;
1592 }
1593
1594 compression_counters.busy++;
1595 return false;
1596 }
1597
1598 /**
1599 * ram_save_target_page: save one target page
1600 *
1601 * Returns the number of pages written
1602 *
1603 * @rs: current RAM state
1604 * @pss: data about the page we want to send
1605 * @last_stage: if we are at the completion stage
1606 */
1607 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1608 bool last_stage)
1609 {
1610 RAMBlock *block = pss->block;
1611 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1612 int res;
1613
1614 if (control_save_page(rs, block, offset, &res)) {
1615 return res;
1616 }
1617
1618 if (save_compress_page(rs, block, offset)) {
1619 return 1;
1620 }
1621
1622 res = save_zero_page(rs, block, offset);
1623 if (res > 0) {
1624 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1625 * page would be stale
1626 */
1627 if (!save_page_use_compression(rs)) {
1628 XBZRLE_cache_lock();
1629 xbzrle_cache_zero_page(rs, block->offset + offset);
1630 XBZRLE_cache_unlock();
1631 }
1632 ram_release_pages(block->idstr, offset, res);
1633 return res;
1634 }
1635
1636 /*
1637 * Do not use multifd for:
1638 * 1. Compression as the first page in the new block should be posted out
1639 * before sending the compressed page
1640 * 2. In postcopy as one whole host page should be placed
1641 */
1642 if (!save_page_use_compression(rs) && migrate_use_multifd()
1643 && !migration_in_postcopy()) {
1644 return ram_save_multifd_page(rs, block, offset);
1645 }
1646
1647 return ram_save_page(rs, pss, last_stage);
1648 }
1649
1650 /**
1651 * ram_save_host_page: save a whole host page
1652 *
1653 * Starting at *offset send pages up to the end of the current host
1654 * page. It's valid for the initial offset to point into the middle of
1655 * a host page in which case the remainder of the hostpage is sent.
1656 * Only dirty target pages are sent. Note that the host page size may
1657 * be a huge page for this block.
1658 * The saving stops at the boundary of the used_length of the block
1659 * if the RAMBlock isn't a multiple of the host page size.
1660 *
1661 * Returns the number of pages written or negative on error
1662 *
1663 * @rs: current RAM state
1664 * @ms: current migration state
1665 * @pss: data about the page we want to send
1666 * @last_stage: if we are at the completion stage
1667 */
1668 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1669 bool last_stage)
1670 {
1671 int tmppages, pages = 0;
1672 size_t pagesize_bits =
1673 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1674
1675 if (ramblock_is_ignored(pss->block)) {
1676 error_report("block %s should not be migrated !", pss->block->idstr);
1677 return 0;
1678 }
1679
1680 do {
1681 /* Check the pages is dirty and if it is send it */
1682 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1683 pss->page++;
1684 continue;
1685 }
1686
1687 tmppages = ram_save_target_page(rs, pss, last_stage);
1688 if (tmppages < 0) {
1689 return tmppages;
1690 }
1691
1692 pages += tmppages;
1693 pss->page++;
1694 /* Allow rate limiting to happen in the middle of huge pages */
1695 migration_rate_limit();
1696 } while ((pss->page & (pagesize_bits - 1)) &&
1697 offset_in_ramblock(pss->block,
1698 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1699
1700 /* The offset we leave with is the last one we looked at */
1701 pss->page--;
1702 return pages;
1703 }
1704
1705 /**
1706 * ram_find_and_save_block: finds a dirty page and sends it to f
1707 *
1708 * Called within an RCU critical section.
1709 *
1710 * Returns the number of pages written where zero means no dirty pages,
1711 * or negative on error
1712 *
1713 * @rs: current RAM state
1714 * @last_stage: if we are at the completion stage
1715 *
1716 * On systems where host-page-size > target-page-size it will send all the
1717 * pages in a host page that are dirty.
1718 */
1719
1720 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1721 {
1722 PageSearchStatus pss;
1723 int pages = 0;
1724 bool again, found;
1725
1726 /* No dirty page as there is zero RAM */
1727 if (!ram_bytes_total()) {
1728 return pages;
1729 }
1730
1731 pss.block = rs->last_seen_block;
1732 pss.page = rs->last_page;
1733 pss.complete_round = false;
1734
1735 if (!pss.block) {
1736 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1737 }
1738
1739 do {
1740 again = true;
1741 found = get_queued_page(rs, &pss);
1742
1743 if (!found) {
1744 /* priority queue empty, so just search for something dirty */
1745 found = find_dirty_block(rs, &pss, &again);
1746 }
1747
1748 if (found) {
1749 pages = ram_save_host_page(rs, &pss, last_stage);
1750 }
1751 } while (!pages && again);
1752
1753 rs->last_seen_block = pss.block;
1754 rs->last_page = pss.page;
1755
1756 return pages;
1757 }
1758
1759 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1760 {
1761 uint64_t pages = size / TARGET_PAGE_SIZE;
1762
1763 if (zero) {
1764 ram_counters.duplicate += pages;
1765 } else {
1766 ram_counters.normal += pages;
1767 ram_counters.transferred += size;
1768 qemu_update_position(f, size);
1769 }
1770 }
1771
1772 static uint64_t ram_bytes_total_common(bool count_ignored)
1773 {
1774 RAMBlock *block;
1775 uint64_t total = 0;
1776
1777 RCU_READ_LOCK_GUARD();
1778
1779 if (count_ignored) {
1780 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1781 total += block->used_length;
1782 }
1783 } else {
1784 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1785 total += block->used_length;
1786 }
1787 }
1788 return total;
1789 }
1790
1791 uint64_t ram_bytes_total(void)
1792 {
1793 return ram_bytes_total_common(false);
1794 }
1795
1796 static void xbzrle_load_setup(void)
1797 {
1798 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1799 }
1800
1801 static void xbzrle_load_cleanup(void)
1802 {
1803 g_free(XBZRLE.decoded_buf);
1804 XBZRLE.decoded_buf = NULL;
1805 }
1806
1807 static void ram_state_cleanup(RAMState **rsp)
1808 {
1809 if (*rsp) {
1810 migration_page_queue_free(*rsp);
1811 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1812 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1813 g_free(*rsp);
1814 *rsp = NULL;
1815 }
1816 }
1817
1818 static void xbzrle_cleanup(void)
1819 {
1820 XBZRLE_cache_lock();
1821 if (XBZRLE.cache) {
1822 cache_fini(XBZRLE.cache);
1823 g_free(XBZRLE.encoded_buf);
1824 g_free(XBZRLE.current_buf);
1825 g_free(XBZRLE.zero_target_page);
1826 XBZRLE.cache = NULL;
1827 XBZRLE.encoded_buf = NULL;
1828 XBZRLE.current_buf = NULL;
1829 XBZRLE.zero_target_page = NULL;
1830 }
1831 XBZRLE_cache_unlock();
1832 }
1833
1834 static void ram_save_cleanup(void *opaque)
1835 {
1836 RAMState **rsp = opaque;
1837 RAMBlock *block;
1838
1839 /* caller have hold iothread lock or is in a bh, so there is
1840 * no writing race against the migration bitmap
1841 */
1842 memory_global_dirty_log_stop();
1843
1844 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1845 g_free(block->clear_bmap);
1846 block->clear_bmap = NULL;
1847 g_free(block->bmap);
1848 block->bmap = NULL;
1849 }
1850
1851 xbzrle_cleanup();
1852 compress_threads_save_cleanup();
1853 ram_state_cleanup(rsp);
1854 }
1855
1856 static void ram_state_reset(RAMState *rs)
1857 {
1858 rs->last_seen_block = NULL;
1859 rs->last_sent_block = NULL;
1860 rs->last_page = 0;
1861 rs->last_version = ram_list.version;
1862 rs->ram_bulk_stage = true;
1863 rs->fpo_enabled = false;
1864 }
1865
1866 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1867
1868 /*
1869 * 'expected' is the value you expect the bitmap mostly to be full
1870 * of; it won't bother printing lines that are all this value.
1871 * If 'todump' is null the migration bitmap is dumped.
1872 */
1873 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1874 unsigned long pages)
1875 {
1876 int64_t cur;
1877 int64_t linelen = 128;
1878 char linebuf[129];
1879
1880 for (cur = 0; cur < pages; cur += linelen) {
1881 int64_t curb;
1882 bool found = false;
1883 /*
1884 * Last line; catch the case where the line length
1885 * is longer than remaining ram
1886 */
1887 if (cur + linelen > pages) {
1888 linelen = pages - cur;
1889 }
1890 for (curb = 0; curb < linelen; curb++) {
1891 bool thisbit = test_bit(cur + curb, todump);
1892 linebuf[curb] = thisbit ? '1' : '.';
1893 found = found || (thisbit != expected);
1894 }
1895 if (found) {
1896 linebuf[curb] = '\0';
1897 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1898 }
1899 }
1900 }
1901
1902 /* **** functions for postcopy ***** */
1903
1904 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1905 {
1906 struct RAMBlock *block;
1907
1908 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1909 unsigned long *bitmap = block->bmap;
1910 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1911 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1912
1913 while (run_start < range) {
1914 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1915 ram_discard_range(block->idstr,
1916 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1917 ((ram_addr_t)(run_end - run_start))
1918 << TARGET_PAGE_BITS);
1919 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1920 }
1921 }
1922 }
1923
1924 /**
1925 * postcopy_send_discard_bm_ram: discard a RAMBlock
1926 *
1927 * Returns zero on success
1928 *
1929 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1930 *
1931 * @ms: current migration state
1932 * @block: RAMBlock to discard
1933 */
1934 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1935 {
1936 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1937 unsigned long current;
1938 unsigned long *bitmap = block->bmap;
1939
1940 for (current = 0; current < end; ) {
1941 unsigned long one = find_next_bit(bitmap, end, current);
1942 unsigned long zero, discard_length;
1943
1944 if (one >= end) {
1945 break;
1946 }
1947
1948 zero = find_next_zero_bit(bitmap, end, one + 1);
1949
1950 if (zero >= end) {
1951 discard_length = end - one;
1952 } else {
1953 discard_length = zero - one;
1954 }
1955 postcopy_discard_send_range(ms, one, discard_length);
1956 current = one + discard_length;
1957 }
1958
1959 return 0;
1960 }
1961
1962 /**
1963 * postcopy_each_ram_send_discard: discard all RAMBlocks
1964 *
1965 * Returns 0 for success or negative for error
1966 *
1967 * Utility for the outgoing postcopy code.
1968 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1969 * passing it bitmap indexes and name.
1970 * (qemu_ram_foreach_block ends up passing unscaled lengths
1971 * which would mean postcopy code would have to deal with target page)
1972 *
1973 * @ms: current migration state
1974 */
1975 static int postcopy_each_ram_send_discard(MigrationState *ms)
1976 {
1977 struct RAMBlock *block;
1978 int ret;
1979
1980 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1981 postcopy_discard_send_init(ms, block->idstr);
1982
1983 /*
1984 * Postcopy sends chunks of bitmap over the wire, but it
1985 * just needs indexes at this point, avoids it having
1986 * target page specific code.
1987 */
1988 ret = postcopy_send_discard_bm_ram(ms, block);
1989 postcopy_discard_send_finish(ms);
1990 if (ret) {
1991 return ret;
1992 }
1993 }
1994
1995 return 0;
1996 }
1997
1998 /**
1999 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2000 *
2001 * Helper for postcopy_chunk_hostpages; it's called twice to
2002 * canonicalize the two bitmaps, that are similar, but one is
2003 * inverted.
2004 *
2005 * Postcopy requires that all target pages in a hostpage are dirty or
2006 * clean, not a mix. This function canonicalizes the bitmaps.
2007 *
2008 * @ms: current migration state
2009 * @block: block that contains the page we want to canonicalize
2010 */
2011 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2012 {
2013 RAMState *rs = ram_state;
2014 unsigned long *bitmap = block->bmap;
2015 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2016 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2017 unsigned long run_start;
2018
2019 if (block->page_size == TARGET_PAGE_SIZE) {
2020 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2021 return;
2022 }
2023
2024 /* Find a dirty page */
2025 run_start = find_next_bit(bitmap, pages, 0);
2026
2027 while (run_start < pages) {
2028
2029 /*
2030 * If the start of this run of pages is in the middle of a host
2031 * page, then we need to fixup this host page.
2032 */
2033 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2034 /* Find the end of this run */
2035 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2036 /*
2037 * If the end isn't at the start of a host page, then the
2038 * run doesn't finish at the end of a host page
2039 * and we need to discard.
2040 */
2041 }
2042
2043 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2044 unsigned long page;
2045 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2046 host_ratio);
2047 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2048
2049 /* Clean up the bitmap */
2050 for (page = fixup_start_addr;
2051 page < fixup_start_addr + host_ratio; page++) {
2052 /*
2053 * Remark them as dirty, updating the count for any pages
2054 * that weren't previously dirty.
2055 */
2056 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2057 }
2058 }
2059
2060 /* Find the next dirty page for the next iteration */
2061 run_start = find_next_bit(bitmap, pages, run_start);
2062 }
2063 }
2064
2065 /**
2066 * postcopy_chunk_hostpages: discard any partially sent host page
2067 *
2068 * Utility for the outgoing postcopy code.
2069 *
2070 * Discard any partially sent host-page size chunks, mark any partially
2071 * dirty host-page size chunks as all dirty. In this case the host-page
2072 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2073 *
2074 * Returns zero on success
2075 *
2076 * @ms: current migration state
2077 * @block: block we want to work with
2078 */
2079 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2080 {
2081 postcopy_discard_send_init(ms, block->idstr);
2082
2083 /*
2084 * Ensure that all partially dirty host pages are made fully dirty.
2085 */
2086 postcopy_chunk_hostpages_pass(ms, block);
2087
2088 postcopy_discard_send_finish(ms);
2089 return 0;
2090 }
2091
2092 /**
2093 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2094 *
2095 * Returns zero on success
2096 *
2097 * Transmit the set of pages to be discarded after precopy to the target
2098 * these are pages that:
2099 * a) Have been previously transmitted but are now dirty again
2100 * b) Pages that have never been transmitted, this ensures that
2101 * any pages on the destination that have been mapped by background
2102 * tasks get discarded (transparent huge pages is the specific concern)
2103 * Hopefully this is pretty sparse
2104 *
2105 * @ms: current migration state
2106 */
2107 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2108 {
2109 RAMState *rs = ram_state;
2110 RAMBlock *block;
2111 int ret;
2112
2113 RCU_READ_LOCK_GUARD();
2114
2115 /* This should be our last sync, the src is now paused */
2116 migration_bitmap_sync(rs);
2117
2118 /* Easiest way to make sure we don't resume in the middle of a host-page */
2119 rs->last_seen_block = NULL;
2120 rs->last_sent_block = NULL;
2121 rs->last_page = 0;
2122
2123 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2124 /* Deal with TPS != HPS and huge pages */
2125 ret = postcopy_chunk_hostpages(ms, block);
2126 if (ret) {
2127 return ret;
2128 }
2129
2130 #ifdef DEBUG_POSTCOPY
2131 ram_debug_dump_bitmap(block->bmap, true,
2132 block->used_length >> TARGET_PAGE_BITS);
2133 #endif
2134 }
2135 trace_ram_postcopy_send_discard_bitmap();
2136
2137 ret = postcopy_each_ram_send_discard(ms);
2138
2139 return ret;
2140 }
2141
2142 /**
2143 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2144 *
2145 * Returns zero on success
2146 *
2147 * @rbname: name of the RAMBlock of the request. NULL means the
2148 * same that last one.
2149 * @start: RAMBlock starting page
2150 * @length: RAMBlock size
2151 */
2152 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2153 {
2154 trace_ram_discard_range(rbname, start, length);
2155
2156 RCU_READ_LOCK_GUARD();
2157 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2158
2159 if (!rb) {
2160 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2161 return -1;
2162 }
2163
2164 /*
2165 * On source VM, we don't need to update the received bitmap since
2166 * we don't even have one.
2167 */
2168 if (rb->receivedmap) {
2169 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2170 length >> qemu_target_page_bits());
2171 }
2172
2173 return ram_block_discard_range(rb, start, length);
2174 }
2175
2176 /*
2177 * For every allocation, we will try not to crash the VM if the
2178 * allocation failed.
2179 */
2180 static int xbzrle_init(void)
2181 {
2182 Error *local_err = NULL;
2183
2184 if (!migrate_use_xbzrle()) {
2185 return 0;
2186 }
2187
2188 XBZRLE_cache_lock();
2189
2190 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2191 if (!XBZRLE.zero_target_page) {
2192 error_report("%s: Error allocating zero page", __func__);
2193 goto err_out;
2194 }
2195
2196 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2197 TARGET_PAGE_SIZE, &local_err);
2198 if (!XBZRLE.cache) {
2199 error_report_err(local_err);
2200 goto free_zero_page;
2201 }
2202
2203 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2204 if (!XBZRLE.encoded_buf) {
2205 error_report("%s: Error allocating encoded_buf", __func__);
2206 goto free_cache;
2207 }
2208
2209 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2210 if (!XBZRLE.current_buf) {
2211 error_report("%s: Error allocating current_buf", __func__);
2212 goto free_encoded_buf;
2213 }
2214
2215 /* We are all good */
2216 XBZRLE_cache_unlock();
2217 return 0;
2218
2219 free_encoded_buf:
2220 g_free(XBZRLE.encoded_buf);
2221 XBZRLE.encoded_buf = NULL;
2222 free_cache:
2223 cache_fini(XBZRLE.cache);
2224 XBZRLE.cache = NULL;
2225 free_zero_page:
2226 g_free(XBZRLE.zero_target_page);
2227 XBZRLE.zero_target_page = NULL;
2228 err_out:
2229 XBZRLE_cache_unlock();
2230 return -ENOMEM;
2231 }
2232
2233 static int ram_state_init(RAMState **rsp)
2234 {
2235 *rsp = g_try_new0(RAMState, 1);
2236
2237 if (!*rsp) {
2238 error_report("%s: Init ramstate fail", __func__);
2239 return -1;
2240 }
2241
2242 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2243 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2244 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2245
2246 /*
2247 * Count the total number of pages used by ram blocks not including any
2248 * gaps due to alignment or unplugs.
2249 * This must match with the initial values of dirty bitmap.
2250 */
2251 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2252 ram_state_reset(*rsp);
2253
2254 return 0;
2255 }
2256
2257 static void ram_list_init_bitmaps(void)
2258 {
2259 MigrationState *ms = migrate_get_current();
2260 RAMBlock *block;
2261 unsigned long pages;
2262 uint8_t shift;
2263
2264 /* Skip setting bitmap if there is no RAM */
2265 if (ram_bytes_total()) {
2266 shift = ms->clear_bitmap_shift;
2267 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2268 error_report("clear_bitmap_shift (%u) too big, using "
2269 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2270 shift = CLEAR_BITMAP_SHIFT_MAX;
2271 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2272 error_report("clear_bitmap_shift (%u) too small, using "
2273 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2274 shift = CLEAR_BITMAP_SHIFT_MIN;
2275 }
2276
2277 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2278 pages = block->max_length >> TARGET_PAGE_BITS;
2279 /*
2280 * The initial dirty bitmap for migration must be set with all
2281 * ones to make sure we'll migrate every guest RAM page to
2282 * destination.
2283 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2284 * new migration after a failed migration, ram_list.
2285 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2286 * guest memory.
2287 */
2288 block->bmap = bitmap_new(pages);
2289 bitmap_set(block->bmap, 0, pages);
2290 block->clear_bmap_shift = shift;
2291 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2292 }
2293 }
2294 }
2295
2296 static void ram_init_bitmaps(RAMState *rs)
2297 {
2298 /* For memory_global_dirty_log_start below. */
2299 qemu_mutex_lock_iothread();
2300 qemu_mutex_lock_ramlist();
2301
2302 WITH_RCU_READ_LOCK_GUARD() {
2303 ram_list_init_bitmaps();
2304 memory_global_dirty_log_start();
2305 migration_bitmap_sync_precopy(rs);
2306 }
2307 qemu_mutex_unlock_ramlist();
2308 qemu_mutex_unlock_iothread();
2309 }
2310
2311 static int ram_init_all(RAMState **rsp)
2312 {
2313 if (ram_state_init(rsp)) {
2314 return -1;
2315 }
2316
2317 if (xbzrle_init()) {
2318 ram_state_cleanup(rsp);
2319 return -1;
2320 }
2321
2322 ram_init_bitmaps(*rsp);
2323
2324 return 0;
2325 }
2326
2327 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2328 {
2329 RAMBlock *block;
2330 uint64_t pages = 0;
2331
2332 /*
2333 * Postcopy is not using xbzrle/compression, so no need for that.
2334 * Also, since source are already halted, we don't need to care
2335 * about dirty page logging as well.
2336 */
2337
2338 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2339 pages += bitmap_count_one(block->bmap,
2340 block->used_length >> TARGET_PAGE_BITS);
2341 }
2342
2343 /* This may not be aligned with current bitmaps. Recalculate. */
2344 rs->migration_dirty_pages = pages;
2345
2346 rs->last_seen_block = NULL;
2347 rs->last_sent_block = NULL;
2348 rs->last_page = 0;
2349 rs->last_version = ram_list.version;
2350 /*
2351 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2352 * matter what we have sent.
2353 */
2354 rs->ram_bulk_stage = false;
2355
2356 /* Update RAMState cache of output QEMUFile */
2357 rs->f = out;
2358
2359 trace_ram_state_resume_prepare(pages);
2360 }
2361
2362 /*
2363 * This function clears bits of the free pages reported by the caller from the
2364 * migration dirty bitmap. @addr is the host address corresponding to the
2365 * start of the continuous guest free pages, and @len is the total bytes of
2366 * those pages.
2367 */
2368 void qemu_guest_free_page_hint(void *addr, size_t len)
2369 {
2370 RAMBlock *block;
2371 ram_addr_t offset;
2372 size_t used_len, start, npages;
2373 MigrationState *s = migrate_get_current();
2374
2375 /* This function is currently expected to be used during live migration */
2376 if (!migration_is_setup_or_active(s->state)) {
2377 return;
2378 }
2379
2380 for (; len > 0; len -= used_len, addr += used_len) {
2381 block = qemu_ram_block_from_host(addr, false, &offset);
2382 if (unlikely(!block || offset >= block->used_length)) {
2383 /*
2384 * The implementation might not support RAMBlock resize during
2385 * live migration, but it could happen in theory with future
2386 * updates. So we add a check here to capture that case.
2387 */
2388 error_report_once("%s unexpected error", __func__);
2389 return;
2390 }
2391
2392 if (len <= block->used_length - offset) {
2393 used_len = len;
2394 } else {
2395 used_len = block->used_length - offset;
2396 }
2397
2398 start = offset >> TARGET_PAGE_BITS;
2399 npages = used_len >> TARGET_PAGE_BITS;
2400
2401 qemu_mutex_lock(&ram_state->bitmap_mutex);
2402 ram_state->migration_dirty_pages -=
2403 bitmap_count_one_with_offset(block->bmap, start, npages);
2404 bitmap_clear(block->bmap, start, npages);
2405 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2406 }
2407 }
2408
2409 /*
2410 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2411 * long-running RCU critical section. When rcu-reclaims in the code
2412 * start to become numerous it will be necessary to reduce the
2413 * granularity of these critical sections.
2414 */
2415
2416 /**
2417 * ram_save_setup: Setup RAM for migration
2418 *
2419 * Returns zero to indicate success and negative for error
2420 *
2421 * @f: QEMUFile where to send the data
2422 * @opaque: RAMState pointer
2423 */
2424 static int ram_save_setup(QEMUFile *f, void *opaque)
2425 {
2426 RAMState **rsp = opaque;
2427 RAMBlock *block;
2428
2429 if (compress_threads_save_setup()) {
2430 return -1;
2431 }
2432
2433 /* migration has already setup the bitmap, reuse it. */
2434 if (!migration_in_colo_state()) {
2435 if (ram_init_all(rsp) != 0) {
2436 compress_threads_save_cleanup();
2437 return -1;
2438 }
2439 }
2440 (*rsp)->f = f;
2441
2442 WITH_RCU_READ_LOCK_GUARD() {
2443 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2444
2445 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2446 qemu_put_byte(f, strlen(block->idstr));
2447 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2448 qemu_put_be64(f, block->used_length);
2449 if (migrate_postcopy_ram() && block->page_size !=
2450 qemu_host_page_size) {
2451 qemu_put_be64(f, block->page_size);
2452 }
2453 if (migrate_ignore_shared()) {
2454 qemu_put_be64(f, block->mr->addr);
2455 }
2456 }
2457 }
2458
2459 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2460 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2461
2462 multifd_send_sync_main(f);
2463 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2464 qemu_fflush(f);
2465
2466 return 0;
2467 }
2468
2469 /**
2470 * ram_save_iterate: iterative stage for migration
2471 *
2472 * Returns zero to indicate success and negative for error
2473 *
2474 * @f: QEMUFile where to send the data
2475 * @opaque: RAMState pointer
2476 */
2477 static int ram_save_iterate(QEMUFile *f, void *opaque)
2478 {
2479 RAMState **temp = opaque;
2480 RAMState *rs = *temp;
2481 int ret = 0;
2482 int i;
2483 int64_t t0;
2484 int done = 0;
2485
2486 if (blk_mig_bulk_active()) {
2487 /* Avoid transferring ram during bulk phase of block migration as
2488 * the bulk phase will usually take a long time and transferring
2489 * ram updates during that time is pointless. */
2490 goto out;
2491 }
2492
2493 WITH_RCU_READ_LOCK_GUARD() {
2494 if (ram_list.version != rs->last_version) {
2495 ram_state_reset(rs);
2496 }
2497
2498 /* Read version before ram_list.blocks */
2499 smp_rmb();
2500
2501 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2502
2503 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2504 i = 0;
2505 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2506 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2507 int pages;
2508
2509 if (qemu_file_get_error(f)) {
2510 break;
2511 }
2512
2513 pages = ram_find_and_save_block(rs, false);
2514 /* no more pages to sent */
2515 if (pages == 0) {
2516 done = 1;
2517 break;
2518 }
2519
2520 if (pages < 0) {
2521 qemu_file_set_error(f, pages);
2522 break;
2523 }
2524
2525 rs->target_page_count += pages;
2526
2527 /*
2528 * During postcopy, it is necessary to make sure one whole host
2529 * page is sent in one chunk.
2530 */
2531 if (migrate_postcopy_ram()) {
2532 flush_compressed_data(rs);
2533 }
2534
2535 /*
2536 * we want to check in the 1st loop, just in case it was the 1st
2537 * time and we had to sync the dirty bitmap.
2538 * qemu_clock_get_ns() is a bit expensive, so we only check each
2539 * some iterations
2540 */
2541 if ((i & 63) == 0) {
2542 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2543 1000000;
2544 if (t1 > MAX_WAIT) {
2545 trace_ram_save_iterate_big_wait(t1, i);
2546 break;
2547 }
2548 }
2549 i++;
2550 }
2551 }
2552
2553 /*
2554 * Must occur before EOS (or any QEMUFile operation)
2555 * because of RDMA protocol.
2556 */
2557 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2558
2559 out:
2560 if (ret >= 0
2561 && migration_is_setup_or_active(migrate_get_current()->state)) {
2562 multifd_send_sync_main(rs->f);
2563 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2564 qemu_fflush(f);
2565 ram_counters.transferred += 8;
2566
2567 ret = qemu_file_get_error(f);
2568 }
2569 if (ret < 0) {
2570 return ret;
2571 }
2572
2573 return done;
2574 }
2575
2576 /**
2577 * ram_save_complete: function called to send the remaining amount of ram
2578 *
2579 * Returns zero to indicate success or negative on error
2580 *
2581 * Called with iothread lock
2582 *
2583 * @f: QEMUFile where to send the data
2584 * @opaque: RAMState pointer
2585 */
2586 static int ram_save_complete(QEMUFile *f, void *opaque)
2587 {
2588 RAMState **temp = opaque;
2589 RAMState *rs = *temp;
2590 int ret = 0;
2591
2592 WITH_RCU_READ_LOCK_GUARD() {
2593 if (!migration_in_postcopy()) {
2594 migration_bitmap_sync_precopy(rs);
2595 }
2596
2597 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2598
2599 /* try transferring iterative blocks of memory */
2600
2601 /* flush all remaining blocks regardless of rate limiting */
2602 while (true) {
2603 int pages;
2604
2605 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2606 /* no more blocks to sent */
2607 if (pages == 0) {
2608 break;
2609 }
2610 if (pages < 0) {
2611 ret = pages;
2612 break;
2613 }
2614 }
2615
2616 flush_compressed_data(rs);
2617 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2618 }
2619
2620 if (ret >= 0) {
2621 multifd_send_sync_main(rs->f);
2622 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2623 qemu_fflush(f);
2624 }
2625
2626 return ret;
2627 }
2628
2629 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2630 uint64_t *res_precopy_only,
2631 uint64_t *res_compatible,
2632 uint64_t *res_postcopy_only)
2633 {
2634 RAMState **temp = opaque;
2635 RAMState *rs = *temp;
2636 uint64_t remaining_size;
2637
2638 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2639
2640 if (!migration_in_postcopy() &&
2641 remaining_size < max_size) {
2642 qemu_mutex_lock_iothread();
2643 WITH_RCU_READ_LOCK_GUARD() {
2644 migration_bitmap_sync_precopy(rs);
2645 }
2646 qemu_mutex_unlock_iothread();
2647 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2648 }
2649
2650 if (migrate_postcopy_ram()) {
2651 /* We can do postcopy, and all the data is postcopiable */
2652 *res_compatible += remaining_size;
2653 } else {
2654 *res_precopy_only += remaining_size;
2655 }
2656 }
2657
2658 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2659 {
2660 unsigned int xh_len;
2661 int xh_flags;
2662 uint8_t *loaded_data;
2663
2664 /* extract RLE header */
2665 xh_flags = qemu_get_byte(f);
2666 xh_len = qemu_get_be16(f);
2667
2668 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2669 error_report("Failed to load XBZRLE page - wrong compression!");
2670 return -1;
2671 }
2672
2673 if (xh_len > TARGET_PAGE_SIZE) {
2674 error_report("Failed to load XBZRLE page - len overflow!");
2675 return -1;
2676 }
2677 loaded_data = XBZRLE.decoded_buf;
2678 /* load data and decode */
2679 /* it can change loaded_data to point to an internal buffer */
2680 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2681
2682 /* decode RLE */
2683 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2684 TARGET_PAGE_SIZE) == -1) {
2685 error_report("Failed to load XBZRLE page - decode error!");
2686 return -1;
2687 }
2688
2689 return 0;
2690 }
2691
2692 /**
2693 * ram_block_from_stream: read a RAMBlock id from the migration stream
2694 *
2695 * Must be called from within a rcu critical section.
2696 *
2697 * Returns a pointer from within the RCU-protected ram_list.
2698 *
2699 * @f: QEMUFile where to read the data from
2700 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2701 */
2702 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2703 {
2704 static RAMBlock *block = NULL;
2705 char id[256];
2706 uint8_t len;
2707
2708 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2709 if (!block) {
2710 error_report("Ack, bad migration stream!");
2711 return NULL;
2712 }
2713 return block;
2714 }
2715
2716 len = qemu_get_byte(f);
2717 qemu_get_buffer(f, (uint8_t *)id, len);
2718 id[len] = 0;
2719
2720 block = qemu_ram_block_by_name(id);
2721 if (!block) {
2722 error_report("Can't find block %s", id);
2723 return NULL;
2724 }
2725
2726 if (ramblock_is_ignored(block)) {
2727 error_report("block %s should not be migrated !", id);
2728 return NULL;
2729 }
2730
2731 return block;
2732 }
2733
2734 static inline void *host_from_ram_block_offset(RAMBlock *block,
2735 ram_addr_t offset)
2736 {
2737 if (!offset_in_ramblock(block, offset)) {
2738 return NULL;
2739 }
2740
2741 return block->host + offset;
2742 }
2743
2744 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2745 ram_addr_t offset, bool record_bitmap)
2746 {
2747 if (!offset_in_ramblock(block, offset)) {
2748 return NULL;
2749 }
2750 if (!block->colo_cache) {
2751 error_report("%s: colo_cache is NULL in block :%s",
2752 __func__, block->idstr);
2753 return NULL;
2754 }
2755
2756 /*
2757 * During colo checkpoint, we need bitmap of these migrated pages.
2758 * It help us to decide which pages in ram cache should be flushed
2759 * into VM's RAM later.
2760 */
2761 if (record_bitmap &&
2762 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2763 ram_state->migration_dirty_pages++;
2764 }
2765 return block->colo_cache + offset;
2766 }
2767
2768 /**
2769 * ram_handle_compressed: handle the zero page case
2770 *
2771 * If a page (or a whole RDMA chunk) has been
2772 * determined to be zero, then zap it.
2773 *
2774 * @host: host address for the zero page
2775 * @ch: what the page is filled from. We only support zero
2776 * @size: size of the zero page
2777 */
2778 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2779 {
2780 if (ch != 0 || !is_zero_range(host, size)) {
2781 memset(host, ch, size);
2782 }
2783 }
2784
2785 /* return the size after decompression, or negative value on error */
2786 static int
2787 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2788 const uint8_t *source, size_t source_len)
2789 {
2790 int err;
2791
2792 err = inflateReset(stream);
2793 if (err != Z_OK) {
2794 return -1;
2795 }
2796
2797 stream->avail_in = source_len;
2798 stream->next_in = (uint8_t *)source;
2799 stream->avail_out = dest_len;
2800 stream->next_out = dest;
2801
2802 err = inflate(stream, Z_NO_FLUSH);
2803 if (err != Z_STREAM_END) {
2804 return -1;
2805 }
2806
2807 return stream->total_out;
2808 }
2809
2810 static void *do_data_decompress(void *opaque)
2811 {
2812 DecompressParam *param = opaque;
2813 unsigned long pagesize;
2814 uint8_t *des;
2815 int len, ret;
2816
2817 qemu_mutex_lock(&param->mutex);
2818 while (!param->quit) {
2819 if (param->des) {
2820 des = param->des;
2821 len = param->len;
2822 param->des = 0;
2823 qemu_mutex_unlock(&param->mutex);
2824
2825 pagesize = TARGET_PAGE_SIZE;
2826
2827 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2828 param->compbuf, len);
2829 if (ret < 0 && migrate_get_current()->decompress_error_check) {
2830 error_report("decompress data failed");
2831 qemu_file_set_error(decomp_file, ret);
2832 }
2833
2834 qemu_mutex_lock(&decomp_done_lock);
2835 param->done = true;
2836 qemu_cond_signal(&decomp_done_cond);
2837 qemu_mutex_unlock(&decomp_done_lock);
2838
2839 qemu_mutex_lock(&param->mutex);
2840 } else {
2841 qemu_cond_wait(&param->cond, &param->mutex);
2842 }
2843 }
2844 qemu_mutex_unlock(&param->mutex);
2845
2846 return NULL;
2847 }
2848
2849 static int wait_for_decompress_done(void)
2850 {
2851 int idx, thread_count;
2852
2853 if (!migrate_use_compression()) {
2854 return 0;
2855 }
2856
2857 thread_count = migrate_decompress_threads();
2858 qemu_mutex_lock(&decomp_done_lock);
2859 for (idx = 0; idx < thread_count; idx++) {
2860 while (!decomp_param[idx].done) {
2861 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2862 }
2863 }
2864 qemu_mutex_unlock(&decomp_done_lock);
2865 return qemu_file_get_error(decomp_file);
2866 }
2867
2868 static void compress_threads_load_cleanup(void)
2869 {
2870 int i, thread_count;
2871
2872 if (!migrate_use_compression()) {
2873 return;
2874 }
2875 thread_count = migrate_decompress_threads();
2876 for (i = 0; i < thread_count; i++) {
2877 /*
2878 * we use it as a indicator which shows if the thread is
2879 * properly init'd or not
2880 */
2881 if (!decomp_param[i].compbuf) {
2882 break;
2883 }
2884
2885 qemu_mutex_lock(&decomp_param[i].mutex);
2886 decomp_param[i].quit = true;
2887 qemu_cond_signal(&decomp_param[i].cond);
2888 qemu_mutex_unlock(&decomp_param[i].mutex);
2889 }
2890 for (i = 0; i < thread_count; i++) {
2891 if (!decomp_param[i].compbuf) {
2892 break;
2893 }
2894
2895 qemu_thread_join(decompress_threads + i);
2896 qemu_mutex_destroy(&decomp_param[i].mutex);
2897 qemu_cond_destroy(&decomp_param[i].cond);
2898 inflateEnd(&decomp_param[i].stream);
2899 g_free(decomp_param[i].compbuf);
2900 decomp_param[i].compbuf = NULL;
2901 }
2902 g_free(decompress_threads);
2903 g_free(decomp_param);
2904 decompress_threads = NULL;
2905 decomp_param = NULL;
2906 decomp_file = NULL;
2907 }
2908
2909 static int compress_threads_load_setup(QEMUFile *f)
2910 {
2911 int i, thread_count;
2912
2913 if (!migrate_use_compression()) {
2914 return 0;
2915 }
2916
2917 thread_count = migrate_decompress_threads();
2918 decompress_threads = g_new0(QemuThread, thread_count);
2919 decomp_param = g_new0(DecompressParam, thread_count);
2920 qemu_mutex_init(&decomp_done_lock);
2921 qemu_cond_init(&decomp_done_cond);
2922 decomp_file = f;
2923 for (i = 0; i < thread_count; i++) {
2924 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2925 goto exit;
2926 }
2927
2928 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2929 qemu_mutex_init(&decomp_param[i].mutex);
2930 qemu_cond_init(&decomp_param[i].cond);
2931 decomp_param[i].done = true;
2932 decomp_param[i].quit = false;
2933 qemu_thread_create(decompress_threads + i, "decompress",
2934 do_data_decompress, decomp_param + i,
2935 QEMU_THREAD_JOINABLE);
2936 }
2937 return 0;
2938 exit:
2939 compress_threads_load_cleanup();
2940 return -1;
2941 }
2942
2943 static void decompress_data_with_multi_threads(QEMUFile *f,
2944 void *host, int len)
2945 {
2946 int idx, thread_count;
2947
2948 thread_count = migrate_decompress_threads();
2949 qemu_mutex_lock(&decomp_done_lock);
2950 while (true) {
2951 for (idx = 0; idx < thread_count; idx++) {
2952 if (decomp_param[idx].done) {
2953 decomp_param[idx].done = false;
2954 qemu_mutex_lock(&decomp_param[idx].mutex);
2955 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2956 decomp_param[idx].des = host;
2957 decomp_param[idx].len = len;
2958 qemu_cond_signal(&decomp_param[idx].cond);
2959 qemu_mutex_unlock(&decomp_param[idx].mutex);
2960 break;
2961 }
2962 }
2963 if (idx < thread_count) {
2964 break;
2965 } else {
2966 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2967 }
2968 }
2969 qemu_mutex_unlock(&decomp_done_lock);
2970 }
2971
2972 /*
2973 * colo cache: this is for secondary VM, we cache the whole
2974 * memory of the secondary VM, it is need to hold the global lock
2975 * to call this helper.
2976 */
2977 int colo_init_ram_cache(void)
2978 {
2979 RAMBlock *block;
2980
2981 WITH_RCU_READ_LOCK_GUARD() {
2982 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2983 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2984 NULL,
2985 false);
2986 if (!block->colo_cache) {
2987 error_report("%s: Can't alloc memory for COLO cache of block %s,"
2988 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2989 block->used_length);
2990 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2991 if (block->colo_cache) {
2992 qemu_anon_ram_free(block->colo_cache, block->used_length);
2993 block->colo_cache = NULL;
2994 }
2995 }
2996 return -errno;
2997 }
2998 }
2999 }
3000
3001 /*
3002 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3003 * with to decide which page in cache should be flushed into SVM's RAM. Here
3004 * we use the same name 'ram_bitmap' as for migration.
3005 */
3006 if (ram_bytes_total()) {
3007 RAMBlock *block;
3008
3009 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3010 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3011 block->bmap = bitmap_new(pages);
3012 }
3013 }
3014
3015 ram_state_init(&ram_state);
3016 return 0;
3017 }
3018
3019 /* TODO: duplicated with ram_init_bitmaps */
3020 void colo_incoming_start_dirty_log(void)
3021 {
3022 RAMBlock *block = NULL;
3023 /* For memory_global_dirty_log_start below. */
3024 qemu_mutex_lock_iothread();
3025 qemu_mutex_lock_ramlist();
3026
3027 memory_global_dirty_log_sync();
3028 WITH_RCU_READ_LOCK_GUARD() {
3029 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3030 ramblock_sync_dirty_bitmap(ram_state, block);
3031 /* Discard this dirty bitmap record */
3032 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3033 }
3034 memory_global_dirty_log_start();
3035 }
3036 ram_state->migration_dirty_pages = 0;
3037 qemu_mutex_unlock_ramlist();
3038 qemu_mutex_unlock_iothread();
3039 }
3040
3041 /* It is need to hold the global lock to call this helper */
3042 void colo_release_ram_cache(void)
3043 {
3044 RAMBlock *block;
3045
3046 memory_global_dirty_log_stop();
3047 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3048 g_free(block->bmap);
3049 block->bmap = NULL;
3050 }
3051
3052 WITH_RCU_READ_LOCK_GUARD() {
3053 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3054 if (block->colo_cache) {
3055 qemu_anon_ram_free(block->colo_cache, block->used_length);
3056 block->colo_cache = NULL;
3057 }
3058 }
3059 }
3060 ram_state_cleanup(&ram_state);
3061 }
3062
3063 /**
3064 * ram_load_setup: Setup RAM for migration incoming side
3065 *
3066 * Returns zero to indicate success and negative for error
3067 *
3068 * @f: QEMUFile where to receive the data
3069 * @opaque: RAMState pointer
3070 */
3071 static int ram_load_setup(QEMUFile *f, void *opaque)
3072 {
3073 if (compress_threads_load_setup(f)) {
3074 return -1;
3075 }
3076
3077 xbzrle_load_setup();
3078 ramblock_recv_map_init();
3079
3080 return 0;
3081 }
3082
3083 static int ram_load_cleanup(void *opaque)
3084 {
3085 RAMBlock *rb;
3086
3087 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3088 qemu_ram_block_writeback(rb);
3089 }
3090
3091 xbzrle_load_cleanup();
3092 compress_threads_load_cleanup();
3093
3094 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3095 g_free(rb->receivedmap);
3096 rb->receivedmap = NULL;
3097 }
3098
3099 return 0;
3100 }
3101
3102 /**
3103 * ram_postcopy_incoming_init: allocate postcopy data structures
3104 *
3105 * Returns 0 for success and negative if there was one error
3106 *
3107 * @mis: current migration incoming state
3108 *
3109 * Allocate data structures etc needed by incoming migration with
3110 * postcopy-ram. postcopy-ram's similarly names
3111 * postcopy_ram_incoming_init does the work.
3112 */
3113 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3114 {
3115 return postcopy_ram_incoming_init(mis);
3116 }
3117
3118 /**
3119 * ram_load_postcopy: load a page in postcopy case
3120 *
3121 * Returns 0 for success or -errno in case of error
3122 *
3123 * Called in postcopy mode by ram_load().
3124 * rcu_read_lock is taken prior to this being called.
3125 *
3126 * @f: QEMUFile where to send the data
3127 */
3128 static int ram_load_postcopy(QEMUFile *f)
3129 {
3130 int flags = 0, ret = 0;
3131 bool place_needed = false;
3132 bool matches_target_page_size = false;
3133 MigrationIncomingState *mis = migration_incoming_get_current();
3134 /* Temporary page that is later 'placed' */
3135 void *postcopy_host_page = mis->postcopy_tmp_page;
3136 void *this_host = NULL;
3137 bool all_zero = false;
3138 int target_pages = 0;
3139
3140 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3141 ram_addr_t addr;
3142 void *host = NULL;
3143 void *page_buffer = NULL;
3144 void *place_source = NULL;
3145 RAMBlock *block = NULL;
3146 uint8_t ch;
3147 int len;
3148
3149 addr = qemu_get_be64(f);
3150
3151 /*
3152 * If qemu file error, we should stop here, and then "addr"
3153 * may be invalid
3154 */
3155 ret = qemu_file_get_error(f);
3156 if (ret) {
3157 break;
3158 }
3159
3160 flags = addr & ~TARGET_PAGE_MASK;
3161 addr &= TARGET_PAGE_MASK;
3162
3163 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3164 place_needed = false;
3165 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3166 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3167 block = ram_block_from_stream(f, flags);
3168
3169 host = host_from_ram_block_offset(block, addr);
3170 if (!host) {
3171 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3172 ret = -EINVAL;
3173 break;
3174 }
3175 target_pages++;
3176 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3177 /*
3178 * Postcopy requires that we place whole host pages atomically;
3179 * these may be huge pages for RAMBlocks that are backed by
3180 * hugetlbfs.
3181 * To make it atomic, the data is read into a temporary page
3182 * that's moved into place later.
3183 * The migration protocol uses, possibly smaller, target-pages
3184 * however the source ensures it always sends all the components
3185 * of a host page in one chunk.
3186 */
3187 page_buffer = postcopy_host_page +
3188 ((uintptr_t)host & (block->page_size - 1));
3189 /* If all TP are zero then we can optimise the place */
3190 if (target_pages == 1) {
3191 all_zero = true;
3192 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3193 block->page_size);
3194 } else {
3195 /* not the 1st TP within the HP */
3196 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3197 (uintptr_t)this_host) {
3198 error_report("Non-same host page %p/%p",
3199 host, this_host);
3200 ret = -EINVAL;
3201 break;
3202 }
3203 }
3204
3205 /*
3206 * If it's the last part of a host page then we place the host
3207 * page
3208 */
3209 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3210 place_needed = true;
3211 target_pages = 0;
3212 }
3213 place_source = postcopy_host_page;
3214 }
3215
3216 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3217 case RAM_SAVE_FLAG_ZERO:
3218 ch = qemu_get_byte(f);
3219 /*
3220 * Can skip to set page_buffer when
3221 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3222 */
3223 if (ch || !matches_target_page_size) {
3224 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3225 }
3226 if (ch) {
3227 all_zero = false;
3228 }
3229 break;
3230
3231 case RAM_SAVE_FLAG_PAGE:
3232 all_zero = false;
3233 if (!matches_target_page_size) {
3234 /* For huge pages, we always use temporary buffer */
3235 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3236 } else {
3237 /*
3238 * For small pages that matches target page size, we
3239 * avoid the qemu_file copy. Instead we directly use
3240 * the buffer of QEMUFile to place the page. Note: we
3241 * cannot do any QEMUFile operation before using that
3242 * buffer to make sure the buffer is valid when
3243 * placing the page.
3244 */
3245 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3246 TARGET_PAGE_SIZE);
3247 }
3248 break;
3249 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3250 all_zero = false;
3251 len = qemu_get_be32(f);
3252 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3253 error_report("Invalid compressed data length: %d", len);
3254 ret = -EINVAL;
3255 break;
3256 }
3257 decompress_data_with_multi_threads(f, page_buffer, len);
3258 break;
3259
3260 case RAM_SAVE_FLAG_EOS:
3261 /* normal exit */
3262 multifd_recv_sync_main();
3263 break;
3264 default:
3265 error_report("Unknown combination of migration flags: %#x"
3266 " (postcopy mode)", flags);
3267 ret = -EINVAL;
3268 break;
3269 }
3270
3271 /* Got the whole host page, wait for decompress before placing. */
3272 if (place_needed) {
3273 ret |= wait_for_decompress_done();
3274 }
3275
3276 /* Detect for any possible file errors */
3277 if (!ret && qemu_file_get_error(f)) {
3278 ret = qemu_file_get_error(f);
3279 }
3280
3281 if (!ret && place_needed) {
3282 /* This gets called at the last target page in the host page */
3283 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3284 block->page_size);
3285
3286 if (all_zero) {
3287 ret = postcopy_place_page_zero(mis, place_dest,
3288 block);
3289 } else {
3290 ret = postcopy_place_page(mis, place_dest,
3291 place_source, block);
3292 }
3293 }
3294 }
3295
3296 return ret;
3297 }
3298
3299 static bool postcopy_is_advised(void)
3300 {
3301 PostcopyState ps = postcopy_state_get();
3302 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3303 }
3304
3305 static bool postcopy_is_running(void)
3306 {
3307 PostcopyState ps = postcopy_state_get();
3308 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3309 }
3310
3311 /*
3312 * Flush content of RAM cache into SVM's memory.
3313 * Only flush the pages that be dirtied by PVM or SVM or both.
3314 */
3315 static void colo_flush_ram_cache(void)
3316 {
3317 RAMBlock *block = NULL;
3318 void *dst_host;
3319 void *src_host;
3320 unsigned long offset = 0;
3321
3322 memory_global_dirty_log_sync();
3323 WITH_RCU_READ_LOCK_GUARD() {
3324 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3325 ramblock_sync_dirty_bitmap(ram_state, block);
3326 }
3327 }
3328
3329 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3330 WITH_RCU_READ_LOCK_GUARD() {
3331 block = QLIST_FIRST_RCU(&ram_list.blocks);
3332
3333 while (block) {
3334 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3335
3336 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3337 >= block->used_length) {
3338 offset = 0;
3339 block = QLIST_NEXT_RCU(block, next);
3340 } else {
3341 migration_bitmap_clear_dirty(ram_state, block, offset);
3342 dst_host = block->host
3343 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3344 src_host = block->colo_cache
3345 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3346 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3347 }
3348 }
3349 }
3350 trace_colo_flush_ram_cache_end();
3351 }
3352
3353 /**
3354 * ram_load_precopy: load pages in precopy case
3355 *
3356 * Returns 0 for success or -errno in case of error
3357 *
3358 * Called in precopy mode by ram_load().
3359 * rcu_read_lock is taken prior to this being called.
3360 *
3361 * @f: QEMUFile where to send the data
3362 */
3363 static int ram_load_precopy(QEMUFile *f)
3364 {
3365 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3366 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3367 bool postcopy_advised = postcopy_is_advised();
3368 if (!migrate_use_compression()) {
3369 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3370 }
3371
3372 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3373 ram_addr_t addr, total_ram_bytes;
3374 void *host = NULL, *host_bak = NULL;
3375 uint8_t ch;
3376
3377 /*
3378 * Yield periodically to let main loop run, but an iteration of
3379 * the main loop is expensive, so do it each some iterations
3380 */
3381 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3382 aio_co_schedule(qemu_get_current_aio_context(),
3383 qemu_coroutine_self());
3384 qemu_coroutine_yield();
3385 }
3386 i++;
3387
3388 addr = qemu_get_be64(f);
3389 flags = addr & ~TARGET_PAGE_MASK;
3390 addr &= TARGET_PAGE_MASK;
3391
3392 if (flags & invalid_flags) {
3393 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3394 error_report("Received an unexpected compressed page");
3395 }
3396
3397 ret = -EINVAL;
3398 break;
3399 }
3400
3401 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3402 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3403 RAMBlock *block = ram_block_from_stream(f, flags);
3404
3405 host = host_from_ram_block_offset(block, addr);
3406 /*
3407 * After going into COLO stage, we should not load the page
3408 * into SVM's memory directly, we put them into colo_cache firstly.
3409 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3410 * Previously, we copied all these memory in preparing stage of COLO
3411 * while we need to stop VM, which is a time-consuming process.
3412 * Here we optimize it by a trick, back-up every page while in
3413 * migration process while COLO is enabled, though it affects the
3414 * speed of the migration, but it obviously reduce the downtime of
3415 * back-up all SVM'S memory in COLO preparing stage.
3416 */
3417 if (migration_incoming_colo_enabled()) {
3418 if (migration_incoming_in_colo_state()) {
3419 /* In COLO stage, put all pages into cache temporarily */
3420 host = colo_cache_from_block_offset(block, addr, true);
3421 } else {
3422 /*
3423 * In migration stage but before COLO stage,
3424 * Put all pages into both cache and SVM's memory.
3425 */
3426 host_bak = colo_cache_from_block_offset(block, addr, false);
3427 }
3428 }
3429 if (!host) {
3430 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3431 ret = -EINVAL;
3432 break;
3433 }
3434 if (!migration_incoming_in_colo_state()) {
3435 ramblock_recv_bitmap_set(block, host);
3436 }
3437
3438 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3439 }
3440
3441 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3442 case RAM_SAVE_FLAG_MEM_SIZE:
3443 /* Synchronize RAM block list */
3444 total_ram_bytes = addr;
3445 while (!ret && total_ram_bytes) {
3446 RAMBlock *block;
3447 char id[256];
3448 ram_addr_t length;
3449
3450 len = qemu_get_byte(f);
3451 qemu_get_buffer(f, (uint8_t *)id, len);
3452 id[len] = 0;
3453 length = qemu_get_be64(f);
3454
3455 block = qemu_ram_block_by_name(id);
3456 if (block && !qemu_ram_is_migratable(block)) {
3457 error_report("block %s should not be migrated !", id);
3458 ret = -EINVAL;
3459 } else if (block) {
3460 if (length != block->used_length) {
3461 Error *local_err = NULL;
3462
3463 ret = qemu_ram_resize(block, length,
3464 &local_err);
3465 if (local_err) {
3466 error_report_err(local_err);
3467 }
3468 }
3469 /* For postcopy we need to check hugepage sizes match */
3470 if (postcopy_advised &&
3471 block->page_size != qemu_host_page_size) {
3472 uint64_t remote_page_size = qemu_get_be64(f);
3473 if (remote_page_size != block->page_size) {
3474 error_report("Mismatched RAM page size %s "
3475 "(local) %zd != %" PRId64,
3476 id, block->page_size,
3477 remote_page_size);
3478 ret = -EINVAL;
3479 }
3480 }
3481 if (migrate_ignore_shared()) {
3482 hwaddr addr = qemu_get_be64(f);
3483 if (ramblock_is_ignored(block) &&
3484 block->mr->addr != addr) {
3485 error_report("Mismatched GPAs for block %s "
3486 "%" PRId64 "!= %" PRId64,
3487 id, (uint64_t)addr,
3488 (uint64_t)block->mr->addr);
3489 ret = -EINVAL;
3490 }
3491 }
3492 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3493 block->idstr);
3494 } else {
3495 error_report("Unknown ramblock \"%s\", cannot "
3496 "accept migration", id);
3497 ret = -EINVAL;
3498 }
3499
3500 total_ram_bytes -= length;
3501 }
3502 break;
3503
3504 case RAM_SAVE_FLAG_ZERO:
3505 ch = qemu_get_byte(f);
3506 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3507 break;
3508
3509 case RAM_SAVE_FLAG_PAGE:
3510 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3511 break;
3512
3513 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3514 len = qemu_get_be32(f);
3515 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3516 error_report("Invalid compressed data length: %d", len);
3517 ret = -EINVAL;
3518 break;
3519 }
3520 decompress_data_with_multi_threads(f, host, len);
3521 break;
3522
3523 case RAM_SAVE_FLAG_XBZRLE:
3524 if (load_xbzrle(f, addr, host) < 0) {
3525 error_report("Failed to decompress XBZRLE page at "
3526 RAM_ADDR_FMT, addr);
3527 ret = -EINVAL;
3528 break;
3529 }
3530 break;
3531 case RAM_SAVE_FLAG_EOS:
3532 /* normal exit */
3533 multifd_recv_sync_main();
3534 break;
3535 default:
3536 if (flags & RAM_SAVE_FLAG_HOOK) {
3537 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3538 } else {
3539 error_report("Unknown combination of migration flags: %#x",
3540 flags);
3541 ret = -EINVAL;
3542 }
3543 }
3544 if (!ret) {
3545 ret = qemu_file_get_error(f);
3546 }
3547 if (!ret && host_bak) {
3548 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3549 }
3550 }
3551
3552 ret |= wait_for_decompress_done();
3553 return ret;
3554 }
3555
3556 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3557 {
3558 int ret = 0;
3559 static uint64_t seq_iter;
3560 /*
3561 * If system is running in postcopy mode, page inserts to host memory must
3562 * be atomic
3563 */
3564 bool postcopy_running = postcopy_is_running();
3565
3566 seq_iter++;
3567
3568 if (version_id != 4) {
3569 return -EINVAL;
3570 }
3571
3572 /*
3573 * This RCU critical section can be very long running.
3574 * When RCU reclaims in the code start to become numerous,
3575 * it will be necessary to reduce the granularity of this
3576 * critical section.
3577 */
3578 WITH_RCU_READ_LOCK_GUARD() {
3579 if (postcopy_running) {
3580 ret = ram_load_postcopy(f);
3581 } else {
3582 ret = ram_load_precopy(f);
3583 }
3584 }
3585 trace_ram_load_complete(ret, seq_iter);
3586
3587 if (!ret && migration_incoming_in_colo_state()) {
3588 colo_flush_ram_cache();
3589 }
3590 return ret;
3591 }
3592
3593 static bool ram_has_postcopy(void *opaque)
3594 {
3595 RAMBlock *rb;
3596 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3597 if (ramblock_is_pmem(rb)) {
3598 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3599 "is not supported now!", rb->idstr, rb->host);
3600 return false;
3601 }
3602 }
3603
3604 return migrate_postcopy_ram();
3605 }
3606
3607 /* Sync all the dirty bitmap with destination VM. */
3608 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3609 {
3610 RAMBlock *block;
3611 QEMUFile *file = s->to_dst_file;
3612 int ramblock_count = 0;
3613
3614 trace_ram_dirty_bitmap_sync_start();
3615
3616 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3617 qemu_savevm_send_recv_bitmap(file, block->idstr);
3618 trace_ram_dirty_bitmap_request(block->idstr);
3619 ramblock_count++;
3620 }
3621
3622 trace_ram_dirty_bitmap_sync_wait();
3623
3624 /* Wait until all the ramblocks' dirty bitmap synced */
3625 while (ramblock_count--) {
3626 qemu_sem_wait(&s->rp_state.rp_sem);
3627 }
3628
3629 trace_ram_dirty_bitmap_sync_complete();
3630
3631 return 0;
3632 }
3633
3634 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3635 {
3636 qemu_sem_post(&s->rp_state.rp_sem);
3637 }
3638
3639 /*
3640 * Read the received bitmap, revert it as the initial dirty bitmap.
3641 * This is only used when the postcopy migration is paused but wants
3642 * to resume from a middle point.
3643 */
3644 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3645 {
3646 int ret = -EINVAL;
3647 QEMUFile *file = s->rp_state.from_dst_file;
3648 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3649 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3650 uint64_t size, end_mark;
3651
3652 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3653
3654 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3655 error_report("%s: incorrect state %s", __func__,
3656 MigrationStatus_str(s->state));
3657 return -EINVAL;
3658 }
3659
3660 /*
3661 * Note: see comments in ramblock_recv_bitmap_send() on why we
3662 * need the endianess convertion, and the paddings.
3663 */
3664 local_size = ROUND_UP(local_size, 8);
3665
3666 /* Add paddings */
3667 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3668
3669 size = qemu_get_be64(file);
3670
3671 /* The size of the bitmap should match with our ramblock */
3672 if (size != local_size) {
3673 error_report("%s: ramblock '%s' bitmap size mismatch "
3674 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3675 block->idstr, size, local_size);
3676 ret = -EINVAL;
3677 goto out;
3678 }
3679
3680 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3681 end_mark = qemu_get_be64(file);
3682
3683 ret = qemu_file_get_error(file);
3684 if (ret || size != local_size) {
3685 error_report("%s: read bitmap failed for ramblock '%s': %d"
3686 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3687 __func__, block->idstr, ret, local_size, size);
3688 ret = -EIO;
3689 goto out;
3690 }
3691
3692 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3693 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3694 __func__, block->idstr, end_mark);
3695 ret = -EINVAL;
3696 goto out;
3697 }
3698
3699 /*
3700 * Endianess convertion. We are during postcopy (though paused).
3701 * The dirty bitmap won't change. We can directly modify it.
3702 */
3703 bitmap_from_le(block->bmap, le_bitmap, nbits);
3704
3705 /*
3706 * What we received is "received bitmap". Revert it as the initial
3707 * dirty bitmap for this ramblock.
3708 */
3709 bitmap_complement(block->bmap, block->bmap, nbits);
3710
3711 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3712
3713 /*
3714 * We succeeded to sync bitmap for current ramblock. If this is
3715 * the last one to sync, we need to notify the main send thread.
3716 */
3717 ram_dirty_bitmap_reload_notify(s);
3718
3719 ret = 0;
3720 out:
3721 g_free(le_bitmap);
3722 return ret;
3723 }
3724
3725 static int ram_resume_prepare(MigrationState *s, void *opaque)
3726 {
3727 RAMState *rs = *(RAMState **)opaque;
3728 int ret;
3729
3730 ret = ram_dirty_bitmap_sync_all(s, rs);
3731 if (ret) {
3732 return ret;
3733 }
3734
3735 ram_state_resume_prepare(rs, s->to_dst_file);
3736
3737 return 0;
3738 }
3739
3740 static SaveVMHandlers savevm_ram_handlers = {
3741 .save_setup = ram_save_setup,
3742 .save_live_iterate = ram_save_iterate,
3743 .save_live_complete_postcopy = ram_save_complete,
3744 .save_live_complete_precopy = ram_save_complete,
3745 .has_postcopy = ram_has_postcopy,
3746 .save_live_pending = ram_save_pending,
3747 .load_state = ram_load,
3748 .save_cleanup = ram_save_cleanup,
3749 .load_setup = ram_load_setup,
3750 .load_cleanup = ram_load_cleanup,
3751 .resume_prepare = ram_resume_prepare,
3752 };
3753
3754 void ram_mig_init(void)
3755 {
3756 qemu_mutex_init(&XBZRLE.lock);
3757 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3758 }