]> git.ipfire.org Git - thirdparty/qemu.git/blob - migration/ram.c
acpi: Use macro for table-loader file name
[thirdparty/qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "savevm.h"
56 #include "qemu/iov.h"
57 #include "multifd.h"
58
59 /***********************************************************/
60 /* ram save/restore */
61
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
66 */
67
68 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO 0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE 0x08
72 #define RAM_SAVE_FLAG_EOS 0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE 0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
77
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
79 {
80 return buffer_is_zero(p, size);
81 }
82
83 XBZRLECacheStats xbzrle_counters;
84
85 /* struct contains XBZRLE cache and a static page
86 used by the compression */
87 static struct {
88 /* buffer used for XBZRLE encoding */
89 uint8_t *encoded_buf;
90 /* buffer for storing page content */
91 uint8_t *current_buf;
92 /* Cache for XBZRLE, Protected by lock. */
93 PageCache *cache;
94 QemuMutex lock;
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
97 /* buffer used for XBZRLE decoding */
98 uint8_t *decoded_buf;
99 } XBZRLE;
100
101 static void XBZRLE_cache_lock(void)
102 {
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
105 }
106
107 static void XBZRLE_cache_unlock(void)
108 {
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
111 }
112
113 /**
114 * xbzrle_cache_resize: resize the xbzrle cache
115 *
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
120 *
121 * Returns 0 for success or -1 for error
122 *
123 * @new_size: new cache size
124 * @errp: set *errp if the check failed, with reason
125 */
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
127 {
128 PageCache *new_cache;
129 int64_t ret = 0;
130
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
135 return -1;
136 }
137
138 if (new_size == migrate_xbzrle_cache_size()) {
139 /* nothing to do */
140 return 0;
141 }
142
143 XBZRLE_cache_lock();
144
145 if (XBZRLE.cache != NULL) {
146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
147 if (!new_cache) {
148 ret = -1;
149 goto out;
150 }
151
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
154 }
155 out:
156 XBZRLE_cache_unlock();
157 return ret;
158 }
159
160 static bool ramblock_is_ignored(RAMBlock *block)
161 {
162 return !qemu_ram_is_migratable(block) ||
163 (migrate_ignore_shared() && qemu_ram_is_shared(block));
164 }
165
166 /* Should be holding either ram_list.mutex, or the RCU lock. */
167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
168 INTERNAL_RAMBLOCK_FOREACH(block) \
169 if (ramblock_is_ignored(block)) {} else
170
171 #define RAMBLOCK_FOREACH_MIGRATABLE(block) \
172 INTERNAL_RAMBLOCK_FOREACH(block) \
173 if (!qemu_ram_is_migratable(block)) {} else
174
175 #undef RAMBLOCK_FOREACH
176
177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
178 {
179 RAMBlock *block;
180 int ret = 0;
181
182 RCU_READ_LOCK_GUARD();
183
184 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
185 ret = func(block, opaque);
186 if (ret) {
187 break;
188 }
189 }
190 return ret;
191 }
192
193 static void ramblock_recv_map_init(void)
194 {
195 RAMBlock *rb;
196
197 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
198 assert(!rb->receivedmap);
199 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200 }
201 }
202
203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
204 {
205 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
206 rb->receivedmap);
207 }
208
209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
210 {
211 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
212 }
213
214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
215 {
216 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
217 }
218
219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
220 size_t nr)
221 {
222 bitmap_set_atomic(rb->receivedmap,
223 ramblock_recv_bitmap_offset(host_addr, rb),
224 nr);
225 }
226
227 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
228
229 /*
230 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231 *
232 * Returns >0 if success with sent bytes, or <0 if error.
233 */
234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
235 const char *block_name)
236 {
237 RAMBlock *block = qemu_ram_block_by_name(block_name);
238 unsigned long *le_bitmap, nbits;
239 uint64_t size;
240
241 if (!block) {
242 error_report("%s: invalid block name: %s", __func__, block_name);
243 return -1;
244 }
245
246 nbits = block->used_length >> TARGET_PAGE_BITS;
247
248 /*
249 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
250 * machines we may need 4 more bytes for padding (see below
251 * comment). So extend it a bit before hand.
252 */
253 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
254
255 /*
256 * Always use little endian when sending the bitmap. This is
257 * required that when source and destination VMs are not using the
258 * same endianess. (Note: big endian won't work.)
259 */
260 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
261
262 /* Size of the bitmap, in bytes */
263 size = DIV_ROUND_UP(nbits, 8);
264
265 /*
266 * size is always aligned to 8 bytes for 64bit machines, but it
267 * may not be true for 32bit machines. We need this padding to
268 * make sure the migration can survive even between 32bit and
269 * 64bit machines.
270 */
271 size = ROUND_UP(size, 8);
272
273 qemu_put_be64(file, size);
274 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
275 /*
276 * Mark as an end, in case the middle part is screwed up due to
277 * some "misterious" reason.
278 */
279 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
280 qemu_fflush(file);
281
282 g_free(le_bitmap);
283
284 if (qemu_file_get_error(file)) {
285 return qemu_file_get_error(file);
286 }
287
288 return size + sizeof(size);
289 }
290
291 /*
292 * An outstanding page request, on the source, having been received
293 * and queued
294 */
295 struct RAMSrcPageRequest {
296 RAMBlock *rb;
297 hwaddr offset;
298 hwaddr len;
299
300 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
301 };
302
303 /* State of RAM for migration */
304 struct RAMState {
305 /* QEMUFile used for this migration */
306 QEMUFile *f;
307 /* Last block that we have visited searching for dirty pages */
308 RAMBlock *last_seen_block;
309 /* Last block from where we have sent data */
310 RAMBlock *last_sent_block;
311 /* Last dirty target page we have sent */
312 ram_addr_t last_page;
313 /* last ram version we have seen */
314 uint32_t last_version;
315 /* We are in the first round */
316 bool ram_bulk_stage;
317 /* The free page optimization is enabled */
318 bool fpo_enabled;
319 /* How many times we have dirty too many pages */
320 int dirty_rate_high_cnt;
321 /* these variables are used for bitmap sync */
322 /* last time we did a full bitmap_sync */
323 int64_t time_last_bitmap_sync;
324 /* bytes transferred at start_time */
325 uint64_t bytes_xfer_prev;
326 /* number of dirty pages since start_time */
327 uint64_t num_dirty_pages_period;
328 /* xbzrle misses since the beginning of the period */
329 uint64_t xbzrle_cache_miss_prev;
330
331 /* compression statistics since the beginning of the period */
332 /* amount of count that no free thread to compress data */
333 uint64_t compress_thread_busy_prev;
334 /* amount bytes after compression */
335 uint64_t compressed_size_prev;
336 /* amount of compressed pages */
337 uint64_t compress_pages_prev;
338
339 /* total handled target pages at the beginning of period */
340 uint64_t target_page_count_prev;
341 /* total handled target pages since start */
342 uint64_t target_page_count;
343 /* number of dirty bits in the bitmap */
344 uint64_t migration_dirty_pages;
345 /* Protects modification of the bitmap and migration dirty pages */
346 QemuMutex bitmap_mutex;
347 /* The RAMBlock used in the last src_page_requests */
348 RAMBlock *last_req_rb;
349 /* Queue of outstanding page requests from the destination */
350 QemuMutex src_page_req_mutex;
351 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
352 };
353 typedef struct RAMState RAMState;
354
355 static RAMState *ram_state;
356
357 static NotifierWithReturnList precopy_notifier_list;
358
359 void precopy_infrastructure_init(void)
360 {
361 notifier_with_return_list_init(&precopy_notifier_list);
362 }
363
364 void precopy_add_notifier(NotifierWithReturn *n)
365 {
366 notifier_with_return_list_add(&precopy_notifier_list, n);
367 }
368
369 void precopy_remove_notifier(NotifierWithReturn *n)
370 {
371 notifier_with_return_remove(n);
372 }
373
374 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
375 {
376 PrecopyNotifyData pnd;
377 pnd.reason = reason;
378 pnd.errp = errp;
379
380 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
381 }
382
383 void precopy_enable_free_page_optimization(void)
384 {
385 if (!ram_state) {
386 return;
387 }
388
389 ram_state->fpo_enabled = true;
390 }
391
392 uint64_t ram_bytes_remaining(void)
393 {
394 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
395 0;
396 }
397
398 MigrationStats ram_counters;
399
400 /* used by the search for pages to send */
401 struct PageSearchStatus {
402 /* Current block being searched */
403 RAMBlock *block;
404 /* Current page to search from */
405 unsigned long page;
406 /* Set once we wrap around */
407 bool complete_round;
408 };
409 typedef struct PageSearchStatus PageSearchStatus;
410
411 CompressionStats compression_counters;
412
413 struct CompressParam {
414 bool done;
415 bool quit;
416 bool zero_page;
417 QEMUFile *file;
418 QemuMutex mutex;
419 QemuCond cond;
420 RAMBlock *block;
421 ram_addr_t offset;
422
423 /* internally used fields */
424 z_stream stream;
425 uint8_t *originbuf;
426 };
427 typedef struct CompressParam CompressParam;
428
429 struct DecompressParam {
430 bool done;
431 bool quit;
432 QemuMutex mutex;
433 QemuCond cond;
434 void *des;
435 uint8_t *compbuf;
436 int len;
437 z_stream stream;
438 };
439 typedef struct DecompressParam DecompressParam;
440
441 static CompressParam *comp_param;
442 static QemuThread *compress_threads;
443 /* comp_done_cond is used to wake up the migration thread when
444 * one of the compression threads has finished the compression.
445 * comp_done_lock is used to co-work with comp_done_cond.
446 */
447 static QemuMutex comp_done_lock;
448 static QemuCond comp_done_cond;
449 /* The empty QEMUFileOps will be used by file in CompressParam */
450 static const QEMUFileOps empty_ops = { };
451
452 static QEMUFile *decomp_file;
453 static DecompressParam *decomp_param;
454 static QemuThread *decompress_threads;
455 static QemuMutex decomp_done_lock;
456 static QemuCond decomp_done_cond;
457
458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
459 ram_addr_t offset, uint8_t *source_buf);
460
461 static void *do_data_compress(void *opaque)
462 {
463 CompressParam *param = opaque;
464 RAMBlock *block;
465 ram_addr_t offset;
466 bool zero_page;
467
468 qemu_mutex_lock(&param->mutex);
469 while (!param->quit) {
470 if (param->block) {
471 block = param->block;
472 offset = param->offset;
473 param->block = NULL;
474 qemu_mutex_unlock(&param->mutex);
475
476 zero_page = do_compress_ram_page(param->file, &param->stream,
477 block, offset, param->originbuf);
478
479 qemu_mutex_lock(&comp_done_lock);
480 param->done = true;
481 param->zero_page = zero_page;
482 qemu_cond_signal(&comp_done_cond);
483 qemu_mutex_unlock(&comp_done_lock);
484
485 qemu_mutex_lock(&param->mutex);
486 } else {
487 qemu_cond_wait(&param->cond, &param->mutex);
488 }
489 }
490 qemu_mutex_unlock(&param->mutex);
491
492 return NULL;
493 }
494
495 static void compress_threads_save_cleanup(void)
496 {
497 int i, thread_count;
498
499 if (!migrate_use_compression() || !comp_param) {
500 return;
501 }
502
503 thread_count = migrate_compress_threads();
504 for (i = 0; i < thread_count; i++) {
505 /*
506 * we use it as a indicator which shows if the thread is
507 * properly init'd or not
508 */
509 if (!comp_param[i].file) {
510 break;
511 }
512
513 qemu_mutex_lock(&comp_param[i].mutex);
514 comp_param[i].quit = true;
515 qemu_cond_signal(&comp_param[i].cond);
516 qemu_mutex_unlock(&comp_param[i].mutex);
517
518 qemu_thread_join(compress_threads + i);
519 qemu_mutex_destroy(&comp_param[i].mutex);
520 qemu_cond_destroy(&comp_param[i].cond);
521 deflateEnd(&comp_param[i].stream);
522 g_free(comp_param[i].originbuf);
523 qemu_fclose(comp_param[i].file);
524 comp_param[i].file = NULL;
525 }
526 qemu_mutex_destroy(&comp_done_lock);
527 qemu_cond_destroy(&comp_done_cond);
528 g_free(compress_threads);
529 g_free(comp_param);
530 compress_threads = NULL;
531 comp_param = NULL;
532 }
533
534 static int compress_threads_save_setup(void)
535 {
536 int i, thread_count;
537
538 if (!migrate_use_compression()) {
539 return 0;
540 }
541 thread_count = migrate_compress_threads();
542 compress_threads = g_new0(QemuThread, thread_count);
543 comp_param = g_new0(CompressParam, thread_count);
544 qemu_cond_init(&comp_done_cond);
545 qemu_mutex_init(&comp_done_lock);
546 for (i = 0; i < thread_count; i++) {
547 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
548 if (!comp_param[i].originbuf) {
549 goto exit;
550 }
551
552 if (deflateInit(&comp_param[i].stream,
553 migrate_compress_level()) != Z_OK) {
554 g_free(comp_param[i].originbuf);
555 goto exit;
556 }
557
558 /* comp_param[i].file is just used as a dummy buffer to save data,
559 * set its ops to empty.
560 */
561 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
562 comp_param[i].done = true;
563 comp_param[i].quit = false;
564 qemu_mutex_init(&comp_param[i].mutex);
565 qemu_cond_init(&comp_param[i].cond);
566 qemu_thread_create(compress_threads + i, "compress",
567 do_data_compress, comp_param + i,
568 QEMU_THREAD_JOINABLE);
569 }
570 return 0;
571
572 exit:
573 compress_threads_save_cleanup();
574 return -1;
575 }
576
577 /**
578 * save_page_header: write page header to wire
579 *
580 * If this is the 1st block, it also writes the block identification
581 *
582 * Returns the number of bytes written
583 *
584 * @f: QEMUFile where to send the data
585 * @block: block that contains the page we want to send
586 * @offset: offset inside the block for the page
587 * in the lower bits, it contains flags
588 */
589 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
590 ram_addr_t offset)
591 {
592 size_t size, len;
593
594 if (block == rs->last_sent_block) {
595 offset |= RAM_SAVE_FLAG_CONTINUE;
596 }
597 qemu_put_be64(f, offset);
598 size = 8;
599
600 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
601 len = strlen(block->idstr);
602 qemu_put_byte(f, len);
603 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
604 size += 1 + len;
605 rs->last_sent_block = block;
606 }
607 return size;
608 }
609
610 /**
611 * mig_throttle_guest_down: throotle down the guest
612 *
613 * Reduce amount of guest cpu execution to hopefully slow down memory
614 * writes. If guest dirty memory rate is reduced below the rate at
615 * which we can transfer pages to the destination then we should be
616 * able to complete migration. Some workloads dirty memory way too
617 * fast and will not effectively converge, even with auto-converge.
618 */
619 static void mig_throttle_guest_down(void)
620 {
621 MigrationState *s = migrate_get_current();
622 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
623 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
624 int pct_max = s->parameters.max_cpu_throttle;
625
626 /* We have not started throttling yet. Let's start it. */
627 if (!cpu_throttle_active()) {
628 cpu_throttle_set(pct_initial);
629 } else {
630 /* Throttling already on, just increase the rate */
631 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
632 pct_max));
633 }
634 }
635
636 /**
637 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
638 *
639 * @rs: current RAM state
640 * @current_addr: address for the zero page
641 *
642 * Update the xbzrle cache to reflect a page that's been sent as all 0.
643 * The important thing is that a stale (not-yet-0'd) page be replaced
644 * by the new data.
645 * As a bonus, if the page wasn't in the cache it gets added so that
646 * when a small write is made into the 0'd page it gets XBZRLE sent.
647 */
648 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
649 {
650 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
651 return;
652 }
653
654 /* We don't care if this fails to allocate a new cache page
655 * as long as it updated an old one */
656 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
657 ram_counters.dirty_sync_count);
658 }
659
660 #define ENCODING_FLAG_XBZRLE 0x1
661
662 /**
663 * save_xbzrle_page: compress and send current page
664 *
665 * Returns: 1 means that we wrote the page
666 * 0 means that page is identical to the one already sent
667 * -1 means that xbzrle would be longer than normal
668 *
669 * @rs: current RAM state
670 * @current_data: pointer to the address of the page contents
671 * @current_addr: addr of the page
672 * @block: block that contains the page we want to send
673 * @offset: offset inside the block for the page
674 * @last_stage: if we are at the completion stage
675 */
676 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
677 ram_addr_t current_addr, RAMBlock *block,
678 ram_addr_t offset, bool last_stage)
679 {
680 int encoded_len = 0, bytes_xbzrle;
681 uint8_t *prev_cached_page;
682
683 if (!cache_is_cached(XBZRLE.cache, current_addr,
684 ram_counters.dirty_sync_count)) {
685 xbzrle_counters.cache_miss++;
686 if (!last_stage) {
687 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
688 ram_counters.dirty_sync_count) == -1) {
689 return -1;
690 } else {
691 /* update *current_data when the page has been
692 inserted into cache */
693 *current_data = get_cached_data(XBZRLE.cache, current_addr);
694 }
695 }
696 return -1;
697 }
698
699 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
700
701 /* save current buffer into memory */
702 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
703
704 /* XBZRLE encoding (if there is no overflow) */
705 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
706 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
707 TARGET_PAGE_SIZE);
708
709 /*
710 * Update the cache contents, so that it corresponds to the data
711 * sent, in all cases except where we skip the page.
712 */
713 if (!last_stage && encoded_len != 0) {
714 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
715 /*
716 * In the case where we couldn't compress, ensure that the caller
717 * sends the data from the cache, since the guest might have
718 * changed the RAM since we copied it.
719 */
720 *current_data = prev_cached_page;
721 }
722
723 if (encoded_len == 0) {
724 trace_save_xbzrle_page_skipping();
725 return 0;
726 } else if (encoded_len == -1) {
727 trace_save_xbzrle_page_overflow();
728 xbzrle_counters.overflow++;
729 return -1;
730 }
731
732 /* Send XBZRLE based compressed page */
733 bytes_xbzrle = save_page_header(rs, rs->f, block,
734 offset | RAM_SAVE_FLAG_XBZRLE);
735 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
736 qemu_put_be16(rs->f, encoded_len);
737 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
738 bytes_xbzrle += encoded_len + 1 + 2;
739 xbzrle_counters.pages++;
740 xbzrle_counters.bytes += bytes_xbzrle;
741 ram_counters.transferred += bytes_xbzrle;
742
743 return 1;
744 }
745
746 /**
747 * migration_bitmap_find_dirty: find the next dirty page from start
748 *
749 * Returns the page offset within memory region of the start of a dirty page
750 *
751 * @rs: current RAM state
752 * @rb: RAMBlock where to search for dirty pages
753 * @start: page where we start the search
754 */
755 static inline
756 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
757 unsigned long start)
758 {
759 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
760 unsigned long *bitmap = rb->bmap;
761 unsigned long next;
762
763 if (ramblock_is_ignored(rb)) {
764 return size;
765 }
766
767 /*
768 * When the free page optimization is enabled, we need to check the bitmap
769 * to send the non-free pages rather than all the pages in the bulk stage.
770 */
771 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
772 next = start + 1;
773 } else {
774 next = find_next_bit(bitmap, size, start);
775 }
776
777 return next;
778 }
779
780 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
781 RAMBlock *rb,
782 unsigned long page)
783 {
784 bool ret;
785
786 qemu_mutex_lock(&rs->bitmap_mutex);
787
788 /*
789 * Clear dirty bitmap if needed. This _must_ be called before we
790 * send any of the page in the chunk because we need to make sure
791 * we can capture further page content changes when we sync dirty
792 * log the next time. So as long as we are going to send any of
793 * the page in the chunk we clear the remote dirty bitmap for all.
794 * Clearing it earlier won't be a problem, but too late will.
795 */
796 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
797 uint8_t shift = rb->clear_bmap_shift;
798 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
799 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
800
801 /*
802 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
803 * can make things easier sometimes since then start address
804 * of the small chunk will always be 64 pages aligned so the
805 * bitmap will always be aligned to unsigned long. We should
806 * even be able to remove this restriction but I'm simply
807 * keeping it.
808 */
809 assert(shift >= 6);
810 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
811 memory_region_clear_dirty_bitmap(rb->mr, start, size);
812 }
813
814 ret = test_and_clear_bit(page, rb->bmap);
815
816 if (ret) {
817 rs->migration_dirty_pages--;
818 }
819 qemu_mutex_unlock(&rs->bitmap_mutex);
820
821 return ret;
822 }
823
824 /* Called with RCU critical section */
825 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
826 {
827 rs->migration_dirty_pages +=
828 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
829 &rs->num_dirty_pages_period);
830 }
831
832 /**
833 * ram_pagesize_summary: calculate all the pagesizes of a VM
834 *
835 * Returns a summary bitmap of the page sizes of all RAMBlocks
836 *
837 * For VMs with just normal pages this is equivalent to the host page
838 * size. If it's got some huge pages then it's the OR of all the
839 * different page sizes.
840 */
841 uint64_t ram_pagesize_summary(void)
842 {
843 RAMBlock *block;
844 uint64_t summary = 0;
845
846 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
847 summary |= block->page_size;
848 }
849
850 return summary;
851 }
852
853 uint64_t ram_get_total_transferred_pages(void)
854 {
855 return ram_counters.normal + ram_counters.duplicate +
856 compression_counters.pages + xbzrle_counters.pages;
857 }
858
859 static void migration_update_rates(RAMState *rs, int64_t end_time)
860 {
861 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
862 double compressed_size;
863
864 /* calculate period counters */
865 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
866 / (end_time - rs->time_last_bitmap_sync);
867
868 if (!page_count) {
869 return;
870 }
871
872 if (migrate_use_xbzrle()) {
873 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
874 rs->xbzrle_cache_miss_prev) / page_count;
875 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
876 }
877
878 if (migrate_use_compression()) {
879 compression_counters.busy_rate = (double)(compression_counters.busy -
880 rs->compress_thread_busy_prev) / page_count;
881 rs->compress_thread_busy_prev = compression_counters.busy;
882
883 compressed_size = compression_counters.compressed_size -
884 rs->compressed_size_prev;
885 if (compressed_size) {
886 double uncompressed_size = (compression_counters.pages -
887 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
888
889 /* Compression-Ratio = Uncompressed-size / Compressed-size */
890 compression_counters.compression_rate =
891 uncompressed_size / compressed_size;
892
893 rs->compress_pages_prev = compression_counters.pages;
894 rs->compressed_size_prev = compression_counters.compressed_size;
895 }
896 }
897 }
898
899 static void migration_trigger_throttle(RAMState *rs)
900 {
901 MigrationState *s = migrate_get_current();
902 uint64_t threshold = s->parameters.throttle_trigger_threshold;
903
904 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
905 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
906 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
907
908 /* During block migration the auto-converge logic incorrectly detects
909 * that ram migration makes no progress. Avoid this by disabling the
910 * throttling logic during the bulk phase of block migration. */
911 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
912 /* The following detection logic can be refined later. For now:
913 Check to see if the ratio between dirtied bytes and the approx.
914 amount of bytes that just got transferred since the last time
915 we were in this routine reaches the threshold. If that happens
916 twice, start or increase throttling. */
917
918 if ((bytes_dirty_period > bytes_dirty_threshold) &&
919 (++rs->dirty_rate_high_cnt >= 2)) {
920 trace_migration_throttle();
921 rs->dirty_rate_high_cnt = 0;
922 mig_throttle_guest_down();
923 }
924 }
925 }
926
927 static void migration_bitmap_sync(RAMState *rs)
928 {
929 RAMBlock *block;
930 int64_t end_time;
931
932 ram_counters.dirty_sync_count++;
933
934 if (!rs->time_last_bitmap_sync) {
935 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
936 }
937
938 trace_migration_bitmap_sync_start();
939 memory_global_dirty_log_sync();
940
941 qemu_mutex_lock(&rs->bitmap_mutex);
942 WITH_RCU_READ_LOCK_GUARD() {
943 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
944 ramblock_sync_dirty_bitmap(rs, block);
945 }
946 ram_counters.remaining = ram_bytes_remaining();
947 }
948 qemu_mutex_unlock(&rs->bitmap_mutex);
949
950 memory_global_after_dirty_log_sync();
951 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
952
953 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
954
955 /* more than 1 second = 1000 millisecons */
956 if (end_time > rs->time_last_bitmap_sync + 1000) {
957 migration_trigger_throttle(rs);
958
959 migration_update_rates(rs, end_time);
960
961 rs->target_page_count_prev = rs->target_page_count;
962
963 /* reset period counters */
964 rs->time_last_bitmap_sync = end_time;
965 rs->num_dirty_pages_period = 0;
966 rs->bytes_xfer_prev = ram_counters.transferred;
967 }
968 if (migrate_use_events()) {
969 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
970 }
971 }
972
973 static void migration_bitmap_sync_precopy(RAMState *rs)
974 {
975 Error *local_err = NULL;
976
977 /*
978 * The current notifier usage is just an optimization to migration, so we
979 * don't stop the normal migration process in the error case.
980 */
981 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
982 error_report_err(local_err);
983 local_err = NULL;
984 }
985
986 migration_bitmap_sync(rs);
987
988 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
989 error_report_err(local_err);
990 }
991 }
992
993 /**
994 * save_zero_page_to_file: send the zero page to the file
995 *
996 * Returns the size of data written to the file, 0 means the page is not
997 * a zero page
998 *
999 * @rs: current RAM state
1000 * @file: the file where the data is saved
1001 * @block: block that contains the page we want to send
1002 * @offset: offset inside the block for the page
1003 */
1004 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1005 RAMBlock *block, ram_addr_t offset)
1006 {
1007 uint8_t *p = block->host + offset;
1008 int len = 0;
1009
1010 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1011 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1012 qemu_put_byte(file, 0);
1013 len += 1;
1014 }
1015 return len;
1016 }
1017
1018 /**
1019 * save_zero_page: send the zero page to the stream
1020 *
1021 * Returns the number of pages written.
1022 *
1023 * @rs: current RAM state
1024 * @block: block that contains the page we want to send
1025 * @offset: offset inside the block for the page
1026 */
1027 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1028 {
1029 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1030
1031 if (len) {
1032 ram_counters.duplicate++;
1033 ram_counters.transferred += len;
1034 return 1;
1035 }
1036 return -1;
1037 }
1038
1039 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1040 {
1041 if (!migrate_release_ram() || !migration_in_postcopy()) {
1042 return;
1043 }
1044
1045 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1046 }
1047
1048 /*
1049 * @pages: the number of pages written by the control path,
1050 * < 0 - error
1051 * > 0 - number of pages written
1052 *
1053 * Return true if the pages has been saved, otherwise false is returned.
1054 */
1055 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1056 int *pages)
1057 {
1058 uint64_t bytes_xmit = 0;
1059 int ret;
1060
1061 *pages = -1;
1062 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1063 &bytes_xmit);
1064 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1065 return false;
1066 }
1067
1068 if (bytes_xmit) {
1069 ram_counters.transferred += bytes_xmit;
1070 *pages = 1;
1071 }
1072
1073 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1074 return true;
1075 }
1076
1077 if (bytes_xmit > 0) {
1078 ram_counters.normal++;
1079 } else if (bytes_xmit == 0) {
1080 ram_counters.duplicate++;
1081 }
1082
1083 return true;
1084 }
1085
1086 /*
1087 * directly send the page to the stream
1088 *
1089 * Returns the number of pages written.
1090 *
1091 * @rs: current RAM state
1092 * @block: block that contains the page we want to send
1093 * @offset: offset inside the block for the page
1094 * @buf: the page to be sent
1095 * @async: send to page asyncly
1096 */
1097 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1098 uint8_t *buf, bool async)
1099 {
1100 ram_counters.transferred += save_page_header(rs, rs->f, block,
1101 offset | RAM_SAVE_FLAG_PAGE);
1102 if (async) {
1103 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1104 migrate_release_ram() &
1105 migration_in_postcopy());
1106 } else {
1107 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1108 }
1109 ram_counters.transferred += TARGET_PAGE_SIZE;
1110 ram_counters.normal++;
1111 return 1;
1112 }
1113
1114 /**
1115 * ram_save_page: send the given page to the stream
1116 *
1117 * Returns the number of pages written.
1118 * < 0 - error
1119 * >=0 - Number of pages written - this might legally be 0
1120 * if xbzrle noticed the page was the same.
1121 *
1122 * @rs: current RAM state
1123 * @block: block that contains the page we want to send
1124 * @offset: offset inside the block for the page
1125 * @last_stage: if we are at the completion stage
1126 */
1127 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1128 {
1129 int pages = -1;
1130 uint8_t *p;
1131 bool send_async = true;
1132 RAMBlock *block = pss->block;
1133 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1134 ram_addr_t current_addr = block->offset + offset;
1135
1136 p = block->host + offset;
1137 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1138
1139 XBZRLE_cache_lock();
1140 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1141 migrate_use_xbzrle()) {
1142 pages = save_xbzrle_page(rs, &p, current_addr, block,
1143 offset, last_stage);
1144 if (!last_stage) {
1145 /* Can't send this cached data async, since the cache page
1146 * might get updated before it gets to the wire
1147 */
1148 send_async = false;
1149 }
1150 }
1151
1152 /* XBZRLE overflow or normal page */
1153 if (pages == -1) {
1154 pages = save_normal_page(rs, block, offset, p, send_async);
1155 }
1156
1157 XBZRLE_cache_unlock();
1158
1159 return pages;
1160 }
1161
1162 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1163 ram_addr_t offset)
1164 {
1165 if (multifd_queue_page(rs->f, block, offset) < 0) {
1166 return -1;
1167 }
1168 ram_counters.normal++;
1169
1170 return 1;
1171 }
1172
1173 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1174 ram_addr_t offset, uint8_t *source_buf)
1175 {
1176 RAMState *rs = ram_state;
1177 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1178 bool zero_page = false;
1179 int ret;
1180
1181 if (save_zero_page_to_file(rs, f, block, offset)) {
1182 zero_page = true;
1183 goto exit;
1184 }
1185
1186 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1187
1188 /*
1189 * copy it to a internal buffer to avoid it being modified by VM
1190 * so that we can catch up the error during compression and
1191 * decompression
1192 */
1193 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1194 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1195 if (ret < 0) {
1196 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1197 error_report("compressed data failed!");
1198 return false;
1199 }
1200
1201 exit:
1202 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1203 return zero_page;
1204 }
1205
1206 static void
1207 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1208 {
1209 ram_counters.transferred += bytes_xmit;
1210
1211 if (param->zero_page) {
1212 ram_counters.duplicate++;
1213 return;
1214 }
1215
1216 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1217 compression_counters.compressed_size += bytes_xmit - 8;
1218 compression_counters.pages++;
1219 }
1220
1221 static bool save_page_use_compression(RAMState *rs);
1222
1223 static void flush_compressed_data(RAMState *rs)
1224 {
1225 int idx, len, thread_count;
1226
1227 if (!save_page_use_compression(rs)) {
1228 return;
1229 }
1230 thread_count = migrate_compress_threads();
1231
1232 qemu_mutex_lock(&comp_done_lock);
1233 for (idx = 0; idx < thread_count; idx++) {
1234 while (!comp_param[idx].done) {
1235 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1236 }
1237 }
1238 qemu_mutex_unlock(&comp_done_lock);
1239
1240 for (idx = 0; idx < thread_count; idx++) {
1241 qemu_mutex_lock(&comp_param[idx].mutex);
1242 if (!comp_param[idx].quit) {
1243 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1244 /*
1245 * it's safe to fetch zero_page without holding comp_done_lock
1246 * as there is no further request submitted to the thread,
1247 * i.e, the thread should be waiting for a request at this point.
1248 */
1249 update_compress_thread_counts(&comp_param[idx], len);
1250 }
1251 qemu_mutex_unlock(&comp_param[idx].mutex);
1252 }
1253 }
1254
1255 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1256 ram_addr_t offset)
1257 {
1258 param->block = block;
1259 param->offset = offset;
1260 }
1261
1262 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1263 ram_addr_t offset)
1264 {
1265 int idx, thread_count, bytes_xmit = -1, pages = -1;
1266 bool wait = migrate_compress_wait_thread();
1267
1268 thread_count = migrate_compress_threads();
1269 qemu_mutex_lock(&comp_done_lock);
1270 retry:
1271 for (idx = 0; idx < thread_count; idx++) {
1272 if (comp_param[idx].done) {
1273 comp_param[idx].done = false;
1274 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1275 qemu_mutex_lock(&comp_param[idx].mutex);
1276 set_compress_params(&comp_param[idx], block, offset);
1277 qemu_cond_signal(&comp_param[idx].cond);
1278 qemu_mutex_unlock(&comp_param[idx].mutex);
1279 pages = 1;
1280 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1281 break;
1282 }
1283 }
1284
1285 /*
1286 * wait for the free thread if the user specifies 'compress-wait-thread',
1287 * otherwise we will post the page out in the main thread as normal page.
1288 */
1289 if (pages < 0 && wait) {
1290 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1291 goto retry;
1292 }
1293 qemu_mutex_unlock(&comp_done_lock);
1294
1295 return pages;
1296 }
1297
1298 /**
1299 * find_dirty_block: find the next dirty page and update any state
1300 * associated with the search process.
1301 *
1302 * Returns true if a page is found
1303 *
1304 * @rs: current RAM state
1305 * @pss: data about the state of the current dirty page scan
1306 * @again: set to false if the search has scanned the whole of RAM
1307 */
1308 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1309 {
1310 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1311 if (pss->complete_round && pss->block == rs->last_seen_block &&
1312 pss->page >= rs->last_page) {
1313 /*
1314 * We've been once around the RAM and haven't found anything.
1315 * Give up.
1316 */
1317 *again = false;
1318 return false;
1319 }
1320 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1321 >= pss->block->used_length) {
1322 /* Didn't find anything in this RAM Block */
1323 pss->page = 0;
1324 pss->block = QLIST_NEXT_RCU(pss->block, next);
1325 if (!pss->block) {
1326 /*
1327 * If memory migration starts over, we will meet a dirtied page
1328 * which may still exists in compression threads's ring, so we
1329 * should flush the compressed data to make sure the new page
1330 * is not overwritten by the old one in the destination.
1331 *
1332 * Also If xbzrle is on, stop using the data compression at this
1333 * point. In theory, xbzrle can do better than compression.
1334 */
1335 flush_compressed_data(rs);
1336
1337 /* Hit the end of the list */
1338 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1339 /* Flag that we've looped */
1340 pss->complete_round = true;
1341 rs->ram_bulk_stage = false;
1342 }
1343 /* Didn't find anything this time, but try again on the new block */
1344 *again = true;
1345 return false;
1346 } else {
1347 /* Can go around again, but... */
1348 *again = true;
1349 /* We've found something so probably don't need to */
1350 return true;
1351 }
1352 }
1353
1354 /**
1355 * unqueue_page: gets a page of the queue
1356 *
1357 * Helper for 'get_queued_page' - gets a page off the queue
1358 *
1359 * Returns the block of the page (or NULL if none available)
1360 *
1361 * @rs: current RAM state
1362 * @offset: used to return the offset within the RAMBlock
1363 */
1364 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1365 {
1366 RAMBlock *block = NULL;
1367
1368 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1369 return NULL;
1370 }
1371
1372 qemu_mutex_lock(&rs->src_page_req_mutex);
1373 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1374 struct RAMSrcPageRequest *entry =
1375 QSIMPLEQ_FIRST(&rs->src_page_requests);
1376 block = entry->rb;
1377 *offset = entry->offset;
1378
1379 if (entry->len > TARGET_PAGE_SIZE) {
1380 entry->len -= TARGET_PAGE_SIZE;
1381 entry->offset += TARGET_PAGE_SIZE;
1382 } else {
1383 memory_region_unref(block->mr);
1384 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1385 g_free(entry);
1386 migration_consume_urgent_request();
1387 }
1388 }
1389 qemu_mutex_unlock(&rs->src_page_req_mutex);
1390
1391 return block;
1392 }
1393
1394 /**
1395 * get_queued_page: unqueue a page from the postcopy requests
1396 *
1397 * Skips pages that are already sent (!dirty)
1398 *
1399 * Returns true if a queued page is found
1400 *
1401 * @rs: current RAM state
1402 * @pss: data about the state of the current dirty page scan
1403 */
1404 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1405 {
1406 RAMBlock *block;
1407 ram_addr_t offset;
1408 bool dirty;
1409
1410 do {
1411 block = unqueue_page(rs, &offset);
1412 /*
1413 * We're sending this page, and since it's postcopy nothing else
1414 * will dirty it, and we must make sure it doesn't get sent again
1415 * even if this queue request was received after the background
1416 * search already sent it.
1417 */
1418 if (block) {
1419 unsigned long page;
1420
1421 page = offset >> TARGET_PAGE_BITS;
1422 dirty = test_bit(page, block->bmap);
1423 if (!dirty) {
1424 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1425 page);
1426 } else {
1427 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1428 }
1429 }
1430
1431 } while (block && !dirty);
1432
1433 if (block) {
1434 /*
1435 * As soon as we start servicing pages out of order, then we have
1436 * to kill the bulk stage, since the bulk stage assumes
1437 * in (migration_bitmap_find_and_reset_dirty) that every page is
1438 * dirty, that's no longer true.
1439 */
1440 rs->ram_bulk_stage = false;
1441
1442 /*
1443 * We want the background search to continue from the queued page
1444 * since the guest is likely to want other pages near to the page
1445 * it just requested.
1446 */
1447 pss->block = block;
1448 pss->page = offset >> TARGET_PAGE_BITS;
1449
1450 /*
1451 * This unqueued page would break the "one round" check, even is
1452 * really rare.
1453 */
1454 pss->complete_round = false;
1455 }
1456
1457 return !!block;
1458 }
1459
1460 /**
1461 * migration_page_queue_free: drop any remaining pages in the ram
1462 * request queue
1463 *
1464 * It should be empty at the end anyway, but in error cases there may
1465 * be some left. in case that there is any page left, we drop it.
1466 *
1467 */
1468 static void migration_page_queue_free(RAMState *rs)
1469 {
1470 struct RAMSrcPageRequest *mspr, *next_mspr;
1471 /* This queue generally should be empty - but in the case of a failed
1472 * migration might have some droppings in.
1473 */
1474 RCU_READ_LOCK_GUARD();
1475 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1476 memory_region_unref(mspr->rb->mr);
1477 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1478 g_free(mspr);
1479 }
1480 }
1481
1482 /**
1483 * ram_save_queue_pages: queue the page for transmission
1484 *
1485 * A request from postcopy destination for example.
1486 *
1487 * Returns zero on success or negative on error
1488 *
1489 * @rbname: Name of the RAMBLock of the request. NULL means the
1490 * same that last one.
1491 * @start: starting address from the start of the RAMBlock
1492 * @len: length (in bytes) to send
1493 */
1494 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1495 {
1496 RAMBlock *ramblock;
1497 RAMState *rs = ram_state;
1498
1499 ram_counters.postcopy_requests++;
1500 RCU_READ_LOCK_GUARD();
1501
1502 if (!rbname) {
1503 /* Reuse last RAMBlock */
1504 ramblock = rs->last_req_rb;
1505
1506 if (!ramblock) {
1507 /*
1508 * Shouldn't happen, we can't reuse the last RAMBlock if
1509 * it's the 1st request.
1510 */
1511 error_report("ram_save_queue_pages no previous block");
1512 return -1;
1513 }
1514 } else {
1515 ramblock = qemu_ram_block_by_name(rbname);
1516
1517 if (!ramblock) {
1518 /* We shouldn't be asked for a non-existent RAMBlock */
1519 error_report("ram_save_queue_pages no block '%s'", rbname);
1520 return -1;
1521 }
1522 rs->last_req_rb = ramblock;
1523 }
1524 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1525 if (start+len > ramblock->used_length) {
1526 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1527 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1528 __func__, start, len, ramblock->used_length);
1529 return -1;
1530 }
1531
1532 struct RAMSrcPageRequest *new_entry =
1533 g_malloc0(sizeof(struct RAMSrcPageRequest));
1534 new_entry->rb = ramblock;
1535 new_entry->offset = start;
1536 new_entry->len = len;
1537
1538 memory_region_ref(ramblock->mr);
1539 qemu_mutex_lock(&rs->src_page_req_mutex);
1540 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1541 migration_make_urgent_request();
1542 qemu_mutex_unlock(&rs->src_page_req_mutex);
1543
1544 return 0;
1545 }
1546
1547 static bool save_page_use_compression(RAMState *rs)
1548 {
1549 if (!migrate_use_compression()) {
1550 return false;
1551 }
1552
1553 /*
1554 * If xbzrle is on, stop using the data compression after first
1555 * round of migration even if compression is enabled. In theory,
1556 * xbzrle can do better than compression.
1557 */
1558 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1559 return true;
1560 }
1561
1562 return false;
1563 }
1564
1565 /*
1566 * try to compress the page before posting it out, return true if the page
1567 * has been properly handled by compression, otherwise needs other
1568 * paths to handle it
1569 */
1570 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1571 {
1572 if (!save_page_use_compression(rs)) {
1573 return false;
1574 }
1575
1576 /*
1577 * When starting the process of a new block, the first page of
1578 * the block should be sent out before other pages in the same
1579 * block, and all the pages in last block should have been sent
1580 * out, keeping this order is important, because the 'cont' flag
1581 * is used to avoid resending the block name.
1582 *
1583 * We post the fist page as normal page as compression will take
1584 * much CPU resource.
1585 */
1586 if (block != rs->last_sent_block) {
1587 flush_compressed_data(rs);
1588 return false;
1589 }
1590
1591 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1592 return true;
1593 }
1594
1595 compression_counters.busy++;
1596 return false;
1597 }
1598
1599 /**
1600 * ram_save_target_page: save one target page
1601 *
1602 * Returns the number of pages written
1603 *
1604 * @rs: current RAM state
1605 * @pss: data about the page we want to send
1606 * @last_stage: if we are at the completion stage
1607 */
1608 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1609 bool last_stage)
1610 {
1611 RAMBlock *block = pss->block;
1612 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1613 int res;
1614
1615 if (control_save_page(rs, block, offset, &res)) {
1616 return res;
1617 }
1618
1619 if (save_compress_page(rs, block, offset)) {
1620 return 1;
1621 }
1622
1623 res = save_zero_page(rs, block, offset);
1624 if (res > 0) {
1625 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1626 * page would be stale
1627 */
1628 if (!save_page_use_compression(rs)) {
1629 XBZRLE_cache_lock();
1630 xbzrle_cache_zero_page(rs, block->offset + offset);
1631 XBZRLE_cache_unlock();
1632 }
1633 ram_release_pages(block->idstr, offset, res);
1634 return res;
1635 }
1636
1637 /*
1638 * Do not use multifd for:
1639 * 1. Compression as the first page in the new block should be posted out
1640 * before sending the compressed page
1641 * 2. In postcopy as one whole host page should be placed
1642 */
1643 if (!save_page_use_compression(rs) && migrate_use_multifd()
1644 && !migration_in_postcopy()) {
1645 return ram_save_multifd_page(rs, block, offset);
1646 }
1647
1648 return ram_save_page(rs, pss, last_stage);
1649 }
1650
1651 /**
1652 * ram_save_host_page: save a whole host page
1653 *
1654 * Starting at *offset send pages up to the end of the current host
1655 * page. It's valid for the initial offset to point into the middle of
1656 * a host page in which case the remainder of the hostpage is sent.
1657 * Only dirty target pages are sent. Note that the host page size may
1658 * be a huge page for this block.
1659 * The saving stops at the boundary of the used_length of the block
1660 * if the RAMBlock isn't a multiple of the host page size.
1661 *
1662 * Returns the number of pages written or negative on error
1663 *
1664 * @rs: current RAM state
1665 * @ms: current migration state
1666 * @pss: data about the page we want to send
1667 * @last_stage: if we are at the completion stage
1668 */
1669 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1670 bool last_stage)
1671 {
1672 int tmppages, pages = 0;
1673 size_t pagesize_bits =
1674 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1675
1676 if (ramblock_is_ignored(pss->block)) {
1677 error_report("block %s should not be migrated !", pss->block->idstr);
1678 return 0;
1679 }
1680
1681 do {
1682 /* Check the pages is dirty and if it is send it */
1683 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1684 pss->page++;
1685 continue;
1686 }
1687
1688 tmppages = ram_save_target_page(rs, pss, last_stage);
1689 if (tmppages < 0) {
1690 return tmppages;
1691 }
1692
1693 pages += tmppages;
1694 pss->page++;
1695 /* Allow rate limiting to happen in the middle of huge pages */
1696 migration_rate_limit();
1697 } while ((pss->page & (pagesize_bits - 1)) &&
1698 offset_in_ramblock(pss->block,
1699 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1700
1701 /* The offset we leave with is the last one we looked at */
1702 pss->page--;
1703 return pages;
1704 }
1705
1706 /**
1707 * ram_find_and_save_block: finds a dirty page and sends it to f
1708 *
1709 * Called within an RCU critical section.
1710 *
1711 * Returns the number of pages written where zero means no dirty pages,
1712 * or negative on error
1713 *
1714 * @rs: current RAM state
1715 * @last_stage: if we are at the completion stage
1716 *
1717 * On systems where host-page-size > target-page-size it will send all the
1718 * pages in a host page that are dirty.
1719 */
1720
1721 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1722 {
1723 PageSearchStatus pss;
1724 int pages = 0;
1725 bool again, found;
1726
1727 /* No dirty page as there is zero RAM */
1728 if (!ram_bytes_total()) {
1729 return pages;
1730 }
1731
1732 pss.block = rs->last_seen_block;
1733 pss.page = rs->last_page;
1734 pss.complete_round = false;
1735
1736 if (!pss.block) {
1737 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1738 }
1739
1740 do {
1741 again = true;
1742 found = get_queued_page(rs, &pss);
1743
1744 if (!found) {
1745 /* priority queue empty, so just search for something dirty */
1746 found = find_dirty_block(rs, &pss, &again);
1747 }
1748
1749 if (found) {
1750 pages = ram_save_host_page(rs, &pss, last_stage);
1751 }
1752 } while (!pages && again);
1753
1754 rs->last_seen_block = pss.block;
1755 rs->last_page = pss.page;
1756
1757 return pages;
1758 }
1759
1760 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1761 {
1762 uint64_t pages = size / TARGET_PAGE_SIZE;
1763
1764 if (zero) {
1765 ram_counters.duplicate += pages;
1766 } else {
1767 ram_counters.normal += pages;
1768 ram_counters.transferred += size;
1769 qemu_update_position(f, size);
1770 }
1771 }
1772
1773 static uint64_t ram_bytes_total_common(bool count_ignored)
1774 {
1775 RAMBlock *block;
1776 uint64_t total = 0;
1777
1778 RCU_READ_LOCK_GUARD();
1779
1780 if (count_ignored) {
1781 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1782 total += block->used_length;
1783 }
1784 } else {
1785 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1786 total += block->used_length;
1787 }
1788 }
1789 return total;
1790 }
1791
1792 uint64_t ram_bytes_total(void)
1793 {
1794 return ram_bytes_total_common(false);
1795 }
1796
1797 static void xbzrle_load_setup(void)
1798 {
1799 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1800 }
1801
1802 static void xbzrle_load_cleanup(void)
1803 {
1804 g_free(XBZRLE.decoded_buf);
1805 XBZRLE.decoded_buf = NULL;
1806 }
1807
1808 static void ram_state_cleanup(RAMState **rsp)
1809 {
1810 if (*rsp) {
1811 migration_page_queue_free(*rsp);
1812 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1813 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1814 g_free(*rsp);
1815 *rsp = NULL;
1816 }
1817 }
1818
1819 static void xbzrle_cleanup(void)
1820 {
1821 XBZRLE_cache_lock();
1822 if (XBZRLE.cache) {
1823 cache_fini(XBZRLE.cache);
1824 g_free(XBZRLE.encoded_buf);
1825 g_free(XBZRLE.current_buf);
1826 g_free(XBZRLE.zero_target_page);
1827 XBZRLE.cache = NULL;
1828 XBZRLE.encoded_buf = NULL;
1829 XBZRLE.current_buf = NULL;
1830 XBZRLE.zero_target_page = NULL;
1831 }
1832 XBZRLE_cache_unlock();
1833 }
1834
1835 static void ram_save_cleanup(void *opaque)
1836 {
1837 RAMState **rsp = opaque;
1838 RAMBlock *block;
1839
1840 /* caller have hold iothread lock or is in a bh, so there is
1841 * no writing race against the migration bitmap
1842 */
1843 memory_global_dirty_log_stop();
1844
1845 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1846 g_free(block->clear_bmap);
1847 block->clear_bmap = NULL;
1848 g_free(block->bmap);
1849 block->bmap = NULL;
1850 }
1851
1852 xbzrle_cleanup();
1853 compress_threads_save_cleanup();
1854 ram_state_cleanup(rsp);
1855 }
1856
1857 static void ram_state_reset(RAMState *rs)
1858 {
1859 rs->last_seen_block = NULL;
1860 rs->last_sent_block = NULL;
1861 rs->last_page = 0;
1862 rs->last_version = ram_list.version;
1863 rs->ram_bulk_stage = true;
1864 rs->fpo_enabled = false;
1865 }
1866
1867 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1868
1869 /*
1870 * 'expected' is the value you expect the bitmap mostly to be full
1871 * of; it won't bother printing lines that are all this value.
1872 * If 'todump' is null the migration bitmap is dumped.
1873 */
1874 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1875 unsigned long pages)
1876 {
1877 int64_t cur;
1878 int64_t linelen = 128;
1879 char linebuf[129];
1880
1881 for (cur = 0; cur < pages; cur += linelen) {
1882 int64_t curb;
1883 bool found = false;
1884 /*
1885 * Last line; catch the case where the line length
1886 * is longer than remaining ram
1887 */
1888 if (cur + linelen > pages) {
1889 linelen = pages - cur;
1890 }
1891 for (curb = 0; curb < linelen; curb++) {
1892 bool thisbit = test_bit(cur + curb, todump);
1893 linebuf[curb] = thisbit ? '1' : '.';
1894 found = found || (thisbit != expected);
1895 }
1896 if (found) {
1897 linebuf[curb] = '\0';
1898 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1899 }
1900 }
1901 }
1902
1903 /* **** functions for postcopy ***** */
1904
1905 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1906 {
1907 struct RAMBlock *block;
1908
1909 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1910 unsigned long *bitmap = block->bmap;
1911 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1912 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1913
1914 while (run_start < range) {
1915 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1916 ram_discard_range(block->idstr,
1917 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1918 ((ram_addr_t)(run_end - run_start))
1919 << TARGET_PAGE_BITS);
1920 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1921 }
1922 }
1923 }
1924
1925 /**
1926 * postcopy_send_discard_bm_ram: discard a RAMBlock
1927 *
1928 * Returns zero on success
1929 *
1930 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1931 *
1932 * @ms: current migration state
1933 * @block: RAMBlock to discard
1934 */
1935 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1936 {
1937 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1938 unsigned long current;
1939 unsigned long *bitmap = block->bmap;
1940
1941 for (current = 0; current < end; ) {
1942 unsigned long one = find_next_bit(bitmap, end, current);
1943 unsigned long zero, discard_length;
1944
1945 if (one >= end) {
1946 break;
1947 }
1948
1949 zero = find_next_zero_bit(bitmap, end, one + 1);
1950
1951 if (zero >= end) {
1952 discard_length = end - one;
1953 } else {
1954 discard_length = zero - one;
1955 }
1956 postcopy_discard_send_range(ms, one, discard_length);
1957 current = one + discard_length;
1958 }
1959
1960 return 0;
1961 }
1962
1963 /**
1964 * postcopy_each_ram_send_discard: discard all RAMBlocks
1965 *
1966 * Returns 0 for success or negative for error
1967 *
1968 * Utility for the outgoing postcopy code.
1969 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1970 * passing it bitmap indexes and name.
1971 * (qemu_ram_foreach_block ends up passing unscaled lengths
1972 * which would mean postcopy code would have to deal with target page)
1973 *
1974 * @ms: current migration state
1975 */
1976 static int postcopy_each_ram_send_discard(MigrationState *ms)
1977 {
1978 struct RAMBlock *block;
1979 int ret;
1980
1981 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1982 postcopy_discard_send_init(ms, block->idstr);
1983
1984 /*
1985 * Postcopy sends chunks of bitmap over the wire, but it
1986 * just needs indexes at this point, avoids it having
1987 * target page specific code.
1988 */
1989 ret = postcopy_send_discard_bm_ram(ms, block);
1990 postcopy_discard_send_finish(ms);
1991 if (ret) {
1992 return ret;
1993 }
1994 }
1995
1996 return 0;
1997 }
1998
1999 /**
2000 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2001 *
2002 * Helper for postcopy_chunk_hostpages; it's called twice to
2003 * canonicalize the two bitmaps, that are similar, but one is
2004 * inverted.
2005 *
2006 * Postcopy requires that all target pages in a hostpage are dirty or
2007 * clean, not a mix. This function canonicalizes the bitmaps.
2008 *
2009 * @ms: current migration state
2010 * @block: block that contains the page we want to canonicalize
2011 */
2012 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2013 {
2014 RAMState *rs = ram_state;
2015 unsigned long *bitmap = block->bmap;
2016 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2017 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2018 unsigned long run_start;
2019
2020 if (block->page_size == TARGET_PAGE_SIZE) {
2021 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2022 return;
2023 }
2024
2025 /* Find a dirty page */
2026 run_start = find_next_bit(bitmap, pages, 0);
2027
2028 while (run_start < pages) {
2029
2030 /*
2031 * If the start of this run of pages is in the middle of a host
2032 * page, then we need to fixup this host page.
2033 */
2034 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2035 /* Find the end of this run */
2036 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2037 /*
2038 * If the end isn't at the start of a host page, then the
2039 * run doesn't finish at the end of a host page
2040 * and we need to discard.
2041 */
2042 }
2043
2044 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2045 unsigned long page;
2046 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2047 host_ratio);
2048 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2049
2050 /* Clean up the bitmap */
2051 for (page = fixup_start_addr;
2052 page < fixup_start_addr + host_ratio; page++) {
2053 /*
2054 * Remark them as dirty, updating the count for any pages
2055 * that weren't previously dirty.
2056 */
2057 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2058 }
2059 }
2060
2061 /* Find the next dirty page for the next iteration */
2062 run_start = find_next_bit(bitmap, pages, run_start);
2063 }
2064 }
2065
2066 /**
2067 * postcopy_chunk_hostpages: discard any partially sent host page
2068 *
2069 * Utility for the outgoing postcopy code.
2070 *
2071 * Discard any partially sent host-page size chunks, mark any partially
2072 * dirty host-page size chunks as all dirty. In this case the host-page
2073 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2074 *
2075 * Returns zero on success
2076 *
2077 * @ms: current migration state
2078 * @block: block we want to work with
2079 */
2080 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2081 {
2082 postcopy_discard_send_init(ms, block->idstr);
2083
2084 /*
2085 * Ensure that all partially dirty host pages are made fully dirty.
2086 */
2087 postcopy_chunk_hostpages_pass(ms, block);
2088
2089 postcopy_discard_send_finish(ms);
2090 return 0;
2091 }
2092
2093 /**
2094 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2095 *
2096 * Returns zero on success
2097 *
2098 * Transmit the set of pages to be discarded after precopy to the target
2099 * these are pages that:
2100 * a) Have been previously transmitted but are now dirty again
2101 * b) Pages that have never been transmitted, this ensures that
2102 * any pages on the destination that have been mapped by background
2103 * tasks get discarded (transparent huge pages is the specific concern)
2104 * Hopefully this is pretty sparse
2105 *
2106 * @ms: current migration state
2107 */
2108 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2109 {
2110 RAMState *rs = ram_state;
2111 RAMBlock *block;
2112 int ret;
2113
2114 RCU_READ_LOCK_GUARD();
2115
2116 /* This should be our last sync, the src is now paused */
2117 migration_bitmap_sync(rs);
2118
2119 /* Easiest way to make sure we don't resume in the middle of a host-page */
2120 rs->last_seen_block = NULL;
2121 rs->last_sent_block = NULL;
2122 rs->last_page = 0;
2123
2124 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2125 /* Deal with TPS != HPS and huge pages */
2126 ret = postcopy_chunk_hostpages(ms, block);
2127 if (ret) {
2128 return ret;
2129 }
2130
2131 #ifdef DEBUG_POSTCOPY
2132 ram_debug_dump_bitmap(block->bmap, true,
2133 block->used_length >> TARGET_PAGE_BITS);
2134 #endif
2135 }
2136 trace_ram_postcopy_send_discard_bitmap();
2137
2138 ret = postcopy_each_ram_send_discard(ms);
2139
2140 return ret;
2141 }
2142
2143 /**
2144 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2145 *
2146 * Returns zero on success
2147 *
2148 * @rbname: name of the RAMBlock of the request. NULL means the
2149 * same that last one.
2150 * @start: RAMBlock starting page
2151 * @length: RAMBlock size
2152 */
2153 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2154 {
2155 trace_ram_discard_range(rbname, start, length);
2156
2157 RCU_READ_LOCK_GUARD();
2158 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2159
2160 if (!rb) {
2161 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2162 return -1;
2163 }
2164
2165 /*
2166 * On source VM, we don't need to update the received bitmap since
2167 * we don't even have one.
2168 */
2169 if (rb->receivedmap) {
2170 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2171 length >> qemu_target_page_bits());
2172 }
2173
2174 return ram_block_discard_range(rb, start, length);
2175 }
2176
2177 /*
2178 * For every allocation, we will try not to crash the VM if the
2179 * allocation failed.
2180 */
2181 static int xbzrle_init(void)
2182 {
2183 Error *local_err = NULL;
2184
2185 if (!migrate_use_xbzrle()) {
2186 return 0;
2187 }
2188
2189 XBZRLE_cache_lock();
2190
2191 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2192 if (!XBZRLE.zero_target_page) {
2193 error_report("%s: Error allocating zero page", __func__);
2194 goto err_out;
2195 }
2196
2197 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2198 TARGET_PAGE_SIZE, &local_err);
2199 if (!XBZRLE.cache) {
2200 error_report_err(local_err);
2201 goto free_zero_page;
2202 }
2203
2204 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2205 if (!XBZRLE.encoded_buf) {
2206 error_report("%s: Error allocating encoded_buf", __func__);
2207 goto free_cache;
2208 }
2209
2210 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2211 if (!XBZRLE.current_buf) {
2212 error_report("%s: Error allocating current_buf", __func__);
2213 goto free_encoded_buf;
2214 }
2215
2216 /* We are all good */
2217 XBZRLE_cache_unlock();
2218 return 0;
2219
2220 free_encoded_buf:
2221 g_free(XBZRLE.encoded_buf);
2222 XBZRLE.encoded_buf = NULL;
2223 free_cache:
2224 cache_fini(XBZRLE.cache);
2225 XBZRLE.cache = NULL;
2226 free_zero_page:
2227 g_free(XBZRLE.zero_target_page);
2228 XBZRLE.zero_target_page = NULL;
2229 err_out:
2230 XBZRLE_cache_unlock();
2231 return -ENOMEM;
2232 }
2233
2234 static int ram_state_init(RAMState **rsp)
2235 {
2236 *rsp = g_try_new0(RAMState, 1);
2237
2238 if (!*rsp) {
2239 error_report("%s: Init ramstate fail", __func__);
2240 return -1;
2241 }
2242
2243 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2244 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2245 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2246
2247 /*
2248 * Count the total number of pages used by ram blocks not including any
2249 * gaps due to alignment or unplugs.
2250 * This must match with the initial values of dirty bitmap.
2251 */
2252 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2253 ram_state_reset(*rsp);
2254
2255 return 0;
2256 }
2257
2258 static void ram_list_init_bitmaps(void)
2259 {
2260 MigrationState *ms = migrate_get_current();
2261 RAMBlock *block;
2262 unsigned long pages;
2263 uint8_t shift;
2264
2265 /* Skip setting bitmap if there is no RAM */
2266 if (ram_bytes_total()) {
2267 shift = ms->clear_bitmap_shift;
2268 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2269 error_report("clear_bitmap_shift (%u) too big, using "
2270 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2271 shift = CLEAR_BITMAP_SHIFT_MAX;
2272 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2273 error_report("clear_bitmap_shift (%u) too small, using "
2274 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2275 shift = CLEAR_BITMAP_SHIFT_MIN;
2276 }
2277
2278 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2279 pages = block->max_length >> TARGET_PAGE_BITS;
2280 /*
2281 * The initial dirty bitmap for migration must be set with all
2282 * ones to make sure we'll migrate every guest RAM page to
2283 * destination.
2284 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2285 * new migration after a failed migration, ram_list.
2286 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2287 * guest memory.
2288 */
2289 block->bmap = bitmap_new(pages);
2290 bitmap_set(block->bmap, 0, pages);
2291 block->clear_bmap_shift = shift;
2292 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2293 }
2294 }
2295 }
2296
2297 static void ram_init_bitmaps(RAMState *rs)
2298 {
2299 /* For memory_global_dirty_log_start below. */
2300 qemu_mutex_lock_iothread();
2301 qemu_mutex_lock_ramlist();
2302
2303 WITH_RCU_READ_LOCK_GUARD() {
2304 ram_list_init_bitmaps();
2305 memory_global_dirty_log_start();
2306 migration_bitmap_sync_precopy(rs);
2307 }
2308 qemu_mutex_unlock_ramlist();
2309 qemu_mutex_unlock_iothread();
2310 }
2311
2312 static int ram_init_all(RAMState **rsp)
2313 {
2314 if (ram_state_init(rsp)) {
2315 return -1;
2316 }
2317
2318 if (xbzrle_init()) {
2319 ram_state_cleanup(rsp);
2320 return -1;
2321 }
2322
2323 ram_init_bitmaps(*rsp);
2324
2325 return 0;
2326 }
2327
2328 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2329 {
2330 RAMBlock *block;
2331 uint64_t pages = 0;
2332
2333 /*
2334 * Postcopy is not using xbzrle/compression, so no need for that.
2335 * Also, since source are already halted, we don't need to care
2336 * about dirty page logging as well.
2337 */
2338
2339 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2340 pages += bitmap_count_one(block->bmap,
2341 block->used_length >> TARGET_PAGE_BITS);
2342 }
2343
2344 /* This may not be aligned with current bitmaps. Recalculate. */
2345 rs->migration_dirty_pages = pages;
2346
2347 rs->last_seen_block = NULL;
2348 rs->last_sent_block = NULL;
2349 rs->last_page = 0;
2350 rs->last_version = ram_list.version;
2351 /*
2352 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2353 * matter what we have sent.
2354 */
2355 rs->ram_bulk_stage = false;
2356
2357 /* Update RAMState cache of output QEMUFile */
2358 rs->f = out;
2359
2360 trace_ram_state_resume_prepare(pages);
2361 }
2362
2363 /*
2364 * This function clears bits of the free pages reported by the caller from the
2365 * migration dirty bitmap. @addr is the host address corresponding to the
2366 * start of the continuous guest free pages, and @len is the total bytes of
2367 * those pages.
2368 */
2369 void qemu_guest_free_page_hint(void *addr, size_t len)
2370 {
2371 RAMBlock *block;
2372 ram_addr_t offset;
2373 size_t used_len, start, npages;
2374 MigrationState *s = migrate_get_current();
2375
2376 /* This function is currently expected to be used during live migration */
2377 if (!migration_is_setup_or_active(s->state)) {
2378 return;
2379 }
2380
2381 for (; len > 0; len -= used_len, addr += used_len) {
2382 block = qemu_ram_block_from_host(addr, false, &offset);
2383 if (unlikely(!block || offset >= block->used_length)) {
2384 /*
2385 * The implementation might not support RAMBlock resize during
2386 * live migration, but it could happen in theory with future
2387 * updates. So we add a check here to capture that case.
2388 */
2389 error_report_once("%s unexpected error", __func__);
2390 return;
2391 }
2392
2393 if (len <= block->used_length - offset) {
2394 used_len = len;
2395 } else {
2396 used_len = block->used_length - offset;
2397 }
2398
2399 start = offset >> TARGET_PAGE_BITS;
2400 npages = used_len >> TARGET_PAGE_BITS;
2401
2402 qemu_mutex_lock(&ram_state->bitmap_mutex);
2403 ram_state->migration_dirty_pages -=
2404 bitmap_count_one_with_offset(block->bmap, start, npages);
2405 bitmap_clear(block->bmap, start, npages);
2406 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2407 }
2408 }
2409
2410 /*
2411 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2412 * long-running RCU critical section. When rcu-reclaims in the code
2413 * start to become numerous it will be necessary to reduce the
2414 * granularity of these critical sections.
2415 */
2416
2417 /**
2418 * ram_save_setup: Setup RAM for migration
2419 *
2420 * Returns zero to indicate success and negative for error
2421 *
2422 * @f: QEMUFile where to send the data
2423 * @opaque: RAMState pointer
2424 */
2425 static int ram_save_setup(QEMUFile *f, void *opaque)
2426 {
2427 RAMState **rsp = opaque;
2428 RAMBlock *block;
2429
2430 if (compress_threads_save_setup()) {
2431 return -1;
2432 }
2433
2434 /* migration has already setup the bitmap, reuse it. */
2435 if (!migration_in_colo_state()) {
2436 if (ram_init_all(rsp) != 0) {
2437 compress_threads_save_cleanup();
2438 return -1;
2439 }
2440 }
2441 (*rsp)->f = f;
2442
2443 WITH_RCU_READ_LOCK_GUARD() {
2444 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2445
2446 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2447 qemu_put_byte(f, strlen(block->idstr));
2448 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2449 qemu_put_be64(f, block->used_length);
2450 if (migrate_postcopy_ram() && block->page_size !=
2451 qemu_host_page_size) {
2452 qemu_put_be64(f, block->page_size);
2453 }
2454 if (migrate_ignore_shared()) {
2455 qemu_put_be64(f, block->mr->addr);
2456 }
2457 }
2458 }
2459
2460 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2461 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2462
2463 multifd_send_sync_main(f);
2464 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2465 qemu_fflush(f);
2466
2467 return 0;
2468 }
2469
2470 /**
2471 * ram_save_iterate: iterative stage for migration
2472 *
2473 * Returns zero to indicate success and negative for error
2474 *
2475 * @f: QEMUFile where to send the data
2476 * @opaque: RAMState pointer
2477 */
2478 static int ram_save_iterate(QEMUFile *f, void *opaque)
2479 {
2480 RAMState **temp = opaque;
2481 RAMState *rs = *temp;
2482 int ret = 0;
2483 int i;
2484 int64_t t0;
2485 int done = 0;
2486
2487 if (blk_mig_bulk_active()) {
2488 /* Avoid transferring ram during bulk phase of block migration as
2489 * the bulk phase will usually take a long time and transferring
2490 * ram updates during that time is pointless. */
2491 goto out;
2492 }
2493
2494 WITH_RCU_READ_LOCK_GUARD() {
2495 if (ram_list.version != rs->last_version) {
2496 ram_state_reset(rs);
2497 }
2498
2499 /* Read version before ram_list.blocks */
2500 smp_rmb();
2501
2502 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2503
2504 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2505 i = 0;
2506 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2507 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2508 int pages;
2509
2510 if (qemu_file_get_error(f)) {
2511 break;
2512 }
2513
2514 pages = ram_find_and_save_block(rs, false);
2515 /* no more pages to sent */
2516 if (pages == 0) {
2517 done = 1;
2518 break;
2519 }
2520
2521 if (pages < 0) {
2522 qemu_file_set_error(f, pages);
2523 break;
2524 }
2525
2526 rs->target_page_count += pages;
2527
2528 /*
2529 * During postcopy, it is necessary to make sure one whole host
2530 * page is sent in one chunk.
2531 */
2532 if (migrate_postcopy_ram()) {
2533 flush_compressed_data(rs);
2534 }
2535
2536 /*
2537 * we want to check in the 1st loop, just in case it was the 1st
2538 * time and we had to sync the dirty bitmap.
2539 * qemu_clock_get_ns() is a bit expensive, so we only check each
2540 * some iterations
2541 */
2542 if ((i & 63) == 0) {
2543 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2544 1000000;
2545 if (t1 > MAX_WAIT) {
2546 trace_ram_save_iterate_big_wait(t1, i);
2547 break;
2548 }
2549 }
2550 i++;
2551 }
2552 }
2553
2554 /*
2555 * Must occur before EOS (or any QEMUFile operation)
2556 * because of RDMA protocol.
2557 */
2558 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2559
2560 out:
2561 if (ret >= 0
2562 && migration_is_setup_or_active(migrate_get_current()->state)) {
2563 multifd_send_sync_main(rs->f);
2564 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2565 qemu_fflush(f);
2566 ram_counters.transferred += 8;
2567
2568 ret = qemu_file_get_error(f);
2569 }
2570 if (ret < 0) {
2571 return ret;
2572 }
2573
2574 return done;
2575 }
2576
2577 /**
2578 * ram_save_complete: function called to send the remaining amount of ram
2579 *
2580 * Returns zero to indicate success or negative on error
2581 *
2582 * Called with iothread lock
2583 *
2584 * @f: QEMUFile where to send the data
2585 * @opaque: RAMState pointer
2586 */
2587 static int ram_save_complete(QEMUFile *f, void *opaque)
2588 {
2589 RAMState **temp = opaque;
2590 RAMState *rs = *temp;
2591 int ret = 0;
2592
2593 WITH_RCU_READ_LOCK_GUARD() {
2594 if (!migration_in_postcopy()) {
2595 migration_bitmap_sync_precopy(rs);
2596 }
2597
2598 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2599
2600 /* try transferring iterative blocks of memory */
2601
2602 /* flush all remaining blocks regardless of rate limiting */
2603 while (true) {
2604 int pages;
2605
2606 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2607 /* no more blocks to sent */
2608 if (pages == 0) {
2609 break;
2610 }
2611 if (pages < 0) {
2612 ret = pages;
2613 break;
2614 }
2615 }
2616
2617 flush_compressed_data(rs);
2618 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2619 }
2620
2621 if (ret >= 0) {
2622 multifd_send_sync_main(rs->f);
2623 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2624 qemu_fflush(f);
2625 }
2626
2627 return ret;
2628 }
2629
2630 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2631 uint64_t *res_precopy_only,
2632 uint64_t *res_compatible,
2633 uint64_t *res_postcopy_only)
2634 {
2635 RAMState **temp = opaque;
2636 RAMState *rs = *temp;
2637 uint64_t remaining_size;
2638
2639 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2640
2641 if (!migration_in_postcopy() &&
2642 remaining_size < max_size) {
2643 qemu_mutex_lock_iothread();
2644 WITH_RCU_READ_LOCK_GUARD() {
2645 migration_bitmap_sync_precopy(rs);
2646 }
2647 qemu_mutex_unlock_iothread();
2648 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2649 }
2650
2651 if (migrate_postcopy_ram()) {
2652 /* We can do postcopy, and all the data is postcopiable */
2653 *res_compatible += remaining_size;
2654 } else {
2655 *res_precopy_only += remaining_size;
2656 }
2657 }
2658
2659 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2660 {
2661 unsigned int xh_len;
2662 int xh_flags;
2663 uint8_t *loaded_data;
2664
2665 /* extract RLE header */
2666 xh_flags = qemu_get_byte(f);
2667 xh_len = qemu_get_be16(f);
2668
2669 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2670 error_report("Failed to load XBZRLE page - wrong compression!");
2671 return -1;
2672 }
2673
2674 if (xh_len > TARGET_PAGE_SIZE) {
2675 error_report("Failed to load XBZRLE page - len overflow!");
2676 return -1;
2677 }
2678 loaded_data = XBZRLE.decoded_buf;
2679 /* load data and decode */
2680 /* it can change loaded_data to point to an internal buffer */
2681 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2682
2683 /* decode RLE */
2684 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2685 TARGET_PAGE_SIZE) == -1) {
2686 error_report("Failed to load XBZRLE page - decode error!");
2687 return -1;
2688 }
2689
2690 return 0;
2691 }
2692
2693 /**
2694 * ram_block_from_stream: read a RAMBlock id from the migration stream
2695 *
2696 * Must be called from within a rcu critical section.
2697 *
2698 * Returns a pointer from within the RCU-protected ram_list.
2699 *
2700 * @f: QEMUFile where to read the data from
2701 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2702 */
2703 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2704 {
2705 static RAMBlock *block = NULL;
2706 char id[256];
2707 uint8_t len;
2708
2709 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2710 if (!block) {
2711 error_report("Ack, bad migration stream!");
2712 return NULL;
2713 }
2714 return block;
2715 }
2716
2717 len = qemu_get_byte(f);
2718 qemu_get_buffer(f, (uint8_t *)id, len);
2719 id[len] = 0;
2720
2721 block = qemu_ram_block_by_name(id);
2722 if (!block) {
2723 error_report("Can't find block %s", id);
2724 return NULL;
2725 }
2726
2727 if (ramblock_is_ignored(block)) {
2728 error_report("block %s should not be migrated !", id);
2729 return NULL;
2730 }
2731
2732 return block;
2733 }
2734
2735 static inline void *host_from_ram_block_offset(RAMBlock *block,
2736 ram_addr_t offset)
2737 {
2738 if (!offset_in_ramblock(block, offset)) {
2739 return NULL;
2740 }
2741
2742 return block->host + offset;
2743 }
2744
2745 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2746 ram_addr_t offset, bool record_bitmap)
2747 {
2748 if (!offset_in_ramblock(block, offset)) {
2749 return NULL;
2750 }
2751 if (!block->colo_cache) {
2752 error_report("%s: colo_cache is NULL in block :%s",
2753 __func__, block->idstr);
2754 return NULL;
2755 }
2756
2757 /*
2758 * During colo checkpoint, we need bitmap of these migrated pages.
2759 * It help us to decide which pages in ram cache should be flushed
2760 * into VM's RAM later.
2761 */
2762 if (record_bitmap &&
2763 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2764 ram_state->migration_dirty_pages++;
2765 }
2766 return block->colo_cache + offset;
2767 }
2768
2769 /**
2770 * ram_handle_compressed: handle the zero page case
2771 *
2772 * If a page (or a whole RDMA chunk) has been
2773 * determined to be zero, then zap it.
2774 *
2775 * @host: host address for the zero page
2776 * @ch: what the page is filled from. We only support zero
2777 * @size: size of the zero page
2778 */
2779 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2780 {
2781 if (ch != 0 || !is_zero_range(host, size)) {
2782 memset(host, ch, size);
2783 }
2784 }
2785
2786 /* return the size after decompression, or negative value on error */
2787 static int
2788 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2789 const uint8_t *source, size_t source_len)
2790 {
2791 int err;
2792
2793 err = inflateReset(stream);
2794 if (err != Z_OK) {
2795 return -1;
2796 }
2797
2798 stream->avail_in = source_len;
2799 stream->next_in = (uint8_t *)source;
2800 stream->avail_out = dest_len;
2801 stream->next_out = dest;
2802
2803 err = inflate(stream, Z_NO_FLUSH);
2804 if (err != Z_STREAM_END) {
2805 return -1;
2806 }
2807
2808 return stream->total_out;
2809 }
2810
2811 static void *do_data_decompress(void *opaque)
2812 {
2813 DecompressParam *param = opaque;
2814 unsigned long pagesize;
2815 uint8_t *des;
2816 int len, ret;
2817
2818 qemu_mutex_lock(&param->mutex);
2819 while (!param->quit) {
2820 if (param->des) {
2821 des = param->des;
2822 len = param->len;
2823 param->des = 0;
2824 qemu_mutex_unlock(&param->mutex);
2825
2826 pagesize = TARGET_PAGE_SIZE;
2827
2828 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2829 param->compbuf, len);
2830 if (ret < 0 && migrate_get_current()->decompress_error_check) {
2831 error_report("decompress data failed");
2832 qemu_file_set_error(decomp_file, ret);
2833 }
2834
2835 qemu_mutex_lock(&decomp_done_lock);
2836 param->done = true;
2837 qemu_cond_signal(&decomp_done_cond);
2838 qemu_mutex_unlock(&decomp_done_lock);
2839
2840 qemu_mutex_lock(&param->mutex);
2841 } else {
2842 qemu_cond_wait(&param->cond, &param->mutex);
2843 }
2844 }
2845 qemu_mutex_unlock(&param->mutex);
2846
2847 return NULL;
2848 }
2849
2850 static int wait_for_decompress_done(void)
2851 {
2852 int idx, thread_count;
2853
2854 if (!migrate_use_compression()) {
2855 return 0;
2856 }
2857
2858 thread_count = migrate_decompress_threads();
2859 qemu_mutex_lock(&decomp_done_lock);
2860 for (idx = 0; idx < thread_count; idx++) {
2861 while (!decomp_param[idx].done) {
2862 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2863 }
2864 }
2865 qemu_mutex_unlock(&decomp_done_lock);
2866 return qemu_file_get_error(decomp_file);
2867 }
2868
2869 static void compress_threads_load_cleanup(void)
2870 {
2871 int i, thread_count;
2872
2873 if (!migrate_use_compression()) {
2874 return;
2875 }
2876 thread_count = migrate_decompress_threads();
2877 for (i = 0; i < thread_count; i++) {
2878 /*
2879 * we use it as a indicator which shows if the thread is
2880 * properly init'd or not
2881 */
2882 if (!decomp_param[i].compbuf) {
2883 break;
2884 }
2885
2886 qemu_mutex_lock(&decomp_param[i].mutex);
2887 decomp_param[i].quit = true;
2888 qemu_cond_signal(&decomp_param[i].cond);
2889 qemu_mutex_unlock(&decomp_param[i].mutex);
2890 }
2891 for (i = 0; i < thread_count; i++) {
2892 if (!decomp_param[i].compbuf) {
2893 break;
2894 }
2895
2896 qemu_thread_join(decompress_threads + i);
2897 qemu_mutex_destroy(&decomp_param[i].mutex);
2898 qemu_cond_destroy(&decomp_param[i].cond);
2899 inflateEnd(&decomp_param[i].stream);
2900 g_free(decomp_param[i].compbuf);
2901 decomp_param[i].compbuf = NULL;
2902 }
2903 g_free(decompress_threads);
2904 g_free(decomp_param);
2905 decompress_threads = NULL;
2906 decomp_param = NULL;
2907 decomp_file = NULL;
2908 }
2909
2910 static int compress_threads_load_setup(QEMUFile *f)
2911 {
2912 int i, thread_count;
2913
2914 if (!migrate_use_compression()) {
2915 return 0;
2916 }
2917
2918 thread_count = migrate_decompress_threads();
2919 decompress_threads = g_new0(QemuThread, thread_count);
2920 decomp_param = g_new0(DecompressParam, thread_count);
2921 qemu_mutex_init(&decomp_done_lock);
2922 qemu_cond_init(&decomp_done_cond);
2923 decomp_file = f;
2924 for (i = 0; i < thread_count; i++) {
2925 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2926 goto exit;
2927 }
2928
2929 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2930 qemu_mutex_init(&decomp_param[i].mutex);
2931 qemu_cond_init(&decomp_param[i].cond);
2932 decomp_param[i].done = true;
2933 decomp_param[i].quit = false;
2934 qemu_thread_create(decompress_threads + i, "decompress",
2935 do_data_decompress, decomp_param + i,
2936 QEMU_THREAD_JOINABLE);
2937 }
2938 return 0;
2939 exit:
2940 compress_threads_load_cleanup();
2941 return -1;
2942 }
2943
2944 static void decompress_data_with_multi_threads(QEMUFile *f,
2945 void *host, int len)
2946 {
2947 int idx, thread_count;
2948
2949 thread_count = migrate_decompress_threads();
2950 qemu_mutex_lock(&decomp_done_lock);
2951 while (true) {
2952 for (idx = 0; idx < thread_count; idx++) {
2953 if (decomp_param[idx].done) {
2954 decomp_param[idx].done = false;
2955 qemu_mutex_lock(&decomp_param[idx].mutex);
2956 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2957 decomp_param[idx].des = host;
2958 decomp_param[idx].len = len;
2959 qemu_cond_signal(&decomp_param[idx].cond);
2960 qemu_mutex_unlock(&decomp_param[idx].mutex);
2961 break;
2962 }
2963 }
2964 if (idx < thread_count) {
2965 break;
2966 } else {
2967 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2968 }
2969 }
2970 qemu_mutex_unlock(&decomp_done_lock);
2971 }
2972
2973 /*
2974 * colo cache: this is for secondary VM, we cache the whole
2975 * memory of the secondary VM, it is need to hold the global lock
2976 * to call this helper.
2977 */
2978 int colo_init_ram_cache(void)
2979 {
2980 RAMBlock *block;
2981
2982 WITH_RCU_READ_LOCK_GUARD() {
2983 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2984 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2985 NULL,
2986 false);
2987 if (!block->colo_cache) {
2988 error_report("%s: Can't alloc memory for COLO cache of block %s,"
2989 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2990 block->used_length);
2991 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2992 if (block->colo_cache) {
2993 qemu_anon_ram_free(block->colo_cache, block->used_length);
2994 block->colo_cache = NULL;
2995 }
2996 }
2997 return -errno;
2998 }
2999 }
3000 }
3001
3002 /*
3003 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3004 * with to decide which page in cache should be flushed into SVM's RAM. Here
3005 * we use the same name 'ram_bitmap' as for migration.
3006 */
3007 if (ram_bytes_total()) {
3008 RAMBlock *block;
3009
3010 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3011 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3012 block->bmap = bitmap_new(pages);
3013 }
3014 }
3015
3016 ram_state_init(&ram_state);
3017 return 0;
3018 }
3019
3020 /* TODO: duplicated with ram_init_bitmaps */
3021 void colo_incoming_start_dirty_log(void)
3022 {
3023 RAMBlock *block = NULL;
3024 /* For memory_global_dirty_log_start below. */
3025 qemu_mutex_lock_iothread();
3026 qemu_mutex_lock_ramlist();
3027
3028 memory_global_dirty_log_sync();
3029 WITH_RCU_READ_LOCK_GUARD() {
3030 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3031 ramblock_sync_dirty_bitmap(ram_state, block);
3032 /* Discard this dirty bitmap record */
3033 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3034 }
3035 memory_global_dirty_log_start();
3036 }
3037 ram_state->migration_dirty_pages = 0;
3038 qemu_mutex_unlock_ramlist();
3039 qemu_mutex_unlock_iothread();
3040 }
3041
3042 /* It is need to hold the global lock to call this helper */
3043 void colo_release_ram_cache(void)
3044 {
3045 RAMBlock *block;
3046
3047 memory_global_dirty_log_stop();
3048 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3049 g_free(block->bmap);
3050 block->bmap = NULL;
3051 }
3052
3053 WITH_RCU_READ_LOCK_GUARD() {
3054 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3055 if (block->colo_cache) {
3056 qemu_anon_ram_free(block->colo_cache, block->used_length);
3057 block->colo_cache = NULL;
3058 }
3059 }
3060 }
3061 ram_state_cleanup(&ram_state);
3062 }
3063
3064 /**
3065 * ram_load_setup: Setup RAM for migration incoming side
3066 *
3067 * Returns zero to indicate success and negative for error
3068 *
3069 * @f: QEMUFile where to receive the data
3070 * @opaque: RAMState pointer
3071 */
3072 static int ram_load_setup(QEMUFile *f, void *opaque)
3073 {
3074 if (compress_threads_load_setup(f)) {
3075 return -1;
3076 }
3077
3078 xbzrle_load_setup();
3079 ramblock_recv_map_init();
3080
3081 return 0;
3082 }
3083
3084 static int ram_load_cleanup(void *opaque)
3085 {
3086 RAMBlock *rb;
3087
3088 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3089 qemu_ram_block_writeback(rb);
3090 }
3091
3092 xbzrle_load_cleanup();
3093 compress_threads_load_cleanup();
3094
3095 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3096 g_free(rb->receivedmap);
3097 rb->receivedmap = NULL;
3098 }
3099
3100 return 0;
3101 }
3102
3103 /**
3104 * ram_postcopy_incoming_init: allocate postcopy data structures
3105 *
3106 * Returns 0 for success and negative if there was one error
3107 *
3108 * @mis: current migration incoming state
3109 *
3110 * Allocate data structures etc needed by incoming migration with
3111 * postcopy-ram. postcopy-ram's similarly names
3112 * postcopy_ram_incoming_init does the work.
3113 */
3114 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3115 {
3116 return postcopy_ram_incoming_init(mis);
3117 }
3118
3119 /**
3120 * ram_load_postcopy: load a page in postcopy case
3121 *
3122 * Returns 0 for success or -errno in case of error
3123 *
3124 * Called in postcopy mode by ram_load().
3125 * rcu_read_lock is taken prior to this being called.
3126 *
3127 * @f: QEMUFile where to send the data
3128 */
3129 static int ram_load_postcopy(QEMUFile *f)
3130 {
3131 int flags = 0, ret = 0;
3132 bool place_needed = false;
3133 bool matches_target_page_size = false;
3134 MigrationIncomingState *mis = migration_incoming_get_current();
3135 /* Temporary page that is later 'placed' */
3136 void *postcopy_host_page = mis->postcopy_tmp_page;
3137 void *this_host = NULL;
3138 bool all_zero = false;
3139 int target_pages = 0;
3140
3141 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3142 ram_addr_t addr;
3143 void *host = NULL;
3144 void *page_buffer = NULL;
3145 void *place_source = NULL;
3146 RAMBlock *block = NULL;
3147 uint8_t ch;
3148 int len;
3149
3150 addr = qemu_get_be64(f);
3151
3152 /*
3153 * If qemu file error, we should stop here, and then "addr"
3154 * may be invalid
3155 */
3156 ret = qemu_file_get_error(f);
3157 if (ret) {
3158 break;
3159 }
3160
3161 flags = addr & ~TARGET_PAGE_MASK;
3162 addr &= TARGET_PAGE_MASK;
3163
3164 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3165 place_needed = false;
3166 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3167 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3168 block = ram_block_from_stream(f, flags);
3169
3170 host = host_from_ram_block_offset(block, addr);
3171 if (!host) {
3172 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3173 ret = -EINVAL;
3174 break;
3175 }
3176 target_pages++;
3177 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3178 /*
3179 * Postcopy requires that we place whole host pages atomically;
3180 * these may be huge pages for RAMBlocks that are backed by
3181 * hugetlbfs.
3182 * To make it atomic, the data is read into a temporary page
3183 * that's moved into place later.
3184 * The migration protocol uses, possibly smaller, target-pages
3185 * however the source ensures it always sends all the components
3186 * of a host page in one chunk.
3187 */
3188 page_buffer = postcopy_host_page +
3189 ((uintptr_t)host & (block->page_size - 1));
3190 /* If all TP are zero then we can optimise the place */
3191 if (target_pages == 1) {
3192 all_zero = true;
3193 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3194 block->page_size);
3195 } else {
3196 /* not the 1st TP within the HP */
3197 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3198 (uintptr_t)this_host) {
3199 error_report("Non-same host page %p/%p",
3200 host, this_host);
3201 ret = -EINVAL;
3202 break;
3203 }
3204 }
3205
3206 /*
3207 * If it's the last part of a host page then we place the host
3208 * page
3209 */
3210 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3211 place_needed = true;
3212 target_pages = 0;
3213 }
3214 place_source = postcopy_host_page;
3215 }
3216
3217 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3218 case RAM_SAVE_FLAG_ZERO:
3219 ch = qemu_get_byte(f);
3220 /*
3221 * Can skip to set page_buffer when
3222 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3223 */
3224 if (ch || !matches_target_page_size) {
3225 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3226 }
3227 if (ch) {
3228 all_zero = false;
3229 }
3230 break;
3231
3232 case RAM_SAVE_FLAG_PAGE:
3233 all_zero = false;
3234 if (!matches_target_page_size) {
3235 /* For huge pages, we always use temporary buffer */
3236 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3237 } else {
3238 /*
3239 * For small pages that matches target page size, we
3240 * avoid the qemu_file copy. Instead we directly use
3241 * the buffer of QEMUFile to place the page. Note: we
3242 * cannot do any QEMUFile operation before using that
3243 * buffer to make sure the buffer is valid when
3244 * placing the page.
3245 */
3246 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3247 TARGET_PAGE_SIZE);
3248 }
3249 break;
3250 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3251 all_zero = false;
3252 len = qemu_get_be32(f);
3253 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3254 error_report("Invalid compressed data length: %d", len);
3255 ret = -EINVAL;
3256 break;
3257 }
3258 decompress_data_with_multi_threads(f, page_buffer, len);
3259 break;
3260
3261 case RAM_SAVE_FLAG_EOS:
3262 /* normal exit */
3263 multifd_recv_sync_main();
3264 break;
3265 default:
3266 error_report("Unknown combination of migration flags: %#x"
3267 " (postcopy mode)", flags);
3268 ret = -EINVAL;
3269 break;
3270 }
3271
3272 /* Got the whole host page, wait for decompress before placing. */
3273 if (place_needed) {
3274 ret |= wait_for_decompress_done();
3275 }
3276
3277 /* Detect for any possible file errors */
3278 if (!ret && qemu_file_get_error(f)) {
3279 ret = qemu_file_get_error(f);
3280 }
3281
3282 if (!ret && place_needed) {
3283 /* This gets called at the last target page in the host page */
3284 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3285 block->page_size);
3286
3287 if (all_zero) {
3288 ret = postcopy_place_page_zero(mis, place_dest,
3289 block);
3290 } else {
3291 ret = postcopy_place_page(mis, place_dest,
3292 place_source, block);
3293 }
3294 }
3295 }
3296
3297 return ret;
3298 }
3299
3300 static bool postcopy_is_advised(void)
3301 {
3302 PostcopyState ps = postcopy_state_get();
3303 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3304 }
3305
3306 static bool postcopy_is_running(void)
3307 {
3308 PostcopyState ps = postcopy_state_get();
3309 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3310 }
3311
3312 /*
3313 * Flush content of RAM cache into SVM's memory.
3314 * Only flush the pages that be dirtied by PVM or SVM or both.
3315 */
3316 static void colo_flush_ram_cache(void)
3317 {
3318 RAMBlock *block = NULL;
3319 void *dst_host;
3320 void *src_host;
3321 unsigned long offset = 0;
3322
3323 memory_global_dirty_log_sync();
3324 WITH_RCU_READ_LOCK_GUARD() {
3325 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3326 ramblock_sync_dirty_bitmap(ram_state, block);
3327 }
3328 }
3329
3330 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3331 WITH_RCU_READ_LOCK_GUARD() {
3332 block = QLIST_FIRST_RCU(&ram_list.blocks);
3333
3334 while (block) {
3335 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3336
3337 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3338 >= block->used_length) {
3339 offset = 0;
3340 block = QLIST_NEXT_RCU(block, next);
3341 } else {
3342 migration_bitmap_clear_dirty(ram_state, block, offset);
3343 dst_host = block->host
3344 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3345 src_host = block->colo_cache
3346 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3347 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3348 }
3349 }
3350 }
3351 trace_colo_flush_ram_cache_end();
3352 }
3353
3354 /**
3355 * ram_load_precopy: load pages in precopy case
3356 *
3357 * Returns 0 for success or -errno in case of error
3358 *
3359 * Called in precopy mode by ram_load().
3360 * rcu_read_lock is taken prior to this being called.
3361 *
3362 * @f: QEMUFile where to send the data
3363 */
3364 static int ram_load_precopy(QEMUFile *f)
3365 {
3366 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3367 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3368 bool postcopy_advised = postcopy_is_advised();
3369 if (!migrate_use_compression()) {
3370 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3371 }
3372
3373 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3374 ram_addr_t addr, total_ram_bytes;
3375 void *host = NULL, *host_bak = NULL;
3376 uint8_t ch;
3377
3378 /*
3379 * Yield periodically to let main loop run, but an iteration of
3380 * the main loop is expensive, so do it each some iterations
3381 */
3382 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3383 aio_co_schedule(qemu_get_current_aio_context(),
3384 qemu_coroutine_self());
3385 qemu_coroutine_yield();
3386 }
3387 i++;
3388
3389 addr = qemu_get_be64(f);
3390 flags = addr & ~TARGET_PAGE_MASK;
3391 addr &= TARGET_PAGE_MASK;
3392
3393 if (flags & invalid_flags) {
3394 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3395 error_report("Received an unexpected compressed page");
3396 }
3397
3398 ret = -EINVAL;
3399 break;
3400 }
3401
3402 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3403 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3404 RAMBlock *block = ram_block_from_stream(f, flags);
3405
3406 host = host_from_ram_block_offset(block, addr);
3407 /*
3408 * After going into COLO stage, we should not load the page
3409 * into SVM's memory directly, we put them into colo_cache firstly.
3410 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3411 * Previously, we copied all these memory in preparing stage of COLO
3412 * while we need to stop VM, which is a time-consuming process.
3413 * Here we optimize it by a trick, back-up every page while in
3414 * migration process while COLO is enabled, though it affects the
3415 * speed of the migration, but it obviously reduce the downtime of
3416 * back-up all SVM'S memory in COLO preparing stage.
3417 */
3418 if (migration_incoming_colo_enabled()) {
3419 if (migration_incoming_in_colo_state()) {
3420 /* In COLO stage, put all pages into cache temporarily */
3421 host = colo_cache_from_block_offset(block, addr, true);
3422 } else {
3423 /*
3424 * In migration stage but before COLO stage,
3425 * Put all pages into both cache and SVM's memory.
3426 */
3427 host_bak = colo_cache_from_block_offset(block, addr, false);
3428 }
3429 }
3430 if (!host) {
3431 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3432 ret = -EINVAL;
3433 break;
3434 }
3435 if (!migration_incoming_in_colo_state()) {
3436 ramblock_recv_bitmap_set(block, host);
3437 }
3438
3439 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3440 }
3441
3442 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3443 case RAM_SAVE_FLAG_MEM_SIZE:
3444 /* Synchronize RAM block list */
3445 total_ram_bytes = addr;
3446 while (!ret && total_ram_bytes) {
3447 RAMBlock *block;
3448 char id[256];
3449 ram_addr_t length;
3450
3451 len = qemu_get_byte(f);
3452 qemu_get_buffer(f, (uint8_t *)id, len);
3453 id[len] = 0;
3454 length = qemu_get_be64(f);
3455
3456 block = qemu_ram_block_by_name(id);
3457 if (block && !qemu_ram_is_migratable(block)) {
3458 error_report("block %s should not be migrated !", id);
3459 ret = -EINVAL;
3460 } else if (block) {
3461 if (length != block->used_length) {
3462 Error *local_err = NULL;
3463
3464 ret = qemu_ram_resize(block, length,
3465 &local_err);
3466 if (local_err) {
3467 error_report_err(local_err);
3468 }
3469 }
3470 /* For postcopy we need to check hugepage sizes match */
3471 if (postcopy_advised &&
3472 block->page_size != qemu_host_page_size) {
3473 uint64_t remote_page_size = qemu_get_be64(f);
3474 if (remote_page_size != block->page_size) {
3475 error_report("Mismatched RAM page size %s "
3476 "(local) %zd != %" PRId64,
3477 id, block->page_size,
3478 remote_page_size);
3479 ret = -EINVAL;
3480 }
3481 }
3482 if (migrate_ignore_shared()) {
3483 hwaddr addr = qemu_get_be64(f);
3484 if (ramblock_is_ignored(block) &&
3485 block->mr->addr != addr) {
3486 error_report("Mismatched GPAs for block %s "
3487 "%" PRId64 "!= %" PRId64,
3488 id, (uint64_t)addr,
3489 (uint64_t)block->mr->addr);
3490 ret = -EINVAL;
3491 }
3492 }
3493 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3494 block->idstr);
3495 } else {
3496 error_report("Unknown ramblock \"%s\", cannot "
3497 "accept migration", id);
3498 ret = -EINVAL;
3499 }
3500
3501 total_ram_bytes -= length;
3502 }
3503 break;
3504
3505 case RAM_SAVE_FLAG_ZERO:
3506 ch = qemu_get_byte(f);
3507 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3508 break;
3509
3510 case RAM_SAVE_FLAG_PAGE:
3511 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3512 break;
3513
3514 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3515 len = qemu_get_be32(f);
3516 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3517 error_report("Invalid compressed data length: %d", len);
3518 ret = -EINVAL;
3519 break;
3520 }
3521 decompress_data_with_multi_threads(f, host, len);
3522 break;
3523
3524 case RAM_SAVE_FLAG_XBZRLE:
3525 if (load_xbzrle(f, addr, host) < 0) {
3526 error_report("Failed to decompress XBZRLE page at "
3527 RAM_ADDR_FMT, addr);
3528 ret = -EINVAL;
3529 break;
3530 }
3531 break;
3532 case RAM_SAVE_FLAG_EOS:
3533 /* normal exit */
3534 multifd_recv_sync_main();
3535 break;
3536 default:
3537 if (flags & RAM_SAVE_FLAG_HOOK) {
3538 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3539 } else {
3540 error_report("Unknown combination of migration flags: %#x",
3541 flags);
3542 ret = -EINVAL;
3543 }
3544 }
3545 if (!ret) {
3546 ret = qemu_file_get_error(f);
3547 }
3548 if (!ret && host_bak) {
3549 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3550 }
3551 }
3552
3553 ret |= wait_for_decompress_done();
3554 return ret;
3555 }
3556
3557 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3558 {
3559 int ret = 0;
3560 static uint64_t seq_iter;
3561 /*
3562 * If system is running in postcopy mode, page inserts to host memory must
3563 * be atomic
3564 */
3565 bool postcopy_running = postcopy_is_running();
3566
3567 seq_iter++;
3568
3569 if (version_id != 4) {
3570 return -EINVAL;
3571 }
3572
3573 /*
3574 * This RCU critical section can be very long running.
3575 * When RCU reclaims in the code start to become numerous,
3576 * it will be necessary to reduce the granularity of this
3577 * critical section.
3578 */
3579 WITH_RCU_READ_LOCK_GUARD() {
3580 if (postcopy_running) {
3581 ret = ram_load_postcopy(f);
3582 } else {
3583 ret = ram_load_precopy(f);
3584 }
3585 }
3586 trace_ram_load_complete(ret, seq_iter);
3587
3588 if (!ret && migration_incoming_in_colo_state()) {
3589 colo_flush_ram_cache();
3590 }
3591 return ret;
3592 }
3593
3594 static bool ram_has_postcopy(void *opaque)
3595 {
3596 RAMBlock *rb;
3597 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3598 if (ramblock_is_pmem(rb)) {
3599 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3600 "is not supported now!", rb->idstr, rb->host);
3601 return false;
3602 }
3603 }
3604
3605 return migrate_postcopy_ram();
3606 }
3607
3608 /* Sync all the dirty bitmap with destination VM. */
3609 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3610 {
3611 RAMBlock *block;
3612 QEMUFile *file = s->to_dst_file;
3613 int ramblock_count = 0;
3614
3615 trace_ram_dirty_bitmap_sync_start();
3616
3617 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3618 qemu_savevm_send_recv_bitmap(file, block->idstr);
3619 trace_ram_dirty_bitmap_request(block->idstr);
3620 ramblock_count++;
3621 }
3622
3623 trace_ram_dirty_bitmap_sync_wait();
3624
3625 /* Wait until all the ramblocks' dirty bitmap synced */
3626 while (ramblock_count--) {
3627 qemu_sem_wait(&s->rp_state.rp_sem);
3628 }
3629
3630 trace_ram_dirty_bitmap_sync_complete();
3631
3632 return 0;
3633 }
3634
3635 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3636 {
3637 qemu_sem_post(&s->rp_state.rp_sem);
3638 }
3639
3640 /*
3641 * Read the received bitmap, revert it as the initial dirty bitmap.
3642 * This is only used when the postcopy migration is paused but wants
3643 * to resume from a middle point.
3644 */
3645 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3646 {
3647 int ret = -EINVAL;
3648 QEMUFile *file = s->rp_state.from_dst_file;
3649 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3650 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3651 uint64_t size, end_mark;
3652
3653 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3654
3655 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3656 error_report("%s: incorrect state %s", __func__,
3657 MigrationStatus_str(s->state));
3658 return -EINVAL;
3659 }
3660
3661 /*
3662 * Note: see comments in ramblock_recv_bitmap_send() on why we
3663 * need the endianess convertion, and the paddings.
3664 */
3665 local_size = ROUND_UP(local_size, 8);
3666
3667 /* Add paddings */
3668 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3669
3670 size = qemu_get_be64(file);
3671
3672 /* The size of the bitmap should match with our ramblock */
3673 if (size != local_size) {
3674 error_report("%s: ramblock '%s' bitmap size mismatch "
3675 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3676 block->idstr, size, local_size);
3677 ret = -EINVAL;
3678 goto out;
3679 }
3680
3681 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3682 end_mark = qemu_get_be64(file);
3683
3684 ret = qemu_file_get_error(file);
3685 if (ret || size != local_size) {
3686 error_report("%s: read bitmap failed for ramblock '%s': %d"
3687 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3688 __func__, block->idstr, ret, local_size, size);
3689 ret = -EIO;
3690 goto out;
3691 }
3692
3693 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3694 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3695 __func__, block->idstr, end_mark);
3696 ret = -EINVAL;
3697 goto out;
3698 }
3699
3700 /*
3701 * Endianess convertion. We are during postcopy (though paused).
3702 * The dirty bitmap won't change. We can directly modify it.
3703 */
3704 bitmap_from_le(block->bmap, le_bitmap, nbits);
3705
3706 /*
3707 * What we received is "received bitmap". Revert it as the initial
3708 * dirty bitmap for this ramblock.
3709 */
3710 bitmap_complement(block->bmap, block->bmap, nbits);
3711
3712 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3713
3714 /*
3715 * We succeeded to sync bitmap for current ramblock. If this is
3716 * the last one to sync, we need to notify the main send thread.
3717 */
3718 ram_dirty_bitmap_reload_notify(s);
3719
3720 ret = 0;
3721 out:
3722 g_free(le_bitmap);
3723 return ret;
3724 }
3725
3726 static int ram_resume_prepare(MigrationState *s, void *opaque)
3727 {
3728 RAMState *rs = *(RAMState **)opaque;
3729 int ret;
3730
3731 ret = ram_dirty_bitmap_sync_all(s, rs);
3732 if (ret) {
3733 return ret;
3734 }
3735
3736 ram_state_resume_prepare(rs, s->to_dst_file);
3737
3738 return 0;
3739 }
3740
3741 static SaveVMHandlers savevm_ram_handlers = {
3742 .save_setup = ram_save_setup,
3743 .save_live_iterate = ram_save_iterate,
3744 .save_live_complete_postcopy = ram_save_complete,
3745 .save_live_complete_precopy = ram_save_complete,
3746 .has_postcopy = ram_has_postcopy,
3747 .save_live_pending = ram_save_pending,
3748 .load_state = ram_load,
3749 .save_cleanup = ram_save_cleanup,
3750 .load_setup = ram_load_setup,
3751 .load_cleanup = ram_load_cleanup,
3752 .resume_prepare = ram_resume_prepare,
3753 };
3754
3755 void ram_mig_init(void)
3756 {
3757 qemu_mutex_init(&XBZRLE.lock);
3758 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3759 }