]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
vfio/mlx5: Add REINIT support to VFIO_MIG_GET_PRECOPY_INFO
authorYishai Hadas <yishaih@nvidia.com>
Tue, 17 Mar 2026 16:17:53 +0000 (18:17 +0200)
committerAlex Williamson <alex@shazbot.org>
Thu, 19 Mar 2026 18:32:10 +0000 (12:32 -0600)
When userspace opts into VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2, the
driver may report the VFIO_PRECOPY_INFO_REINIT output flag in response
to the VFIO_MIG_GET_PRECOPY_INFO ioctl, along with a new initial_bytes
value.

The presence of the VFIO_PRECOPY_INFO_REINIT flag indicates to the
caller that new initial data is available in the migration stream.

If the firmware reports a new initial-data chunk, any previously dirty
bytes in memory are treated as initial bytes, since the caller must read
both sets before reaching the end of the initial-data region.

In this case, the driver issues a new SAVE command to fetch the data and
prepare it for a subsequent read() from userspace.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20260317161753.18964-7-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
drivers/vfio/pci/mlx5/cmd.c
drivers/vfio/pci/mlx5/cmd.h
drivers/vfio/pci/mlx5/main.c

index 18b8d85940703205d7599a3ec157d4bc1cbea70a..5fe0621b5fbd80d66d3a05f2310d43a8074552a3 100644 (file)
@@ -87,7 +87,7 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
 
 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
                                          size_t *state_size, u64 *total_size,
-                                         u8 query_flags)
+                                         u8 *mig_state, u8 query_flags)
 {
        u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
        u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
@@ -152,6 +152,10 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
                        MLX5_GET64(query_vhca_migration_state_out, out,
                                   remaining_total_size) : *state_size;
 
+       if (mig_state && mvdev->mig_state_cap)
+               *mig_state = MLX5_GET(query_vhca_migration_state_out, out,
+                                     migration_state);
+
        return 0;
 }
 
@@ -277,6 +281,9 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
        if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
                mvdev->chunk_mode = 1;
 
+       if (MLX5_CAP_GEN_2(mvdev->mdev, migration_state))
+               mvdev->mig_state_cap = 1;
+
 end:
        mlx5_vf_put_core_dev(mvdev->mdev);
 }
@@ -555,6 +562,7 @@ void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
 {
        spin_lock_irq(&buf->migf->list_lock);
        buf->stop_copy_chunk_num = 0;
+       buf->pre_copy_init_bytes_chunk = false;
        list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
        spin_unlock_irq(&buf->migf->list_lock);
 }
@@ -689,7 +697,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
                                !next_required_umem_size;
                if (async_data->header_buf) {
                        status = add_buf_header(async_data->header_buf, image_size,
-                                               initial_pre_copy);
+                                               initial_pre_copy ||
+                                               async_data->buf->pre_copy_init_bytes_chunk);
                        if (status)
                                goto err;
                }
@@ -708,9 +717,12 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
                        }
                }
                spin_unlock_irqrestore(&migf->list_lock, flags);
-               if (initial_pre_copy) {
+               if (initial_pre_copy || async_data->buf->pre_copy_init_bytes_chunk) {
                        migf->pre_copy_initial_bytes += image_size;
-                       migf->state = MLX5_MIGF_STATE_PRE_COPY;
+                       if (initial_pre_copy)
+                               migf->state = MLX5_MIGF_STATE_PRE_COPY;
+                       if (async_data->buf->pre_copy_init_bytes_chunk)
+                               async_data->buf->pre_copy_init_bytes_chunk = false;
                }
                if (stop_copy_last_chunk)
                        migf->state = MLX5_MIGF_STATE_COMPLETE;
index 7d2c10be2e60f0113cca724769623b86d413e3a7..deed0f132f39fe07851d2bb8f3ce4341fcb24f6c 100644 (file)
@@ -62,6 +62,7 @@ struct mlx5_vhca_data_buffer {
        u32 *mkey_in;
        enum dma_data_direction dma_dir;
        u8 stop_copy_chunk_num;
+       bool pre_copy_init_bytes_chunk;
        struct list_head buf_elm;
        struct mlx5_vf_migration_file *migf;
 };
@@ -97,6 +98,7 @@ struct mlx5_vf_migration_file {
        u32 record_tag;
        u64 stop_copy_prep_size;
        u64 pre_copy_initial_bytes;
+       u64 pre_copy_initial_bytes_start;
        size_t next_required_umem_size;
        u8 num_ready_chunks;
        /* Upon chunk mode preserve another set of buffers for stop_copy phase */
@@ -175,6 +177,7 @@ struct mlx5vf_pci_core_device {
        u8 mdev_detach:1;
        u8 log_active:1;
        u8 chunk_mode:1;
+       u8 mig_state_cap:1;
        struct completion tracker_comp;
        /* protect migration state */
        struct mutex state_mutex;
@@ -199,7 +202,7 @@ int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
                                          size_t *state_size, u64 *total_size,
-                                         u8 query_flags);
+                                         u8 *migration_state, u8 query_flags);
 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
                               const struct vfio_migration_ops *mig_ops,
                               const struct vfio_log_ops *log_ops);
index 68e051c48d4012888158f6f071415640f99c6fc5..de306dee1d1ad9daf47ca5a562077f591ec6c342 100644 (file)
@@ -464,8 +464,10 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
        struct mlx5_vhca_data_buffer *buf;
        struct vfio_precopy_info info = {};
        loff_t *pos = &filp->f_pos;
+       u8 migration_state = 0;
        size_t inc_length = 0;
-       bool end_of_data = false;
+       bool reinit_state;
+       bool end_of_data;
        int ret;
 
        ret = vfio_check_precopy_ioctl(&mvdev->core_device.vdev, cmd, arg,
@@ -492,7 +494,8 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
                 * As so, the other code below is safe with the proper locks.
                 */
                ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
-                                                           NULL, MLX5VF_QUERY_INC);
+                                                           NULL, &migration_state,
+                                                           MLX5VF_QUERY_INC);
                if (ret)
                        goto err_state_unlock;
        }
@@ -503,41 +506,67 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
                goto err_migf_unlock;
        }
 
-       if (migf->pre_copy_initial_bytes > *pos) {
-               info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
+       /*
+        * opt-in for VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 serves
+        * as opt-in for VFIO_PRECOPY_INFO_REINIT as well
+        */
+       reinit_state = mvdev->core_device.vdev.precopy_info_v2 &&
+                       migration_state == MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_INIT;
+       end_of_data = !(migf->max_pos - *pos);
+       if (reinit_state) {
+               /*
+                * Any bytes already present in memory are treated as initial
+                * bytes, since the caller is required to read them before
+                * reaching the new initial-bytes region.
+                */
+               migf->pre_copy_initial_bytes_start = *pos;
+               migf->pre_copy_initial_bytes = migf->max_pos - *pos;
+               info.initial_bytes = migf->pre_copy_initial_bytes + inc_length;
+               info.flags |= VFIO_PRECOPY_INFO_REINIT;
        } else {
-               info.dirty_bytes = migf->max_pos - *pos;
-               if (!info.dirty_bytes)
-                       end_of_data = true;
-               info.dirty_bytes += inc_length;
+               if (migf->pre_copy_initial_bytes_start +
+                   migf->pre_copy_initial_bytes > *pos) {
+                       WARN_ON_ONCE(end_of_data);
+                       info.initial_bytes = migf->pre_copy_initial_bytes_start +
+                               migf->pre_copy_initial_bytes - *pos;
+               } else {
+                       info.dirty_bytes = (migf->max_pos - *pos) + inc_length;
+               }
        }
+       mutex_unlock(&migf->lock);
 
-       if (!end_of_data || !inc_length) {
-               mutex_unlock(&migf->lock);
-               goto done;
-       }
+       if ((reinit_state || end_of_data) && inc_length) {
+               /*
+                * In case we finished transferring the current state and the
+                * device has a dirty state, or that the device has a new init
+                * state, save a new state to be ready for.
+                */
+               buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE),
+                                            DMA_FROM_DEVICE);
+               if (IS_ERR(buf)) {
+                       ret = PTR_ERR(buf);
+                       mlx5vf_mark_err(migf);
+                       goto err_state_unlock;
+               }
 
-       mutex_unlock(&migf->lock);
-       /*
-        * We finished transferring the current state and the device has a
-        * dirty state, save a new state to be ready for.
-        */
-       buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE),
-                                    DMA_FROM_DEVICE);
-       if (IS_ERR(buf)) {
-               ret = PTR_ERR(buf);
-               mlx5vf_mark_err(migf);
-               goto err_state_unlock;
-       }
+               buf->pre_copy_init_bytes_chunk = reinit_state;
+               ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
+               if (ret) {
+                       mlx5vf_mark_err(migf);
+                       mlx5vf_put_data_buffer(buf);
+                       goto err_state_unlock;
+               }
 
-       ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
-       if (ret) {
-               mlx5vf_mark_err(migf);
-               mlx5vf_put_data_buffer(buf);
-               goto err_state_unlock;
+               /*
+                * SAVE appends a header record via add_buf_header(),
+                * let's account it as well.
+                */
+               if (reinit_state)
+                       info.initial_bytes += sizeof(struct mlx5_vf_migration_header);
+               else
+                       info.dirty_bytes += sizeof(struct mlx5_vf_migration_header);
        }
 
-done:
        mlx5vf_state_mutex_unlock(mvdev);
        if (copy_to_user((void __user *)arg, &info,
                         offsetofend(struct vfio_precopy_info, dirty_bytes)))
@@ -570,7 +599,7 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
        if (migf->state == MLX5_MIGF_STATE_ERROR)
                return -ENODEV;
 
-       ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
+       ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, NULL,
                                MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
        if (ret)
                goto err;
@@ -636,7 +665,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
        if (ret)
                goto out;
 
-       ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
+       ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, NULL, 0);
        if (ret)
                goto out_pd;
 
@@ -1123,7 +1152,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
                enum mlx5_vf_migf_state state;
                size_t size;
 
-               ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
+               ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, NULL,
                                        MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
                if (ret)
                        return ERR_PTR(ret);
@@ -1248,7 +1277,7 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
 
        mutex_lock(&mvdev->state_mutex);
        ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
-                                                   &total_size, 0);
+                                                   &total_size, NULL, 0);
        if (!ret)
                *stop_copy_length = total_size;
        mlx5vf_state_mutex_unlock(mvdev);