vhost: Reintroduce kthread API and add mode selection

author Cindy Lu <lulu@redhat.com>

Mon, 14 Jul 2025 07:12:32 +0000 (15:12 +0800)

committer Michael S. Tsirkin <mst@redhat.com>

Fri, 1 Aug 2025 13:11:09 +0000 (09:11 -0400)
author Cindy Lu <lulu@redhat.com>
Mon, 14 Jul 2025 07:12:32 +0000 (15:12 +0800)
committer Michael S. Tsirkin <mst@redhat.com>
Fri, 1 Aug 2025 13:11:09 +0000 (09:11 -0400)
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig

index 020d4fbb947ca036f0c1c4b8e6995099e57e7f68..bc0f385744974df677ef5a6af1c8bfec617cbf43 100644 (file)
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -95,4 +95,22 @@ config VHOST_CROSS_ENDIAN_LEGACY
  
           If unsure, say "N".
  
+config VHOST_ENABLE_FORK_OWNER_CONTROL
+       bool "Enable VHOST_ENABLE_FORK_OWNER_CONTROL"
+       default y
+       help
+         This option enables two IOCTLs: VHOST_SET_FORK_FROM_OWNER and
+         VHOST_GET_FORK_FROM_OWNER. These allow userspace applications
+         to modify the vhost worker mode for vhost devices.
+
+         Also expose module parameter 'fork_from_owner_default' to allow users
+         to configure the default mode for vhost workers.
+
+         By default, `VHOST_ENABLE_FORK_OWNER_CONTROL` is set to `y`,
+         users can change the worker thread mode as needed.
+         If this config is disabled (n),the related IOCTLs and parameters will
+         be unavailable.
+
+         If unsure, say "Y".
+
  endif
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c

index 4390e3a142186f0b5e08e3c46975982ac556cccc..f4c1bc6adeda050639ce2ef983cfcb81b92a4d25 100644 (file)
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -22,6 +22,7 @@
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
  #include <linux/kthread.h>
+#include <linux/cgroup.h>
  #include <linux/module.h>
  #include <linux/sort.h>
  #include <linux/sched/mm.h>
@@ -41,6 +42,13 @@ static int max_iotlb_entries = 2048;
  module_param(max_iotlb_entries, int, 0444);
  MODULE_PARM_DESC(max_iotlb_entries,
         "Maximum number of iotlb entries. (default: 2048)");
+static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK;
+
+#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
+module_param(fork_from_owner_default, bool, 0444);
+MODULE_PARM_DESC(fork_from_owner_default,
+                "Set task mode as the default(default: Y)");
+#endif
  
  enum {
         VHOST_MEMORY_F_LOG = 0x1,
@@ -242,7 +250,7 @@ static void vhost_worker_queue(struct vhost_worker *worker,
                  * test_and_set_bit() implies a memory barrier.
                  */
                 llist_add(&work->node, &worker->work_list);
-               vhost_task_wake(worker->vtsk);
+               worker->ops->wakeup(worker);
         }
  }
  
@@ -388,6 +396,44 @@ static void vhost_vq_reset(struct vhost_dev *dev,
         __vhost_vq_meta_reset(vq);
  }
  
+static int vhost_run_work_kthread_list(void *data)
+{
+       struct vhost_worker *worker = data;
+       struct vhost_work *work, *work_next;
+       struct vhost_dev *dev = worker->dev;
+       struct llist_node *node;
+
+       kthread_use_mm(dev->mm);
+
+       for (;;) {
+               /* mb paired w/ kthread_stop */
+               set_current_state(TASK_INTERRUPTIBLE);
+
+               if (kthread_should_stop()) {
+                       __set_current_state(TASK_RUNNING);
+                       break;
+               }
+               node = llist_del_all(&worker->work_list);
+               if (!node)
+                       schedule();
+
+               node = llist_reverse_order(node);
+               /* make sure flag is seen after deletion */
+               smp_wmb();
+               llist_for_each_entry_safe(work, work_next, node, node) {
+                       clear_bit(VHOST_WORK_QUEUED, &work->flags);
+                       __set_current_state(TASK_RUNNING);
+                       kcov_remote_start_common(worker->kcov_handle);
+                       work->fn(work);
+                       kcov_remote_stop();
+                       cond_resched();
+               }
+       }
+       kthread_unuse_mm(dev->mm);
+
+       return 0;
+}
+
  static bool vhost_run_work_list(void *data)
  {
         struct vhost_worker *worker = data;
@@ -552,6 +598,7 @@ void vhost_dev_init(struct vhost_dev *dev,
         dev->byte_weight = byte_weight;
         dev->use_worker = use_worker;
         dev->msg_handler = msg_handler;
+       dev->fork_owner = fork_from_owner_default;
         init_waitqueue_head(&dev->wait);
         INIT_LIST_HEAD(&dev->read_list);
         INIT_LIST_HEAD(&dev->pending_list);
@@ -581,6 +628,46 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
  }
  EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
  
+struct vhost_attach_cgroups_struct {
+       struct vhost_work work;
+       struct task_struct *owner;
+       int ret;
+};
+
+static void vhost_attach_cgroups_work(struct vhost_work *work)
+{
+       struct vhost_attach_cgroups_struct *s;
+
+       s = container_of(work, struct vhost_attach_cgroups_struct, work);
+       s->ret = cgroup_attach_task_all(s->owner, current);
+}
+
+static int vhost_attach_task_to_cgroups(struct vhost_worker *worker)
+{
+       struct vhost_attach_cgroups_struct attach;
+       int saved_cnt;
+
+       attach.owner = current;
+
+       vhost_work_init(&attach.work, vhost_attach_cgroups_work);
+       vhost_worker_queue(worker, &attach.work);
+
+       mutex_lock(&worker->mutex);
+
+       /*
+        * Bypass attachment_cnt check in __vhost_worker_flush:
+        * Temporarily change it to INT_MAX to bypass the check
+        */
+       saved_cnt = worker->attachment_cnt;
+       worker->attachment_cnt = INT_MAX;
+       __vhost_worker_flush(worker);
+       worker->attachment_cnt = saved_cnt;
+
+       mutex_unlock(&worker->mutex);
+
+       return attach.ret;
+}
+
  /* Caller should have device mutex */
  bool vhost_dev_has_owner(struct vhost_dev *dev)
  {
@@ -626,7 +713,7 @@ static void vhost_worker_destroy(struct vhost_dev *dev,
  
         WARN_ON(!llist_empty(&worker->work_list));
         xa_erase(&dev->worker_xa, worker->id);
-       vhost_task_stop(worker->vtsk);
+       worker->ops->stop(worker);
         kfree(worker);
  }
  
@@ -649,42 +736,115 @@ static void vhost_workers_free(struct vhost_dev *dev)
         xa_destroy(&dev->worker_xa);
  }
  
+static void vhost_task_wakeup(struct vhost_worker *worker)
+{
+       return vhost_task_wake(worker->vtsk);
+}
+
+static void vhost_kthread_wakeup(struct vhost_worker *worker)
+{
+       wake_up_process(worker->kthread_task);
+}
+
+static void vhost_task_do_stop(struct vhost_worker *worker)
+{
+       return vhost_task_stop(worker->vtsk);
+}
+
+static void vhost_kthread_do_stop(struct vhost_worker *worker)
+{
+       kthread_stop(worker->kthread_task);
+}
+
+static int vhost_task_worker_create(struct vhost_worker *worker,
+                                   struct vhost_dev *dev, const char *name)
+{
+       struct vhost_task *vtsk;
+       u32 id;
+       int ret;
+
+       vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
+                                worker, name);
+       if (IS_ERR(vtsk))
+               return PTR_ERR(vtsk);
+
+       worker->vtsk = vtsk;
+       vhost_task_start(vtsk);
+       ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+       if (ret < 0) {
+               vhost_task_do_stop(worker);
+               return ret;
+       }
+       worker->id = id;
+       return 0;
+}
+
+static int vhost_kthread_worker_create(struct vhost_worker *worker,
+                                      struct vhost_dev *dev, const char *name)
+{
+       struct task_struct *task;
+       u32 id;
+       int ret;
+
+       task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name);
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+
+       worker->kthread_task = task;
+       wake_up_process(task);
+       ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+       if (ret < 0)
+               goto stop_worker;
+
+       ret = vhost_attach_task_to_cgroups(worker);
+       if (ret)
+               goto stop_worker;
+
+       worker->id = id;
+       return 0;
+
+stop_worker:
+       vhost_kthread_do_stop(worker);
+       return ret;
+}
+
+static const struct vhost_worker_ops kthread_ops = {
+       .create = vhost_kthread_worker_create,
+       .stop = vhost_kthread_do_stop,
+       .wakeup = vhost_kthread_wakeup,
+};
+
+static const struct vhost_worker_ops vhost_task_ops = {
+       .create = vhost_task_worker_create,
+       .stop = vhost_task_do_stop,
+       .wakeup = vhost_task_wakeup,
+};
+
  static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
  {
         struct vhost_worker *worker;
-       struct vhost_task *vtsk;
         char name[TASK_COMM_LEN];
         int ret;
-       u32 id;
+       const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops :
+                                                              &kthread_ops;
  
         worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
         if (!worker)
                 return NULL;
  
         worker->dev = dev;
+       worker->ops = ops;
         snprintf(name, sizeof(name), "vhost-%d", current->pid);
  
-       vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
-                                worker, name);
-       if (IS_ERR(vtsk))
-               goto free_worker;
-
         mutex_init(&worker->mutex);
         init_llist_head(&worker->work_list);
         worker->kcov_handle = kcov_common_handle();
-       worker->vtsk = vtsk;
-
-       vhost_task_start(vtsk);
-
-       ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+       ret = ops->create(worker, dev, name);
         if (ret < 0)
-               goto stop_worker;
-       worker->id = id;
+               goto free_worker;
  
         return worker;
  
-stop_worker:
-       vhost_task_stop(vtsk);
  free_worker:
         kfree(worker);
         return NULL;
@@ -865,6 +1025,14 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
         switch (ioctl) {
         /* dev worker ioctls */
         case VHOST_NEW_WORKER:
+               /*
+                * vhost_tasks will account for worker threads under the parent's
+                * NPROC value but kthreads do not. To avoid userspace overflowing
+                * the system with worker threads fork_owner must be true.
+                */
+               if (!dev->fork_owner)
+                       return -EFAULT;
+
                 ret = vhost_new_worker(dev, &state);
                 if (!ret && copy_to_user(argp, &state, sizeof(state)))
                         ret = -EFAULT;
@@ -982,6 +1150,7 @@ void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
  
         vhost_dev_cleanup(dev);
  
+       dev->fork_owner = fork_from_owner_default;
         dev->umem = umem;
         /* We don't need VQ locks below since vhost_dev_cleanup makes sure
          * VQs aren't running.
@@ -2135,6 +2304,45 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
                 goto done;
         }
  
+#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
+       if (ioctl == VHOST_SET_FORK_FROM_OWNER) {
+               /* Only allow modification before owner is set */
+               if (vhost_dev_has_owner(d)) {
+                       r = -EBUSY;
+                       goto done;
+               }
+               u8 fork_owner_val;
+
+               if (get_user(fork_owner_val, (u8 __user *)argp)) {
+                       r = -EFAULT;
+                       goto done;
+               }
+               if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
+                   fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
+                       r = -EINVAL;
+                       goto done;
+               }
+               d->fork_owner = !!fork_owner_val;
+               r = 0;
+               goto done;
+       }
+       if (ioctl == VHOST_GET_FORK_FROM_OWNER) {
+               u8 fork_owner_val = d->fork_owner;
+
+               if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
+                   fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
+                       r = -EINVAL;
+                       goto done;
+               }
+               if (put_user(fork_owner_val, (u8 __user *)argp)) {
+                       r = -EFAULT;
+                       goto done;
+               }
+               r = 0;
+               goto done;
+       }
+#endif
+
         /* You must be the owner to do anything else */
         r = vhost_dev_check_owner(d);
         if (r)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h

index bb75a292d50cd3f96bc6be019574a6d6379523d6..ab704d84fb3446c47501ba8b3ee7adfa45c584a5 100644 (file)
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -26,7 +26,18 @@ struct vhost_work {
         unsigned long           flags;
  };
  
+struct vhost_worker;
+struct vhost_dev;
+
+struct vhost_worker_ops {
+       int (*create)(struct vhost_worker *worker, struct vhost_dev *dev,
+                     const char *name);
+       void (*stop)(struct vhost_worker *worker);
+       void (*wakeup)(struct vhost_worker *worker);
+};
+
  struct vhost_worker {
+       struct task_struct *kthread_task;
         struct vhost_task       *vtsk;
         struct vhost_dev        *dev;
         /* Used to serialize device wide flushing with worker swapping. */
@@ -36,6 +47,7 @@ struct vhost_worker {
         u32                     id;
         int                     attachment_cnt;
         bool                    killed;
+       const struct vhost_worker_ops *ops;
  };
  
  /* Poll a file (eventfd or socket) */
@@ -176,6 +188,16 @@ struct vhost_dev {
         int byte_weight;
         struct xarray worker_xa;
         bool use_worker;
+       /*
+        * If fork_owner is true we use vhost_tasks to create
+        * the worker so all settings/limits like cgroups, NPROC,
+        * scheduler, etc are inherited from the owner. If false,
+        * we use kthreads and only attach to the same cgroups
+        * as the owner for compat with older kernels.
+        * here we use true as default value.
+        * The default value is set by fork_from_owner_default
+        */
+       bool fork_owner;
         int (*msg_handler)(struct vhost_dev *dev, u32 asid,
                            struct vhost_iotlb_msg *msg);
  };
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h

index d4b3e2ae1314d1fc55f3de081ca147532d25947c..e72f2655459e45db7a61958e245a1cb93f4e6527 100644 (file)
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -235,4 +235,33 @@
   */
  #define VHOST_VDPA_GET_VRING_SIZE      _IOWR(VHOST_VIRTIO, 0x82,       \
                                               struct vhost_vring_state)
+
+/* fork_owner values for vhost */
+#define VHOST_FORK_OWNER_KTHREAD 0
+#define VHOST_FORK_OWNER_TASK 1
+
+/**
+ * VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device,
+ * This ioctl must called before VHOST_SET_OWNER.
+ * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y
+ *
+ * @param fork_owner: An 8-bit value that determines the vhost thread mode
+ *
+ * When fork_owner is set to VHOST_FORK_OWNER_TASK(default value):
+ *   - Vhost will create vhost worker as tasks forked from the owner,
+ *     inheriting all of the owner's attributes.
+ *
+ * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD:
+ *   - Vhost will create vhost workers as kernel threads.
+ */
+#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8)
+
+/**
+ * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device.
+ * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y
+ *
+ * @return: An 8-bit value indicating the current thread mode.
+ */
+#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8)
+
  #endif
author	Cindy Lu <lulu@redhat.com>
	Mon, 14 Jul 2025 07:12:32 +0000 (15:12 +0800)
committer	Michael S. Tsirkin <mst@redhat.com>
	Fri, 1 Aug 2025 13:11:09 +0000 (09:11 -0400)
drivers/vhost/Kconfig		patch \| blob \| blame \| history
drivers/vhost/vhost.c		patch \| blob \| blame \| history
drivers/vhost/vhost.h		patch \| blob \| blame \| history
include/uapi/linux/vhost.h		patch \| blob \| blame \| history