]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
KVM: x86: switch hugepage recovery thread to vhost_task
authorPaolo Bonzini <pbonzini@redhat.com>
Fri, 8 Nov 2024 09:56:31 +0000 (04:56 -0500)
committerPaolo Bonzini <pbonzini@redhat.com>
Thu, 14 Nov 2024 18:20:04 +0000 (13:20 -0500)
kvm_vm_create_worker_thread() is meant to be used for kthreads that
can consume significant amounts of CPU time on behalf of a VM or in
response to how the VM behaves (for example how it accesses its memory).
Therefore it wants to charge the CPU time consumed by that work to
the VM's container.

However, because of these threads, cgroups which have kvm instances
inside never complete freezing.  This can be trivially reproduced:

  root@test ~# mkdir /sys/fs/cgroup/test
  root@test ~# echo $$ > /sys/fs/cgroup/test/cgroup.procs
  root@test ~# qemu-system-x86_64 -nographic -enable-kvm

and in another terminal:

  root@test ~# echo 1 > /sys/fs/cgroup/test/cgroup.freeze
  root@test ~# cat /sys/fs/cgroup/test/cgroup.events
  populated 1
  frozen 0

The cgroup freezing happens in the signal delivery path but
kvm_nx_huge_page_recovery_worker, while joining non-root cgroups, never
calls into the signal delivery path and thus never gets frozen. Because
the cgroup freezer determines whether a given cgroup is frozen by
comparing the number of frozen threads to the total number of threads
in the cgroup, the cgroup never becomes frozen and users waiting for
the state transition may hang indefinitely.

Since the worker kthread is tied to a user process, it's better if
it behaves similarly to user tasks as much as possible, including
being able to send SIGSTOP and SIGCONT.  In fact, vhost_task is all
that kvm_vm_create_worker_thread() wanted to be and more: not only it
inherits the userspace process's cgroups, it has other niceties like
being parented properly in the process tree.  Use it instead of the
homegrown alternative.

Incidentally, the new code is also better behaved when you flip recovery
back and forth to disabled and back to enabled.  If your recovery period
is 1 minute, it will run the next recovery after 1 minute independent
of how many times you flipped the parameter.

(Commit message based on emails from Tejun).

Reported-by: Tejun Heo <tj@kernel.org>
Reported-by: Luca Boccassi <bluca@debian.org>
Acked-by: Tejun Heo <tj@kernel.org>
Tested-by: Luca Boccassi <bluca@debian.org>
Cc: stable@vger.kernel.org
Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/Kconfig
arch/x86/kvm/mmu/mmu.c
include/linux/kvm_host.h
virt/kvm/kvm_main.c

index 3e8afc82ae2fb1273f1193a6eb39a0e74bc3b248..e159e44a6a1b61dba500b5f4709031c93828794d 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/irqbypass.h>
 #include <linux/hyperv.h>
 #include <linux/kfifo.h>
+#include <linux/sched/vhost_task.h>
 
 #include <asm/apic.h>
 #include <asm/pvclock-abi.h>
@@ -1442,7 +1443,8 @@ struct kvm_arch {
        bool sgx_provisioning_allowed;
 
        struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
-       struct task_struct *nx_huge_page_recovery_thread;
+       struct vhost_task *nx_huge_page_recovery_thread;
+       u64 nx_huge_page_last;
 
 #ifdef CONFIG_X86_64
        /* The number of TDP MMU pages across all roots. */
index 1ed1e4f5d51cdd7cb339929776511571d14df1fb..d93af539034188cbfce94844ed40adc456839858 100644 (file)
@@ -30,6 +30,7 @@ config KVM_X86
        select HAVE_KVM_IRQ_BYPASS
        select HAVE_KVM_IRQ_ROUTING
        select HAVE_KVM_READONLY_MEM
+       select VHOST_TASK
        select KVM_ASYNC_PF
        select USER_RETURN_NOTIFIER
        select KVM_MMIO
index d7b391fe2c239fafeaed6553f6356d02cbfa1cb0..22e7ad235123136faf054138d4df3df40de844da 100644 (file)
@@ -7162,7 +7162,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
                        kvm_mmu_zap_all_fast(kvm);
                        mutex_unlock(&kvm->slots_lock);
 
-                       wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
+                       vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
                }
                mutex_unlock(&kvm_lock);
        }
@@ -7291,7 +7291,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
                mutex_lock(&kvm_lock);
 
                list_for_each_entry(kvm, &vm_list, vm_list)
-                       wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
+                       vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
 
                mutex_unlock(&kvm_lock);
        }
@@ -7394,62 +7394,56 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
        srcu_read_unlock(&kvm->srcu, rcu_idx);
 }
 
-static long get_nx_huge_page_recovery_timeout(u64 start_time)
+static void kvm_nx_huge_page_recovery_worker_kill(void *data)
 {
-       bool enabled;
-       uint period;
-
-       enabled = calc_nx_huge_pages_recovery_period(&period);
-
-       return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
-                      : MAX_SCHEDULE_TIMEOUT;
 }
 
-static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
+static bool kvm_nx_huge_page_recovery_worker(void *data)
 {
-       u64 start_time;
+       struct kvm *kvm = data;
+       bool enabled;
+       uint period;
        long remaining_time;
 
-       while (true) {
-               start_time = get_jiffies_64();
-               remaining_time = get_nx_huge_page_recovery_timeout(start_time);
-
-               set_current_state(TASK_INTERRUPTIBLE);
-               while (!kthread_should_stop() && remaining_time > 0) {
-                       schedule_timeout(remaining_time);
-                       remaining_time = get_nx_huge_page_recovery_timeout(start_time);
-                       set_current_state(TASK_INTERRUPTIBLE);
-               }
-
-               set_current_state(TASK_RUNNING);
-
-               if (kthread_should_stop())
-                       return 0;
+       enabled = calc_nx_huge_pages_recovery_period(&period);
+       if (!enabled)
+               return false;
 
-               kvm_recover_nx_huge_pages(kvm);
+       remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period)
+               - get_jiffies_64();
+       if (remaining_time > 0) {
+               schedule_timeout(remaining_time);
+               /* check for signals and come back */
+               return true;
        }
+
+       __set_current_state(TASK_RUNNING);
+       kvm_recover_nx_huge_pages(kvm);
+       kvm->arch.nx_huge_page_last = get_jiffies_64();
+       return true;
 }
 
 int kvm_mmu_post_init_vm(struct kvm *kvm)
 {
-       int err;
-
        if (nx_hugepage_mitigation_hard_disabled)
                return 0;
 
-       err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
-                                         "kvm-nx-lpage-recovery",
-                                         &kvm->arch.nx_huge_page_recovery_thread);
-       if (!err)
-               kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
+       kvm->arch.nx_huge_page_last = get_jiffies_64();
+       kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
+               kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
+               kvm, "kvm-nx-lpage-recovery");
 
-       return err;
+       if (!kvm->arch.nx_huge_page_recovery_thread)
+               return -ENOMEM;
+
+       vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
+       return 0;
 }
 
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
 {
        if (kvm->arch.nx_huge_page_recovery_thread)
-               kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
+               vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
 }
 
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
index 03e4d26e3bcc1b5dfde2b43940d3d5c6697ae70f..401439bb21e3e6c1ef0e6824cca4be61487faca1 100644 (file)
@@ -2425,12 +2425,6 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
 
-typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
-
-int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
-                               uintptr_t data, const char *name,
-                               struct task_struct **thread_ptr);
-
 #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
 static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
 {
index 27186b06518a493c05a3e45172c551ec74451149..de2c11dae23163c057c625e8eb3f593978f0548f 100644 (file)
@@ -6426,106 +6426,3 @@ void kvm_exit(void)
        kvm_irqfd_exit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
-
-struct kvm_vm_worker_thread_context {
-       struct kvm *kvm;
-       struct task_struct *parent;
-       struct completion init_done;
-       kvm_vm_thread_fn_t thread_fn;
-       uintptr_t data;
-       int err;
-};
-
-static int kvm_vm_worker_thread(void *context)
-{
-       /*
-        * The init_context is allocated on the stack of the parent thread, so
-        * we have to locally copy anything that is needed beyond initialization
-        */
-       struct kvm_vm_worker_thread_context *init_context = context;
-       struct task_struct *parent;
-       struct kvm *kvm = init_context->kvm;
-       kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
-       uintptr_t data = init_context->data;
-       int err;
-
-       err = kthread_park(current);
-       /* kthread_park(current) is never supposed to return an error */
-       WARN_ON(err != 0);
-       if (err)
-               goto init_complete;
-
-       err = cgroup_attach_task_all(init_context->parent, current);
-       if (err) {
-               kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
-                       __func__, err);
-               goto init_complete;
-       }
-
-       set_user_nice(current, task_nice(init_context->parent));
-
-init_complete:
-       init_context->err = err;
-       complete(&init_context->init_done);
-       init_context = NULL;
-
-       if (err)
-               goto out;
-
-       /* Wait to be woken up by the spawner before proceeding. */
-       kthread_parkme();
-
-       if (!kthread_should_stop())
-               err = thread_fn(kvm, data);
-
-out:
-       /*
-        * Move kthread back to its original cgroup to prevent it lingering in
-        * the cgroup of the VM process, after the latter finishes its
-        * execution.
-        *
-        * kthread_stop() waits on the 'exited' completion condition which is
-        * set in exit_mm(), via mm_release(), in do_exit(). However, the
-        * kthread is removed from the cgroup in the cgroup_exit() which is
-        * called after the exit_mm(). This causes the kthread_stop() to return
-        * before the kthread actually quits the cgroup.
-        */
-       rcu_read_lock();
-       parent = rcu_dereference(current->real_parent);
-       get_task_struct(parent);
-       rcu_read_unlock();
-       cgroup_attach_task_all(parent, current);
-       put_task_struct(parent);
-
-       return err;
-}
-
-int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
-                               uintptr_t data, const char *name,
-                               struct task_struct **thread_ptr)
-{
-       struct kvm_vm_worker_thread_context init_context = {};
-       struct task_struct *thread;
-
-       *thread_ptr = NULL;
-       init_context.kvm = kvm;
-       init_context.parent = current;
-       init_context.thread_fn = thread_fn;
-       init_context.data = data;
-       init_completion(&init_context.init_done);
-
-       thread = kthread_run(kvm_vm_worker_thread, &init_context,
-                            "%s-%d", name, task_pid_nr(current));
-       if (IS_ERR(thread))
-               return PTR_ERR(thread);
-
-       /* kthread_run is never supposed to return NULL */
-       WARN_ON(thread == NULL);
-
-       wait_for_completion(&init_context.init_done);
-
-       if (!init_context.err)
-               *thread_ptr = thread;
-
-       return init_context.err;
-}