From 5f8ca05ea99183ab2b69c7fd9617961211d194e7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:14 -0700 Subject: [PATCH] KVM: Add irqfd to KVM's list via the vfs_poll() callback Add the irqfd structure to KVM's list of irqfds in kvm_irqfd_register(), i.e. via the vfs_poll() callback. This will allow taking irqfds.lock across the entire registration sequence (add to waitqueue, add to list), and more importantly will allow inserting into KVM's list if and only if adding to the waitqueue succeeds (spoiler alert), without needing to juggle return codes in weird ways. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-5-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 98 ++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 7b7c5269cf18..170a67aad983 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -245,9 +245,32 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) return ret; } +static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; + int n_entries; + + lockdep_assert_held(&kvm->irqfds.lock); + + n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); + + write_seqcount_begin(&irqfd->irq_entry_sc); + + e = entries; + if (n_entries == 1) + irqfd->irq_entry = *e; + else + irqfd->irq_entry.type = 0; + + write_seqcount_end(&irqfd->irq_entry_sc); +} + struct kvm_irqfd_pt { struct kvm_kernel_irqfd *irqfd; + struct kvm *kvm; poll_table pt; + int ret; }; static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, @@ -255,6 +278,25 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, { struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); struct kvm_kernel_irqfd *irqfd = p->irqfd; + struct kvm_kernel_irqfd *tmp; + struct kvm *kvm = p->kvm; + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry(tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd != tmp->eventfd) + continue; + /* This fd is used for another irq already. */ + p->ret = -EBUSY; + spin_unlock_irq(&kvm->irqfds.lock); + return; + } + + irqfd_update(kvm, irqfd); + + list_add_tail(&irqfd->list, &kvm->irqfds.items); + + spin_unlock_irq(&kvm->irqfds.lock); /* * Add the irqfd as a priority waiter on the eventfd, with a custom @@ -264,26 +306,7 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); add_wait_queue_priority(wqh, &irqfd->wait); -} - -/* Must be called under irqfds.lock */ -static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - int n_entries; - - n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); - - write_seqcount_begin(&irqfd->irq_entry_sc); - - e = entries; - if (n_entries == 1) - irqfd->irq_entry = *e; - else - irqfd->irq_entry.type = 0; - - write_seqcount_end(&irqfd->irq_entry_sc); + p->ret = 0; } #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) @@ -308,7 +331,7 @@ void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, static int kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { - struct kvm_kernel_irqfd *irqfd, *tmp; + struct kvm_kernel_irqfd *irqfd; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; struct kvm_irqfd_pt irqfd_pt; int ret; @@ -407,32 +430,22 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) */ idx = srcu_read_lock(&kvm->irq_srcu); - spin_lock_irq(&kvm->irqfds.lock); - - ret = 0; - list_for_each_entry(tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd != tmp->eventfd) - continue; - /* This fd is used for another irq already. */ - ret = -EBUSY; - goto fail_duplicate; - } - - irqfd_update(kvm, irqfd); - - list_add_tail(&irqfd->list, &kvm->irqfds.items); - - spin_unlock_irq(&kvm->irqfds.lock); - /* - * Register the irqfd with the eventfd by polling on the eventfd. If - * there was en event pending on the eventfd prior to registering, - * manually trigger IRQ injection. + * Register the irqfd with the eventfd by polling on the eventfd, and + * simultaneously and the irqfd to KVM's list. If there was en event + * pending on the eventfd prior to registering, manually trigger IRQ + * injection. */ irqfd_pt.irqfd = irqfd; + irqfd_pt.kvm = kvm; init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); events = vfs_poll(fd_file(f), &irqfd_pt.pt); + + ret = irqfd_pt.ret; + if (ret) + goto fail_poll; + if (events & EPOLLIN) schedule_work(&irqfd->inject); @@ -452,8 +465,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) srcu_read_unlock(&kvm->irq_srcu, idx); return 0; -fail_duplicate: - spin_unlock_irq(&kvm->irqfds.lock); +fail_poll: srcu_read_unlock(&kvm->irq_srcu, idx); fail: if (irqfd->resampler) -- 2.47.2