From: Sicong Huang Date: Tue, 19 May 2026 11:20:18 +0000 (+0800) Subject: virt: acrn: Fix irqfd use-after-free during eventfd shutdown X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=666c7f9e07925aa0863348f960e09bb89f8f05a3;p=thirdparty%2Fkernel%2Fstable.git virt: acrn: Fix irqfd use-after-free during eventfd shutdown acrn_irqfd_deassign() and the eventfd EPOLLHUP wakeup can race and free the same struct hsm_irqfd: CPU0 CPU1 ---- ---- eventfd_release() wake_up_poll(EPOLLHUP) hsm_irqfd_wakeup() queue_work(&irqfd->shutdown) acrn_irqfd_deassign() hsm_irqfd_shutdown() list_del_init() eventfd_ctx_remove_wait_queue() eventfd_ctx_put() kfree(irqfd) hsm_irqfd_shutdown_work() container_of(work, ..., shutdown) irqfd->vm <-- use-after-free The deassign path freed the irqfd while a shutdown work item was already queued by EPOLLHUP (or vice versa), so the work item could resurrect a dangling pointer through container_of(). Switch to the lifetime model used by KVM irqfds: - Deassign/deinit only deactivate the irqfd: remove it from vm->irqfds under irqfds_lock and queue the cleanup work. - hsm_irqfd_shutdown_work() becomes the sole owner that unhooks the eventfd waitqueue entry, drops the eventfd reference and frees the irqfd. - A new HSM_IRQFD_FLAG_SHUTDOWN bit guarded by test_and_set_bit() ensures the cleanup work is queued at most once, no matter how many of {EPOLLHUP, deassign, deinit} fire concurrently. This is safe to call from the waitqueue callback, which runs with wqh->lock held and IRQs disabled and therefore cannot take irqfds_lock. - acrn_irqfd_deassign() flushes vm->irqfd_wq before returning so the eventfd is fully detached on return. acrn_irqfd_deinit() deactivates every irqfd, flushes the workqueue and only then destroys it, so no path can queue_work() onto a torn-down workqueue. - acrn_irqfd_assign() now installs the eventfd waitqueue entry and publishes the irqfd to vm->irqfds under irqfds_lock, so the irqfd is never visible to deassign/deinit before its waitqueue entry is in place, and any EPOLLHUP that fires in the assign window queues cleanup work that blocks on irqfds_lock until publication is done. Signed-off-by: Sicong Huang Reviewed-by: Fei Li Link: https://patch.msgid.link/20260519112018.2135000-2-congei42@163.com Signed-off-by: Greg Kroah-Hartman --- diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c index acf8cd5f8f8c2..feeba7eda4941 100644 --- a/drivers/virt/acrn/irqfd.c +++ b/drivers/virt/acrn/irqfd.c @@ -16,6 +16,9 @@ #include "acrn_drv.h" +/* Cleanup work has been queued; set via test_and_set_bit(). */ +#define HSM_IRQFD_FLAG_SHUTDOWN 0 + /** * struct hsm_irqfd - Properties of HSM irqfd * @vm: Associated VM pointer @@ -25,6 +28,7 @@ * @list: Entry within &acrn_vm.irqfds of irqfds of a VM * @pt: Structure for select/poll on the associated eventfd * @msi: MSI data + * @flags: Internal lifecycle flags (HSM_IRQFD_FLAG_*) */ struct hsm_irqfd { struct acrn_vm *vm; @@ -34,6 +38,7 @@ struct hsm_irqfd { struct list_head list; poll_table pt; struct acrn_msi_entry msi; + unsigned long flags; }; static void acrn_irqfd_inject(struct hsm_irqfd *irqfd) @@ -44,30 +49,29 @@ static void acrn_irqfd_inject(struct hsm_irqfd *irqfd) irqfd->msi.msi_data); } -static void hsm_irqfd_shutdown(struct hsm_irqfd *irqfd) +/* Queue the cleanup work at most once. Safe from atomic context. */ +static void hsm_irqfd_queue_shutdown(struct hsm_irqfd *irqfd) { - u64 cnt; - - lockdep_assert_held(&irqfd->vm->irqfds_lock); - - /* remove from wait queue */ - list_del_init(&irqfd->list); - eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); - eventfd_ctx_put(irqfd->eventfd); - kfree(irqfd); + if (!test_and_set_bit(HSM_IRQFD_FLAG_SHUTDOWN, &irqfd->flags)) + queue_work(irqfd->vm->irqfd_wq, &irqfd->shutdown); } +/* Sole owner of @irqfd: unhook waitqueue, drop eventfd ref, free. */ static void hsm_irqfd_shutdown_work(struct work_struct *work) { - struct hsm_irqfd *irqfd; - struct acrn_vm *vm; + struct hsm_irqfd *irqfd = container_of(work, struct hsm_irqfd, + shutdown); + struct acrn_vm *vm = irqfd->vm; + u64 cnt; - irqfd = container_of(work, struct hsm_irqfd, shutdown); - vm = irqfd->vm; mutex_lock(&vm->irqfds_lock); if (!list_empty(&irqfd->list)) - hsm_irqfd_shutdown(irqfd); + list_del_init(&irqfd->list); mutex_unlock(&vm->irqfds_lock); + + eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); + eventfd_ctx_put(irqfd->eventfd); + kfree(irqfd); } /* Called with wqh->lock held and interrupts disabled */ @@ -76,17 +80,16 @@ static int hsm_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, { unsigned long poll_bits = (unsigned long)key; struct hsm_irqfd *irqfd; - struct acrn_vm *vm; irqfd = container_of(wait, struct hsm_irqfd, wait); - vm = irqfd->vm; + if (poll_bits & POLLIN) /* An event has been signaled, inject an interrupt */ acrn_irqfd_inject(irqfd); if (poll_bits & POLLHUP) - /* Do shutdown work in thread to hold wqh->lock */ - queue_work(vm->irqfd_wq, &irqfd->shutdown); + /* Defer teardown to the cleanup work; can't sleep here. */ + hsm_irqfd_queue_shutdown(irqfd); return 0; } @@ -142,6 +145,12 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args) init_waitqueue_func_entry(&irqfd->wait, hsm_irqfd_wakeup); init_poll_funcptr(&irqfd->pt, hsm_irqfd_poll_func); + /* + * Hold irqfds_lock across waitqueue install and list_add so the + * irqfd is not visible to deassign/deinit before its waitqueue + * entry is in place, and any racing EPOLLHUP cleanup work blocks + * on irqfds_lock until publication completes. + */ mutex_lock(&vm->irqfds_lock); list_for_each_entry(tmp, &vm->irqfds, list) { if (irqfd->eventfd != tmp->eventfd) @@ -150,14 +159,12 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args) mutex_unlock(&vm->irqfds_lock); goto fail; } - list_add_tail(&irqfd->list, &vm->irqfds); - mutex_unlock(&vm->irqfds_lock); - /* Check the pending event in this stage */ events = vfs_poll(fd_file(f), &irqfd->pt); - + list_add_tail(&irqfd->list, &vm->irqfds); if (events & EPOLLIN) acrn_irqfd_inject(irqfd); + mutex_unlock(&vm->irqfds_lock); return 0; fail: @@ -180,13 +187,17 @@ static int acrn_irqfd_deassign(struct acrn_vm *vm, mutex_lock(&vm->irqfds_lock); list_for_each_entry_safe(irqfd, tmp, &vm->irqfds, list) { if (irqfd->eventfd == eventfd) { - hsm_irqfd_shutdown(irqfd); + list_del_init(&irqfd->list); + hsm_irqfd_queue_shutdown(irqfd); break; } } mutex_unlock(&vm->irqfds_lock); eventfd_ctx_put(eventfd); + /* Wait for cleanup work to finish so the eventfd is fully detached. */ + flush_workqueue(vm->irqfd_wq); + return 0; } @@ -219,9 +230,15 @@ void acrn_irqfd_deinit(struct acrn_vm *vm) struct hsm_irqfd *irqfd, *next; dev_dbg(acrn_dev.this_device, "VM %u irqfd deinit.\n", vm->vmid); - destroy_workqueue(vm->irqfd_wq); + mutex_lock(&vm->irqfds_lock); - list_for_each_entry_safe(irqfd, next, &vm->irqfds, list) - hsm_irqfd_shutdown(irqfd); + list_for_each_entry_safe(irqfd, next, &vm->irqfds, list) { + list_del_init(&irqfd->list); + hsm_irqfd_queue_shutdown(irqfd); + } mutex_unlock(&vm->irqfds_lock); + + /* Drain all cleanup work before tearing the workqueue down. */ + flush_workqueue(vm->irqfd_wq); + destroy_workqueue(vm->irqfd_wq); }