]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
vfio/pci: Use RCU for error/request triggers to avoid circular locking
authorAlex Williamson <alex.williamson@nvidia.com>
Mon, 24 Nov 2025 22:36:22 +0000 (15:36 -0700)
committerAlex Williamson <alex@shazbot.org>
Fri, 28 Nov 2025 17:04:27 +0000 (10:04 -0700)
Thanks to a device generating an ACS violation during bus reset,
lockdep reported the following circular locking issue:

CPU0: SET_IRQS (MSI/X): holds igate, acquires memory_lock
CPU1: HOT_RESET: holds memory_lock, acquires pci_bus_sem
CPU2: AER: holds pci_bus_sem, acquires igate

This results in a potential 3-way deadlock.

Remove the pci_bus_sem->igate leg of the triangle by using RCU
to peek at the eventfd rather than locking it with igate.

Fixes: 3be3a074cf5b ("vfio-pci: Don't use device_lock around AER interrupt setup")
Signed-off-by: Alex Williamson <alex.williamson@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20251124223623.2770706-1-alex@shazbot.org
Signed-off-by: Alex Williamson <alex@shazbot.org>
drivers/vfio/pci/vfio_pci_core.c
drivers/vfio/pci/vfio_pci_intrs.c
drivers/vfio/pci/vfio_pci_priv.h
include/linux/vfio_pci_core.h

index 79a1a50a4ef7c088d3b720e228d2c5b9030946ba..2b01bfbce3eae037b6036b43d5b73ae981caf0f0 100644 (file)
@@ -42,6 +42,40 @@ static bool nointxmask;
 static bool disable_vga;
 static bool disable_idle_d3;
 
+static void vfio_pci_eventfd_rcu_free(struct rcu_head *rcu)
+{
+       struct vfio_pci_eventfd *eventfd =
+               container_of(rcu, struct vfio_pci_eventfd, rcu);
+
+       eventfd_ctx_put(eventfd->ctx);
+       kfree(eventfd);
+}
+
+int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev,
+                                   struct vfio_pci_eventfd __rcu **peventfd,
+                                   struct eventfd_ctx *ctx)
+{
+       struct vfio_pci_eventfd *new = NULL;
+       struct vfio_pci_eventfd *old;
+
+       lockdep_assert_held(&vdev->igate);
+
+       if (ctx) {
+               new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
+               if (!new)
+                       return -ENOMEM;
+
+               new->ctx = ctx;
+       }
+
+       old = rcu_replace_pointer(*peventfd, new,
+                                 lockdep_is_held(&vdev->igate));
+       if (old)
+               call_rcu(&old->rcu, vfio_pci_eventfd_rcu_free);
+
+       return 0;
+}
+
 /* List of PF's that vfio_pci_core_sriov_configure() has been called on */
 static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex);
 static LIST_HEAD(vfio_pci_sriov_pfs);
@@ -697,14 +731,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
        vfio_pci_dma_buf_cleanup(vdev);
 
        mutex_lock(&vdev->igate);
-       if (vdev->err_trigger) {
-               eventfd_ctx_put(vdev->err_trigger);
-               vdev->err_trigger = NULL;
-       }
-       if (vdev->req_trigger) {
-               eventfd_ctx_put(vdev->req_trigger);
-               vdev->req_trigger = NULL;
-       }
+       vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL);
+       vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL);
        mutex_unlock(&vdev->igate);
 }
 EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
@@ -1784,21 +1812,21 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
        struct vfio_pci_core_device *vdev =
                container_of(core_vdev, struct vfio_pci_core_device, vdev);
        struct pci_dev *pdev = vdev->pdev;
+       struct vfio_pci_eventfd *eventfd;
 
-       mutex_lock(&vdev->igate);
-
-       if (vdev->req_trigger) {
+       rcu_read_lock();
+       eventfd = rcu_dereference(vdev->req_trigger);
+       if (eventfd) {
                if (!(count % 10))
                        pci_notice_ratelimited(pdev,
                                "Relaying device request to user (#%u)\n",
                                count);
-               eventfd_signal(vdev->req_trigger);
+               eventfd_signal(eventfd->ctx);
        } else if (count == 0) {
                pci_warn(pdev,
                        "No device request channel registered, blocked until released by user\n");
        }
-
-       mutex_unlock(&vdev->igate);
+       rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(vfio_pci_core_request);
 
@@ -2216,13 +2244,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
                                                pci_channel_state_t state)
 {
        struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+       struct vfio_pci_eventfd *eventfd;
 
-       mutex_lock(&vdev->igate);
-
-       if (vdev->err_trigger)
-               eventfd_signal(vdev->err_trigger);
-
-       mutex_unlock(&vdev->igate);
+       rcu_read_lock();
+       eventfd = rcu_dereference(vdev->err_trigger);
+       if (eventfd)
+               eventfd_signal(eventfd->ctx);
+       rcu_read_unlock();
 
        return PCI_ERS_RESULT_CAN_RECOVER;
 }
index 30d3e921cb0deb221ddca68ee70c4c9755421568..c76e753b3cecd6fb0d48d64026fa9cdbac7f4cef 100644 (file)
@@ -731,21 +731,27 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
        return 0;
 }
 
-static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
+static int vfio_pci_set_ctx_trigger_single(struct vfio_pci_core_device *vdev,
+                                          struct vfio_pci_eventfd __rcu **peventfd,
                                           unsigned int count, uint32_t flags,
                                           void *data)
 {
        /* DATA_NONE/DATA_BOOL enables loopback testing */
        if (flags & VFIO_IRQ_SET_DATA_NONE) {
-               if (*ctx) {
-                       if (count) {
-                               eventfd_signal(*ctx);
-                       } else {
-                               eventfd_ctx_put(*ctx);
-                               *ctx = NULL;
-                       }
+               struct vfio_pci_eventfd *eventfd;
+
+               eventfd = rcu_dereference_protected(*peventfd,
+                                               lockdep_is_held(&vdev->igate));
+
+               if (!eventfd)
+                       return -EINVAL;
+
+               if (count) {
+                       eventfd_signal(eventfd->ctx);
                        return 0;
                }
+
+               return vfio_pci_eventfd_replace_locked(vdev, peventfd, NULL);
        } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
                uint8_t trigger;
 
@@ -753,8 +759,15 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
                        return -EINVAL;
 
                trigger = *(uint8_t *)data;
-               if (trigger && *ctx)
-                       eventfd_signal(*ctx);
+
+               if (trigger) {
+                       struct vfio_pci_eventfd *eventfd =
+                                       rcu_dereference_protected(*peventfd,
+                                       lockdep_is_held(&vdev->igate));
+
+                       if (eventfd)
+                               eventfd_signal(eventfd->ctx);
+               }
 
                return 0;
        } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
@@ -765,22 +778,23 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
 
                fd = *(int32_t *)data;
                if (fd == -1) {
-                       if (*ctx)
-                               eventfd_ctx_put(*ctx);
-                       *ctx = NULL;
+                       return vfio_pci_eventfd_replace_locked(vdev,
+                                                              peventfd, NULL);
                } else if (fd >= 0) {
                        struct eventfd_ctx *efdctx;
+                       int ret;
 
                        efdctx = eventfd_ctx_fdget(fd);
                        if (IS_ERR(efdctx))
                                return PTR_ERR(efdctx);
 
-                       if (*ctx)
-                               eventfd_ctx_put(*ctx);
+                       ret = vfio_pci_eventfd_replace_locked(vdev,
+                                                             peventfd, efdctx);
+                       if (ret)
+                               eventfd_ctx_put(efdctx);
 
-                       *ctx = efdctx;
+                       return ret;
                }
-               return 0;
        }
 
        return -EINVAL;
@@ -793,7 +807,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
        if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
                return -EINVAL;
 
-       return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger,
+       return vfio_pci_set_ctx_trigger_single(vdev, &vdev->err_trigger,
                                               count, flags, data);
 }
 
@@ -804,7 +818,7 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
        if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1)
                return -EINVAL;
 
-       return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger,
+       return vfio_pci_set_ctx_trigger_single(vdev, &vdev->req_trigger,
                                               count, flags, data);
 }
 
index 28a405f8b97c9da8af88b63538b26ef35c79d234..6681389518a7832b2e2bdfff561e554bfde8b47b 100644 (file)
@@ -26,6 +26,10 @@ struct vfio_pci_ioeventfd {
 bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
 void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
 
+int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev,
+                                   struct vfio_pci_eventfd __rcu **peventfd,
+                                   struct eventfd_ctx *ctx);
+
 int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
                            unsigned index, unsigned start, unsigned count,
                            void *data);
index 88fd2fd895d0afcbcee743b8fb71cfce779d43b3..a1eddd55dab888dd9eb6e58e7629644378659cb1 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/vfio.h>
 #include <linux/irqbypass.h>
+#include <linux/rcupdate.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
 #include <linux/notifier.h>
@@ -29,6 +30,11 @@ struct vfio_pci_region;
 struct p2pdma_provider;
 struct dma_buf_phys_vec;
 
+struct vfio_pci_eventfd {
+       struct eventfd_ctx      *ctx;
+       struct rcu_head         rcu;
+};
+
 struct vfio_pci_regops {
        ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
                      size_t count, loff_t *ppos, bool iswrite);
@@ -124,8 +130,8 @@ struct vfio_pci_core_device {
        struct pci_saved_state  *pci_saved_state;
        struct pci_saved_state  *pm_save;
        int                     ioeventfds_nr;
-       struct eventfd_ctx      *err_trigger;
-       struct eventfd_ctx      *req_trigger;
+       struct vfio_pci_eventfd __rcu *err_trigger;
+       struct vfio_pci_eventfd __rcu *req_trigger;
        struct eventfd_ctx      *pm_wake_eventfd_ctx;
        struct list_head        dummy_resources_list;
        struct mutex            ioeventfds_lock;