xfs-reset-xfs_attr_incomplete-filter-on-node-removal.patch
xfs-remove-conditional-building-of-rt-geometry-validator-functions.patch
btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch
+vfio-introduce-interface-to-flush-virqfd-inject-workqueue.patch
+vfio-pci-create-persistent-intx-handler.patch
+vfio-platform-create-persistent-irq-handlers.patch
+vfio-fsl-mc-block-calling-interrupt-handler-without-trigger.patch
+x86-kconfig-remove-config_amd_mem_encrypt_active_by_default.patch
+x86-sev-fix-position-dependent-variable-references-in-startup-code.patch
--- /dev/null
+From 7447d911af699a15f8d050dfcb7c680a86f87012 Mon Sep 17 00:00:00 2001
+From: Alex Williamson <alex.williamson@redhat.com>
+Date: Fri, 8 Mar 2024 16:05:28 -0700
+Subject: vfio/fsl-mc: Block calling interrupt handler without trigger
+
+From: Alex Williamson <alex.williamson@redhat.com>
+
+commit 7447d911af699a15f8d050dfcb7c680a86f87012 upstream.
+
+The eventfd_ctx trigger pointer of the vfio_fsl_mc_irq object is
+initially NULL and may become NULL if the user sets the trigger
+eventfd to -1. The interrupt handler itself is guaranteed that
+trigger is always valid between request_irq() and free_irq(), but
+the loopback testing mechanisms to invoke the handler function
+need to test the trigger. The triggering and setting ioctl paths
+both make use of igate and are therefore mutually exclusive.
+
+The vfio-fsl-mc driver does not make use of irqfds, nor does it
+support any sort of masking operations, therefore unlike vfio-pci
+and vfio-platform, the flow can remain essentially unchanged.
+
+Cc: Diana Craciun <diana.craciun@oss.nxp.com>
+Cc: <stable@vger.kernel.org>
+Fixes: cc0ee20bd969 ("vfio/fsl-mc: trigger an interrupt via eventfd")
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Link: https://lore.kernel.org/r/20240308230557.805580-8-alex.williamson@redhat.com
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
++++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
+@@ -141,13 +141,14 @@ static int vfio_fsl_mc_set_irq_trigger(s
+ irq = &vdev->mc_irqs[index];
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+- vfio_fsl_mc_irq_handler(hwirq, irq);
++ if (irq->trigger)
++ eventfd_signal(irq->trigger);
+
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ u8 trigger = *(u8 *)data;
+
+- if (trigger)
+- vfio_fsl_mc_irq_handler(hwirq, irq);
++ if (trigger && irq->trigger)
++ eventfd_signal(irq->trigger);
+ }
+
+ return 0;
--- /dev/null
+From b620ecbd17a03cacd06f014a5d3f3a11285ce053 Mon Sep 17 00:00:00 2001
+From: Alex Williamson <alex.williamson@redhat.com>
+Date: Fri, 8 Mar 2024 16:05:24 -0700
+Subject: vfio: Introduce interface to flush virqfd inject workqueue
+
+From: Alex Williamson <alex.williamson@redhat.com>
+
+commit b620ecbd17a03cacd06f014a5d3f3a11285ce053 upstream.
+
+In order to synchronize changes that can affect the thread callback,
+introduce an interface to force a flush of the inject workqueue. The
+irqfd pointer is only valid under spinlock, but the workqueue cannot
+be flushed under spinlock. Therefore the flush work for the irqfd is
+queued under spinlock. The vfio_irqfd_cleanup_wq workqueue is re-used
+for queuing this work such that flushing the workqueue is also ordered
+relative to shutdown.
+
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Link: https://lore.kernel.org/r/20240308230557.805580-4-alex.williamson@redhat.com
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Stable-dep-of: 18c198c96a81 ("vfio/pci: Create persistent INTx handler")
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vfio/virqfd.c | 21 +++++++++++++++++++++
+ include/linux/vfio.h | 2 ++
+ 2 files changed, 23 insertions(+)
+
+--- a/drivers/vfio/virqfd.c
++++ b/drivers/vfio/virqfd.c
+@@ -101,6 +101,13 @@ static void virqfd_inject(struct work_st
+ virqfd->thread(virqfd->opaque, virqfd->data);
+ }
+
++static void virqfd_flush_inject(struct work_struct *work)
++{
++ struct virqfd *virqfd = container_of(work, struct virqfd, flush_inject);
++
++ flush_work(&virqfd->inject);
++}
++
+ int vfio_virqfd_enable(void *opaque,
+ int (*handler)(void *, void *),
+ void (*thread)(void *, void *),
+@@ -124,6 +131,7 @@ int vfio_virqfd_enable(void *opaque,
+
+ INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
+ INIT_WORK(&virqfd->inject, virqfd_inject);
++ INIT_WORK(&virqfd->flush_inject, virqfd_flush_inject);
+
+ irqfd = fdget(fd);
+ if (!irqfd.file) {
+@@ -213,3 +221,16 @@ void vfio_virqfd_disable(struct virqfd *
+ flush_workqueue(vfio_irqfd_cleanup_wq);
+ }
+ EXPORT_SYMBOL_GPL(vfio_virqfd_disable);
++
++void vfio_virqfd_flush_thread(struct virqfd **pvirqfd)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&virqfd_lock, flags);
++ if (*pvirqfd && (*pvirqfd)->thread)
++ queue_work(vfio_irqfd_cleanup_wq, &(*pvirqfd)->flush_inject);
++ spin_unlock_irqrestore(&virqfd_lock, flags);
++
++ flush_workqueue(vfio_irqfd_cleanup_wq);
++}
++EXPORT_SYMBOL_GPL(vfio_virqfd_flush_thread);
+--- a/include/linux/vfio.h
++++ b/include/linux/vfio.h
+@@ -349,6 +349,7 @@ struct virqfd {
+ wait_queue_entry_t wait;
+ poll_table pt;
+ struct work_struct shutdown;
++ struct work_struct flush_inject;
+ struct virqfd **pvirqfd;
+ };
+
+@@ -356,5 +357,6 @@ int vfio_virqfd_enable(void *opaque, int
+ void (*thread)(void *, void *), void *data,
+ struct virqfd **pvirqfd, int fd);
+ void vfio_virqfd_disable(struct virqfd **pvirqfd);
++void vfio_virqfd_flush_thread(struct virqfd **pvirqfd);
+
+ #endif /* VFIO_H */
--- /dev/null
+From 18c198c96a815c962adc2b9b77909eec0be7df4d Mon Sep 17 00:00:00 2001
+From: Alex Williamson <alex.williamson@redhat.com>
+Date: Fri, 8 Mar 2024 16:05:25 -0700
+Subject: vfio/pci: Create persistent INTx handler
+
+From: Alex Williamson <alex.williamson@redhat.com>
+
+commit 18c198c96a815c962adc2b9b77909eec0be7df4d upstream.
+
+A vulnerability exists where the eventfd for INTx signaling can be
+deconfigured, which unregisters the IRQ handler but still allows
+eventfds to be signaled with a NULL context through the SET_IRQS ioctl
+or through unmask irqfd if the device interrupt is pending.
+
+Ideally this could be solved with some additional locking; the igate
+mutex serializes the ioctl and config space accesses, and the interrupt
+handler is unregistered relative to the trigger, but the irqfd path
+runs asynchronous to those. The igate mutex cannot be acquired from the
+atomic context of the eventfd wake function. Disabling the irqfd
+relative to the eventfd registration is potentially incompatible with
+existing userspace.
+
+As a result, the solution implemented here moves configuration of the
+INTx interrupt handler to track the lifetime of the INTx context object
+and irq_type configuration, rather than registration of a particular
+trigger eventfd. Synchronization is added between the ioctl path and
+eventfd_signal() wrapper such that the eventfd trigger can be
+dynamically updated relative to in-flight interrupts or irqfd callbacks.
+
+Cc: <stable@vger.kernel.org>
+Fixes: 89e1f7d4c66d ("vfio: Add PCI device driver")
+Reported-by: Reinette Chatre <reinette.chatre@intel.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Link: https://lore.kernel.org/r/20240308230557.805580-5-alex.williamson@redhat.com
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vfio/pci/vfio_pci_intrs.c | 145 ++++++++++++++++++++------------------
+ 1 file changed, 78 insertions(+), 67 deletions(-)
+
+--- a/drivers/vfio/pci/vfio_pci_intrs.c
++++ b/drivers/vfio/pci/vfio_pci_intrs.c
+@@ -90,11 +90,15 @@ static void vfio_send_intx_eventfd(void
+
+ if (likely(is_intx(vdev) && !vdev->virq_disabled)) {
+ struct vfio_pci_irq_ctx *ctx;
++ struct eventfd_ctx *trigger;
+
+ ctx = vfio_irq_ctx_get(vdev, 0);
+ if (WARN_ON_ONCE(!ctx))
+ return;
+- eventfd_signal(ctx->trigger, 1);
++
++ trigger = READ_ONCE(ctx->trigger);
++ if (likely(trigger))
++ eventfd_signal(trigger, 1);
+ }
+ }
+
+@@ -253,100 +257,100 @@ static irqreturn_t vfio_intx_handler(int
+ return ret;
+ }
+
+-static int vfio_intx_enable(struct vfio_pci_core_device *vdev)
++static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
++ struct eventfd_ctx *trigger)
+ {
++ struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_irq_ctx *ctx;
++ unsigned long irqflags;
++ char *name;
++ int ret;
+
+ if (!is_irq_none(vdev))
+ return -EINVAL;
+
+- if (!vdev->pdev->irq)
++ if (!pdev->irq)
+ return -ENODEV;
+
++ name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev));
++ if (!name)
++ return -ENOMEM;
++
+ ctx = vfio_irq_ctx_alloc(vdev, 0);
+ if (!ctx)
+ return -ENOMEM;
+
++ ctx->name = name;
++ ctx->trigger = trigger;
++
+ /*
+- * If the virtual interrupt is masked, restore it. Devices
+- * supporting DisINTx can be masked at the hardware level
+- * here, non-PCI-2.3 devices will have to wait until the
+- * interrupt is enabled.
++ * Fill the initial masked state based on virq_disabled. After
++ * enable, changing the DisINTx bit in vconfig directly changes INTx
++ * masking. igate prevents races during setup, once running masked
++ * is protected via irqlock.
++ *
++ * Devices supporting DisINTx also reflect the current mask state in
++ * the physical DisINTx bit, which is not affected during IRQ setup.
++ *
++ * Devices without DisINTx support require an exclusive interrupt.
++ * IRQ masking is performed at the IRQ chip. Again, igate protects
++ * against races during setup and IRQ handlers and irqfds are not
++ * yet active, therefore masked is stable and can be used to
++ * conditionally auto-enable the IRQ.
++ *
++ * irq_type must be stable while the IRQ handler is registered,
++ * therefore it must be set before request_irq().
+ */
+ ctx->masked = vdev->virq_disabled;
+- if (vdev->pci_2_3)
+- pci_intx(vdev->pdev, !ctx->masked);
++ if (vdev->pci_2_3) {
++ pci_intx(pdev, !ctx->masked);
++ irqflags = IRQF_SHARED;
++ } else {
++ irqflags = ctx->masked ? IRQF_NO_AUTOEN : 0;
++ }
+
+ vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
+
++ ret = request_irq(pdev->irq, vfio_intx_handler,
++ irqflags, ctx->name, vdev);
++ if (ret) {
++ vdev->irq_type = VFIO_PCI_NUM_IRQS;
++ kfree(name);
++ vfio_irq_ctx_free(vdev, ctx, 0);
++ return ret;
++ }
++
+ return 0;
+ }
+
+-static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd)
++static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev,
++ struct eventfd_ctx *trigger)
+ {
+ struct pci_dev *pdev = vdev->pdev;
+- unsigned long irqflags = IRQF_SHARED;
+ struct vfio_pci_irq_ctx *ctx;
+- struct eventfd_ctx *trigger;
+- unsigned long flags;
+- int ret;
++ struct eventfd_ctx *old;
+
+ ctx = vfio_irq_ctx_get(vdev, 0);
+ if (WARN_ON_ONCE(!ctx))
+ return -EINVAL;
+
+- if (ctx->trigger) {
+- free_irq(pdev->irq, vdev);
+- kfree(ctx->name);
+- eventfd_ctx_put(ctx->trigger);
+- ctx->trigger = NULL;
+- }
+-
+- if (fd < 0) /* Disable only */
+- return 0;
++ old = ctx->trigger;
+
+- ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)",
+- pci_name(pdev));
+- if (!ctx->name)
+- return -ENOMEM;
++ WRITE_ONCE(ctx->trigger, trigger);
+
+- trigger = eventfd_ctx_fdget(fd);
+- if (IS_ERR(trigger)) {
+- kfree(ctx->name);
+- return PTR_ERR(trigger);
++ /* Releasing an old ctx requires synchronizing in-flight users */
++ if (old) {
++ synchronize_irq(pdev->irq);
++ vfio_virqfd_flush_thread(&ctx->unmask);
++ eventfd_ctx_put(old);
+ }
+
+- ctx->trigger = trigger;
+-
+- /*
+- * Devices without DisINTx support require an exclusive interrupt,
+- * IRQ masking is performed at the IRQ chip. The masked status is
+- * protected by vdev->irqlock. Setup the IRQ without auto-enable and
+- * unmask as necessary below under lock. DisINTx is unmodified by
+- * the IRQ configuration and may therefore use auto-enable.
+- */
+- if (!vdev->pci_2_3)
+- irqflags = IRQF_NO_AUTOEN;
+-
+- ret = request_irq(pdev->irq, vfio_intx_handler,
+- irqflags, ctx->name, vdev);
+- if (ret) {
+- ctx->trigger = NULL;
+- kfree(ctx->name);
+- eventfd_ctx_put(trigger);
+- return ret;
+- }
+-
+- spin_lock_irqsave(&vdev->irqlock, flags);
+- if (!vdev->pci_2_3 && !ctx->masked)
+- enable_irq(pdev->irq);
+- spin_unlock_irqrestore(&vdev->irqlock, flags);
+-
+ return 0;
+ }
+
+ static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
+ {
++ struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_irq_ctx *ctx;
+
+ ctx = vfio_irq_ctx_get(vdev, 0);
+@@ -354,10 +358,13 @@ static void vfio_intx_disable(struct vfi
+ if (ctx) {
+ vfio_virqfd_disable(&ctx->unmask);
+ vfio_virqfd_disable(&ctx->mask);
++ free_irq(pdev->irq, vdev);
++ if (ctx->trigger)
++ eventfd_ctx_put(ctx->trigger);
++ kfree(ctx->name);
++ vfio_irq_ctx_free(vdev, ctx, 0);
+ }
+- vfio_intx_set_signal(vdev, -1);
+ vdev->irq_type = VFIO_PCI_NUM_IRQS;
+- vfio_irq_ctx_free(vdev, ctx, 0);
+ }
+
+ /*
+@@ -641,19 +648,23 @@ static int vfio_pci_set_intx_trigger(str
+ return -EINVAL;
+
+ if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
++ struct eventfd_ctx *trigger = NULL;
+ int32_t fd = *(int32_t *)data;
+ int ret;
+
++ if (fd >= 0) {
++ trigger = eventfd_ctx_fdget(fd);
++ if (IS_ERR(trigger))
++ return PTR_ERR(trigger);
++ }
++
+ if (is_intx(vdev))
+- return vfio_intx_set_signal(vdev, fd);
++ ret = vfio_intx_set_signal(vdev, trigger);
++ else
++ ret = vfio_intx_enable(vdev, trigger);
+
+- ret = vfio_intx_enable(vdev);
+- if (ret)
+- return ret;
+-
+- ret = vfio_intx_set_signal(vdev, fd);
+- if (ret)
+- vfio_intx_disable(vdev);
++ if (ret && trigger)
++ eventfd_ctx_put(trigger);
+
+ return ret;
+ }
--- /dev/null
+From 675daf435e9f8e5a5eab140a9864dfad6668b375 Mon Sep 17 00:00:00 2001
+From: Alex Williamson <alex.williamson@redhat.com>
+Date: Fri, 8 Mar 2024 16:05:27 -0700
+Subject: vfio/platform: Create persistent IRQ handlers
+
+From: Alex Williamson <alex.williamson@redhat.com>
+
+commit 675daf435e9f8e5a5eab140a9864dfad6668b375 upstream.
+
+The vfio-platform SET_IRQS ioctl currently allows loopback triggering of
+an interrupt before a signaling eventfd has been configured by the user,
+which thereby allows a NULL pointer dereference.
+
+Rather than register the IRQ relative to a valid trigger, register all
+IRQs in a disabled state in the device open path. This allows mask
+operations on the IRQ to nest within the overall enable state governed
+by a valid eventfd signal. This decouples @masked, protected by the
+@locked spinlock from @trigger, protected via the @igate mutex.
+
+In doing so, it's guaranteed that changes to @trigger cannot race the
+IRQ handlers because the IRQ handler is synchronously disabled before
+modifying the trigger, and loopback triggering of the IRQ via ioctl is
+safe due to serialization with trigger changes via igate.
+
+For compatibility, request_irq() failures are maintained to be local to
+the SET_IRQS ioctl rather than a fatal error in the open device path.
+This allows, for example, a userspace driver with polling mode support
+to continue to work regardless of moving the request_irq() call site.
+This necessarily blocks all SET_IRQS access to the failed index.
+
+Cc: Eric Auger <eric.auger@redhat.com>
+Cc: <stable@vger.kernel.org>
+Fixes: 57f972e2b341 ("vfio/platform: trigger an interrupt via eventfd")
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Link: https://lore.kernel.org/r/20240308230557.805580-7-alex.williamson@redhat.com
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vfio/platform/vfio_platform_irq.c | 100 ++++++++++++++++++++----------
+ 1 file changed, 68 insertions(+), 32 deletions(-)
+
+--- a/drivers/vfio/platform/vfio_platform_irq.c
++++ b/drivers/vfio/platform/vfio_platform_irq.c
+@@ -136,6 +136,16 @@ static int vfio_platform_set_irq_unmask(
+ return 0;
+ }
+
++/*
++ * The trigger eventfd is guaranteed valid in the interrupt path
++ * and protected by the igate mutex when triggered via ioctl.
++ */
++static void vfio_send_eventfd(struct vfio_platform_irq *irq_ctx)
++{
++ if (likely(irq_ctx->trigger))
++ eventfd_signal(irq_ctx->trigger, 1);
++}
++
+ static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id)
+ {
+ struct vfio_platform_irq *irq_ctx = dev_id;
+@@ -155,7 +165,7 @@ static irqreturn_t vfio_automasked_irq_h
+ spin_unlock_irqrestore(&irq_ctx->lock, flags);
+
+ if (ret == IRQ_HANDLED)
+- eventfd_signal(irq_ctx->trigger, 1);
++ vfio_send_eventfd(irq_ctx);
+
+ return ret;
+ }
+@@ -164,52 +174,40 @@ static irqreturn_t vfio_irq_handler(int
+ {
+ struct vfio_platform_irq *irq_ctx = dev_id;
+
+- eventfd_signal(irq_ctx->trigger, 1);
++ vfio_send_eventfd(irq_ctx);
+
+ return IRQ_HANDLED;
+ }
+
+ static int vfio_set_trigger(struct vfio_platform_device *vdev, int index,
+- int fd, irq_handler_t handler)
++ int fd)
+ {
+ struct vfio_platform_irq *irq = &vdev->irqs[index];
+ struct eventfd_ctx *trigger;
+- int ret;
+
+ if (irq->trigger) {
+- irq_clear_status_flags(irq->hwirq, IRQ_NOAUTOEN);
+- free_irq(irq->hwirq, irq);
+- kfree(irq->name);
++ disable_irq(irq->hwirq);
+ eventfd_ctx_put(irq->trigger);
+ irq->trigger = NULL;
+ }
+
+ if (fd < 0) /* Disable only */
+ return 0;
+- irq->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-irq[%d](%s)",
+- irq->hwirq, vdev->name);
+- if (!irq->name)
+- return -ENOMEM;
+
+ trigger = eventfd_ctx_fdget(fd);
+- if (IS_ERR(trigger)) {
+- kfree(irq->name);
++ if (IS_ERR(trigger))
+ return PTR_ERR(trigger);
+- }
+
+ irq->trigger = trigger;
+
+- irq_set_status_flags(irq->hwirq, IRQ_NOAUTOEN);
+- ret = request_irq(irq->hwirq, handler, 0, irq->name, irq);
+- if (ret) {
+- kfree(irq->name);
+- eventfd_ctx_put(trigger);
+- irq->trigger = NULL;
+- return ret;
+- }
+-
+- if (!irq->masked)
+- enable_irq(irq->hwirq);
++ /*
++ * irq->masked effectively provides nested disables within the overall
++ * enable relative to trigger. Specifically request_irq() is called
++ * with NO_AUTOEN, therefore the IRQ is initially disabled. The user
++ * may only further disable the IRQ with a MASK operations because
++ * irq->masked is initially false.
++ */
++ enable_irq(irq->hwirq);
+
+ return 0;
+ }
+@@ -228,7 +226,7 @@ static int vfio_platform_set_irq_trigger
+ handler = vfio_irq_handler;
+
+ if (!count && (flags & VFIO_IRQ_SET_DATA_NONE))
+- return vfio_set_trigger(vdev, index, -1, handler);
++ return vfio_set_trigger(vdev, index, -1);
+
+ if (start != 0 || count != 1)
+ return -EINVAL;
+@@ -236,7 +234,7 @@ static int vfio_platform_set_irq_trigger
+ if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ int32_t fd = *(int32_t *)data;
+
+- return vfio_set_trigger(vdev, index, fd, handler);
++ return vfio_set_trigger(vdev, index, fd);
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+@@ -260,6 +258,14 @@ int vfio_platform_set_irqs_ioctl(struct
+ unsigned start, unsigned count, uint32_t flags,
+ void *data) = NULL;
+
++ /*
++ * For compatibility, errors from request_irq() are local to the
++ * SET_IRQS path and reflected in the name pointer. This allows,
++ * for example, polling mode fallback for an exclusive IRQ failure.
++ */
++ if (IS_ERR(vdev->irqs[index].name))
++ return PTR_ERR(vdev->irqs[index].name);
++
+ switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+ case VFIO_IRQ_SET_ACTION_MASK:
+ func = vfio_platform_set_irq_mask;
+@@ -280,7 +286,7 @@ int vfio_platform_set_irqs_ioctl(struct
+
+ int vfio_platform_irq_init(struct vfio_platform_device *vdev)
+ {
+- int cnt = 0, i;
++ int cnt = 0, i, ret = 0;
+
+ while (vdev->get_irq(vdev, cnt) >= 0)
+ cnt++;
+@@ -292,29 +298,54 @@ int vfio_platform_irq_init(struct vfio_p
+
+ for (i = 0; i < cnt; i++) {
+ int hwirq = vdev->get_irq(vdev, i);
++ irq_handler_t handler = vfio_irq_handler;
+
+- if (hwirq < 0)
++ if (hwirq < 0) {
++ ret = -EINVAL;
+ goto err;
++ }
+
+ spin_lock_init(&vdev->irqs[i].lock);
+
+ vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD;
+
+- if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK)
++ if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK) {
+ vdev->irqs[i].flags |= VFIO_IRQ_INFO_MASKABLE
+ | VFIO_IRQ_INFO_AUTOMASKED;
++ handler = vfio_automasked_irq_handler;
++ }
+
+ vdev->irqs[i].count = 1;
+ vdev->irqs[i].hwirq = hwirq;
+ vdev->irqs[i].masked = false;
++ vdev->irqs[i].name = kasprintf(GFP_KERNEL_ACCOUNT,
++ "vfio-irq[%d](%s)", hwirq,
++ vdev->name);
++ if (!vdev->irqs[i].name) {
++ ret = -ENOMEM;
++ goto err;
++ }
++
++ ret = request_irq(hwirq, handler, IRQF_NO_AUTOEN,
++ vdev->irqs[i].name, &vdev->irqs[i]);
++ if (ret) {
++ kfree(vdev->irqs[i].name);
++ vdev->irqs[i].name = ERR_PTR(ret);
++ }
+ }
+
+ vdev->num_irqs = cnt;
+
+ return 0;
+ err:
++ for (--i; i >= 0; i--) {
++ if (!IS_ERR(vdev->irqs[i].name)) {
++ free_irq(vdev->irqs[i].hwirq, &vdev->irqs[i]);
++ kfree(vdev->irqs[i].name);
++ }
++ }
+ kfree(vdev->irqs);
+- return -EINVAL;
++ return ret;
+ }
+
+ void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev)
+@@ -324,7 +355,12 @@ void vfio_platform_irq_cleanup(struct vf
+ for (i = 0; i < vdev->num_irqs; i++) {
+ vfio_virqfd_disable(&vdev->irqs[i].mask);
+ vfio_virqfd_disable(&vdev->irqs[i].unmask);
+- vfio_set_trigger(vdev, i, -1, NULL);
++ if (!IS_ERR(vdev->irqs[i].name)) {
++ free_irq(vdev->irqs[i].hwirq, &vdev->irqs[i]);
++ if (vdev->irqs[i].trigger)
++ eventfd_ctx_put(vdev->irqs[i].trigger);
++ kfree(vdev->irqs[i].name);
++ }
+ }
+
+ vdev->num_irqs = 0;
--- /dev/null
+From 29956748339aa8757a7e2f927a8679dd08f24bb6 Mon Sep 17 00:00:00 2001
+From: "Borislav Petkov (AMD)" <bp@alien8.de>
+Date: Fri, 2 Feb 2024 17:29:32 +0100
+Subject: x86/Kconfig: Remove CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
+
+From: Borislav Petkov (AMD) <bp@alien8.de>
+
+commit 29956748339aa8757a7e2f927a8679dd08f24bb6 upstream.
+
+It was meant well at the time but nothing's using it so get rid of it.
+
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Link: https://lore.kernel.org/r/20240202163510.GDZb0Zvj8qOndvFOiZ@fat_crate.local
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/admin-guide/kernel-parameters.txt | 4 +---
+ Documentation/arch/x86/amd-memory-encryption.rst | 16 ++++++++--------
+ arch/x86/Kconfig | 13 -------------
+ arch/x86/mm/mem_encrypt_identity.c | 11 +----------
+ 4 files changed, 10 insertions(+), 34 deletions(-)
+
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -3269,9 +3269,7 @@
+
+ mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control
+ Valid arguments: on, off
+- Default (depends on kernel configuration option):
+- on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y)
+- off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n)
++ Default: off
+ mem_encrypt=on: Activate SME
+ mem_encrypt=off: Do not activate SME
+
+--- a/Documentation/arch/x86/amd-memory-encryption.rst
++++ b/Documentation/arch/x86/amd-memory-encryption.rst
+@@ -87,14 +87,14 @@ The state of SME in the Linux kernel can
+ kernel is non-zero).
+
+ SME can also be enabled and activated in the BIOS. If SME is enabled and
+-activated in the BIOS, then all memory accesses will be encrypted and it will
+-not be necessary to activate the Linux memory encryption support. If the BIOS
+-merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG), then Linux can activate
+-memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or
+-by supplying mem_encrypt=on on the kernel command line. However, if BIOS does
+-not enable SME, then Linux will not be able to activate memory encryption, even
+-if configured to do so by default or the mem_encrypt=on command line parameter
+-is specified.
++activated in the BIOS, then all memory accesses will be encrypted and it
++will not be necessary to activate the Linux memory encryption support.
++
++If the BIOS merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG),
++then memory encryption can be enabled by supplying mem_encrypt=on on the
++kernel command line. However, if BIOS does not enable SME, then Linux
++will not be able to activate memory encryption, even if configured to do
++so by default or the mem_encrypt=on command line parameter is specified.
+
+ Secure Nested Paging (SNP)
+ ==========================
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -1514,19 +1514,6 @@ config AMD_MEM_ENCRYPT
+ This requires an AMD processor that supports Secure Memory
+ Encryption (SME).
+
+-config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
+- bool "Activate AMD Secure Memory Encryption (SME) by default"
+- depends on AMD_MEM_ENCRYPT
+- help
+- Say yes to have system memory encrypted by default if running on
+- an AMD processor that supports Secure Memory Encryption (SME).
+-
+- If set to Y, then the encryption of system memory can be
+- deactivated with the mem_encrypt=off command line option.
+-
+- If set to N, then the encryption of system memory can be
+- activated with the mem_encrypt=on command line option.
+-
+ # Common NUMA Features
+ config NUMA
+ bool "NUMA Memory Allocation and Scheduler Support"
+--- a/arch/x86/mm/mem_encrypt_identity.c
++++ b/arch/x86/mm/mem_encrypt_identity.c
+@@ -97,7 +97,6 @@ static char sme_workarea[2 * PMD_SIZE] _
+
+ static char sme_cmdline_arg[] __initdata = "mem_encrypt";
+ static char sme_cmdline_on[] __initdata = "on";
+-static char sme_cmdline_off[] __initdata = "off";
+
+ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
+ {
+@@ -504,7 +503,7 @@ void __init sme_encrypt_kernel(struct bo
+
+ void __init sme_enable(struct boot_params *bp)
+ {
+- const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
++ const char *cmdline_ptr, *cmdline_arg, *cmdline_on;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned long feature_mask;
+ unsigned long me_mask;
+@@ -587,12 +586,6 @@ void __init sme_enable(struct boot_param
+ asm ("lea sme_cmdline_on(%%rip), %0"
+ : "=r" (cmdline_on)
+ : "p" (sme_cmdline_on));
+- asm ("lea sme_cmdline_off(%%rip), %0"
+- : "=r" (cmdline_off)
+- : "p" (sme_cmdline_off));
+-
+- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
+- sme_me_mask = me_mask;
+
+ cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+ ((u64)bp->ext_cmd_line_ptr << 32));
+@@ -602,8 +595,6 @@ void __init sme_enable(struct boot_param
+
+ if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+ sme_me_mask = me_mask;
+- else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
+- sme_me_mask = 0;
+
+ out:
+ if (sme_me_mask) {
--- /dev/null
+From 1c811d403afd73f04bde82b83b24c754011bd0e8 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Sat, 3 Feb 2024 13:53:06 +0100
+Subject: x86/sev: Fix position dependent variable references in startup code
+
+From: Ard Biesheuvel <ardb@kernel.org>
+
+commit 1c811d403afd73f04bde82b83b24c754011bd0e8 upstream.
+
+The early startup code executes from a 1:1 mapping of memory, which
+differs from the mapping that the code was linked and/or relocated to
+run at. The latter mapping is not active yet at this point, and so
+symbol references that rely on it will fault.
+
+Given that the core kernel is built without -fPIC, symbol references are
+typically emitted as absolute, and so any such references occuring in
+the early startup code will therefore crash the kernel.
+
+While an attempt was made to work around this for the early SEV/SME
+startup code, by forcing RIP-relative addressing for certain global
+SEV/SME variables via inline assembly (see snp_cpuid_get_table() for
+example), RIP-relative addressing must be pervasively enforced for
+SEV/SME global variables when accessed prior to page table fixups.
+
+__startup_64() already handles this issue for select non-SEV/SME global
+variables using fixup_pointer(), which adjusts the pointer relative to a
+`physaddr` argument. To avoid having to pass around this `physaddr`
+argument across all functions needing to apply pointer fixups, introduce
+a macro RIP_RELATIVE_REF() which generates a RIP-relative reference to
+a given global variable. It is used where necessary to force
+RIP-relative accesses to global variables.
+
+For backporting purposes, this patch makes no attempt at cleaning up
+other occurrences of this pattern, involving either inline asm or
+fixup_pointer(). Those will be addressed later.
+
+ [ bp: Call it "rip_rel_ref" everywhere like other code shortens
+ "rIP-relative reference" and make the asm wrapper __always_inline. ]
+
+Co-developed-by: Kevin Loughlin <kevinloughlin@google.com>
+Signed-off-by: Kevin Loughlin <kevinloughlin@google.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Cc: <stable@kernel.org>
+Link: https://lore.kernel.org/all/20240130220845.1978329-1-kevinloughlin@google.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/coco/core.c | 7 +------
+ arch/x86/include/asm/asm.h | 14 ++++++++++++++
+ arch/x86/include/asm/coco.h | 8 +++++++-
+ arch/x86/include/asm/mem_encrypt.h | 15 +++++++++------
+ arch/x86/kernel/sev-shared.c | 12 ++++++------
+ arch/x86/kernel/sev.c | 4 ++--
+ arch/x86/mm/mem_encrypt_identity.c | 27 ++++++++++++---------------
+ 7 files changed, 51 insertions(+), 36 deletions(-)
+
+--- a/arch/x86/coco/core.c
++++ b/arch/x86/coco/core.c
+@@ -14,7 +14,7 @@
+ #include <asm/processor.h>
+
+ enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE;
+-static u64 cc_mask __ro_after_init;
++u64 cc_mask __ro_after_init;
+
+ static bool noinstr intel_cc_platform_has(enum cc_attr attr)
+ {
+@@ -148,8 +148,3 @@ u64 cc_mkdec(u64 val)
+ }
+ }
+ EXPORT_SYMBOL_GPL(cc_mkdec);
+-
+-__init void cc_set_mask(u64 mask)
+-{
+- cc_mask = mask;
+-}
+--- a/arch/x86/include/asm/asm.h
++++ b/arch/x86/include/asm/asm.h
+@@ -113,6 +113,20 @@
+
+ #endif
+
++#ifndef __ASSEMBLY__
++#ifndef __pic__
++static __always_inline __pure void *rip_rel_ptr(void *p)
++{
++ asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p));
++
++ return p;
++}
++#define RIP_REL_REF(var) (*(typeof(&(var)))rip_rel_ptr(&(var)))
++#else
++#define RIP_REL_REF(var) (var)
++#endif
++#endif
++
+ /*
+ * Macros to generate condition code outputs from inline assembly,
+ * The output operand must be type "bool".
+--- a/arch/x86/include/asm/coco.h
++++ b/arch/x86/include/asm/coco.h
+@@ -2,6 +2,7 @@
+ #ifndef _ASM_X86_COCO_H
+ #define _ASM_X86_COCO_H
+
++#include <asm/asm.h>
+ #include <asm/types.h>
+
+ enum cc_vendor {
+@@ -11,9 +12,14 @@ enum cc_vendor {
+ };
+
+ extern enum cc_vendor cc_vendor;
++extern u64 cc_mask;
+
+ #ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+-void cc_set_mask(u64 mask);
++static inline void cc_set_mask(u64 mask)
++{
++ RIP_REL_REF(cc_mask) = mask;
++}
++
+ u64 cc_mkenc(u64 val);
+ u64 cc_mkdec(u64 val);
+ #else
+--- a/arch/x86/include/asm/mem_encrypt.h
++++ b/arch/x86/include/asm/mem_encrypt.h
+@@ -15,7 +15,8 @@
+ #include <linux/init.h>
+ #include <linux/cc_platform.h>
+
+-#include <asm/bootparam.h>
++#include <asm/asm.h>
++struct boot_params;
+
+ #ifdef CONFIG_X86_MEM_ENCRYPT
+ void __init mem_encrypt_init(void);
+@@ -57,6 +58,11 @@ void __init mem_encrypt_free_decrypted_m
+
+ void __init sev_es_init_vc_handling(void);
+
++static inline u64 sme_get_me_mask(void)
++{
++ return RIP_REL_REF(sme_me_mask);
++}
++
+ #define __bss_decrypted __section(".bss..decrypted")
+
+ #else /* !CONFIG_AMD_MEM_ENCRYPT */
+@@ -89,6 +95,8 @@ early_set_mem_enc_dec_hypercall(unsigned
+
+ static inline void mem_encrypt_free_decrypted_mem(void) { }
+
++static inline u64 sme_get_me_mask(void) { return 0; }
++
+ #define __bss_decrypted
+
+ #endif /* CONFIG_AMD_MEM_ENCRYPT */
+@@ -106,11 +114,6 @@ void add_encrypt_protection_map(void);
+
+ extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
+
+-static inline u64 sme_get_me_mask(void)
+-{
+- return sme_me_mask;
+-}
+-
+ #endif /* __ASSEMBLY__ */
+
+ #endif /* __X86_MEM_ENCRYPT_H__ */
+--- a/arch/x86/kernel/sev-shared.c
++++ b/arch/x86/kernel/sev-shared.c
+@@ -556,9 +556,9 @@ static int snp_cpuid(struct ghcb *ghcb,
+ leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
+
+ /* Skip post-processing for out-of-range zero leafs. */
+- if (!(leaf->fn <= cpuid_std_range_max ||
+- (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
+- (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
++ if (!(leaf->fn <= RIP_REL_REF(cpuid_std_range_max) ||
++ (leaf->fn >= 0x40000000 && leaf->fn <= RIP_REL_REF(cpuid_hyp_range_max)) ||
++ (leaf->fn >= 0x80000000 && leaf->fn <= RIP_REL_REF(cpuid_ext_range_max))))
+ return 0;
+ }
+
+@@ -1063,11 +1063,11 @@ static void __init setup_cpuid_table(con
+ const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ if (fn->eax_in == 0x0)
+- cpuid_std_range_max = fn->eax;
++ RIP_REL_REF(cpuid_std_range_max) = fn->eax;
+ else if (fn->eax_in == 0x40000000)
+- cpuid_hyp_range_max = fn->eax;
++ RIP_REL_REF(cpuid_hyp_range_max) = fn->eax;
+ else if (fn->eax_in == 0x80000000)
+- cpuid_ext_range_max = fn->eax;
++ RIP_REL_REF(cpuid_ext_range_max) = fn->eax;
+ }
+ }
+
+--- a/arch/x86/kernel/sev.c
++++ b/arch/x86/kernel/sev.c
+@@ -748,7 +748,7 @@ void __init early_snp_set_memory_private
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
++ if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
+ return;
+
+ /*
+@@ -767,7 +767,7 @@ void __init early_snp_set_memory_shared(
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
++ if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
+ return;
+
+ /* Ask hypervisor to mark the memory pages shared in the RMP table. */
+--- a/arch/x86/mm/mem_encrypt_identity.c
++++ b/arch/x86/mm/mem_encrypt_identity.c
+@@ -304,7 +304,8 @@ void __init sme_encrypt_kernel(struct bo
+ * instrumentation or checking boot_cpu_data in the cc_platform_has()
+ * function.
+ */
+- if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED)
++ if (!sme_get_me_mask() ||
++ RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED)
+ return;
+
+ /*
+@@ -541,11 +542,11 @@ void __init sme_enable(struct boot_param
+ me_mask = 1UL << (ebx & 0x3f);
+
+ /* Check the SEV MSR whether SEV or SME is enabled */
+- sev_status = __rdmsr(MSR_AMD64_SEV);
+- feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
++ RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV);
++ feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */
+- if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
++ if (snp && !(msr & MSR_AMD64_SEV_SNP_ENABLED))
+ snp_abort();
+
+ /* Check if memory encryption is enabled */
+@@ -571,7 +572,6 @@ void __init sme_enable(struct boot_param
+ return;
+ } else {
+ /* SEV state cannot be controlled by a command line option */
+- sme_me_mask = me_mask;
+ goto out;
+ }
+
+@@ -590,16 +590,13 @@ void __init sme_enable(struct boot_param
+ cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+ ((u64)bp->ext_cmd_line_ptr << 32));
+
+- if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0)
+- goto out;
+-
+- if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+- sme_me_mask = me_mask;
++ if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0 ||
++ strncmp(buffer, cmdline_on, sizeof(buffer)))
++ return;
+
+ out:
+- if (sme_me_mask) {
+- physical_mask &= ~sme_me_mask;
+- cc_vendor = CC_VENDOR_AMD;
+- cc_set_mask(sme_me_mask);
+- }
++ RIP_REL_REF(sme_me_mask) = me_mask;
++ physical_mask &= ~me_mask;
++ cc_vendor = CC_VENDOR_AMD;
++ cc_set_mask(me_mask);
+ }