From: Michael Brown Date: Thu, 9 Oct 2025 15:01:51 +0000 (+0100) Subject: [pci] Map all MSI-X interrupts to a dummy target address by default X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=4224f574daa2cea6e8f402e766cdbb35a0178487;p=thirdparty%2Fipxe.git [pci] Map all MSI-X interrupts to a dummy target address by default Interrupts as such are not used in iPXE, which operates in polling mode. However, some network cards (such as the Intel 40GbE and 100GbE NICs) will defer writing out completions until the point of asserting an MSI-X interrupt. From the point of view of the PCI device, asserting an MSI-X interrupt is just a 32-bit DMA write of an opaque value to an opaque target address. The PCI device has no know to know whether or not the target address corresponds to a real APIC. We can therefore trick the PCI device into believing that it is asserting an MSI-X interrupt, by configuring it to write an opaque 32-bit value to a dummy target address in host memory. This is sufficient to trigger the associated write of the completions to host memory. Allocate a dummy target address when enabling MSI-X on a PCI device, and map all interrupts to this target address by default. Signed-off-by: Michael Brown --- diff --git a/src/drivers/bus/pcimsix.c b/src/drivers/bus/pcimsix.c index eb0450d91..f55488ad7 100644 --- a/src/drivers/bus/pcimsix.c +++ b/src/drivers/bus/pcimsix.c @@ -33,6 +33,38 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); * * PCI MSI-X interrupts * + * Interrupts as such are not used in iPXE, which operates in polling + * mode. However, some network cards (such as the Intel 40GbE and + * 100GbE NICs) will defer writing out completions until the point of + * asserting an MSI-X interrupt. + * + * From the point of view of the PCI device, asserting an MSI-X + * interrupt is just a 32-bit DMA write of an opaque value to an + * opaque target address. The PCI device has no know to know whether + * or not the target address corresponds to a real APIC. + * + * We can therefore trick the PCI device into believing that it is + * asserting an MSI-X interrupt, by configuring it to write an opaque + * 32-bit value to a dummy target address in host memory. This is + * sufficient to trigger the associated write of the completions to + * host memory. + * + * When running in a virtual machine, the hypervisor will intercept + * our attempt to configure MSI-X on the PCI device. The physical + * hardware will be configured to raise an interrupt under the + * hypervisor's control, which will then be reflected back into the + * virtual machine. The opaque value that we write will be assumed to + * indicate an interrupt vector number (as would normally be the case + * when configuring MSI-X), and the opaque address will generally be + * ignored. The reflected interrupt will be ignored (since it is not + * enabled within the virtual machine), but the device still asserts + * an MSI-X interrupt and so still triggers the associated write of + * the completions to host memory. + * + * Note that since the opaque target address will generally be ignored + * by the hypervisor, we cannot examine the value present at the dummy + * target address to find out whether or not an interrupt has been + * raised. */ /** @@ -103,6 +135,8 @@ static void * pci_msix_ioremap ( struct pci_device *pci, struct pci_msix *msix, */ int pci_msix_enable ( struct pci_device *pci, struct pci_msix *msix ) { uint16_t ctrl; + physaddr_t msg; + unsigned int i; int rc; /* Locate capability */ @@ -134,6 +168,19 @@ int pci_msix_enable ( struct pci_device *pci, struct pci_msix *msix ) { goto err_pba; } + /* Allocate dummy target */ + msix->msg = dma_alloc ( &pci->dma, &msix->map, sizeof ( *msix->msg ), + sizeof ( *msix->msg ) ); + if ( ! msix->msg ) { + rc = -ENOMEM; + goto err_msg; + } + + /* Map all interrupts to dummy target by default */ + msg = dma ( &msix->map, msix->msg ); + for ( i = 0 ; i < msix->count ; i++ ) + pci_msix_map ( msix, i, msg, 0 ); + /* Enable MSI-X */ ctrl &= ~PCI_MSIX_CTRL_MASK; ctrl |= PCI_MSIX_CTRL_ENABLE; @@ -141,6 +188,8 @@ int pci_msix_enable ( struct pci_device *pci, struct pci_msix *msix ) { return 0; + dma_free ( &msix->map, msix->msg, sizeof ( *msix->msg ) ); + err_msg: iounmap ( msix->pba ); err_pba: iounmap ( msix->table ); @@ -163,6 +212,9 @@ void pci_msix_disable ( struct pci_device *pci, struct pci_msix *msix ) { ctrl &= ~PCI_MSIX_CTRL_ENABLE; pci_write_config_word ( pci, ( msix->cap + PCI_MSIX_CTRL ), ctrl ); + /* Free dummy target */ + dma_free ( &msix->map, msix->msg, sizeof ( *msix->msg ) ); + /* Unmap pending bit array */ iounmap ( msix->pba ); diff --git a/src/include/ipxe/pcimsix.h b/src/include/ipxe/pcimsix.h index aa2aaf017..b40c6c357 100644 --- a/src/include/ipxe/pcimsix.h +++ b/src/include/ipxe/pcimsix.h @@ -40,6 +40,10 @@ struct pci_msix { void *table; /** Pending bit array */ void *pba; + /** Dummy message target */ + uint32_t *msg; + /** Dummy message target mapping */ + struct dma_mapping map; }; extern int pci_msix_enable ( struct pci_device *pci, struct pci_msix *msix );