PPC64 support for NVIDIA V100 GPU with NVLink2 passthrough

author Daniel Henrique Barboza <danielhb413@gmail.com>

Thu, 4 Apr 2019 13:40:39 +0000 (10:40 -0300)

committer Erik Skultety <eskultet@redhat.com>

Mon, 15 Apr 2019 05:41:43 +0000 (07:41 +0200)
author Daniel Henrique Barboza <danielhb413@gmail.com>
Thu, 4 Apr 2019 13:40:39 +0000 (10:40 -0300)
committer Erik Skultety <eskultet@redhat.com>
Mon, 15 Apr 2019 05:41:43 +0000 (07:41 +0200)
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c

index ddcceee16fd86f6ef000ac7e9dbb13f2f8de2ba6..38fa16bc5fcd90845e1817e49b0f2411cba87dea 100644 (file)
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -10358,7 +10358,7 @@ qemuDomainUpdateCurrentMemorySize(virDomainObjPtr vm)
   * such as '0004:04:00.0', and tells if the device is a NVLink2
   * bridge.
   */
-static ATTRIBUTE_UNUSED bool
+static bool
  ppc64VFIODeviceIsNV2Bridge(const char *device)
  {
      const char *nvlink2Files[] = {"ibm,gpu", "ibm,nvlink",
@@ -10396,7 +10396,9 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def)
      unsigned long long maxMemory = 0;
      unsigned long long passthroughLimit = 0;
      size_t i, nPCIHostBridges = 0;
+    virPCIDeviceAddressPtr pciAddr;
      bool usesVFIO = false;
+    bool nvlink2Capable = false;
  
      for (i = 0; i < def->ncontrollers; i++) {
          virDomainControllerDefPtr cont = def->controllers[i];
@@ -10414,7 +10416,17 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def)
              dev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI &&
              dev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
              usesVFIO = true;
-            break;
+
+            pciAddr = &dev->source.subsys.u.pci.addr;
+            if (virPCIDeviceAddressIsValid(pciAddr, false)) {
+                VIR_AUTOFREE(char *) pciAddrStr = NULL;
+
+                pciAddrStr = virPCIDeviceAddressAsString(pciAddr);
+                if (ppc64VFIODeviceIsNV2Bridge(pciAddrStr)) {
+                    nvlink2Capable = true;
+                    break;
+                }
+            }
          }
      }
  
@@ -10441,29 +10453,59 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def)
                  4096 * nPCIHostBridges +
                  8192;
  
-    /* passthroughLimit := max( 2 GiB * #PHBs,                       (c)
-     *                          memory                               (d)
-     *                          + memory * 1/512 * #PHBs + 8 MiB )   (e)
+    /* NVLink2 support in QEMU is a special case of the passthrough
+     * mechanics explained in the usesVFIO case below. The GPU RAM
+     * is placed with a gap after maxMemory. The current QEMU
+     * implementation puts the NVIDIA RAM above the PCI MMIO, which
+     * starts at 32TiB and is the MMIO reserved for the guest main RAM.
       *
-     * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2 GiB
-     * rather than 1 GiB
+     * This window ends at 64TiB, and this is where the GPUs are being
+     * placed. The next available window size is at 128TiB, and
+     * 64TiB..128TiB will fit all possible NVIDIA GPUs.
       *
-     * (d) is the with-DDW (and memory pre-registration and related
-     * features) DMA window accounting - assuming that we only account RAM
-     * once, even if mapped to multiple PHBs
+     * The same assumption as the most common case applies here:
+     * the guest will request a 64-bit DMA window, per PHB, that is
+     * big enough to map all its RAM, which is now at 128TiB due
+     * to the GPUs.
       *
-     * (e) is the with-DDW userspace view and overhead for the 64-bit DMA
-     * window. This is based a bit on expected guest behaviour, but there
-     * really isn't a way to completely avoid that. We assume the guest
-     * requests a 64-bit DMA window (per PHB) just big enough to map all
-     * its RAM. 4 kiB page size gives the 1/512; it will be less with 64
-     * kiB pages, less still if the guest is mapped with hugepages (unlike
-     * the default 32-bit DMA window, DDW windows can use large IOMMU
-     * pages). 8 MiB is for second and further level overheads, like (b) */
-    if (usesVFIO)
+     * Note that the NVIDIA RAM window must be accounted for the TCE
+     * table size, but *not* for the main RAM (maxMemory). This gives
+     * us the following passthroughLimit for the NVLink2 case:
+     *
+     * passthroughLimit = maxMemory +
+     *                    128TiB/512KiB * #PHBs + 8 MiB */
+    if (nvlink2Capable) {
+        passthroughLimit = maxMemory +
+                           128 * (1ULL<<30) / 512 * nPCIHostBridges +
+                           8192;
+    } else if (usesVFIO) {
+        /* For regular (non-NVLink2 present) VFIO passthrough, the value
+         * of passthroughLimit is:
+         *
+         * passthroughLimit := max( 2 GiB * #PHBs,                       (c)
+         *                          memory                               (d)
+         *                          + memory * 1/512 * #PHBs + 8 MiB )   (e)
+         *
+         * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2
+         * GiB rather than 1 GiB
+         *
+         * (d) is the with-DDW (and memory pre-registration and related
+         * features) DMA window accounting - assuming that we only account
+         * RAM once, even if mapped to multiple PHBs
+         *
+         * (e) is the with-DDW userspace view and overhead for the 64-bit
+         * DMA window. This is based a bit on expected guest behaviour, but
+         * there really isn't a way to completely avoid that. We assume the
+         * guest requests a 64-bit DMA window (per PHB) just big enough to
+         * map all its RAM. 4 kiB page size gives the 1/512; it will be
+         * less with 64 kiB pages, less still if the guest is mapped with
+         * hugepages (unlike the default 32-bit DMA window, DDW windows
+         * can use large IOMMU pages). 8 MiB is for second and further level
+         * overheads, like (b) */
          passthroughLimit = MAX(2 * 1024 * 1024 * nPCIHostBridges,
                                 memory +
                                 memory / 512 * nPCIHostBridges + 8192);
+    }
  
      memKB = baseLimit + passthroughLimit;
author	Daniel Henrique Barboza <danielhb413@gmail.com>
	Thu, 4 Apr 2019 13:40:39 +0000 (10:40 -0300)
committer	Erik Skultety <eskultet@redhat.com>
	Mon, 15 Apr 2019 05:41:43 +0000 (07:41 +0200)