]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
nvme-pci: make PRP list DMA pools per-NUMA-node
authorCaleb Sander Mateos <csander@purestorage.com>
Mon, 12 May 2025 13:50:40 +0000 (15:50 +0200)
committerChristoph Hellwig <hch@lst.de>
Tue, 20 May 2025 03:34:27 +0000 (05:34 +0200)
NVMe commands with over 8 KB of discontiguous data allocate PRP list
pages from the per-nvme_device dma_pool prp_page_pool or prp_small_pool.
Each call to dma_pool_alloc() and dma_pool_free() takes the per-dma_pool
spinlock. These device-global spinlocks are a significant source of
contention when many CPUs are submitting to the same NVMe devices. On a
workload issuing 32 KB reads from 16 CPUs (8 hypertwin pairs) across 2
NUMA nodes to 23 NVMe devices, we observed 2.4% of CPU time spent in
_raw_spin_lock_irqsave called from dma_pool_alloc and dma_pool_free.

Ideally, the dma_pools would be per-hctx to minimize contention. But
that could impose considerable resource costs in a system with many NVMe
devices and CPUs.

As a compromise, allocate per-NUMA-node PRP list DMA pools. Map each
nvme_queue to the set of DMA pools corresponding to its device and its
hctx's NUMA node. This reduces the _raw_spin_lock_irqsave overhead by
about half, to 1.2%. Preventing the sharing of PRP list pages across
NUMA nodes also makes them cheaper to initialize.

Link: https://lore.kernel.org/linux-nvme/CADUfDZqa=OOTtTTznXRDmBQo1WrFcDw1hBA7XwM7hzJ-hpckcA@mail.gmail.com/T/#u
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
drivers/nvme/host/pci.c

index 30abf134c886613514a0de3d1e3ceab02983fb20..e6b6b6ee08786f596423d6765765a9b0994cd299 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/nodemask.h>
 #include <linux/once.h>
 #include <linux/pci.h>
 #include <linux/suspend.h>
@@ -112,6 +113,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 static void nvme_delete_io_queues(struct nvme_dev *dev);
 static void nvme_update_attrs(struct nvme_dev *dev);
 
+struct nvme_prp_dma_pools {
+       struct dma_pool *large;
+       struct dma_pool *small;
+};
+
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
@@ -121,8 +127,6 @@ struct nvme_dev {
        struct blk_mq_tag_set admin_tagset;
        u32 __iomem *dbs;
        struct device *dev;
-       struct dma_pool *prp_page_pool;
-       struct dma_pool *prp_small_pool;
        unsigned online_queues;
        unsigned max_qid;
        unsigned io_queues[HCTX_MAX_TYPES];
@@ -162,6 +166,7 @@ struct nvme_dev {
        unsigned int nr_allocated_queues;
        unsigned int nr_write_queues;
        unsigned int nr_poll_queues;
+       struct nvme_prp_dma_pools prp_pools[];
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -191,6 +196,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
  */
 struct nvme_queue {
        struct nvme_dev *dev;
+       struct nvme_prp_dma_pools prp_pools;
        spinlock_t sq_lock;
        void *sq_cmds;
         /* only used for poll queues: */
@@ -397,15 +403,64 @@ static int nvme_pci_npages_prp(void)
        return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
 }
 
+static struct nvme_prp_dma_pools *
+nvme_setup_prp_pools(struct nvme_dev *dev, unsigned numa_node)
+{
+       struct nvme_prp_dma_pools *prp_pools = &dev->prp_pools[numa_node];
+       size_t small_align = 256;
+
+       if (prp_pools->small)
+               return prp_pools; /* already initialized */
+
+       prp_pools->large = dma_pool_create_node("prp list page", dev->dev,
+                                               NVME_CTRL_PAGE_SIZE,
+                                               NVME_CTRL_PAGE_SIZE, 0,
+                                               numa_node);
+       if (!prp_pools->large)
+               return ERR_PTR(-ENOMEM);
+
+       if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
+               small_align = 512;
+
+       /* Optimisation for I/Os between 4k and 128k */
+       prp_pools->small = dma_pool_create_node("prp list 256", dev->dev,
+                                               256, small_align, 0, numa_node);
+       if (!prp_pools->small) {
+               dma_pool_destroy(prp_pools->large);
+               prp_pools->large = NULL;
+               return ERR_PTR(-ENOMEM);
+       }
+
+       return prp_pools;
+}
+
+static void nvme_release_prp_pools(struct nvme_dev *dev)
+{
+       unsigned i;
+
+       for (i = 0; i < nr_node_ids; i++) {
+               struct nvme_prp_dma_pools *prp_pools = &dev->prp_pools[i];
+
+               dma_pool_destroy(prp_pools->large);
+               dma_pool_destroy(prp_pools->small);
+       }
+}
+
 static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data,
                unsigned qid)
 {
        struct nvme_dev *dev = to_nvme_dev(data);
        struct nvme_queue *nvmeq = &dev->queues[qid];
+       struct nvme_prp_dma_pools *prp_pools;
        struct blk_mq_tags *tags;
 
        tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0];
        WARN_ON(tags != hctx->tags);
+       prp_pools = nvme_setup_prp_pools(dev, hctx->numa_node);
+       if (IS_ERR(prp_pools))
+               return PTR_ERR(prp_pools);
+
+       nvmeq->prp_pools = *prp_pools;
        hctx->driver_data = nvmeq;
        return 0;
 }
@@ -539,7 +594,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
        return true;
 }
 
-static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
+static void nvme_free_prps(struct nvme_queue *nvmeq, struct request *req)
 {
        const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -550,12 +605,13 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
                __le64 *prp_list = iod->list[i].prp_list;
                dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
 
-               dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
+               dma_pool_free(nvmeq->prp_pools.large, prp_list, dma_addr);
                dma_addr = next_dma_addr;
        }
 }
 
-static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+                           struct request *req)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 
@@ -570,13 +626,13 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
        dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
 
        if (iod->nr_allocations == 0)
-               dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
+               dma_pool_free(nvmeq->prp_pools.small, iod->list[0].sg_list,
                              iod->first_dma);
        else if (iod->nr_allocations == 1)
-               dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
+               dma_pool_free(nvmeq->prp_pools.large, iod->list[0].sg_list,
                              iod->first_dma);
        else
-               nvme_free_prps(dev, req);
+               nvme_free_prps(nvmeq, req);
        mempool_free(iod->sgt.sgl, dev->iod_mempool);
 }
 
@@ -594,7 +650,7 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
        }
 }
 
-static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq,
                struct request *req, struct nvme_rw_command *cmnd)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -630,10 +686,10 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 
        nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
        if (nprps <= (256 / 8)) {
-               pool = dev->prp_small_pool;
+               pool = nvmeq->prp_pools.small;
                iod->nr_allocations = 0;
        } else {
-               pool = dev->prp_page_pool;
+               pool = nvmeq->prp_pools.large;
                iod->nr_allocations = 1;
        }
 
@@ -675,7 +731,7 @@ done:
        cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
        return BLK_STS_OK;
 free_prps:
-       nvme_free_prps(dev, req);
+       nvme_free_prps(nvmeq, req);
        return BLK_STS_RESOURCE;
 bad_sgl:
        WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
@@ -700,7 +756,7 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
        sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
 }
 
-static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq,
                struct request *req, struct nvme_rw_command *cmd)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -720,10 +776,10 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
        }
 
        if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
-               pool = dev->prp_small_pool;
+               pool = nvmeq->prp_pools.small;
                iod->nr_allocations = 0;
        } else {
-               pool = dev->prp_page_pool;
+               pool = nvmeq->prp_pools.large;
                iod->nr_allocations = 1;
        }
 
@@ -787,12 +843,12 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
 static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
                struct nvme_command *cmnd)
 {
+       struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        blk_status_t ret = BLK_STS_RESOURCE;
        int rc;
 
        if (blk_rq_nr_phys_segments(req) == 1) {
-               struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
                struct bio_vec bv = req_bvec(req);
 
                if (!is_pci_p2pdma_page(bv.bv_page)) {
@@ -827,9 +883,9 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
        }
 
        if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
-               ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+               ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw);
        else
-               ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
+               ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw);
        if (ret != BLK_STS_OK)
                goto out_unmap_sg;
        return BLK_STS_OK;
@@ -844,6 +900,7 @@ out_free_sg:
 static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
                                             struct request *req)
 {
+       struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct nvme_rw_command *cmnd = &iod->cmd.rw;
        struct nvme_sgl_desc *sg_list;
@@ -867,7 +924,7 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
        if (rc)
                goto out_free_sg;
 
-       sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
+       sg_list = dma_pool_alloc(nvmeq->prp_pools.small, GFP_ATOMIC, &sgl_dma);
        if (!sg_list)
                goto out_unmap_sg;
 
@@ -949,7 +1006,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
        return BLK_STS_OK;
 out_unmap_data:
        if (blk_rq_nr_phys_segments(req))
-               nvme_unmap_data(dev, req);
+               nvme_unmap_data(dev, req->mq_hctx->driver_data, req);
 out_free_cmd:
        nvme_cleanup_cmd(req);
        return ret;
@@ -1039,6 +1096,7 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
 }
 
 static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
+                                               struct nvme_queue *nvmeq,
                                                struct request *req)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1050,7 +1108,7 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
                return;
        }
 
-       dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
+       dma_pool_free(nvmeq->prp_pools.small, iod->meta_list.sg_list,
                      iod->meta_dma);
        dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
        mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
@@ -1062,10 +1120,10 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req)
        struct nvme_dev *dev = nvmeq->dev;
 
        if (blk_integrity_rq(req))
-               nvme_unmap_metadata(dev, req);
+               nvme_unmap_metadata(dev, nvmeq, req);
 
        if (blk_rq_nr_phys_segments(req))
-               nvme_unmap_data(dev, req);
+               nvme_unmap_data(dev, nvmeq, req);
 }
 
 static void nvme_pci_complete_rq(struct request *req)
@@ -2842,35 +2900,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
        return 0;
 }
 
-static int nvme_setup_prp_pools(struct nvme_dev *dev)
-{
-       size_t small_align = 256;
-
-       dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
-                                               NVME_CTRL_PAGE_SIZE,
-                                               NVME_CTRL_PAGE_SIZE, 0);
-       if (!dev->prp_page_pool)
-               return -ENOMEM;
-
-       if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
-               small_align = 512;
-
-       /* Optimisation for I/Os between 4k and 128k */
-       dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
-                                               256, small_align, 0);
-       if (!dev->prp_small_pool) {
-               dma_pool_destroy(dev->prp_page_pool);
-               return -ENOMEM;
-       }
-       return 0;
-}
-
-static void nvme_release_prp_pools(struct nvme_dev *dev)
-{
-       dma_pool_destroy(dev->prp_page_pool);
-       dma_pool_destroy(dev->prp_small_pool);
-}
-
 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 {
        size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
@@ -3185,7 +3214,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
        struct nvme_dev *dev;
        int ret = -ENOMEM;
 
-       dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+       dev = kzalloc_node(sizeof(*dev) + nr_node_ids * sizeof(*dev->prp_pools),
+                          GFP_KERNEL, node);
        if (!dev)
                return ERR_PTR(-ENOMEM);
        INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
@@ -3260,13 +3290,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        if (result)
                goto out_uninit_ctrl;
 
-       result = nvme_setup_prp_pools(dev);
-       if (result)
-               goto out_dev_unmap;
-
        result = nvme_pci_alloc_iod_mempool(dev);
        if (result)
-               goto out_release_prp_pools;
+               goto out_dev_unmap;
 
        dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
@@ -3342,8 +3368,6 @@ out_disable:
 out_release_iod_mempool:
        mempool_destroy(dev->iod_mempool);
        mempool_destroy(dev->iod_meta_mempool);
-out_release_prp_pools:
-       nvme_release_prp_pools(dev);
 out_dev_unmap:
        nvme_dev_unmap(dev);
 out_uninit_ctrl: