From: Christoph Hellwig Date: Mon, 7 Jul 2025 12:52:23 +0000 (+0200) Subject: nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b8b7570a7ec872f2a27b775c4f8710ca8a357adf;p=thirdparty%2Fkernel%2Flinux.git nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping The current version of the blk_rq_dma_map support in nvme-pci tries to reconstruct the DMA mappings from the on the wire descriptors if they are needed for unmapping. While this is not the case for the direct mapping fast path and the IOVA path, it is needed for the non-IOVA slow path, e.g. when using the interconnect is not dma coherent, when using swiotlb bounce buffering, or a IOMMU mapping that can't coalesce. While the reconstruction is easy and works fine for the SGL path, where the on the wire representation maps 1:1 to DMA mappings, the code to reconstruct the DMA mapping ranges from PRPs can't always work, as a given PRP layout can come from different DMA mappings, and the current code doesn't even always get that right. Give up on this approach and track the actual DMA mapping when actually needed again. Fixes: 7ce3c1dd78fc ("nvme-pci: convert the data mapping to blk_rq_dma_map") Reported-by: Ben Copeland Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Tested-by: Jens Axboe Link: https://lore.kernel.org/r/20250707125223.3022531-1-hch@lst.de Signed-off-by: Jens Axboe --- diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a509dc1f1d140..e236a9091a2e1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -173,6 +173,7 @@ struct nvme_dev { bool hmb; struct sg_table *hmb_sgt; + mempool_t *dmavec_mempool; mempool_t *iod_meta_mempool; /* shadow doorbell buffer support: */ @@ -262,6 +263,11 @@ enum nvme_iod_flags { IOD_SINGLE_SEGMENT = 1U << 2, }; +struct nvme_dma_vec { + dma_addr_t addr; + unsigned int len; +}; + /* * The nvme_iod describes the data in an I/O. */ @@ -274,6 +280,8 @@ struct nvme_iod { unsigned int total_len; struct dma_iova_state dma_state; void *descriptors[NVME_MAX_NR_DESCRIPTORS]; + struct nvme_dma_vec *dma_vecs; + unsigned int nr_dma_vecs; dma_addr_t meta_dma; struct sg_table meta_sgt; @@ -674,44 +682,12 @@ static void nvme_free_prps(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct device *dma_dev = nvmeq->dev->dev; - enum dma_data_direction dir = rq_dma_dir(req); - int length = iod->total_len; - dma_addr_t dma_addr; - int i, desc; - __le64 *prp_list; - u32 dma_len; - - dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1); - dma_len = min_t(u32, length, - NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1))); - length -= dma_len; - if (!length) { - dma_unmap_page(dma_dev, dma_addr, dma_len, dir); - return; - } - - if (length <= NVME_CTRL_PAGE_SIZE) { - dma_unmap_page(dma_dev, dma_addr, dma_len, dir); - dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2); - dma_unmap_page(dma_dev, dma_addr, length, dir); - return; - } - - i = 0; - desc = 0; - prp_list = iod->descriptors[desc]; - do { - dma_unmap_page(dma_dev, dma_addr, dma_len, dir); - if (i == NVME_CTRL_PAGE_SIZE >> 3) { - prp_list = iod->descriptors[++desc]; - i = 0; - } + unsigned int i; - dma_addr = le64_to_cpu(prp_list[i++]); - dma_len = min(length, NVME_CTRL_PAGE_SIZE); - length -= dma_len; - } while (length); + for (i = 0; i < iod->nr_dma_vecs; i++) + dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr, + iod->dma_vecs[i].len, rq_dma_dir(req)); + mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool); } static void nvme_free_sgls(struct request *req) @@ -760,6 +736,23 @@ static void nvme_unmap_data(struct request *req) nvme_free_descriptors(req); } +static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev, + struct blk_dma_iter *iter) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + if (iter->len) + return true; + if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter)) + return false; + if (dma_need_unmap(dma_dev)) { + iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr; + iod->dma_vecs[iod->nr_dma_vecs].len = iter->len; + iod->nr_dma_vecs++; + } + return true; +} + static blk_status_t nvme_pci_setup_data_prp(struct request *req, struct blk_dma_iter *iter) { @@ -770,6 +763,16 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req, unsigned int prp_len, i; __le64 *prp_list; + if (dma_need_unmap(nvmeq->dev->dev)) { + iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool, + GFP_ATOMIC); + if (!iod->dma_vecs) + return BLK_STS_RESOURCE; + iod->dma_vecs[0].addr = iter->addr; + iod->dma_vecs[0].len = iter->len; + iod->nr_dma_vecs = 1; + } + /* * PRP1 always points to the start of the DMA transfers. * @@ -786,13 +789,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req, if (!length) goto done; - if (!iter->len) { - if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, - &iod->dma_state, iter)) { - if (WARN_ON_ONCE(!iter->status)) - goto bad_sgl; - goto done; - } + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) { + if (WARN_ON_ONCE(!iter->status)) + goto bad_sgl; + goto done; } /* @@ -831,13 +831,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req, if (!length) break; - if (iter->len == 0) { - if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, - &iod->dma_state, iter)) { - if (WARN_ON_ONCE(!iter->status)) - goto bad_sgl; - goto done; - } + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) { + if (WARN_ON_ONCE(!iter->status)) + goto bad_sgl; + goto done; } /* @@ -3025,14 +3022,25 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); + size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS; + + dev->dmavec_mempool = mempool_create_node(1, + mempool_kmalloc, mempool_kfree, + (void *)alloc_size, GFP_KERNEL, + dev_to_node(dev->dev)); + if (!dev->dmavec_mempool) + return -ENOMEM; dev->iod_meta_mempool = mempool_create_node(1, mempool_kmalloc, mempool_kfree, (void *)meta_size, GFP_KERNEL, dev_to_node(dev->dev)); if (!dev->iod_meta_mempool) - return -ENOMEM; + goto free; return 0; +free: + mempool_destroy(dev->dmavec_mempool); + return -ENOMEM; } static void nvme_free_tagset(struct nvme_dev *dev) @@ -3477,6 +3485,7 @@ out_disable: nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); out_release_iod_mempool: + mempool_destroy(dev->dmavec_mempool); mempool_destroy(dev->iod_meta_mempool); out_dev_unmap: nvme_dev_unmap(dev); @@ -3540,6 +3549,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_remove_admin(dev); nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); + mempool_destroy(dev->dmavec_mempool); mempool_destroy(dev->iod_meta_mempool); nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev);