]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
iommu/io-pgtable-arm: Optimise non-coherent unmap
authorAshish Mhetre <amhetre@nvidia.com>
Tue, 6 Aug 2024 10:51:35 +0000 (10:51 +0000)
committerJoerg Roedel <jroedel@suse.de>
Fri, 30 Aug 2024 12:29:32 +0000 (14:29 +0200)
The current __arm_lpae_unmap() function calls dma_sync() on individual
PTEs after clearing them. Overall unmap performance can be improved by
around 25% for large buffer sizes by combining the syncs for adjacent
leaf entries.
Optimize the unmap time by clearing all the leaf entries and issuing a
single dma_sync() for them.
Below is detailed analysis of average unmap latency(in us) with and
without this optimization obtained by running dma_map_benchmark for
different buffer sizes.

UnMap Latency(us)
Size Without With % gain with
optimiztion optimization optimization

4KB 3 3 0
8KB 4 3.8 5
16KB 6.1 5.4 11.48
32KB 10.2 8.5 16.67
64KB 18.5 14.9 19.46
128KB 35 27.5 21.43
256KB 67.5 52.2 22.67
512KB 127.9 97.2 24.00
1MB 248.6 187.4 24.62
2MB 65.5 65.5 0
4MB 119.2 119 0.17

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Ashish Mhetre <amhetre@nvidia.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240806105135.218089-1-amhetre@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
drivers/iommu/io-pgtable-arm.c

index f5d9fd1f45bf49cdc3db065836f2c7591946ab6b..6fecf3d9fe673460b359d70d80442674a5b9bd60 100644 (file)
@@ -274,13 +274,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
                                   sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
 }
 
-static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
+static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
 {
+       for (int i = 0; i < num_entries; i++)
+               ptep[i] = 0;
 
-       *ptep = 0;
-
-       if (!cfg->coherent_walk)
-               __arm_lpae_sync_pte(ptep, 1, cfg);
+       if (!cfg->coherent_walk && num_entries)
+               __arm_lpae_sync_pte(ptep, num_entries, cfg);
 }
 
 static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
@@ -654,26 +654,29 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
                max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
                num_entries = min_t(int, pgcount, max_entries);
 
-               while (i < num_entries) {
-                       pte = READ_ONCE(*ptep);
+               /* Find and handle non-leaf entries */
+               for (i = 0; i < num_entries; i++) {
+                       pte = READ_ONCE(ptep[i]);
                        if (WARN_ON(!pte))
                                break;
 
-                       __arm_lpae_clear_pte(ptep, &iop->cfg);
-
                        if (!iopte_leaf(pte, lvl, iop->fmt)) {
+                               __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
+
                                /* Also flush any partial walks */
                                io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
                                                          ARM_LPAE_GRANULE(data));
                                __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
-                       } else if (!iommu_iotlb_gather_queued(gather)) {
-                               io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
                        }
-
-                       ptep++;
-                       i++;
                }
 
+               /* Clear the remaining entries */
+               __arm_lpae_clear_pte(ptep, &iop->cfg, i);
+
+               if (gather && !iommu_iotlb_gather_queued(gather))
+                       for (int j = 0; j < i; j++)
+                               io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
+
                return i * size;
        } else if (iopte_leaf(pte, lvl, iop->fmt)) {
                /*