]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
s390/pci: Use dma-iommu layer
authorNiklas Schnelle <schnelle@linux.ibm.com>
Thu, 28 Sep 2023 14:31:37 +0000 (16:31 +0200)
committerJoerg Roedel <jroedel@suse.de>
Mon, 2 Oct 2023 06:43:00 +0000 (08:43 +0200)
While s390 already has a standard IOMMU driver and previous changes have
added I/O TLB flushing operations this driver is currently only used for
user-space PCI access such as vfio-pci. For the DMA API s390 instead
utilizes its own implementation in arch/s390/pci/pci_dma.c which drives
the same hardware and shares some code but requires a complex and
fragile hand over between DMA API and IOMMU API use of a device and
despite code sharing still leads to significant duplication and
maintenance effort. Let's utilize the common code DMAP API
implementation from drivers/iommu/dma-iommu.c instead allowing us to
get rid of arch/s390/pci/pci_dma.c.

Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Link: https://lore.kernel.org/r/20230928-dma_iommu-v13-3-9e5fc4dacc36@linux.ibm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
13 files changed:
Documentation/admin-guide/kernel-parameters.txt
arch/s390/include/asm/pci.h
arch/s390/include/asm/pci_clp.h
arch/s390/include/asm/pci_dma.h
arch/s390/pci/Makefile
arch/s390/pci/pci.c
arch/s390/pci/pci_bus.c
arch/s390/pci/pci_debug.c
arch/s390/pci/pci_dma.c [deleted file]
arch/s390/pci/pci_event.c
arch/s390/pci/pci_sysfs.c
drivers/iommu/Kconfig
drivers/iommu/s390-iommu.c

index 0a1731a0f0ef373421c0594886c065461ade1064..14f56c448edc433c7d1dbb9e554dd2ad9e1987eb 100644 (file)
                          forcing Dual Address Cycle for PCI cards supporting
                          greater than 32-bit addressing.
 
-       iommu.strict=   [ARM64, X86] Configure TLB invalidation behaviour
+       iommu.strict=   [ARM64, X86, S390] Configure TLB invalidation behaviour
                        Format: { "0" | "1" }
                        0 - Lazy mode.
                          Request that DMA unmap operations use deferred
        s390_iommu=     [HW,S390]
                        Set s390 IOTLB flushing mode
                strict
-                       With strict flushing every unmap operation will result in
-                       an IOTLB flush. Default is lazy flushing before reuse,
-                       which is faster.
+                       With strict flushing every unmap operation will result
+                       in an IOTLB flush. Default is lazy flushing before
+                       reuse, which is faster. Deprecated, equivalent to
+                       iommu.strict=1.
 
        s390_iommu_aperture=    [KNL,S390]
                        Specifies the size of the per device DMA address space
index b248694e00247b57b5be3338535038e9910335d1..3f74f1cf37dfda0ef2a3c1efbbdbd222c2485366 100644 (file)
@@ -159,13 +159,6 @@ struct zpci_dev {
        unsigned long   *dma_table;
        int             tlb_refresh;
 
-       spinlock_t      iommu_bitmap_lock;
-       unsigned long   *iommu_bitmap;
-       unsigned long   *lazy_bitmap;
-       unsigned long   iommu_size;
-       unsigned long   iommu_pages;
-       unsigned int    next_bit;
-
        struct iommu_device iommu_dev;  /* IOMMU core handle */
 
        char res_name[16];
index d6189ed14f84874ac1f135d1480377d64f6f7383..f0c677ddd270606df61e7fd6ccd9b6c17f89f6b9 100644 (file)
@@ -50,6 +50,9 @@ struct clp_fh_list_entry {
 #define CLP_UTIL_STR_LEN       64
 #define CLP_PFIP_NR_SEGMENTS   4
 
+/* PCI function type numbers */
+#define PCI_FUNC_TYPE_ISM      0x5     /* ISM device */
+
 extern bool zpci_unique_uid;
 
 struct clp_rsp_slpc_pci {
index 7119c04c51c5c864677de1ed928b33a8b74b6d74..42d7cc4262ca48d1368cc31ab804f07bf558a7b7 100644 (file)
@@ -82,117 +82,16 @@ enum zpci_ioat_dtype {
 #define ZPCI_TABLE_VALID_MASK          0x20
 #define ZPCI_TABLE_PROT_MASK           0x200
 
-static inline unsigned int calc_rtx(dma_addr_t ptr)
-{
-       return ((unsigned long) ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK;
-}
-
-static inline unsigned int calc_sx(dma_addr_t ptr)
-{
-       return ((unsigned long) ptr >> ZPCI_ST_SHIFT) & ZPCI_INDEX_MASK;
-}
-
-static inline unsigned int calc_px(dma_addr_t ptr)
-{
-       return ((unsigned long) ptr >> PAGE_SHIFT) & ZPCI_PT_MASK;
-}
-
-static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa)
-{
-       *entry &= ZPCI_PTE_FLAG_MASK;
-       *entry |= (pfaa & ZPCI_PTE_ADDR_MASK);
-}
-
-static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto)
-{
-       *entry &= ZPCI_RTE_FLAG_MASK;
-       *entry |= (sto & ZPCI_RTE_ADDR_MASK);
-       *entry |= ZPCI_TABLE_TYPE_RTX;
-}
-
-static inline void set_st_pto(unsigned long *entry, phys_addr_t pto)
-{
-       *entry &= ZPCI_STE_FLAG_MASK;
-       *entry |= (pto & ZPCI_STE_ADDR_MASK);
-       *entry |= ZPCI_TABLE_TYPE_SX;
-}
-
-static inline void validate_rt_entry(unsigned long *entry)
-{
-       *entry &= ~ZPCI_TABLE_VALID_MASK;
-       *entry &= ~ZPCI_TABLE_OFFSET_MASK;
-       *entry |= ZPCI_TABLE_VALID;
-       *entry |= ZPCI_TABLE_LEN_RTX;
-}
-
-static inline void validate_st_entry(unsigned long *entry)
-{
-       *entry &= ~ZPCI_TABLE_VALID_MASK;
-       *entry |= ZPCI_TABLE_VALID;
-}
-
-static inline void invalidate_pt_entry(unsigned long *entry)
-{
-       WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID);
-       *entry &= ~ZPCI_PTE_VALID_MASK;
-       *entry |= ZPCI_PTE_INVALID;
-}
-
-static inline void validate_pt_entry(unsigned long *entry)
-{
-       WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID);
-       *entry &= ~ZPCI_PTE_VALID_MASK;
-       *entry |= ZPCI_PTE_VALID;
-}
-
-static inline void entry_set_protected(unsigned long *entry)
-{
-       *entry &= ~ZPCI_TABLE_PROT_MASK;
-       *entry |= ZPCI_TABLE_PROTECTED;
-}
-
-static inline void entry_clr_protected(unsigned long *entry)
-{
-       *entry &= ~ZPCI_TABLE_PROT_MASK;
-       *entry |= ZPCI_TABLE_UNPROTECTED;
-}
-
-static inline int reg_entry_isvalid(unsigned long entry)
-{
-       return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID;
-}
-
-static inline int pt_entry_isvalid(unsigned long entry)
-{
-       return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID;
-}
-
-static inline unsigned long *get_rt_sto(unsigned long entry)
-{
-       if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX)
-               return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK);
-       else
-               return NULL;
-
-}
-
-static inline unsigned long *get_st_pto(unsigned long entry)
-{
-       if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX)
-               return phys_to_virt(entry & ZPCI_STE_ADDR_MASK);
-       else
-               return NULL;
-}
-
-/* Prototypes */
-void dma_free_seg_table(unsigned long);
-unsigned long *dma_alloc_cpu_table(gfp_t gfp);
-void dma_cleanup_tables(unsigned long *);
-unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr,
-                                 gfp_t gfp);
-void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags);
-
-extern const struct dma_map_ops s390_pci_dma_ops;
+struct zpci_iommu_ctrs {
+       atomic64_t              mapped_pages;
+       atomic64_t              unmapped_pages;
+       atomic64_t              global_rpcits;
+       atomic64_t              sync_map_rpcits;
+       atomic64_t              sync_rpcits;
+};
+
+struct zpci_dev;
 
+struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev);
 
 #endif
index 5ae31ca9dd441d6180b13e624c0adaca4e49fc23..0547a10406e72a1a0745a842228130bc0710f1a0 100644 (file)
@@ -3,7 +3,7 @@
 # Makefile for the s390 PCI subsystem.
 #
 
-obj-$(CONFIG_PCI)      += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \
+obj-$(CONFIG_PCI)      += pci.o pci_irq.o pci_clp.o pci_sysfs.o \
                           pci_event.o pci_debug.o pci_insn.o pci_mmio.o \
                           pci_bus.o pci_kvm_hook.o
 obj-$(CONFIG_PCI_IOV)  += pci_iov.o
index d34d5813d00660c1776595e13e8a7df977c9eaf7..563cb72d9ed0f4a5cc9ef4fcd03e2fc8f4555316 100644 (file)
@@ -124,7 +124,11 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
 
        WARN_ON_ONCE(iota & 0x3fff);
        fib.pba = base;
-       fib.pal = limit;
+       /* Work around off by one in ISM virt device */
+       if (zdev->pft == PCI_FUNC_TYPE_ISM && limit > base)
+               fib.pal = limit + (1 << 12);
+       else
+               fib.pal = limit;
        fib.iota = iota | ZPCI_IOTA_RTTO_FLAG;
        fib.gd = zdev->gisa;
        cc = zpci_mod_fc(req, &fib, status);
@@ -582,7 +586,6 @@ int pcibios_device_add(struct pci_dev *pdev)
                pdev->no_vf_scan = 1;
 
        pdev->dev.groups = zpci_attr_groups;
-       pdev->dev.dma_ops = &s390_pci_dma_ops;
        zpci_map_resources(pdev);
 
        for (i = 0; i < PCI_STD_NUM_BARS; i++) {
@@ -756,8 +759,6 @@ int zpci_hot_reset_device(struct zpci_dev *zdev)
        if (zdev->dma_table)
                rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
                                        virt_to_phys(zdev->dma_table), &status);
-       else
-               rc = zpci_dma_init_device(zdev);
        if (rc) {
                zpci_disable_device(zdev);
                return rc;
@@ -865,11 +866,6 @@ int zpci_deconfigure_device(struct zpci_dev *zdev)
        if (zdev->zbus->bus)
                zpci_bus_remove_device(zdev, false);
 
-       if (zdev->dma_table) {
-               rc = zpci_dma_exit_device(zdev);
-               if (rc)
-                       return rc;
-       }
        if (zdev_enabled(zdev)) {
                rc = zpci_disable_device(zdev);
                if (rc)
@@ -918,8 +914,6 @@ void zpci_release_device(struct kref *kref)
        if (zdev->zbus->bus)
                zpci_bus_remove_device(zdev, false);
 
-       if (zdev->dma_table)
-               zpci_dma_exit_device(zdev);
        if (zdev_enabled(zdev))
                zpci_disable_device(zdev);
 
@@ -1109,10 +1103,6 @@ static int __init pci_base_init(void)
        if (rc)
                goto out_irq;
 
-       rc = zpci_dma_init();
-       if (rc)
-               goto out_dma;
-
        rc = clp_scan_pci_devices();
        if (rc)
                goto out_find;
@@ -1122,8 +1112,6 @@ static int __init pci_base_init(void)
        return 0;
 
 out_find:
-       zpci_dma_exit();
-out_dma:
        zpci_irq_exit();
 out_irq:
        zpci_mem_exit();
index 32245b970a0cf1d72ace477e750f5c340d708943..daa5d7450c7d383b254d225d022e23a14c105e41 100644 (file)
@@ -47,11 +47,6 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev)
                rc = zpci_enable_device(zdev);
                if (rc)
                        return rc;
-               rc = zpci_dma_init_device(zdev);
-               if (rc) {
-                       zpci_disable_device(zdev);
-                       return rc;
-               }
        }
 
        if (!zdev->has_resources) {
index ca6bd98eec136e9a920a250729108730ec652ef5..6dde2263c79d1f57274e016e5867fe3a09ed473c 100644 (file)
@@ -53,9 +53,11 @@ static char *pci_fmt3_names[] = {
 };
 
 static char *pci_sw_names[] = {
-       "Allocated pages",
        "Mapped pages",
        "Unmapped pages",
+       "Global RPCITs",
+       "Sync Map RPCITs",
+       "Sync RPCITs",
 };
 
 static void pci_fmb_show(struct seq_file *m, char *name[], int length,
@@ -69,10 +71,14 @@ static void pci_fmb_show(struct seq_file *m, char *name[], int length,
 
 static void pci_sw_counter_show(struct seq_file *m)
 {
-       struct zpci_dev *zdev = m->private;
-       atomic64_t *counter = &zdev->allocated_pages;
+       struct zpci_iommu_ctrs  *ctrs = zpci_get_iommu_ctrs(m->private);
+       atomic64_t *counter;
        int i;
 
+       if (!ctrs)
+               return;
+
+       counter = &ctrs->mapped_pages;
        for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++)
                seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i],
                           atomic64_read(counter));
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
deleted file mode 100644 (file)
index 2d9b01d..0000000
+++ /dev/null
@@ -1,735 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright IBM Corp. 2012
- *
- * Author(s):
- *   Jan Glauber <jang@linux.vnet.ibm.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/iommu-helper.h>
-#include <linux/dma-map-ops.h>
-#include <linux/vmalloc.h>
-#include <linux/pci.h>
-#include <asm/pci_dma.h>
-
-static struct kmem_cache *dma_region_table_cache;
-static struct kmem_cache *dma_page_table_cache;
-static int s390_iommu_strict;
-static u64 s390_iommu_aperture;
-static u32 s390_iommu_aperture_factor = 1;
-
-static int zpci_refresh_global(struct zpci_dev *zdev)
-{
-       return zpci_refresh_trans((u64) zdev->fh << 32, zdev->start_dma,
-                                 zdev->iommu_pages * PAGE_SIZE);
-}
-
-unsigned long *dma_alloc_cpu_table(gfp_t gfp)
-{
-       unsigned long *table, *entry;
-
-       table = kmem_cache_alloc(dma_region_table_cache, gfp);
-       if (!table)
-               return NULL;
-
-       for (entry = table; entry < table + ZPCI_TABLE_ENTRIES; entry++)
-               *entry = ZPCI_TABLE_INVALID;
-       return table;
-}
-
-static void dma_free_cpu_table(void *table)
-{
-       kmem_cache_free(dma_region_table_cache, table);
-}
-
-static unsigned long *dma_alloc_page_table(gfp_t gfp)
-{
-       unsigned long *table, *entry;
-
-       table = kmem_cache_alloc(dma_page_table_cache, gfp);
-       if (!table)
-               return NULL;
-
-       for (entry = table; entry < table + ZPCI_PT_ENTRIES; entry++)
-               *entry = ZPCI_PTE_INVALID;
-       return table;
-}
-
-static void dma_free_page_table(void *table)
-{
-       kmem_cache_free(dma_page_table_cache, table);
-}
-
-static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp)
-{
-       unsigned long old_rte, rte;
-       unsigned long *sto;
-
-       rte = READ_ONCE(*rtep);
-       if (reg_entry_isvalid(rte)) {
-               sto = get_rt_sto(rte);
-       } else {
-               sto = dma_alloc_cpu_table(gfp);
-               if (!sto)
-                       return NULL;
-
-               set_rt_sto(&rte, virt_to_phys(sto));
-               validate_rt_entry(&rte);
-               entry_clr_protected(&rte);
-
-               old_rte = cmpxchg(rtep, ZPCI_TABLE_INVALID, rte);
-               if (old_rte != ZPCI_TABLE_INVALID) {
-                       /* Somone else was faster, use theirs */
-                       dma_free_cpu_table(sto);
-                       sto = get_rt_sto(old_rte);
-               }
-       }
-       return sto;
-}
-
-static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp)
-{
-       unsigned long old_ste, ste;
-       unsigned long *pto;
-
-       ste = READ_ONCE(*step);
-       if (reg_entry_isvalid(ste)) {
-               pto = get_st_pto(ste);
-       } else {
-               pto = dma_alloc_page_table(gfp);
-               if (!pto)
-                       return NULL;
-               set_st_pto(&ste, virt_to_phys(pto));
-               validate_st_entry(&ste);
-               entry_clr_protected(&ste);
-
-               old_ste = cmpxchg(step, ZPCI_TABLE_INVALID, ste);
-               if (old_ste != ZPCI_TABLE_INVALID) {
-                       /* Somone else was faster, use theirs */
-                       dma_free_page_table(pto);
-                       pto = get_st_pto(old_ste);
-               }
-       }
-       return pto;
-}
-
-unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr,
-                                 gfp_t gfp)
-{
-       unsigned long *sto, *pto;
-       unsigned int rtx, sx, px;
-
-       rtx = calc_rtx(dma_addr);
-       sto = dma_get_seg_table_origin(&rto[rtx], gfp);
-       if (!sto)
-               return NULL;
-
-       sx = calc_sx(dma_addr);
-       pto = dma_get_page_table_origin(&sto[sx], gfp);
-       if (!pto)
-               return NULL;
-
-       px = calc_px(dma_addr);
-       return &pto[px];
-}
-
-void dma_update_cpu_trans(unsigned long *ptep, phys_addr_t page_addr, int flags)
-{
-       unsigned long pte;
-
-       pte = READ_ONCE(*ptep);
-       if (flags & ZPCI_PTE_INVALID) {
-               invalidate_pt_entry(&pte);
-       } else {
-               set_pt_pfaa(&pte, page_addr);
-               validate_pt_entry(&pte);
-       }
-
-       if (flags & ZPCI_TABLE_PROTECTED)
-               entry_set_protected(&pte);
-       else
-               entry_clr_protected(&pte);
-
-       xchg(ptep, pte);
-}
-
-static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa,
-                             dma_addr_t dma_addr, size_t size, int flags)
-{
-       unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
-       phys_addr_t page_addr = (pa & PAGE_MASK);
-       unsigned long *entry;
-       int i, rc = 0;
-
-       if (!nr_pages)
-               return -EINVAL;
-
-       if (!zdev->dma_table)
-               return -EINVAL;
-
-       for (i = 0; i < nr_pages; i++) {
-               entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr,
-                                          GFP_ATOMIC);
-               if (!entry) {
-                       rc = -ENOMEM;
-                       goto undo_cpu_trans;
-               }
-               dma_update_cpu_trans(entry, page_addr, flags);
-               page_addr += PAGE_SIZE;
-               dma_addr += PAGE_SIZE;
-       }
-
-undo_cpu_trans:
-       if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) {
-               flags = ZPCI_PTE_INVALID;
-               while (i-- > 0) {
-                       page_addr -= PAGE_SIZE;
-                       dma_addr -= PAGE_SIZE;
-                       entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr,
-                                                  GFP_ATOMIC);
-                       if (!entry)
-                               break;
-                       dma_update_cpu_trans(entry, page_addr, flags);
-               }
-       }
-       return rc;
-}
-
-static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,
-                          size_t size, int flags)
-{
-       unsigned long irqflags;
-       int ret;
-
-       /*
-        * With zdev->tlb_refresh == 0, rpcit is not required to establish new
-        * translations when previously invalid translation-table entries are
-        * validated. With lazy unmap, rpcit is skipped for previously valid
-        * entries, but a global rpcit is then required before any address can
-        * be re-used, i.e. after each iommu bitmap wrap-around.
-        */
-       if ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID) {
-               if (!zdev->tlb_refresh)
-                       return 0;
-       } else {
-               if (!s390_iommu_strict)
-                       return 0;
-       }
-
-       ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr,
-                                PAGE_ALIGN(size));
-       if (ret == -ENOMEM && !s390_iommu_strict) {
-               /* enable the hypervisor to free some resources */
-               if (zpci_refresh_global(zdev))
-                       goto out;
-
-               spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags);
-               bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap,
-                             zdev->lazy_bitmap, zdev->iommu_pages);
-               bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages);
-               spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags);
-               ret = 0;
-       }
-out:
-       return ret;
-}
-
-static int dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa,
-                           dma_addr_t dma_addr, size_t size, int flags)
-{
-       int rc;
-
-       rc = __dma_update_trans(zdev, pa, dma_addr, size, flags);
-       if (rc)
-               return rc;
-
-       rc = __dma_purge_tlb(zdev, dma_addr, size, flags);
-       if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID))
-               __dma_update_trans(zdev, pa, dma_addr, size, ZPCI_PTE_INVALID);
-
-       return rc;
-}
-
-void dma_free_seg_table(unsigned long entry)
-{
-       unsigned long *sto = get_rt_sto(entry);
-       int sx;
-
-       for (sx = 0; sx < ZPCI_TABLE_ENTRIES; sx++)
-               if (reg_entry_isvalid(sto[sx]))
-                       dma_free_page_table(get_st_pto(sto[sx]));
-
-       dma_free_cpu_table(sto);
-}
-
-void dma_cleanup_tables(unsigned long *table)
-{
-       int rtx;
-
-       if (!table)
-               return;
-
-       for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++)
-               if (reg_entry_isvalid(table[rtx]))
-                       dma_free_seg_table(table[rtx]);
-
-       dma_free_cpu_table(table);
-}
-
-static unsigned long __dma_alloc_iommu(struct device *dev,
-                                      unsigned long start, int size)
-{
-       struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-
-       return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages,
-                               start, size, zdev->start_dma >> PAGE_SHIFT,
-                               dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT),
-                               0);
-}
-
-static dma_addr_t dma_alloc_address(struct device *dev, int size)
-{
-       struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-       unsigned long offset, flags;
-
-       spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags);
-       offset = __dma_alloc_iommu(dev, zdev->next_bit, size);
-       if (offset == -1) {
-               if (!s390_iommu_strict) {
-                       /* global flush before DMA addresses are reused */
-                       if (zpci_refresh_global(zdev))
-                               goto out_error;
-
-                       bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap,
-                                     zdev->lazy_bitmap, zdev->iommu_pages);
-                       bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages);
-               }
-               /* wrap-around */
-               offset = __dma_alloc_iommu(dev, 0, size);
-               if (offset == -1)
-                       goto out_error;
-       }
-       zdev->next_bit = offset + size;
-       spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
-
-       return zdev->start_dma + offset * PAGE_SIZE;
-
-out_error:
-       spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
-       return DMA_MAPPING_ERROR;
-}
-
-static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size)
-{
-       struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-       unsigned long flags, offset;
-
-       offset = (dma_addr - zdev->start_dma) >> PAGE_SHIFT;
-
-       spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags);
-       if (!zdev->iommu_bitmap)
-               goto out;
-
-       if (s390_iommu_strict)
-               bitmap_clear(zdev->iommu_bitmap, offset, size);
-       else
-               bitmap_set(zdev->lazy_bitmap, offset, size);
-
-out:
-       spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
-}
-
-static inline void zpci_err_dma(unsigned long rc, unsigned long addr)
-{
-       struct {
-               unsigned long rc;
-               unsigned long addr;
-       } __packed data = {rc, addr};
-
-       zpci_err_hex(&data, sizeof(data));
-}
-
-static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
-                                    unsigned long offset, size_t size,
-                                    enum dma_data_direction direction,
-                                    unsigned long attrs)
-{
-       struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-       unsigned long pa = page_to_phys(page) + offset;
-       int flags = ZPCI_PTE_VALID;
-       unsigned long nr_pages;
-       dma_addr_t dma_addr;
-       int ret;
-
-       /* This rounds up number of pages based on size and offset */
-       nr_pages = iommu_num_pages(pa, size, PAGE_SIZE);
-       dma_addr = dma_alloc_address(dev, nr_pages);
-       if (dma_addr == DMA_MAPPING_ERROR) {
-               ret = -ENOSPC;
-               goto out_err;
-       }
-
-       /* Use rounded up size */
-       size = nr_pages * PAGE_SIZE;
-
-       if (direction == DMA_NONE || direction == DMA_TO_DEVICE)
-               flags |= ZPCI_TABLE_PROTECTED;
-
-       ret = dma_update_trans(zdev, pa, dma_addr, size, flags);
-       if (ret)
-               goto out_free;
-
-       atomic64_add(nr_pages, &zdev->mapped_pages);
-       return dma_addr + (offset & ~PAGE_MASK);
-
-out_free:
-       dma_free_address(dev, dma_addr, nr_pages);
-out_err:
-       zpci_err("map error:\n");
-       zpci_err_dma(ret, pa);
-       return DMA_MAPPING_ERROR;
-}
-
-static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr,
-                                size_t size, enum dma_data_direction direction,
-                                unsigned long attrs)
-{
-       struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-       int npages, ret;
-
-       npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
-       dma_addr = dma_addr & PAGE_MASK;
-       ret = dma_update_trans(zdev, 0, dma_addr, npages * PAGE_SIZE,
-                              ZPCI_PTE_INVALID);
-       if (ret) {
-               zpci_err("unmap error:\n");
-               zpci_err_dma(ret, dma_addr);
-               return;
-       }
-
-       atomic64_add(npages, &zdev->unmapped_pages);
-       dma_free_address(dev, dma_addr, npages);
-}
-
-static void *s390_dma_alloc(struct device *dev, size_t size,
-                           dma_addr_t *dma_handle, gfp_t flag,
-                           unsigned long attrs)
-{
-       struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-       struct page *page;
-       phys_addr_t pa;
-       dma_addr_t map;
-
-       size = PAGE_ALIGN(size);
-       page = alloc_pages(flag | __GFP_ZERO, get_order(size));
-       if (!page)
-               return NULL;
-
-       pa = page_to_phys(page);
-       map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0);
-       if (dma_mapping_error(dev, map)) {
-               __free_pages(page, get_order(size));
-               return NULL;
-       }
-
-       atomic64_add(size / PAGE_SIZE, &zdev->allocated_pages);
-       if (dma_handle)
-               *dma_handle = map;
-       return phys_to_virt(pa);
-}
-
-static void s390_dma_free(struct device *dev, size_t size,
-                         void *vaddr, dma_addr_t dma_handle,
-                         unsigned long attrs)
-{
-       struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-
-       size = PAGE_ALIGN(size);
-       atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages);
-       s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, 0);
-       free_pages((unsigned long)vaddr, get_order(size));
-}
-
-/* Map a segment into a contiguous dma address area */
-static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
-                            size_t size, dma_addr_t *handle,
-                            enum dma_data_direction dir)
-{
-       unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
-       struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-       dma_addr_t dma_addr_base, dma_addr;
-       int flags = ZPCI_PTE_VALID;
-       struct scatterlist *s;
-       phys_addr_t pa = 0;
-       int ret;
-
-       dma_addr_base = dma_alloc_address(dev, nr_pages);
-       if (dma_addr_base == DMA_MAPPING_ERROR)
-               return -ENOMEM;
-
-       dma_addr = dma_addr_base;
-       if (dir == DMA_NONE || dir == DMA_TO_DEVICE)
-               flags |= ZPCI_TABLE_PROTECTED;
-
-       for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) {
-               pa = page_to_phys(sg_page(s));
-               ret = __dma_update_trans(zdev, pa, dma_addr,
-                                        s->offset + s->length, flags);
-               if (ret)
-                       goto unmap;
-
-               dma_addr += s->offset + s->length;
-       }
-       ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags);
-       if (ret)
-               goto unmap;
-
-       *handle = dma_addr_base;
-       atomic64_add(nr_pages, &zdev->mapped_pages);
-
-       return ret;
-
-unmap:
-       dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base,
-                        ZPCI_PTE_INVALID);
-       dma_free_address(dev, dma_addr_base, nr_pages);
-       zpci_err("map error:\n");
-       zpci_err_dma(ret, pa);
-       return ret;
-}
-
-static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
-                          int nr_elements, enum dma_data_direction dir,
-                          unsigned long attrs)
-{
-       struct scatterlist *s = sg, *start = sg, *dma = sg;
-       unsigned int max = dma_get_max_seg_size(dev);
-       unsigned int size = s->offset + s->length;
-       unsigned int offset = s->offset;
-       int count = 0, i, ret;
-
-       for (i = 1; i < nr_elements; i++) {
-               s = sg_next(s);
-
-               s->dma_length = 0;
-
-               if (s->offset || (size & ~PAGE_MASK) ||
-                   size + s->length > max) {
-                       ret = __s390_dma_map_sg(dev, start, size,
-                                               &dma->dma_address, dir);
-                       if (ret)
-                               goto unmap;
-
-                       dma->dma_address += offset;
-                       dma->dma_length = size - offset;
-
-                       size = offset = s->offset;
-                       start = s;
-                       dma = sg_next(dma);
-                       count++;
-               }
-               size += s->length;
-       }
-       ret = __s390_dma_map_sg(dev, start, size, &dma->dma_address, dir);
-       if (ret)
-               goto unmap;
-
-       dma->dma_address += offset;
-       dma->dma_length = size - offset;
-
-       return count + 1;
-unmap:
-       for_each_sg(sg, s, count, i)
-               s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s),
-                                    dir, attrs);
-
-       return ret;
-}
-
-static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-                             int nr_elements, enum dma_data_direction dir,
-                             unsigned long attrs)
-{
-       struct scatterlist *s;
-       int i;
-
-       for_each_sg(sg, s, nr_elements, i) {
-               if (s->dma_length)
-                       s390_dma_unmap_pages(dev, s->dma_address, s->dma_length,
-                                            dir, attrs);
-               s->dma_address = 0;
-               s->dma_length = 0;
-       }
-}
-       
-int zpci_dma_init_device(struct zpci_dev *zdev)
-{
-       u8 status;
-       int rc;
-
-       /*
-        * At this point, if the device is part of an IOMMU domain, this would
-        * be a strong hint towards a bug in the IOMMU API (common) code and/or
-        * simultaneous access via IOMMU and DMA API. So let's issue a warning.
-        */
-       WARN_ON(zdev->s390_domain);
-
-       spin_lock_init(&zdev->iommu_bitmap_lock);
-
-       zdev->dma_table = dma_alloc_cpu_table(GFP_KERNEL);
-       if (!zdev->dma_table) {
-               rc = -ENOMEM;
-               goto out;
-       }
-
-       /*
-        * Restrict the iommu bitmap size to the minimum of the following:
-        * - s390_iommu_aperture which defaults to high_memory
-        * - 3-level pagetable address limit minus start_dma offset
-        * - DMA address range allowed by the hardware (clp query pci fn)
-        *
-        * Also set zdev->end_dma to the actual end address of the usable
-        * range, instead of the theoretical maximum as reported by hardware.
-        *
-        * This limits the number of concurrently usable DMA mappings since
-        * for each DMA mapped memory address we need a DMA address including
-        * extra DMA addresses for multiple mappings of the same memory address.
-        */
-       zdev->start_dma = PAGE_ALIGN(zdev->start_dma);
-       zdev->iommu_size = min3(s390_iommu_aperture,
-                               ZPCI_TABLE_SIZE_RT - zdev->start_dma,
-                               zdev->end_dma - zdev->start_dma + 1);
-       zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1;
-       zdev->iommu_pages = zdev->iommu_size >> PAGE_SHIFT;
-       zdev->iommu_bitmap = vzalloc(zdev->iommu_pages / 8);
-       if (!zdev->iommu_bitmap) {
-               rc = -ENOMEM;
-               goto free_dma_table;
-       }
-       if (!s390_iommu_strict) {
-               zdev->lazy_bitmap = vzalloc(zdev->iommu_pages / 8);
-               if (!zdev->lazy_bitmap) {
-                       rc = -ENOMEM;
-                       goto free_bitmap;
-               }
-
-       }
-       if (zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
-                              virt_to_phys(zdev->dma_table), &status)) {
-               rc = -EIO;
-               goto free_bitmap;
-       }
-
-       return 0;
-free_bitmap:
-       vfree(zdev->iommu_bitmap);
-       zdev->iommu_bitmap = NULL;
-       vfree(zdev->lazy_bitmap);
-       zdev->lazy_bitmap = NULL;
-free_dma_table:
-       dma_free_cpu_table(zdev->dma_table);
-       zdev->dma_table = NULL;
-out:
-       return rc;
-}
-
-int zpci_dma_exit_device(struct zpci_dev *zdev)
-{
-       int cc = 0;
-
-       /*
-        * At this point, if the device is part of an IOMMU domain, this would
-        * be a strong hint towards a bug in the IOMMU API (common) code and/or
-        * simultaneous access via IOMMU and DMA API. So let's issue a warning.
-        */
-       WARN_ON(zdev->s390_domain);
-       if (zdev_enabled(zdev))
-               cc = zpci_unregister_ioat(zdev, 0);
-       /*
-        * cc == 3 indicates the function is gone already. This can happen
-        * if the function was deconfigured/disabled suddenly and we have not
-        * received a new handle yet.
-        */
-       if (cc && cc != 3)
-               return -EIO;
-
-       dma_cleanup_tables(zdev->dma_table);
-       zdev->dma_table = NULL;
-       vfree(zdev->iommu_bitmap);
-       zdev->iommu_bitmap = NULL;
-       vfree(zdev->lazy_bitmap);
-       zdev->lazy_bitmap = NULL;
-       zdev->next_bit = 0;
-       return 0;
-}
-
-static int __init dma_alloc_cpu_table_caches(void)
-{
-       dma_region_table_cache = kmem_cache_create("PCI_DMA_region_tables",
-                                       ZPCI_TABLE_SIZE, ZPCI_TABLE_ALIGN,
-                                       0, NULL);
-       if (!dma_region_table_cache)
-               return -ENOMEM;
-
-       dma_page_table_cache = kmem_cache_create("PCI_DMA_page_tables",
-                                       ZPCI_PT_SIZE, ZPCI_PT_ALIGN,
-                                       0, NULL);
-       if (!dma_page_table_cache) {
-               kmem_cache_destroy(dma_region_table_cache);
-               return -ENOMEM;
-       }
-       return 0;
-}
-
-int __init zpci_dma_init(void)
-{
-       s390_iommu_aperture = (u64)virt_to_phys(high_memory);
-       if (!s390_iommu_aperture_factor)
-               s390_iommu_aperture = ULONG_MAX;
-       else
-               s390_iommu_aperture *= s390_iommu_aperture_factor;
-
-       return dma_alloc_cpu_table_caches();
-}
-
-void zpci_dma_exit(void)
-{
-       kmem_cache_destroy(dma_page_table_cache);
-       kmem_cache_destroy(dma_region_table_cache);
-}
-
-const struct dma_map_ops s390_pci_dma_ops = {
-       .alloc          = s390_dma_alloc,
-       .free           = s390_dma_free,
-       .map_sg         = s390_dma_map_sg,
-       .unmap_sg       = s390_dma_unmap_sg,
-       .map_page       = s390_dma_map_pages,
-       .unmap_page     = s390_dma_unmap_pages,
-       .mmap           = dma_common_mmap,
-       .get_sgtable    = dma_common_get_sgtable,
-       .alloc_pages    = dma_common_alloc_pages,
-       .free_pages     = dma_common_free_pages,
-       /* dma_supported is unconditionally true without a callback */
-};
-EXPORT_SYMBOL_GPL(s390_pci_dma_ops);
-
-static int __init s390_iommu_setup(char *str)
-{
-       if (!strcmp(str, "strict"))
-               s390_iommu_strict = 1;
-       return 1;
-}
-
-__setup("s390_iommu=", s390_iommu_setup);
-
-static int __init s390_iommu_aperture_setup(char *str)
-{
-       if (kstrtou32(str, 10, &s390_iommu_aperture_factor))
-               s390_iommu_aperture_factor = 1;
-       return 1;
-}
-
-__setup("s390_iommu_aperture=", s390_iommu_aperture_setup);
index 4ef5a6a1d6187d1261dd70c5bb99fa7eaa629877..4d9773ef9e0a856e8a21b1ca46174e653daa6360 100644 (file)
@@ -313,8 +313,6 @@ static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh)
        /* Even though the device is already gone we still
         * need to free zPCI resources as part of the disable.
         */
-       if (zdev->dma_table)
-               zpci_dma_exit_device(zdev);
        if (zdev_enabled(zdev))
                zpci_disable_device(zdev);
        zdev->state = ZPCI_FN_STATE_STANDBY;
index cae280e5c047d1d5eaa405c86b5e8444350961c1..8a7abac5181645d6635ed95e1f5706942948c642 100644 (file)
@@ -56,6 +56,7 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
        struct pci_dev *pdev = to_pci_dev(dev);
        struct zpci_dev *zdev = to_zpci(pdev);
        int ret = 0;
+       u8 status;
 
        /* Can't use device_remove_self() here as that would lead us to lock
         * the pci_rescan_remove_lock while holding the device' kernfs lock.
@@ -82,12 +83,6 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
        pci_lock_rescan_remove();
        if (pci_dev_is_added(pdev)) {
                pci_stop_and_remove_bus_device(pdev);
-               if (zdev->dma_table) {
-                       ret = zpci_dma_exit_device(zdev);
-                       if (ret)
-                               goto out;
-               }
-
                if (zdev_enabled(zdev)) {
                        ret = zpci_disable_device(zdev);
                        /*
@@ -105,14 +100,16 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
                ret = zpci_enable_device(zdev);
                if (ret)
                        goto out;
-               ret = zpci_dma_init_device(zdev);
-               if (ret) {
-                       zpci_disable_device(zdev);
-                       goto out;
+
+               if (zdev->dma_table) {
+                       ret = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+                                                virt_to_phys(zdev->dma_table), &status);
+                       if (ret)
+                               zpci_disable_device(zdev);
                }
-               pci_rescan_bus(zdev->zbus->bus);
        }
 out:
+       pci_rescan_bus(zdev->zbus->bus);
        pci_unlock_rescan_remove();
        if (kn)
                sysfs_unbreak_active_protection(kn);
index cd6727898b1175b7e5c8f2f9337d5e90cbec7229..3199fd54b462c9f74da83ddefce63405e225fc3b 100644 (file)
@@ -91,7 +91,7 @@ config IOMMU_DEBUGFS
 choice
        prompt "IOMMU default domain type"
        depends on IOMMU_API
-       default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64
+       default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64 || S390
        default IOMMU_DEFAULT_DMA_STRICT
        help
          Choose the type of IOMMU domain used to manage DMA API usage by
@@ -146,7 +146,7 @@ config OF_IOMMU
 
 # IOMMU-agnostic DMA-mapping layer
 config IOMMU_DMA
-       def_bool ARM64 || IA64 || X86
+       def_bool ARM64 || IA64 || X86 || S390
        select DMA_OPS
        select IOMMU_API
        select IOMMU_IOVA
index 560d0957f9bef345a2064cde13219c07cfd28230..bb9e48c826444304905fb147d13c153070dde622 100644 (file)
 #include <linux/rcupdate.h>
 #include <asm/pci_dma.h>
 
+#include "dma-iommu.h"
+
 static const struct iommu_ops s390_iommu_ops;
 
+static struct kmem_cache *dma_region_table_cache;
+static struct kmem_cache *dma_page_table_cache;
+
+static u64 s390_iommu_aperture;
+static u32 s390_iommu_aperture_factor = 1;
+
 struct s390_domain {
        struct iommu_domain     domain;
        struct list_head        devices;
+       struct zpci_iommu_ctrs  ctrs;
        unsigned long           *dma_table;
        spinlock_t              list_lock;
        struct rcu_head         rcu;
 };
 
+static inline unsigned int calc_rtx(dma_addr_t ptr)
+{
+       return ((unsigned long)ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK;
+}
+
+static inline unsigned int calc_sx(dma_addr_t ptr)
+{
+       return ((unsigned long)ptr >> ZPCI_ST_SHIFT) & ZPCI_INDEX_MASK;
+}
+
+static inline unsigned int calc_px(dma_addr_t ptr)
+{
+       return ((unsigned long)ptr >> PAGE_SHIFT) & ZPCI_PT_MASK;
+}
+
+static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa)
+{
+       *entry &= ZPCI_PTE_FLAG_MASK;
+       *entry |= (pfaa & ZPCI_PTE_ADDR_MASK);
+}
+
+static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto)
+{
+       *entry &= ZPCI_RTE_FLAG_MASK;
+       *entry |= (sto & ZPCI_RTE_ADDR_MASK);
+       *entry |= ZPCI_TABLE_TYPE_RTX;
+}
+
+static inline void set_st_pto(unsigned long *entry, phys_addr_t pto)
+{
+       *entry &= ZPCI_STE_FLAG_MASK;
+       *entry |= (pto & ZPCI_STE_ADDR_MASK);
+       *entry |= ZPCI_TABLE_TYPE_SX;
+}
+
+static inline void validate_rt_entry(unsigned long *entry)
+{
+       *entry &= ~ZPCI_TABLE_VALID_MASK;
+       *entry &= ~ZPCI_TABLE_OFFSET_MASK;
+       *entry |= ZPCI_TABLE_VALID;
+       *entry |= ZPCI_TABLE_LEN_RTX;
+}
+
+static inline void validate_st_entry(unsigned long *entry)
+{
+       *entry &= ~ZPCI_TABLE_VALID_MASK;
+       *entry |= ZPCI_TABLE_VALID;
+}
+
+static inline void invalidate_pt_entry(unsigned long *entry)
+{
+       WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID);
+       *entry &= ~ZPCI_PTE_VALID_MASK;
+       *entry |= ZPCI_PTE_INVALID;
+}
+
+static inline void validate_pt_entry(unsigned long *entry)
+{
+       WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID);
+       *entry &= ~ZPCI_PTE_VALID_MASK;
+       *entry |= ZPCI_PTE_VALID;
+}
+
+static inline void entry_set_protected(unsigned long *entry)
+{
+       *entry &= ~ZPCI_TABLE_PROT_MASK;
+       *entry |= ZPCI_TABLE_PROTECTED;
+}
+
+static inline void entry_clr_protected(unsigned long *entry)
+{
+       *entry &= ~ZPCI_TABLE_PROT_MASK;
+       *entry |= ZPCI_TABLE_UNPROTECTED;
+}
+
+static inline int reg_entry_isvalid(unsigned long entry)
+{
+       return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID;
+}
+
+static inline int pt_entry_isvalid(unsigned long entry)
+{
+       return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID;
+}
+
+static inline unsigned long *get_rt_sto(unsigned long entry)
+{
+       if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX)
+               return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK);
+       else
+               return NULL;
+}
+
+static inline unsigned long *get_st_pto(unsigned long entry)
+{
+       if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX)
+               return phys_to_virt(entry & ZPCI_STE_ADDR_MASK);
+       else
+               return NULL;
+}
+
+static int __init dma_alloc_cpu_table_caches(void)
+{
+       dma_region_table_cache = kmem_cache_create("PCI_DMA_region_tables",
+                                                  ZPCI_TABLE_SIZE,
+                                                  ZPCI_TABLE_ALIGN,
+                                                  0, NULL);
+       if (!dma_region_table_cache)
+               return -ENOMEM;
+
+       dma_page_table_cache = kmem_cache_create("PCI_DMA_page_tables",
+                                                ZPCI_PT_SIZE,
+                                                ZPCI_PT_ALIGN,
+                                                0, NULL);
+       if (!dma_page_table_cache) {
+               kmem_cache_destroy(dma_region_table_cache);
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+static unsigned long *dma_alloc_cpu_table(gfp_t gfp)
+{
+       unsigned long *table, *entry;
+
+       table = kmem_cache_alloc(dma_region_table_cache, gfp);
+       if (!table)
+               return NULL;
+
+       for (entry = table; entry < table + ZPCI_TABLE_ENTRIES; entry++)
+               *entry = ZPCI_TABLE_INVALID;
+       return table;
+}
+
+static void dma_free_cpu_table(void *table)
+{
+       kmem_cache_free(dma_region_table_cache, table);
+}
+
+static void dma_free_page_table(void *table)
+{
+       kmem_cache_free(dma_page_table_cache, table);
+}
+
+static void dma_free_seg_table(unsigned long entry)
+{
+       unsigned long *sto = get_rt_sto(entry);
+       int sx;
+
+       for (sx = 0; sx < ZPCI_TABLE_ENTRIES; sx++)
+               if (reg_entry_isvalid(sto[sx]))
+                       dma_free_page_table(get_st_pto(sto[sx]));
+
+       dma_free_cpu_table(sto);
+}
+
+static void dma_cleanup_tables(unsigned long *table)
+{
+       int rtx;
+
+       if (!table)
+               return;
+
+       for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++)
+               if (reg_entry_isvalid(table[rtx]))
+                       dma_free_seg_table(table[rtx]);
+
+       dma_free_cpu_table(table);
+}
+
+static unsigned long *dma_alloc_page_table(gfp_t gfp)
+{
+       unsigned long *table, *entry;
+
+       table = kmem_cache_alloc(dma_page_table_cache, gfp);
+       if (!table)
+               return NULL;
+
+       for (entry = table; entry < table + ZPCI_PT_ENTRIES; entry++)
+               *entry = ZPCI_PTE_INVALID;
+       return table;
+}
+
+static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp)
+{
+       unsigned long old_rte, rte;
+       unsigned long *sto;
+
+       rte = READ_ONCE(*rtep);
+       if (reg_entry_isvalid(rte)) {
+               sto = get_rt_sto(rte);
+       } else {
+               sto = dma_alloc_cpu_table(gfp);
+               if (!sto)
+                       return NULL;
+
+               set_rt_sto(&rte, virt_to_phys(sto));
+               validate_rt_entry(&rte);
+               entry_clr_protected(&rte);
+
+               old_rte = cmpxchg(rtep, ZPCI_TABLE_INVALID, rte);
+               if (old_rte != ZPCI_TABLE_INVALID) {
+                       /* Somone else was faster, use theirs */
+                       dma_free_cpu_table(sto);
+                       sto = get_rt_sto(old_rte);
+               }
+       }
+       return sto;
+}
+
+static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp)
+{
+       unsigned long old_ste, ste;
+       unsigned long *pto;
+
+       ste = READ_ONCE(*step);
+       if (reg_entry_isvalid(ste)) {
+               pto = get_st_pto(ste);
+       } else {
+               pto = dma_alloc_page_table(gfp);
+               if (!pto)
+                       return NULL;
+               set_st_pto(&ste, virt_to_phys(pto));
+               validate_st_entry(&ste);
+               entry_clr_protected(&ste);
+
+               old_ste = cmpxchg(step, ZPCI_TABLE_INVALID, ste);
+               if (old_ste != ZPCI_TABLE_INVALID) {
+                       /* Somone else was faster, use theirs */
+                       dma_free_page_table(pto);
+                       pto = get_st_pto(old_ste);
+               }
+       }
+       return pto;
+}
+
+static unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, gfp_t gfp)
+{
+       unsigned long *sto, *pto;
+       unsigned int rtx, sx, px;
+
+       rtx = calc_rtx(dma_addr);
+       sto = dma_get_seg_table_origin(&rto[rtx], gfp);
+       if (!sto)
+               return NULL;
+
+       sx = calc_sx(dma_addr);
+       pto = dma_get_page_table_origin(&sto[sx], gfp);
+       if (!pto)
+               return NULL;
+
+       px = calc_px(dma_addr);
+       return &pto[px];
+}
+
+static void dma_update_cpu_trans(unsigned long *ptep, phys_addr_t page_addr, int flags)
+{
+       unsigned long pte;
+
+       pte = READ_ONCE(*ptep);
+       if (flags & ZPCI_PTE_INVALID) {
+               invalidate_pt_entry(&pte);
+       } else {
+               set_pt_pfaa(&pte, page_addr);
+               validate_pt_entry(&pte);
+       }
+
+       if (flags & ZPCI_TABLE_PROTECTED)
+               entry_set_protected(&pte);
+       else
+               entry_clr_protected(&pte);
+
+       xchg(ptep, pte);
+}
+
 static struct s390_domain *to_s390_domain(struct iommu_domain *dom)
 {
        return container_of(dom, struct s390_domain, domain);
@@ -34,6 +318,8 @@ static bool s390_iommu_capable(struct device *dev, enum iommu_cap cap)
        switch (cap) {
        case IOMMU_CAP_CACHE_COHERENCY:
                return true;
+       case IOMMU_CAP_DEFERRED_FLUSH:
+               return true;
        default:
                return false;
        }
@@ -81,14 +367,13 @@ static void s390_domain_free(struct iommu_domain *domain)
        call_rcu(&s390_domain->rcu, s390_iommu_rcu_free_domain);
 }
 
-static void __s390_iommu_detach_device(struct zpci_dev *zdev)
+static void s390_iommu_detach_device(struct iommu_domain *domain,
+                                    struct device *dev)
 {
-       struct s390_domain *s390_domain = zdev->s390_domain;
+       struct s390_domain *s390_domain = to_s390_domain(domain);
+       struct zpci_dev *zdev = to_zpci_dev(dev);
        unsigned long flags;
 
-       if (!s390_domain)
-               return;
-
        spin_lock_irqsave(&s390_domain->list_lock, flags);
        list_del_rcu(&zdev->iommu_list);
        spin_unlock_irqrestore(&s390_domain->list_lock, flags);
@@ -115,9 +400,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
                return -EINVAL;
 
        if (zdev->s390_domain)
-               __s390_iommu_detach_device(zdev);
-       else if (zdev->dma_table)
-               zpci_dma_exit_device(zdev);
+               s390_iommu_detach_device(&zdev->s390_domain->domain, dev);
 
        cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
                                virt_to_phys(s390_domain->dma_table), &status);
@@ -127,7 +410,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
         */
        if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL)
                return -EIO;
-       zdev->dma_table = s390_domain->dma_table;
 
        zdev->dma_table = s390_domain->dma_table;
        zdev->s390_domain = s390_domain;
@@ -139,31 +421,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
        return 0;
 }
 
-/*
- * Switch control over the IOMMU to S390's internal dma_api ops
- */
-static int s390_iommu_platform_attach(struct iommu_domain *platform_domain,
-                                     struct device *dev)
-{
-       struct zpci_dev *zdev = to_zpci_dev(dev);
-
-       if (!zdev->s390_domain)
-               return 0;
-
-       __s390_iommu_detach_device(zdev);
-       zpci_dma_init_device(zdev);
-       return 0;
-}
-
-static struct iommu_domain_ops s390_iommu_platform_ops = {
-       .attach_dev = s390_iommu_platform_attach,
-};
-
-static struct iommu_domain s390_iommu_platform_domain = {
-       .type = IOMMU_DOMAIN_PLATFORM,
-       .ops = &s390_iommu_platform_ops,
-};
-
 static void s390_iommu_get_resv_regions(struct device *dev,
                                        struct list_head *list)
 {
@@ -216,7 +473,7 @@ static void s390_iommu_release_device(struct device *dev)
         * to the device, but keep it attached to other devices in the group.
         */
        if (zdev)
-               __s390_iommu_detach_device(zdev);
+               s390_iommu_detach_device(&zdev->s390_domain->domain, dev);
 }
 
 static int zpci_refresh_all(struct zpci_dev *zdev)
@@ -232,6 +489,7 @@ static void s390_iommu_flush_iotlb_all(struct iommu_domain *domain)
 
        rcu_read_lock();
        list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) {
+               atomic64_inc(&s390_domain->ctrs.global_rpcits);
                zpci_refresh_all(zdev);
        }
        rcu_read_unlock();
@@ -250,6 +508,7 @@ static void s390_iommu_iotlb_sync(struct iommu_domain *domain,
 
        rcu_read_lock();
        list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) {
+               atomic64_inc(&s390_domain->ctrs.sync_rpcits);
                zpci_refresh_trans((u64)zdev->fh << 32, gather->start,
                                   size);
        }
@@ -267,6 +526,7 @@ static int s390_iommu_iotlb_sync_map(struct iommu_domain *domain,
        list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) {
                if (!zdev->tlb_refresh)
                        continue;
+               atomic64_inc(&s390_domain->ctrs.sync_map_rpcits);
                ret = zpci_refresh_trans((u64)zdev->fh << 32,
                                         iova, size);
                /*
@@ -361,16 +621,15 @@ static int s390_iommu_map_pages(struct iommu_domain *domain,
        if (!IS_ALIGNED(iova | paddr, pgsize))
                return -EINVAL;
 
-       if (!(prot & IOMMU_READ))
-               return -EINVAL;
-
        if (!(prot & IOMMU_WRITE))
                flags |= ZPCI_TABLE_PROTECTED;
 
        rc = s390_iommu_validate_trans(s390_domain, paddr, iova,
-                                      pgcount, flags, gfp);
-       if (!rc)
+                                    pgcount, flags, gfp);
+       if (!rc) {
                *mapped = size;
+               atomic64_add(pgcount, &s390_domain->ctrs.mapped_pages);
+       }
 
        return rc;
 }
@@ -426,12 +685,26 @@ static size_t s390_iommu_unmap_pages(struct iommu_domain *domain,
                return 0;
 
        iommu_iotlb_gather_add_range(gather, iova, size);
+       atomic64_add(pgcount, &s390_domain->ctrs.unmapped_pages);
 
        return size;
 }
 
+static void s390_iommu_probe_finalize(struct device *dev)
+{
+       iommu_setup_dma_ops(dev, 0, U64_MAX);
+}
+
+struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev)
+{
+       if (!zdev || !zdev->s390_domain)
+               return NULL;
+       return &zdev->s390_domain->ctrs;
+}
+
 int zpci_init_iommu(struct zpci_dev *zdev)
 {
+       u64 aperture_size;
        int rc = 0;
 
        rc = iommu_device_sysfs_add(&zdev->iommu_dev, NULL, NULL,
@@ -443,6 +716,12 @@ int zpci_init_iommu(struct zpci_dev *zdev)
        if (rc)
                goto out_sysfs;
 
+       zdev->start_dma = PAGE_ALIGN(zdev->start_dma);
+       aperture_size = min3(s390_iommu_aperture,
+                            ZPCI_TABLE_SIZE_RT - zdev->start_dma,
+                            zdev->end_dma - zdev->start_dma + 1);
+       zdev->end_dma = zdev->start_dma + aperture_size - 1;
+
        return 0;
 
 out_sysfs:
@@ -458,11 +737,50 @@ void zpci_destroy_iommu(struct zpci_dev *zdev)
        iommu_device_sysfs_remove(&zdev->iommu_dev);
 }
 
+static int __init s390_iommu_setup(char *str)
+{
+       if (!strcmp(str, "strict")) {
+               pr_warn("s390_iommu=strict deprecated; use iommu.strict=1 instead\n");
+               iommu_set_dma_strict();
+       }
+       return 1;
+}
+
+__setup("s390_iommu=", s390_iommu_setup);
+
+static int __init s390_iommu_aperture_setup(char *str)
+{
+       if (kstrtou32(str, 10, &s390_iommu_aperture_factor))
+               s390_iommu_aperture_factor = 1;
+       return 1;
+}
+
+__setup("s390_iommu_aperture=", s390_iommu_aperture_setup);
+
+static int __init s390_iommu_init(void)
+{
+       int rc;
+
+       iommu_dma_forcedac = true;
+       s390_iommu_aperture = (u64)virt_to_phys(high_memory);
+       if (!s390_iommu_aperture_factor)
+               s390_iommu_aperture = ULONG_MAX;
+       else
+               s390_iommu_aperture *= s390_iommu_aperture_factor;
+
+       rc = dma_alloc_cpu_table_caches();
+       if (rc)
+               return rc;
+
+       return rc;
+}
+subsys_initcall(s390_iommu_init);
+
 static const struct iommu_ops s390_iommu_ops = {
-       .default_domain = &s390_iommu_platform_domain,
        .capable = s390_iommu_capable,
        .domain_alloc_paging = s390_domain_alloc_paging,
        .probe_device = s390_iommu_probe_device,
+       .probe_finalize = s390_iommu_probe_finalize,
        .release_device = s390_iommu_release_device,
        .device_group = generic_device_group,
        .pgsize_bitmap = SZ_4K,