drivers/iommu/intel/iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <dwmw2@infradead.org>,
   6  *          Ashok Raj <ashok.raj@intel.com>,
   7  *          Shaohua Li <shaohua.li@intel.com>,
   8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9  *          Fenghua Yu <fenghua.yu@intel.com>
  10  *          Joerg Roedel <jroedel@suse.de>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/crash_dump.h>
  17 #include <linux/dma-direct.h>
  18 #include <linux/dmi.h>
  19 #include <linux/memory.h>
  20 #include <linux/pci.h>
  21 #include <linux/pci-ats.h>
  22 #include <linux/spinlock.h>
  23 #include <linux/syscore_ops.h>
  24 #include <linux/tboot.h>
  25
  26 #include "iommu.h"
  27 #include "../dma-iommu.h"
  28 #include "../irq_remapping.h"
  29 #include "../iommu-sva.h"
  30 #include "pasid.h"
  31 #include "cap_audit.h"
  32 #include "perfmon.h"
  33
  34 #define ROOT_SIZE               VTD_PAGE_SIZE
  35 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  36
  37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  41
  42 #define IOAPIC_RANGE_START      (0xfee00000)
  43 #define IOAPIC_RANGE_END        (0xfeefffff)
  44 #define IOVA_START_ADDR         (0x1000)
  45
  46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  47
  48 #define MAX_AGAW_WIDTH 64
  49 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  50
  51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  53
  54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  56 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  57                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  58 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  59
  60 /* IO virtual address start page frame number */
  61 #define IOVA_START_PFN          (1)
  62
  63 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  64
  65 /* page table handling */
  66 #define LEVEL_STRIDE            (9)
  67 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  68
  69 static inline int agaw_to_level(int agaw)
  70 {
  71         return agaw + 2;
  72 }
  73
  74 static inline int agaw_to_width(int agaw)
  75 {
  76         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
  77 }
  78
  79 static inline int width_to_agaw(int width)
  80 {
  81         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
  82 }
  83
  84 static inline unsigned int level_to_offset_bits(int level)
  85 {
  86         return (level - 1) * LEVEL_STRIDE;
  87 }
  88
  89 static inline int pfn_level_offset(u64 pfn, int level)
  90 {
  91         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
  92 }
  93
  94 static inline u64 level_mask(int level)
  95 {
  96         return -1ULL << level_to_offset_bits(level);
  97 }
  98
  99 static inline u64 level_size(int level)
 100 {
 101         return 1ULL << level_to_offset_bits(level);
 102 }
 103
 104 static inline u64 align_to_level(u64 pfn, int level)
 105 {
 106         return (pfn + level_size(level) - 1) & level_mask(level);
 107 }
 108
 109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 110 {
 111         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 112 }
 113
 114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 115    are never going to work. */
 116 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
 117 {
 118         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 119 }
 120 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
 121 {
 122         return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
 123 }
 124 static inline unsigned long page_to_dma_pfn(struct page *pg)
 125 {
 126         return mm_to_dma_pfn_start(page_to_pfn(pg));
 127 }
 128 static inline unsigned long virt_to_dma_pfn(void *p)
 129 {
 130         return page_to_dma_pfn(virt_to_page(p));
 131 }
 132
 133 static void __init check_tylersburg_isoch(void);
 134 static int rwbf_quirk;
 135
 136 /*
 137  * set to 1 to panic kernel if can't successfully enable VT-d
 138  * (used when kernel is launched w/ TXT)
 139  */
 140 static int force_on = 0;
 141 static int intel_iommu_tboot_noforce;
 142 static int no_platform_optin;
 143
 144 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 145
 146 /*
 147  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 148  * if marked present.
 149  */
 150 static phys_addr_t root_entry_lctp(struct root_entry *re)
 151 {
 152         if (!(re->lo & 1))
 153                 return 0;
 154
 155         return re->lo & VTD_PAGE_MASK;
 156 }
 157
 158 /*
 159  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 160  * if marked present.
 161  */
 162 static phys_addr_t root_entry_uctp(struct root_entry *re)
 163 {
 164         if (!(re->hi & 1))
 165                 return 0;
 166
 167         return re->hi & VTD_PAGE_MASK;
 168 }
 169
 170 static inline void context_set_present(struct context_entry *context)
 171 {
 172         context->lo |= 1;
 173 }
 174
 175 static inline void context_set_fault_enable(struct context_entry *context)
 176 {
 177         context->lo &= (((u64)-1) << 2) | 1;
 178 }
 179
 180 static inline void context_set_translation_type(struct context_entry *context,
 181                                                 unsigned long value)
 182 {
 183         context->lo &= (((u64)-1) << 4) | 3;
 184         context->lo |= (value & 3) << 2;
 185 }
 186
 187 static inline void context_set_address_root(struct context_entry *context,
 188                                             unsigned long value)
 189 {
 190         context->lo &= ~VTD_PAGE_MASK;
 191         context->lo |= value & VTD_PAGE_MASK;
 192 }
 193
 194 static inline void context_set_address_width(struct context_entry *context,
 195                                              unsigned long value)
 196 {
 197         context->hi |= value & 7;
 198 }
 199
 200 static inline void context_set_domain_id(struct context_entry *context,
 201                                          unsigned long value)
 202 {
 203         context->hi |= (value & ((1 << 16) - 1)) << 8;
 204 }
 205
 206 static inline void context_set_pasid(struct context_entry *context)
 207 {
 208         context->lo |= CONTEXT_PASIDE;
 209 }
 210
 211 static inline int context_domain_id(struct context_entry *c)
 212 {
 213         return((c->hi >> 8) & 0xffff);
 214 }
 215
 216 static inline void context_clear_entry(struct context_entry *context)
 217 {
 218         context->lo = 0;
 219         context->hi = 0;
 220 }
 221
 222 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 223 {
 224         if (!iommu->copied_tables)
 225                 return false;
 226
 227         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 228 }
 229
 230 static inline void
 231 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 232 {
 233         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 234 }
 235
 236 static inline void
 237 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 238 {
 239         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 240 }
 241
 242 /*
 243  * This domain is a statically identity mapping domain.
 244  *      1. This domain creats a static 1:1 mapping to all usable memory.
 245  *      2. It maps to each iommu if successful.
 246  *      3. Each iommu mapps to this domain if successful.
 247  */
 248 static struct dmar_domain *si_domain;
 249 static int hw_pass_through = 1;
 250
 251 struct dmar_rmrr_unit {
 252         struct list_head list;          /* list of rmrr units   */
 253         struct acpi_dmar_header *hdr;   /* ACPI header          */
 254         u64     base_address;           /* reserved base address*/
 255         u64     end_address;            /* reserved end address */
 256         struct dmar_dev_scope *devices; /* target devices */
 257         int     devices_cnt;            /* target device count */
 258 };
 259
 260 struct dmar_atsr_unit {
 261         struct list_head list;          /* list of ATSR units */
 262         struct acpi_dmar_header *hdr;   /* ACPI header */
 263         struct dmar_dev_scope *devices; /* target devices */
 264         int devices_cnt;                /* target device count */
 265         u8 include_all:1;               /* include all ports */
 266 };
 267
 268 struct dmar_satc_unit {
 269         struct list_head list;          /* list of SATC units */
 270         struct acpi_dmar_header *hdr;   /* ACPI header */
 271         struct dmar_dev_scope *devices; /* target devices */
 272         struct intel_iommu *iommu;      /* the corresponding iommu */
 273         int devices_cnt;                /* target device count */
 274         u8 atc_required:1;              /* ATS is required */
 275 };
 276
 277 static LIST_HEAD(dmar_atsr_units);
 278 static LIST_HEAD(dmar_rmrr_units);
 279 static LIST_HEAD(dmar_satc_units);
 280
 281 #define for_each_rmrr_units(rmrr) \
 282         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 283
 284 static void device_block_translation(struct device *dev);
 285 static void intel_iommu_domain_free(struct iommu_domain *domain);
 286
 287 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 288 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 289
 290 int intel_iommu_enabled = 0;
 291 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 292
 293 static int dmar_map_gfx = 1;
 294 static int intel_iommu_superpage = 1;
 295 static int iommu_identity_mapping;
 296 static int iommu_skip_te_disable;
 297
 298 #define IDENTMAP_GFX            2
 299 #define IDENTMAP_AZALIA         4
 300
 301 const struct iommu_ops intel_iommu_ops;
 302
 303 static bool translation_pre_enabled(struct intel_iommu *iommu)
 304 {
 305         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 306 }
 307
 308 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 309 {
 310         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 311 }
 312
 313 static void init_translation_status(struct intel_iommu *iommu)
 314 {
 315         u32 gsts;
 316
 317         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 318         if (gsts & DMA_GSTS_TES)
 319                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 320 }
 321
 322 static int __init intel_iommu_setup(char *str)
 323 {
 324         if (!str)
 325                 return -EINVAL;
 326
 327         while (*str) {
 328                 if (!strncmp(str, "on", 2)) {
 329                         dmar_disabled = 0;
 330                         pr_info("IOMMU enabled\n");
 331                 } else if (!strncmp(str, "off", 3)) {
 332                         dmar_disabled = 1;
 333                         no_platform_optin = 1;
 334                         pr_info("IOMMU disabled\n");
 335                 } else if (!strncmp(str, "igfx_off", 8)) {
 336                         dmar_map_gfx = 0;
 337                         pr_info("Disable GFX device mapping\n");
 338                 } else if (!strncmp(str, "forcedac", 8)) {
 339                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 340                         iommu_dma_forcedac = true;
 341                 } else if (!strncmp(str, "strict", 6)) {
 342                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 343                         iommu_set_dma_strict();
 344                 } else if (!strncmp(str, "sp_off", 6)) {
 345                         pr_info("Disable supported super page\n");
 346                         intel_iommu_superpage = 0;
 347                 } else if (!strncmp(str, "sm_on", 5)) {
 348                         pr_info("Enable scalable mode if hardware supports\n");
 349                         intel_iommu_sm = 1;
 350                 } else if (!strncmp(str, "sm_off", 6)) {
 351                         pr_info("Scalable mode is disallowed\n");
 352                         intel_iommu_sm = 0;
 353                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 354                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 355                         intel_iommu_tboot_noforce = 1;
 356                 } else {
 357                         pr_notice("Unknown option - '%s'\n", str);
 358                 }
 359
 360                 str += strcspn(str, ",");
 361                 while (*str == ',')
 362                         str++;
 363         }
 364
 365         return 1;
 366 }
 367 __setup("intel_iommu=", intel_iommu_setup);
 368
 369 void *alloc_pgtable_page(int node, gfp_t gfp)
 370 {
 371         struct page *page;
 372         void *vaddr = NULL;
 373
 374         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
 375         if (page)
 376                 vaddr = page_address(page);
 377         return vaddr;
 378 }
 379
 380 void free_pgtable_page(void *vaddr)
 381 {
 382         free_page((unsigned long)vaddr);
 383 }
 384
 385 static inline int domain_type_is_si(struct dmar_domain *domain)
 386 {
 387         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 388 }
 389
 390 static inline int domain_pfn_supported(struct dmar_domain *domain,
 391                                        unsigned long pfn)
 392 {
 393         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 394
 395         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 396 }
 397
 398 /*
 399  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 400  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 401  * the returned SAGAW.
 402  */
 403 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 404 {
 405         unsigned long fl_sagaw, sl_sagaw;
 406
 407         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 408         sl_sagaw = cap_sagaw(iommu->cap);
 409
 410         /* Second level only. */
 411         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 412                 return sl_sagaw;
 413
 414         /* First level only. */
 415         if (!ecap_slts(iommu->ecap))
 416                 return fl_sagaw;
 417
 418         return fl_sagaw & sl_sagaw;
 419 }
 420
 421 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 422 {
 423         unsigned long sagaw;
 424         int agaw;
 425
 426         sagaw = __iommu_calculate_sagaw(iommu);
 427         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 428                 if (test_bit(agaw, &sagaw))
 429                         break;
 430         }
 431
 432         return agaw;
 433 }
 434
 435 /*
 436  * Calculate max SAGAW for each iommu.
 437  */
 438 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 439 {
 440         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 441 }
 442
 443 /*
 444  * calculate agaw for each iommu.
 445  * "SAGAW" may be different across iommus, use a default agaw, and
 446  * get a supported less agaw for iommus that don't support the default agaw.
 447  */
 448 int iommu_calculate_agaw(struct intel_iommu *iommu)
 449 {
 450         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 451 }
 452
 453 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 454 {
 455         return sm_supported(iommu) ?
 456                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 457 }
 458
 459 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 460 {
 461         struct iommu_domain_info *info;
 462         struct dmar_drhd_unit *drhd;
 463         struct intel_iommu *iommu;
 464         bool found = false;
 465         unsigned long i;
 466
 467         domain->iommu_coherency = true;
 468         xa_for_each(&domain->iommu_array, i, info) {
 469                 found = true;
 470                 if (!iommu_paging_structure_coherency(info->iommu)) {
 471                         domain->iommu_coherency = false;
 472                         break;
 473                 }
 474         }
 475         if (found)
 476                 return;
 477
 478         /* No hardware attached; use lowest common denominator */
 479         rcu_read_lock();
 480         for_each_active_iommu(iommu, drhd) {
 481                 if (!iommu_paging_structure_coherency(iommu)) {
 482                         domain->iommu_coherency = false;
 483                         break;
 484                 }
 485         }
 486         rcu_read_unlock();
 487 }
 488
 489 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 490                                          struct intel_iommu *skip)
 491 {
 492         struct dmar_drhd_unit *drhd;
 493         struct intel_iommu *iommu;
 494         int mask = 0x3;
 495
 496         if (!intel_iommu_superpage)
 497                 return 0;
 498
 499         /* set iommu_superpage to the smallest common denominator */
 500         rcu_read_lock();
 501         for_each_active_iommu(iommu, drhd) {
 502                 if (iommu != skip) {
 503                         if (domain && domain->use_first_level) {
 504                                 if (!cap_fl1gp_support(iommu->cap))
 505                                         mask = 0x1;
 506                         } else {
 507                                 mask &= cap_super_page_val(iommu->cap);
 508                         }
 509
 510                         if (!mask)
 511                                 break;
 512                 }
 513         }
 514         rcu_read_unlock();
 515
 516         return fls(mask);
 517 }
 518
 519 static int domain_update_device_node(struct dmar_domain *domain)
 520 {
 521         struct device_domain_info *info;
 522         int nid = NUMA_NO_NODE;
 523         unsigned long flags;
 524
 525         spin_lock_irqsave(&domain->lock, flags);
 526         list_for_each_entry(info, &domain->devices, link) {
 527                 /*
 528                  * There could possibly be multiple device numa nodes as devices
 529                  * within the same domain may sit behind different IOMMUs. There
 530                  * isn't perfect answer in such situation, so we select first
 531                  * come first served policy.
 532                  */
 533                 nid = dev_to_node(info->dev);
 534                 if (nid != NUMA_NO_NODE)
 535                         break;
 536         }
 537         spin_unlock_irqrestore(&domain->lock, flags);
 538
 539         return nid;
 540 }
 541
 542 static void domain_update_iotlb(struct dmar_domain *domain);
 543
 544 /* Return the super pagesize bitmap if supported. */
 545 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 546 {
 547         unsigned long bitmap = 0;
 548
 549         /*
 550          * 1-level super page supports page size of 2MiB, 2-level super page
 551          * supports page size of both 2MiB and 1GiB.
 552          */
 553         if (domain->iommu_superpage == 1)
 554                 bitmap |= SZ_2M;
 555         else if (domain->iommu_superpage == 2)
 556                 bitmap |= SZ_2M | SZ_1G;
 557
 558         return bitmap;
 559 }
 560
 561 /* Some capabilities may be different across iommus */
 562 static void domain_update_iommu_cap(struct dmar_domain *domain)
 563 {
 564         domain_update_iommu_coherency(domain);
 565         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 566
 567         /*
 568          * If RHSA is missing, we should default to the device numa domain
 569          * as fall back.
 570          */
 571         if (domain->nid == NUMA_NO_NODE)
 572                 domain->nid = domain_update_device_node(domain);
 573
 574         /*
 575          * First-level translation restricts the input-address to a
 576          * canonical address (i.e., address bits 63:N have the same
 577          * value as address bit [N-1], where N is 48-bits with 4-level
 578          * paging and 57-bits with 5-level paging). Hence, skip bit
 579          * [N-1].
 580          */
 581         if (domain->use_first_level)
 582                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 583         else
 584                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 585
 586         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 587         domain_update_iotlb(domain);
 588 }
 589
 590 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 591                                          u8 devfn, int alloc)
 592 {
 593         struct root_entry *root = &iommu->root_entry[bus];
 594         struct context_entry *context;
 595         u64 *entry;
 596
 597         /*
 598          * Except that the caller requested to allocate a new entry,
 599          * returning a copied context entry makes no sense.
 600          */
 601         if (!alloc && context_copied(iommu, bus, devfn))
 602                 return NULL;
 603
 604         entry = &root->lo;
 605         if (sm_supported(iommu)) {
 606                 if (devfn >= 0x80) {
 607                         devfn -= 0x80;
 608                         entry = &root->hi;
 609                 }
 610                 devfn *= 2;
 611         }
 612         if (*entry & 1)
 613                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 614         else {
 615                 unsigned long phy_addr;
 616                 if (!alloc)
 617                         return NULL;
 618
 619                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
 620                 if (!context)
 621                         return NULL;
 622
 623                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 624                 phy_addr = virt_to_phys((void *)context);
 625                 *entry = phy_addr | 1;
 626                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 627         }
 628         return &context[devfn];
 629 }
 630
 631 /**
 632  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 633  *                               sub-hierarchy of a candidate PCI-PCI bridge
 634  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 635  * @bridge: the candidate PCI-PCI bridge
 636  *
 637  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 638  */
 639 static bool
 640 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 641 {
 642         struct pci_dev *pdev, *pbridge;
 643
 644         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 645                 return false;
 646
 647         pdev = to_pci_dev(dev);
 648         pbridge = to_pci_dev(bridge);
 649
 650         if (pbridge->subordinate &&
 651             pbridge->subordinate->number <= pdev->bus->number &&
 652             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 653                 return true;
 654
 655         return false;
 656 }
 657
 658 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 659 {
 660         struct dmar_drhd_unit *drhd;
 661         u32 vtbar;
 662         int rc;
 663
 664         /* We know that this device on this chipset has its own IOMMU.
 665          * If we find it under a different IOMMU, then the BIOS is lying
 666          * to us. Hope that the IOMMU for this device is actually
 667          * disabled, and it needs no translation...
 668          */
 669         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 670         if (rc) {
 671                 /* "can't" happen */
 672                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 673                 return false;
 674         }
 675         vtbar &= 0xffff0000;
 676
 677         /* we know that the this iommu should be at offset 0xa000 from vtbar */
 678         drhd = dmar_find_matched_drhd_unit(pdev);
 679         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 680                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 681                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 682                 return true;
 683         }
 684
 685         return false;
 686 }
 687
 688 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 689 {
 690         if (!iommu || iommu->drhd->ignored)
 691                 return true;
 692
 693         if (dev_is_pci(dev)) {
 694                 struct pci_dev *pdev = to_pci_dev(dev);
 695
 696                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 697                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 698                     quirk_ioat_snb_local_iommu(pdev))
 699                         return true;
 700         }
 701
 702         return false;
 703 }
 704
 705 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 706 {
 707         struct dmar_drhd_unit *drhd = NULL;
 708         struct pci_dev *pdev = NULL;
 709         struct intel_iommu *iommu;
 710         struct device *tmp;
 711         u16 segment = 0;
 712         int i;
 713
 714         if (!dev)
 715                 return NULL;
 716
 717         if (dev_is_pci(dev)) {
 718                 struct pci_dev *pf_pdev;
 719
 720                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 721
 722                 /* VFs aren't listed in scope tables; we need to look up
 723                  * the PF instead to find the IOMMU. */
 724                 pf_pdev = pci_physfn(pdev);
 725                 dev = &pf_pdev->dev;
 726                 segment = pci_domain_nr(pdev->bus);
 727         } else if (has_acpi_companion(dev))
 728                 dev = &ACPI_COMPANION(dev)->dev;
 729
 730         rcu_read_lock();
 731         for_each_iommu(iommu, drhd) {
 732                 if (pdev && segment != drhd->segment)
 733                         continue;
 734
 735                 for_each_active_dev_scope(drhd->devices,
 736                                           drhd->devices_cnt, i, tmp) {
 737                         if (tmp == dev) {
 738                                 /* For a VF use its original BDF# not that of the PF
 739                                  * which we used for the IOMMU lookup. Strictly speaking
 740                                  * we could do this for all PCI devices; we only need to
 741                                  * get the BDF# from the scope table for ACPI matches. */
 742                                 if (pdev && pdev->is_virtfn)
 743                                         goto got_pdev;
 744
 745                                 if (bus && devfn) {
 746                                         *bus = drhd->devices[i].bus;
 747                                         *devfn = drhd->devices[i].devfn;
 748                                 }
 749                                 goto out;
 750                         }
 751
 752                         if (is_downstream_to_pci_bridge(dev, tmp))
 753                                 goto got_pdev;
 754                 }
 755
 756                 if (pdev && drhd->include_all) {
 757 got_pdev:
 758                         if (bus && devfn) {
 759                                 *bus = pdev->bus->number;
 760                                 *devfn = pdev->devfn;
 761                         }
 762                         goto out;
 763                 }
 764         }
 765         iommu = NULL;
 766 out:
 767         if (iommu_is_dummy(iommu, dev))
 768                 iommu = NULL;
 769
 770         rcu_read_unlock();
 771
 772         return iommu;
 773 }
 774
 775 static void domain_flush_cache(struct dmar_domain *domain,
 776                                void *addr, int size)
 777 {
 778         if (!domain->iommu_coherency)
 779                 clflush_cache_range(addr, size);
 780 }
 781
 782 static void free_context_table(struct intel_iommu *iommu)
 783 {
 784         struct context_entry *context;
 785         int i;
 786
 787         if (!iommu->root_entry)
 788                 return;
 789
 790         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 791                 context = iommu_context_addr(iommu, i, 0, 0);
 792                 if (context)
 793                         free_pgtable_page(context);
 794
 795                 if (!sm_supported(iommu))
 796                         continue;
 797
 798                 context = iommu_context_addr(iommu, i, 0x80, 0);
 799                 if (context)
 800                         free_pgtable_page(context);
 801         }
 802
 803         free_pgtable_page(iommu->root_entry);
 804         iommu->root_entry = NULL;
 805 }
 806
 807 #ifdef CONFIG_DMAR_DEBUG
 808 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 809                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
 810 {
 811         struct dma_pte *pte;
 812         int offset;
 813
 814         while (1) {
 815                 offset = pfn_level_offset(pfn, level);
 816                 pte = &parent[offset];
 817                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
 818                         pr_info("PTE not present at level %d\n", level);
 819                         break;
 820                 }
 821
 822                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 823
 824                 if (level == 1)
 825                         break;
 826
 827                 parent = phys_to_virt(dma_pte_addr(pte));
 828                 level--;
 829         }
 830 }
 831
 832 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 833                           unsigned long long addr, u32 pasid)
 834 {
 835         struct pasid_dir_entry *dir, *pde;
 836         struct pasid_entry *entries, *pte;
 837         struct context_entry *ctx_entry;
 838         struct root_entry *rt_entry;
 839         int i, dir_index, index, level;
 840         u8 devfn = source_id & 0xff;
 841         u8 bus = source_id >> 8;
 842         struct dma_pte *pgtable;
 843
 844         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 845
 846         /* root entry dump */
 847         rt_entry = &iommu->root_entry[bus];
 848         if (!rt_entry) {
 849                 pr_info("root table entry is not present\n");
 850                 return;
 851         }
 852
 853         if (sm_supported(iommu))
 854                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 855                         rt_entry->hi, rt_entry->lo);
 856         else
 857                 pr_info("root entry: 0x%016llx", rt_entry->lo);
 858
 859         /* context entry dump */
 860         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 861         if (!ctx_entry) {
 862                 pr_info("context table entry is not present\n");
 863                 return;
 864         }
 865
 866         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 867                 ctx_entry->hi, ctx_entry->lo);
 868
 869         /* legacy mode does not require PASID entries */
 870         if (!sm_supported(iommu)) {
 871                 level = agaw_to_level(ctx_entry->hi & 7);
 872                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 873                 goto pgtable_walk;
 874         }
 875
 876         /* get the pointer to pasid directory entry */
 877         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 878         if (!dir) {
 879                 pr_info("pasid directory entry is not present\n");
 880                 return;
 881         }
 882         /* For request-without-pasid, get the pasid from context entry */
 883         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
 884                 pasid = IOMMU_NO_PASID;
 885
 886         dir_index = pasid >> PASID_PDE_SHIFT;
 887         pde = &dir[dir_index];
 888         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 889
 890         /* get the pointer to the pasid table entry */
 891         entries = get_pasid_table_from_pde(pde);
 892         if (!entries) {
 893                 pr_info("pasid table entry is not present\n");
 894                 return;
 895         }
 896         index = pasid & PASID_PTE_MASK;
 897         pte = &entries[index];
 898         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 899                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 900
 901         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 902                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 903                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 904         } else {
 905                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 906                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 907         }
 908
 909 pgtable_walk:
 910         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 911 }
 912 #endif
 913
 914 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 915                                       unsigned long pfn, int *target_level,
 916                                       gfp_t gfp)
 917 {
 918         struct dma_pte *parent, *pte;
 919         int level = agaw_to_level(domain->agaw);
 920         int offset;
 921
 922         if (!domain_pfn_supported(domain, pfn))
 923                 /* Address beyond IOMMU's addressing capabilities. */
 924                 return NULL;
 925
 926         parent = domain->pgd;
 927
 928         while (1) {
 929                 void *tmp_page;
 930
 931                 offset = pfn_level_offset(pfn, level);
 932                 pte = &parent[offset];
 933                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 934                         break;
 935                 if (level == *target_level)
 936                         break;
 937
 938                 if (!dma_pte_present(pte)) {
 939                         uint64_t pteval;
 940
 941                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
 942
 943                         if (!tmp_page)
 944                                 return NULL;
 945
 946                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 947                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 948                         if (domain->use_first_level)
 949                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 950
 951                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 952                                 /* Someone else set it while we were thinking; use theirs. */
 953                                 free_pgtable_page(tmp_page);
 954                         else
 955                                 domain_flush_cache(domain, pte, sizeof(*pte));
 956                 }
 957                 if (level == 1)
 958                         break;
 959
 960                 parent = phys_to_virt(dma_pte_addr(pte));
 961                 level--;
 962         }
 963
 964         if (!*target_level)
 965                 *target_level = level;
 966
 967         return pte;
 968 }
 969
 970 /* return address's pte at specific level */
 971 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 972                                          unsigned long pfn,
 973                                          int level, int *large_page)
 974 {
 975         struct dma_pte *parent, *pte;
 976         int total = agaw_to_level(domain->agaw);
 977         int offset;
 978
 979         parent = domain->pgd;
 980         while (level <= total) {
 981                 offset = pfn_level_offset(pfn, total);
 982                 pte = &parent[offset];
 983                 if (level == total)
 984                         return pte;
 985
 986                 if (!dma_pte_present(pte)) {
 987                         *large_page = total;
 988                         break;
 989                 }
 990
 991                 if (dma_pte_superpage(pte)) {
 992                         *large_page = total;
 993                         return pte;
 994                 }
 995
 996                 parent = phys_to_virt(dma_pte_addr(pte));
 997                 total--;
 998         }
 999         return NULL;
1000 }
1001
1002 /* clear last level pte, a tlb flush should be followed */
1003 static void dma_pte_clear_range(struct dmar_domain *domain,
1004                                 unsigned long start_pfn,
1005                                 unsigned long last_pfn)
1006 {
1007         unsigned int large_page;
1008         struct dma_pte *first_pte, *pte;
1009
1010         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1011             WARN_ON(start_pfn > last_pfn))
1012                 return;
1013
1014         /* we don't need lock here; nobody else touches the iova range */
1015         do {
1016                 large_page = 1;
1017                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1018                 if (!pte) {
1019                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1020                         continue;
1021                 }
1022                 do {
1023                         dma_clear_pte(pte);
1024                         start_pfn += lvl_to_nr_pages(large_page);
1025                         pte++;
1026                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1027
1028                 domain_flush_cache(domain, first_pte,
1029                                    (void *)pte - (void *)first_pte);
1030
1031         } while (start_pfn && start_pfn <= last_pfn);
1032 }
1033
1034 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1035                                int retain_level, struct dma_pte *pte,
1036                                unsigned long pfn, unsigned long start_pfn,
1037                                unsigned long last_pfn)
1038 {
1039         pfn = max(start_pfn, pfn);
1040         pte = &pte[pfn_level_offset(pfn, level)];
1041
1042         do {
1043                 unsigned long level_pfn;
1044                 struct dma_pte *level_pte;
1045
1046                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1047                         goto next;
1048
1049                 level_pfn = pfn & level_mask(level);
1050                 level_pte = phys_to_virt(dma_pte_addr(pte));
1051
1052                 if (level > 2) {
1053                         dma_pte_free_level(domain, level - 1, retain_level,
1054                                            level_pte, level_pfn, start_pfn,
1055                                            last_pfn);
1056                 }
1057
1058                 /*
1059                  * Free the page table if we're below the level we want to
1060                  * retain and the range covers the entire table.
1061                  */
1062                 if (level < retain_level && !(start_pfn > level_pfn ||
1063                       last_pfn < level_pfn + level_size(level) - 1)) {
1064                         dma_clear_pte(pte);
1065                         domain_flush_cache(domain, pte, sizeof(*pte));
1066                         free_pgtable_page(level_pte);
1067                 }
1068 next:
1069                 pfn += level_size(level);
1070         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1071 }
1072
1073 /*
1074  * clear last level (leaf) ptes and free page table pages below the
1075  * level we wish to keep intact.
1076  */
1077 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1078                                    unsigned long start_pfn,
1079                                    unsigned long last_pfn,
1080                                    int retain_level)
1081 {
1082         dma_pte_clear_range(domain, start_pfn, last_pfn);
1083
1084         /* We don't need lock here; nobody else touches the iova range */
1085         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1086                            domain->pgd, 0, start_pfn, last_pfn);
1087
1088         /* free pgd */
1089         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090                 free_pgtable_page(domain->pgd);
1091                 domain->pgd = NULL;
1092         }
1093 }
1094
1095 /* When a page at a given level is being unlinked from its parent, we don't
1096    need to *modify* it at all. All we need to do is make a list of all the
1097    pages which can be freed just as soon as we've flushed the IOTLB and we
1098    know the hardware page-walk will no longer touch them.
1099    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1100    be freed. */
1101 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1102                                     int level, struct dma_pte *pte,
1103                                     struct list_head *freelist)
1104 {
1105         struct page *pg;
1106
1107         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1108         list_add_tail(&pg->lru, freelist);
1109
1110         if (level == 1)
1111                 return;
1112
1113         pte = page_address(pg);
1114         do {
1115                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1116                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1117                 pte++;
1118         } while (!first_pte_in_page(pte));
1119 }
1120
1121 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1122                                 struct dma_pte *pte, unsigned long pfn,
1123                                 unsigned long start_pfn, unsigned long last_pfn,
1124                                 struct list_head *freelist)
1125 {
1126         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1127
1128         pfn = max(start_pfn, pfn);
1129         pte = &pte[pfn_level_offset(pfn, level)];
1130
1131         do {
1132                 unsigned long level_pfn = pfn & level_mask(level);
1133
1134                 if (!dma_pte_present(pte))
1135                         goto next;
1136
1137                 /* If range covers entire pagetable, free it */
1138                 if (start_pfn <= level_pfn &&
1139                     last_pfn >= level_pfn + level_size(level) - 1) {
1140                         /* These suborbinate page tables are going away entirely. Don't
1141                            bother to clear them; we're just going to *free* them. */
1142                         if (level > 1 && !dma_pte_superpage(pte))
1143                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1144
1145                         dma_clear_pte(pte);
1146                         if (!first_pte)
1147                                 first_pte = pte;
1148                         last_pte = pte;
1149                 } else if (level > 1) {
1150                         /* Recurse down into a level that isn't *entirely* obsolete */
1151                         dma_pte_clear_level(domain, level - 1,
1152                                             phys_to_virt(dma_pte_addr(pte)),
1153                                             level_pfn, start_pfn, last_pfn,
1154                                             freelist);
1155                 }
1156 next:
1157                 pfn = level_pfn + level_size(level);
1158         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1159
1160         if (first_pte)
1161                 domain_flush_cache(domain, first_pte,
1162                                    (void *)++last_pte - (void *)first_pte);
1163 }
1164
1165 /* We can't just free the pages because the IOMMU may still be walking
1166    the page tables, and may have cached the intermediate levels. The
1167    pages can only be freed after the IOTLB flush has been done. */
1168 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1169                          unsigned long last_pfn, struct list_head *freelist)
1170 {
1171         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1172             WARN_ON(start_pfn > last_pfn))
1173                 return;
1174
1175         /* we don't need lock here; nobody else touches the iova range */
1176         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1177                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1178
1179         /* free pgd */
1180         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1181                 struct page *pgd_page = virt_to_page(domain->pgd);
1182                 list_add_tail(&pgd_page->lru, freelist);
1183                 domain->pgd = NULL;
1184         }
1185 }
1186
1187 /* iommu handling */
1188 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1189 {
1190         struct root_entry *root;
1191
1192         root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1193         if (!root) {
1194                 pr_err("Allocating root entry for %s failed\n",
1195                         iommu->name);
1196                 return -ENOMEM;
1197         }
1198
1199         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1200         iommu->root_entry = root;
1201
1202         return 0;
1203 }
1204
1205 static void iommu_set_root_entry(struct intel_iommu *iommu)
1206 {
1207         u64 addr;
1208         u32 sts;
1209         unsigned long flag;
1210
1211         addr = virt_to_phys(iommu->root_entry);
1212         if (sm_supported(iommu))
1213                 addr |= DMA_RTADDR_SMT;
1214
1215         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1216         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1217
1218         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1219
1220         /* Make sure hardware complete it */
1221         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1222                       readl, (sts & DMA_GSTS_RTPS), sts);
1223
1224         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1225
1226         /*
1227          * Hardware invalidates all DMA remapping hardware translation
1228          * caches as part of SRTP flow.
1229          */
1230         if (cap_esrtps(iommu->cap))
1231                 return;
1232
1233         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1234         if (sm_supported(iommu))
1235                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1236         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1237 }
1238
1239 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1240 {
1241         u32 val;
1242         unsigned long flag;
1243
1244         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1245                 return;
1246
1247         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1248         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1249
1250         /* Make sure hardware complete it */
1251         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1252                       readl, (!(val & DMA_GSTS_WBFS)), val);
1253
1254         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1255 }
1256
1257 /* return value determine if we need a write buffer flush */
1258 static void __iommu_flush_context(struct intel_iommu *iommu,
1259                                   u16 did, u16 source_id, u8 function_mask,
1260                                   u64 type)
1261 {
1262         u64 val = 0;
1263         unsigned long flag;
1264
1265         switch (type) {
1266         case DMA_CCMD_GLOBAL_INVL:
1267                 val = DMA_CCMD_GLOBAL_INVL;
1268                 break;
1269         case DMA_CCMD_DOMAIN_INVL:
1270                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1271                 break;
1272         case DMA_CCMD_DEVICE_INVL:
1273                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1274                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1275                 break;
1276         default:
1277                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1278                         iommu->name, type);
1279                 return;
1280         }
1281         val |= DMA_CCMD_ICC;
1282
1283         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1284         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1285
1286         /* Make sure hardware complete it */
1287         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1288                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1289
1290         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1291 }
1292
1293 /* return value determine if we need a write buffer flush */
1294 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1295                                 u64 addr, unsigned int size_order, u64 type)
1296 {
1297         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1298         u64 val = 0, val_iva = 0;
1299         unsigned long flag;
1300
1301         switch (type) {
1302         case DMA_TLB_GLOBAL_FLUSH:
1303                 /* global flush doesn't need set IVA_REG */
1304                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1305                 break;
1306         case DMA_TLB_DSI_FLUSH:
1307                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1308                 break;
1309         case DMA_TLB_PSI_FLUSH:
1310                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1311                 /* IH bit is passed in as part of address */
1312                 val_iva = size_order | addr;
1313                 break;
1314         default:
1315                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1316                         iommu->name, type);
1317                 return;
1318         }
1319
1320         if (cap_write_drain(iommu->cap))
1321                 val |= DMA_TLB_WRITE_DRAIN;
1322
1323         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1324         /* Note: Only uses first TLB reg currently */
1325         if (val_iva)
1326                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1327         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1328
1329         /* Make sure hardware complete it */
1330         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1331                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1332
1333         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1334
1335         /* check IOTLB invalidation granularity */
1336         if (DMA_TLB_IAIG(val) == 0)
1337                 pr_err("Flush IOTLB failed\n");
1338         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1339                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1340                         (unsigned long long)DMA_TLB_IIRG(type),
1341                         (unsigned long long)DMA_TLB_IAIG(val));
1342 }
1343
1344 static struct device_domain_info *
1345 domain_lookup_dev_info(struct dmar_domain *domain,
1346                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1347 {
1348         struct device_domain_info *info;
1349         unsigned long flags;
1350
1351         spin_lock_irqsave(&domain->lock, flags);
1352         list_for_each_entry(info, &domain->devices, link) {
1353                 if (info->iommu == iommu && info->bus == bus &&
1354                     info->devfn == devfn) {
1355                         spin_unlock_irqrestore(&domain->lock, flags);
1356                         return info;
1357                 }
1358         }
1359         spin_unlock_irqrestore(&domain->lock, flags);
1360
1361         return NULL;
1362 }
1363
1364 static void domain_update_iotlb(struct dmar_domain *domain)
1365 {
1366         struct dev_pasid_info *dev_pasid;
1367         struct device_domain_info *info;
1368         bool has_iotlb_device = false;
1369         unsigned long flags;
1370
1371         spin_lock_irqsave(&domain->lock, flags);
1372         list_for_each_entry(info, &domain->devices, link) {
1373                 if (info->ats_enabled) {
1374                         has_iotlb_device = true;
1375                         break;
1376                 }
1377         }
1378
1379         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1380                 info = dev_iommu_priv_get(dev_pasid->dev);
1381                 if (info->ats_enabled) {
1382                         has_iotlb_device = true;
1383                         break;
1384                 }
1385         }
1386         domain->has_iotlb_device = has_iotlb_device;
1387         spin_unlock_irqrestore(&domain->lock, flags);
1388 }
1389
1390 /*
1391  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1392  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1393  * check because it applies only to the built-in QAT devices and it doesn't
1394  * grant additional privileges.
1395  */
1396 #define BUGGY_QAT_DEVID_MASK 0x4940
1397 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1398 {
1399         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1400                 return false;
1401
1402         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1403                 return false;
1404
1405         return true;
1406 }
1407
1408 static void iommu_enable_pci_caps(struct device_domain_info *info)
1409 {
1410         struct pci_dev *pdev;
1411
1412         if (!dev_is_pci(info->dev))
1413                 return;
1414
1415         pdev = to_pci_dev(info->dev);
1416
1417         /* The PCIe spec, in its wisdom, declares that the behaviour of
1418            the device if you enable PASID support after ATS support is
1419            undefined. So always enable PASID support on devices which
1420            have it, even if we can't yet know if we're ever going to
1421            use it. */
1422         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1423                 info->pasid_enabled = 1;
1424
1425         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1426             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1427                 info->ats_enabled = 1;
1428                 domain_update_iotlb(info->domain);
1429         }
1430 }
1431
1432 static void iommu_disable_pci_caps(struct device_domain_info *info)
1433 {
1434         struct pci_dev *pdev;
1435
1436         if (!dev_is_pci(info->dev))
1437                 return;
1438
1439         pdev = to_pci_dev(info->dev);
1440
1441         if (info->ats_enabled) {
1442                 pci_disable_ats(pdev);
1443                 info->ats_enabled = 0;
1444                 domain_update_iotlb(info->domain);
1445         }
1446
1447         if (info->pasid_enabled) {
1448                 pci_disable_pasid(pdev);
1449                 info->pasid_enabled = 0;
1450         }
1451 }
1452
1453 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1454                                     u64 addr, unsigned int mask)
1455 {
1456         u16 sid, qdep;
1457
1458         if (!info || !info->ats_enabled)
1459                 return;
1460
1461         sid = info->bus << 8 | info->devfn;
1462         qdep = info->ats_qdep;
1463         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1464                            qdep, addr, mask);
1465         quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1466 }
1467
1468 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1469                                   u64 addr, unsigned mask)
1470 {
1471         struct dev_pasid_info *dev_pasid;
1472         struct device_domain_info *info;
1473         unsigned long flags;
1474
1475         if (!domain->has_iotlb_device)
1476                 return;
1477
1478         spin_lock_irqsave(&domain->lock, flags);
1479         list_for_each_entry(info, &domain->devices, link)
1480                 __iommu_flush_dev_iotlb(info, addr, mask);
1481
1482         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1483                 info = dev_iommu_priv_get(dev_pasid->dev);
1484
1485                 if (!info->ats_enabled)
1486                         continue;
1487
1488                 qi_flush_dev_iotlb_pasid(info->iommu,
1489                                          PCI_DEVID(info->bus, info->devfn),
1490                                          info->pfsid, dev_pasid->pasid,
1491                                          info->ats_qdep, addr,
1492                                          mask);
1493         }
1494         spin_unlock_irqrestore(&domain->lock, flags);
1495 }
1496
1497 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1498                                      struct dmar_domain *domain, u64 addr,
1499                                      unsigned long npages, bool ih)
1500 {
1501         u16 did = domain_id_iommu(domain, iommu);
1502         struct dev_pasid_info *dev_pasid;
1503         unsigned long flags;
1504
1505         spin_lock_irqsave(&domain->lock, flags);
1506         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1507                 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1508
1509         if (!list_empty(&domain->devices))
1510                 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1511         spin_unlock_irqrestore(&domain->lock, flags);
1512 }
1513
1514 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1515                                   struct dmar_domain *domain,
1516                                   unsigned long pfn, unsigned int pages,
1517                                   int ih, int map)
1518 {
1519         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1520         unsigned int mask = ilog2(aligned_pages);
1521         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1522         u16 did = domain_id_iommu(domain, iommu);
1523
1524         if (WARN_ON(!pages))
1525                 return;
1526
1527         if (ih)
1528                 ih = 1 << 6;
1529
1530         if (domain->use_first_level) {
1531                 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1532         } else {
1533                 unsigned long bitmask = aligned_pages - 1;
1534
1535                 /*
1536                  * PSI masks the low order bits of the base address. If the
1537                  * address isn't aligned to the mask, then compute a mask value
1538                  * needed to ensure the target range is flushed.
1539                  */
1540                 if (unlikely(bitmask & pfn)) {
1541                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1542
1543                         /*
1544                          * Since end_pfn <= pfn + bitmask, the only way bits
1545                          * higher than bitmask can differ in pfn and end_pfn is
1546                          * by carrying. This means after masking out bitmask,
1547                          * high bits starting with the first set bit in
1548                          * shared_bits are all equal in both pfn and end_pfn.
1549                          */
1550                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1551                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1552                 }
1553
1554                 /*
1555                  * Fallback to domain selective flush if no PSI support or
1556                  * the size is too big.
1557                  */
1558                 if (!cap_pgsel_inv(iommu->cap) ||
1559                     mask > cap_max_amask_val(iommu->cap))
1560                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1561                                                         DMA_TLB_DSI_FLUSH);
1562                 else
1563                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1564                                                         DMA_TLB_PSI_FLUSH);
1565         }
1566
1567         /*
1568          * In caching mode, changes of pages from non-present to present require
1569          * flush. However, device IOTLB doesn't need to be flushed in this case.
1570          */
1571         if (!cap_caching_mode(iommu->cap) || !map)
1572                 iommu_flush_dev_iotlb(domain, addr, mask);
1573 }
1574
1575 /* Notification for newly created mappings */
1576 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1577                                         struct dmar_domain *domain,
1578                                         unsigned long pfn, unsigned int pages)
1579 {
1580         /*
1581          * It's a non-present to present mapping. Only flush if caching mode
1582          * and second level.
1583          */
1584         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1585                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1586         else
1587                 iommu_flush_write_buffer(iommu);
1588 }
1589
1590 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1591 {
1592         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1593         struct iommu_domain_info *info;
1594         unsigned long idx;
1595
1596         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1597                 struct intel_iommu *iommu = info->iommu;
1598                 u16 did = domain_id_iommu(dmar_domain, iommu);
1599
1600                 if (dmar_domain->use_first_level)
1601                         domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1602                 else
1603                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1604                                                  DMA_TLB_DSI_FLUSH);
1605
1606                 if (!cap_caching_mode(iommu->cap))
1607                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1608         }
1609 }
1610
1611 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1612 {
1613         u32 pmen;
1614         unsigned long flags;
1615
1616         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1617                 return;
1618
1619         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1620         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1621         pmen &= ~DMA_PMEN_EPM;
1622         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1623
1624         /* wait for the protected region status bit to clear */
1625         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1626                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1627
1628         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1629 }
1630
1631 static void iommu_enable_translation(struct intel_iommu *iommu)
1632 {
1633         u32 sts;
1634         unsigned long flags;
1635
1636         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1637         iommu->gcmd |= DMA_GCMD_TE;
1638         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1639
1640         /* Make sure hardware complete it */
1641         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1642                       readl, (sts & DMA_GSTS_TES), sts);
1643
1644         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1645 }
1646
1647 static void iommu_disable_translation(struct intel_iommu *iommu)
1648 {
1649         u32 sts;
1650         unsigned long flag;
1651
1652         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1653             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1654                 return;
1655
1656         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1657         iommu->gcmd &= ~DMA_GCMD_TE;
1658         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1659
1660         /* Make sure hardware complete it */
1661         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1662                       readl, (!(sts & DMA_GSTS_TES)), sts);
1663
1664         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1665 }
1666
1667 static int iommu_init_domains(struct intel_iommu *iommu)
1668 {
1669         u32 ndomains;
1670
1671         ndomains = cap_ndoms(iommu->cap);
1672         pr_debug("%s: Number of Domains supported <%d>\n",
1673                  iommu->name, ndomains);
1674
1675         spin_lock_init(&iommu->lock);
1676
1677         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1678         if (!iommu->domain_ids)
1679                 return -ENOMEM;
1680
1681         /*
1682          * If Caching mode is set, then invalid translations are tagged
1683          * with domain-id 0, hence we need to pre-allocate it. We also
1684          * use domain-id 0 as a marker for non-allocated domain-id, so
1685          * make sure it is not used for a real domain.
1686          */
1687         set_bit(0, iommu->domain_ids);
1688
1689         /*
1690          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1691          * entry for first-level or pass-through translation modes should
1692          * be programmed with a domain id different from those used for
1693          * second-level or nested translation. We reserve a domain id for
1694          * this purpose.
1695          */
1696         if (sm_supported(iommu))
1697                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1698
1699         return 0;
1700 }
1701
1702 static void disable_dmar_iommu(struct intel_iommu *iommu)
1703 {
1704         if (!iommu->domain_ids)
1705                 return;
1706
1707         /*
1708          * All iommu domains must have been detached from the devices,
1709          * hence there should be no domain IDs in use.
1710          */
1711         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1712                     > NUM_RESERVED_DID))
1713                 return;
1714
1715         if (iommu->gcmd & DMA_GCMD_TE)
1716                 iommu_disable_translation(iommu);
1717 }
1718
1719 static void free_dmar_iommu(struct intel_iommu *iommu)
1720 {
1721         if (iommu->domain_ids) {
1722                 bitmap_free(iommu->domain_ids);
1723                 iommu->domain_ids = NULL;
1724         }
1725
1726         if (iommu->copied_tables) {
1727                 bitmap_free(iommu->copied_tables);
1728                 iommu->copied_tables = NULL;
1729         }
1730
1731         /* free context mapping */
1732         free_context_table(iommu);
1733
1734 #ifdef CONFIG_INTEL_IOMMU_SVM
1735         if (pasid_supported(iommu)) {
1736                 if (ecap_prs(iommu->ecap))
1737                         intel_svm_finish_prq(iommu);
1738         }
1739 #endif
1740 }
1741
1742 /*
1743  * Check and return whether first level is used by default for
1744  * DMA translation.
1745  */
1746 static bool first_level_by_default(unsigned int type)
1747 {
1748         /* Only SL is available in legacy mode */
1749         if (!scalable_mode_support())
1750                 return false;
1751
1752         /* Only level (either FL or SL) is available, just use it */
1753         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1754                 return intel_cap_flts_sanity();
1755
1756         /* Both levels are available, decide it based on domain type */
1757         return type != IOMMU_DOMAIN_UNMANAGED;
1758 }
1759
1760 static struct dmar_domain *alloc_domain(unsigned int type)
1761 {
1762         struct dmar_domain *domain;
1763
1764         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1765         if (!domain)
1766                 return NULL;
1767
1768         domain->nid = NUMA_NO_NODE;
1769         if (first_level_by_default(type))
1770                 domain->use_first_level = true;
1771         domain->has_iotlb_device = false;
1772         INIT_LIST_HEAD(&domain->devices);
1773         INIT_LIST_HEAD(&domain->dev_pasids);
1774         spin_lock_init(&domain->lock);
1775         xa_init(&domain->iommu_array);
1776
1777         return domain;
1778 }
1779
1780 static int domain_attach_iommu(struct dmar_domain *domain,
1781                                struct intel_iommu *iommu)
1782 {
1783         struct iommu_domain_info *info, *curr;
1784         unsigned long ndomains;
1785         int num, ret = -ENOSPC;
1786
1787         info = kzalloc(sizeof(*info), GFP_KERNEL);
1788         if (!info)
1789                 return -ENOMEM;
1790
1791         spin_lock(&iommu->lock);
1792         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1793         if (curr) {
1794                 curr->refcnt++;
1795                 spin_unlock(&iommu->lock);
1796                 kfree(info);
1797                 return 0;
1798         }
1799
1800         ndomains = cap_ndoms(iommu->cap);
1801         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1802         if (num >= ndomains) {
1803                 pr_err("%s: No free domain ids\n", iommu->name);
1804                 goto err_unlock;
1805         }
1806
1807         set_bit(num, iommu->domain_ids);
1808         info->refcnt    = 1;
1809         info->did       = num;
1810         info->iommu     = iommu;
1811         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1812                           NULL, info, GFP_ATOMIC);
1813         if (curr) {
1814                 ret = xa_err(curr) ? : -EBUSY;
1815                 goto err_clear;
1816         }
1817         domain_update_iommu_cap(domain);
1818
1819         spin_unlock(&iommu->lock);
1820         return 0;
1821
1822 err_clear:
1823         clear_bit(info->did, iommu->domain_ids);
1824 err_unlock:
1825         spin_unlock(&iommu->lock);
1826         kfree(info);
1827         return ret;
1828 }
1829
1830 static void domain_detach_iommu(struct dmar_domain *domain,
1831                                 struct intel_iommu *iommu)
1832 {
1833         struct iommu_domain_info *info;
1834
1835         spin_lock(&iommu->lock);
1836         info = xa_load(&domain->iommu_array, iommu->seq_id);
1837         if (--info->refcnt == 0) {
1838                 clear_bit(info->did, iommu->domain_ids);
1839                 xa_erase(&domain->iommu_array, iommu->seq_id);
1840                 domain->nid = NUMA_NO_NODE;
1841                 domain_update_iommu_cap(domain);
1842                 kfree(info);
1843         }
1844         spin_unlock(&iommu->lock);
1845 }
1846
1847 static inline int guestwidth_to_adjustwidth(int gaw)
1848 {
1849         int agaw;
1850         int r = (gaw - 12) % 9;
1851
1852         if (r == 0)
1853                 agaw = gaw;
1854         else
1855                 agaw = gaw + 9 - r;
1856         if (agaw > 64)
1857                 agaw = 64;
1858         return agaw;
1859 }
1860
1861 static void domain_exit(struct dmar_domain *domain)
1862 {
1863         if (domain->pgd) {
1864                 LIST_HEAD(freelist);
1865
1866                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1867                 put_pages_list(&freelist);
1868         }
1869
1870         if (WARN_ON(!list_empty(&domain->devices)))
1871                 return;
1872
1873         kfree(domain);
1874 }
1875
1876 /*
1877  * Get the PASID directory size for scalable mode context entry.
1878  * Value of X in the PDTS field of a scalable mode context entry
1879  * indicates PASID directory with 2^(X + 7) entries.
1880  */
1881 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1882 {
1883         unsigned long pds, max_pde;
1884
1885         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1886         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1887         if (pds < 7)
1888                 return 0;
1889
1890         return pds - 7;
1891 }
1892
1893 /*
1894  * Set the RID_PASID field of a scalable mode context entry. The
1895  * IOMMU hardware will use the PASID value set in this field for
1896  * DMA translations of DMA requests without PASID.
1897  */
1898 static inline void
1899 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1900 {
1901         context->hi |= pasid & ((1 << 20) - 1);
1902 }
1903
1904 /*
1905  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1906  * entry.
1907  */
1908 static inline void context_set_sm_dte(struct context_entry *context)
1909 {
1910         context->lo |= BIT_ULL(2);
1911 }
1912
1913 /*
1914  * Set the PRE(Page Request Enable) field of a scalable mode context
1915  * entry.
1916  */
1917 static inline void context_set_sm_pre(struct context_entry *context)
1918 {
1919         context->lo |= BIT_ULL(4);
1920 }
1921
1922 /* Convert value to context PASID directory size field coding. */
1923 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1924
1925 static int domain_context_mapping_one(struct dmar_domain *domain,
1926                                       struct intel_iommu *iommu,
1927                                       struct pasid_table *table,
1928                                       u8 bus, u8 devfn)
1929 {
1930         struct device_domain_info *info =
1931                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1932         u16 did = domain_id_iommu(domain, iommu);
1933         int translation = CONTEXT_TT_MULTI_LEVEL;
1934         struct context_entry *context;
1935         int ret;
1936
1937         if (hw_pass_through && domain_type_is_si(domain))
1938                 translation = CONTEXT_TT_PASS_THROUGH;
1939
1940         pr_debug("Set context mapping for %02x:%02x.%d\n",
1941                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1942
1943         spin_lock(&iommu->lock);
1944         ret = -ENOMEM;
1945         context = iommu_context_addr(iommu, bus, devfn, 1);
1946         if (!context)
1947                 goto out_unlock;
1948
1949         ret = 0;
1950         if (context_present(context) && !context_copied(iommu, bus, devfn))
1951                 goto out_unlock;
1952
1953         /*
1954          * For kdump cases, old valid entries may be cached due to the
1955          * in-flight DMA and copied pgtable, but there is no unmapping
1956          * behaviour for them, thus we need an explicit cache flush for
1957          * the newly-mapped device. For kdump, at this point, the device
1958          * is supposed to finish reset at its driver probe stage, so no
1959          * in-flight DMA will exist, and we don't need to worry anymore
1960          * hereafter.
1961          */
1962         if (context_copied(iommu, bus, devfn)) {
1963                 u16 did_old = context_domain_id(context);
1964
1965                 if (did_old < cap_ndoms(iommu->cap)) {
1966                         iommu->flush.flush_context(iommu, did_old,
1967                                                    (((u16)bus) << 8) | devfn,
1968                                                    DMA_CCMD_MASK_NOBIT,
1969                                                    DMA_CCMD_DEVICE_INVL);
1970                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1971                                                  DMA_TLB_DSI_FLUSH);
1972                 }
1973
1974                 clear_context_copied(iommu, bus, devfn);
1975         }
1976
1977         context_clear_entry(context);
1978
1979         if (sm_supported(iommu)) {
1980                 unsigned long pds;
1981
1982                 /* Setup the PASID DIR pointer: */
1983                 pds = context_get_sm_pds(table);
1984                 context->lo = (u64)virt_to_phys(table->table) |
1985                                 context_pdts(pds);
1986
1987                 /* Setup the RID_PASID field: */
1988                 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1989
1990                 /*
1991                  * Setup the Device-TLB enable bit and Page request
1992                  * Enable bit:
1993                  */
1994                 if (info && info->ats_supported)
1995                         context_set_sm_dte(context);
1996                 if (info && info->pri_supported)
1997                         context_set_sm_pre(context);
1998                 if (info && info->pasid_supported)
1999                         context_set_pasid(context);
2000         } else {
2001                 struct dma_pte *pgd = domain->pgd;
2002                 int agaw;
2003
2004                 context_set_domain_id(context, did);
2005
2006                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2007                         /*
2008                          * Skip top levels of page tables for iommu which has
2009                          * less agaw than default. Unnecessary for PT mode.
2010                          */
2011                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2012                                 ret = -ENOMEM;
2013                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2014                                 if (!dma_pte_present(pgd))
2015                                         goto out_unlock;
2016                         }
2017
2018                         if (info && info->ats_supported)
2019                                 translation = CONTEXT_TT_DEV_IOTLB;
2020                         else
2021                                 translation = CONTEXT_TT_MULTI_LEVEL;
2022
2023                         context_set_address_root(context, virt_to_phys(pgd));
2024                         context_set_address_width(context, agaw);
2025                 } else {
2026                         /*
2027                          * In pass through mode, AW must be programmed to
2028                          * indicate the largest AGAW value supported by
2029                          * hardware. And ASR is ignored by hardware.
2030                          */
2031                         context_set_address_width(context, iommu->msagaw);
2032                 }
2033
2034                 context_set_translation_type(context, translation);
2035         }
2036
2037         context_set_fault_enable(context);
2038         context_set_present(context);
2039         if (!ecap_coherent(iommu->ecap))
2040                 clflush_cache_range(context, sizeof(*context));
2041
2042         /*
2043          * It's a non-present to present mapping. If hardware doesn't cache
2044          * non-present entry we only need to flush the write-buffer. If the
2045          * _does_ cache non-present entries, then it does so in the special
2046          * domain #0, which we have to flush:
2047          */
2048         if (cap_caching_mode(iommu->cap)) {
2049                 iommu->flush.flush_context(iommu, 0,
2050                                            (((u16)bus) << 8) | devfn,
2051                                            DMA_CCMD_MASK_NOBIT,
2052                                            DMA_CCMD_DEVICE_INVL);
2053                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2054         } else {
2055                 iommu_flush_write_buffer(iommu);
2056         }
2057
2058         ret = 0;
2059
2060 out_unlock:
2061         spin_unlock(&iommu->lock);
2062
2063         return ret;
2064 }
2065
2066 struct domain_context_mapping_data {
2067         struct dmar_domain *domain;
2068         struct intel_iommu *iommu;
2069         struct pasid_table *table;
2070 };
2071
2072 static int domain_context_mapping_cb(struct pci_dev *pdev,
2073                                      u16 alias, void *opaque)
2074 {
2075         struct domain_context_mapping_data *data = opaque;
2076
2077         return domain_context_mapping_one(data->domain, data->iommu,
2078                                           data->table, PCI_BUS_NUM(alias),
2079                                           alias & 0xff);
2080 }
2081
2082 static int
2083 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2084 {
2085         struct domain_context_mapping_data data;
2086         struct pasid_table *table;
2087         struct intel_iommu *iommu;
2088         u8 bus, devfn;
2089
2090         iommu = device_to_iommu(dev, &bus, &devfn);
2091         if (!iommu)
2092                 return -ENODEV;
2093
2094         table = intel_pasid_get_table(dev);
2095
2096         if (!dev_is_pci(dev))
2097                 return domain_context_mapping_one(domain, iommu, table,
2098                                                   bus, devfn);
2099
2100         data.domain = domain;
2101         data.iommu = iommu;
2102         data.table = table;
2103
2104         return pci_for_each_dma_alias(to_pci_dev(dev),
2105                                       &domain_context_mapping_cb, &data);
2106 }
2107
2108 /* Returns a number of VTD pages, but aligned to MM page size */
2109 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2110                                             size_t size)
2111 {
2112         host_addr &= ~PAGE_MASK;
2113         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2114 }
2115
2116 /* Return largest possible superpage level for a given mapping */
2117 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2118                                           unsigned long iov_pfn,
2119                                           unsigned long phy_pfn,
2120                                           unsigned long pages)
2121 {
2122         int support, level = 1;
2123         unsigned long pfnmerge;
2124
2125         support = domain->iommu_superpage;
2126
2127         /* To use a large page, the virtual *and* physical addresses
2128            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2129            of them will mean we have to use smaller pages. So just
2130            merge them and check both at once. */
2131         pfnmerge = iov_pfn | phy_pfn;
2132
2133         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2134                 pages >>= VTD_STRIDE_SHIFT;
2135                 if (!pages)
2136                         break;
2137                 pfnmerge >>= VTD_STRIDE_SHIFT;
2138                 level++;
2139                 support--;
2140         }
2141         return level;
2142 }
2143
2144 /*
2145  * Ensure that old small page tables are removed to make room for superpage(s).
2146  * We're going to add new large pages, so make sure we don't remove their parent
2147  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2148  */
2149 static void switch_to_super_page(struct dmar_domain *domain,
2150                                  unsigned long start_pfn,
2151                                  unsigned long end_pfn, int level)
2152 {
2153         unsigned long lvl_pages = lvl_to_nr_pages(level);
2154         struct iommu_domain_info *info;
2155         struct dma_pte *pte = NULL;
2156         unsigned long i;
2157
2158         while (start_pfn <= end_pfn) {
2159                 if (!pte)
2160                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
2161                                              GFP_ATOMIC);
2162
2163                 if (dma_pte_present(pte)) {
2164                         dma_pte_free_pagetable(domain, start_pfn,
2165                                                start_pfn + lvl_pages - 1,
2166                                                level + 1);
2167
2168                         xa_for_each(&domain->iommu_array, i, info)
2169                                 iommu_flush_iotlb_psi(info->iommu, domain,
2170                                                       start_pfn, lvl_pages,
2171                                                       0, 0);
2172                 }
2173
2174                 pte++;
2175                 start_pfn += lvl_pages;
2176                 if (first_pte_in_page(pte))
2177                         pte = NULL;
2178         }
2179 }
2180
2181 static int
2182 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2183                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2184                  gfp_t gfp)
2185 {
2186         struct dma_pte *first_pte = NULL, *pte = NULL;
2187         unsigned int largepage_lvl = 0;
2188         unsigned long lvl_pages = 0;
2189         phys_addr_t pteval;
2190         u64 attr;
2191
2192         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2193                 return -EINVAL;
2194
2195         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2196                 return -EINVAL;
2197
2198         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2199         attr |= DMA_FL_PTE_PRESENT;
2200         if (domain->use_first_level) {
2201                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2202                 if (prot & DMA_PTE_WRITE)
2203                         attr |= DMA_FL_PTE_DIRTY;
2204         }
2205
2206         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2207
2208         while (nr_pages > 0) {
2209                 uint64_t tmp;
2210
2211                 if (!pte) {
2212                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2213                                         phys_pfn, nr_pages);
2214
2215                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2216                                              gfp);
2217                         if (!pte)
2218                                 return -ENOMEM;
2219                         first_pte = pte;
2220
2221                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2222
2223                         /* It is large page*/
2224                         if (largepage_lvl > 1) {
2225                                 unsigned long end_pfn;
2226                                 unsigned long pages_to_remove;
2227
2228                                 pteval |= DMA_PTE_LARGE_PAGE;
2229                                 pages_to_remove = min_t(unsigned long, nr_pages,
2230                                                         nr_pte_to_next_page(pte) * lvl_pages);
2231                                 end_pfn = iov_pfn + pages_to_remove - 1;
2232                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2233                         } else {
2234                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2235                         }
2236
2237                 }
2238                 /* We don't need lock here, nobody else
2239                  * touches the iova range
2240                  */
2241                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2242                 if (tmp) {
2243                         static int dumps = 5;
2244                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2245                                 iov_pfn, tmp, (unsigned long long)pteval);
2246                         if (dumps) {
2247                                 dumps--;
2248                                 debug_dma_dump_mappings(NULL);
2249                         }
2250                         WARN_ON(1);
2251                 }
2252
2253                 nr_pages -= lvl_pages;
2254                 iov_pfn += lvl_pages;
2255                 phys_pfn += lvl_pages;
2256                 pteval += lvl_pages * VTD_PAGE_SIZE;
2257
2258                 /* If the next PTE would be the first in a new page, then we
2259                  * need to flush the cache on the entries we've just written.
2260                  * And then we'll need to recalculate 'pte', so clear it and
2261                  * let it get set again in the if (!pte) block above.
2262                  *
2263                  * If we're done (!nr_pages) we need to flush the cache too.
2264                  *
2265                  * Also if we've been setting superpages, we may need to
2266                  * recalculate 'pte' and switch back to smaller pages for the
2267                  * end of the mapping, if the trailing size is not enough to
2268                  * use another superpage (i.e. nr_pages < lvl_pages).
2269                  */
2270                 pte++;
2271                 if (!nr_pages || first_pte_in_page(pte) ||
2272                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2273                         domain_flush_cache(domain, first_pte,
2274                                            (void *)pte - (void *)first_pte);
2275                         pte = NULL;
2276                 }
2277         }
2278
2279         return 0;
2280 }
2281
2282 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2283 {
2284         struct intel_iommu *iommu = info->iommu;
2285         struct context_entry *context;
2286         u16 did_old;
2287
2288         if (!iommu)
2289                 return;
2290
2291         spin_lock(&iommu->lock);
2292         context = iommu_context_addr(iommu, bus, devfn, 0);
2293         if (!context) {
2294                 spin_unlock(&iommu->lock);
2295                 return;
2296         }
2297
2298         if (sm_supported(iommu)) {
2299                 if (hw_pass_through && domain_type_is_si(info->domain))
2300                         did_old = FLPT_DEFAULT_DID;
2301                 else
2302                         did_old = domain_id_iommu(info->domain, iommu);
2303         } else {
2304                 did_old = context_domain_id(context);
2305         }
2306
2307         context_clear_entry(context);
2308         __iommu_flush_cache(iommu, context, sizeof(*context));
2309         spin_unlock(&iommu->lock);
2310         iommu->flush.flush_context(iommu,
2311                                    did_old,
2312                                    (((u16)bus) << 8) | devfn,
2313                                    DMA_CCMD_MASK_NOBIT,
2314                                    DMA_CCMD_DEVICE_INVL);
2315
2316         if (sm_supported(iommu))
2317                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2318
2319         iommu->flush.flush_iotlb(iommu,
2320                                  did_old,
2321                                  0,
2322                                  0,
2323                                  DMA_TLB_DSI_FLUSH);
2324
2325         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2326 }
2327
2328 static int domain_setup_first_level(struct intel_iommu *iommu,
2329                                     struct dmar_domain *domain,
2330                                     struct device *dev,
2331                                     u32 pasid)
2332 {
2333         struct dma_pte *pgd = domain->pgd;
2334         int agaw, level;
2335         int flags = 0;
2336
2337         /*
2338          * Skip top levels of page tables for iommu which has
2339          * less agaw than default. Unnecessary for PT mode.
2340          */
2341         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2342                 pgd = phys_to_virt(dma_pte_addr(pgd));
2343                 if (!dma_pte_present(pgd))
2344                         return -ENOMEM;
2345         }
2346
2347         level = agaw_to_level(agaw);
2348         if (level != 4 && level != 5)
2349                 return -EINVAL;
2350
2351         if (level == 5)
2352                 flags |= PASID_FLAG_FL5LP;
2353
2354         if (domain->force_snooping)
2355                 flags |= PASID_FLAG_PAGE_SNOOP;
2356
2357         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2358                                              domain_id_iommu(domain, iommu),
2359                                              flags);
2360 }
2361
2362 static bool dev_is_real_dma_subdevice(struct device *dev)
2363 {
2364         return dev && dev_is_pci(dev) &&
2365                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2366 }
2367
2368 static int iommu_domain_identity_map(struct dmar_domain *domain,
2369                                      unsigned long first_vpfn,
2370                                      unsigned long last_vpfn)
2371 {
2372         /*
2373          * RMRR range might have overlap with physical memory range,
2374          * clear it first
2375          */
2376         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2377
2378         return __domain_mapping(domain, first_vpfn,
2379                                 first_vpfn, last_vpfn - first_vpfn + 1,
2380                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2381 }
2382
2383 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2384
2385 static int __init si_domain_init(int hw)
2386 {
2387         struct dmar_rmrr_unit *rmrr;
2388         struct device *dev;
2389         int i, nid, ret;
2390
2391         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2392         if (!si_domain)
2393                 return -EFAULT;
2394
2395         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2396                 domain_exit(si_domain);
2397                 si_domain = NULL;
2398                 return -EFAULT;
2399         }
2400
2401         if (hw)
2402                 return 0;
2403
2404         for_each_online_node(nid) {
2405                 unsigned long start_pfn, end_pfn;
2406                 int i;
2407
2408                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2409                         ret = iommu_domain_identity_map(si_domain,
2410                                         mm_to_dma_pfn_start(start_pfn),
2411                                         mm_to_dma_pfn_end(end_pfn));
2412                         if (ret)
2413                                 return ret;
2414                 }
2415         }
2416
2417         /*
2418          * Identity map the RMRRs so that devices with RMRRs could also use
2419          * the si_domain.
2420          */
2421         for_each_rmrr_units(rmrr) {
2422                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2423                                           i, dev) {
2424                         unsigned long long start = rmrr->base_address;
2425                         unsigned long long end = rmrr->end_address;
2426
2427                         if (WARN_ON(end < start ||
2428                                     end >> agaw_to_width(si_domain->agaw)))
2429                                 continue;
2430
2431                         ret = iommu_domain_identity_map(si_domain,
2432                                         mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2433                                         mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2434                         if (ret)
2435                                 return ret;
2436                 }
2437         }
2438
2439         return 0;
2440 }
2441
2442 static int dmar_domain_attach_device(struct dmar_domain *domain,
2443                                      struct device *dev)
2444 {
2445         struct device_domain_info *info = dev_iommu_priv_get(dev);
2446         struct intel_iommu *iommu;
2447         unsigned long flags;
2448         u8 bus, devfn;
2449         int ret;
2450
2451         iommu = device_to_iommu(dev, &bus, &devfn);
2452         if (!iommu)
2453                 return -ENODEV;
2454
2455         ret = domain_attach_iommu(domain, iommu);
2456         if (ret)
2457                 return ret;
2458         info->domain = domain;
2459         spin_lock_irqsave(&domain->lock, flags);
2460         list_add(&info->link, &domain->devices);
2461         spin_unlock_irqrestore(&domain->lock, flags);
2462
2463         /* PASID table is mandatory for a PCI device in scalable mode. */
2464         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2465                 /* Setup the PASID entry for requests without PASID: */
2466                 if (hw_pass_through && domain_type_is_si(domain))
2467                         ret = intel_pasid_setup_pass_through(iommu, domain,
2468                                         dev, IOMMU_NO_PASID);
2469                 else if (domain->use_first_level)
2470                         ret = domain_setup_first_level(iommu, domain, dev,
2471                                         IOMMU_NO_PASID);
2472                 else
2473                         ret = intel_pasid_setup_second_level(iommu, domain,
2474                                         dev, IOMMU_NO_PASID);
2475                 if (ret) {
2476                         dev_err(dev, "Setup RID2PASID failed\n");
2477                         device_block_translation(dev);
2478                         return ret;
2479                 }
2480         }
2481
2482         ret = domain_context_mapping(domain, dev);
2483         if (ret) {
2484                 dev_err(dev, "Domain context map failed\n");
2485                 device_block_translation(dev);
2486                 return ret;
2487         }
2488
2489         iommu_enable_pci_caps(info);
2490
2491         return 0;
2492 }
2493
2494 /**
2495  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2496  * is relaxable (ie. is allowed to be not enforced under some conditions)
2497  * @dev: device handle
2498  *
2499  * We assume that PCI USB devices with RMRRs have them largely
2500  * for historical reasons and that the RMRR space is not actively used post
2501  * boot.  This exclusion may change if vendors begin to abuse it.
2502  *
2503  * The same exception is made for graphics devices, with the requirement that
2504  * any use of the RMRR regions will be torn down before assigning the device
2505  * to a guest.
2506  *
2507  * Return: true if the RMRR is relaxable, false otherwise
2508  */
2509 static bool device_rmrr_is_relaxable(struct device *dev)
2510 {
2511         struct pci_dev *pdev;
2512
2513         if (!dev_is_pci(dev))
2514                 return false;
2515
2516         pdev = to_pci_dev(dev);
2517         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2518                 return true;
2519         else
2520                 return false;
2521 }
2522
2523 /*
2524  * Return the required default domain type for a specific device.
2525  *
2526  * @dev: the device in query
2527  * @startup: true if this is during early boot
2528  *
2529  * Returns:
2530  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2531  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2532  *  - 0: both identity and dynamic domains work for this device
2533  */
2534 static int device_def_domain_type(struct device *dev)
2535 {
2536         if (dev_is_pci(dev)) {
2537                 struct pci_dev *pdev = to_pci_dev(dev);
2538
2539                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2540                         return IOMMU_DOMAIN_IDENTITY;
2541
2542                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2543                         return IOMMU_DOMAIN_IDENTITY;
2544         }
2545
2546         return 0;
2547 }
2548
2549 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2550 {
2551         /*
2552          * Start from the sane iommu hardware state.
2553          * If the queued invalidation is already initialized by us
2554          * (for example, while enabling interrupt-remapping) then
2555          * we got the things already rolling from a sane state.
2556          */
2557         if (!iommu->qi) {
2558                 /*
2559                  * Clear any previous faults.
2560                  */
2561                 dmar_fault(-1, iommu);
2562                 /*
2563                  * Disable queued invalidation if supported and already enabled
2564                  * before OS handover.
2565                  */
2566                 dmar_disable_qi(iommu);
2567         }
2568
2569         if (dmar_enable_qi(iommu)) {
2570                 /*
2571                  * Queued Invalidate not enabled, use Register Based Invalidate
2572                  */
2573                 iommu->flush.flush_context = __iommu_flush_context;
2574                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2575                 pr_info("%s: Using Register based invalidation\n",
2576                         iommu->name);
2577         } else {
2578                 iommu->flush.flush_context = qi_flush_context;
2579                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2580                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2581         }
2582 }
2583
2584 static int copy_context_table(struct intel_iommu *iommu,
2585                               struct root_entry *old_re,
2586                               struct context_entry **tbl,
2587                               int bus, bool ext)
2588 {
2589         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2590         struct context_entry *new_ce = NULL, ce;
2591         struct context_entry *old_ce = NULL;
2592         struct root_entry re;
2593         phys_addr_t old_ce_phys;
2594
2595         tbl_idx = ext ? bus * 2 : bus;
2596         memcpy(&re, old_re, sizeof(re));
2597
2598         for (devfn = 0; devfn < 256; devfn++) {
2599                 /* First calculate the correct index */
2600                 idx = (ext ? devfn * 2 : devfn) % 256;
2601
2602                 if (idx == 0) {
2603                         /* First save what we may have and clean up */
2604                         if (new_ce) {
2605                                 tbl[tbl_idx] = new_ce;
2606                                 __iommu_flush_cache(iommu, new_ce,
2607                                                     VTD_PAGE_SIZE);
2608                                 pos = 1;
2609                         }
2610
2611                         if (old_ce)
2612                                 memunmap(old_ce);
2613
2614                         ret = 0;
2615                         if (devfn < 0x80)
2616                                 old_ce_phys = root_entry_lctp(&re);
2617                         else
2618                                 old_ce_phys = root_entry_uctp(&re);
2619
2620                         if (!old_ce_phys) {
2621                                 if (ext && devfn == 0) {
2622                                         /* No LCTP, try UCTP */
2623                                         devfn = 0x7f;
2624                                         continue;
2625                                 } else {
2626                                         goto out;
2627                                 }
2628                         }
2629
2630                         ret = -ENOMEM;
2631                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2632                                         MEMREMAP_WB);
2633                         if (!old_ce)
2634                                 goto out;
2635
2636                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2637                         if (!new_ce)
2638                                 goto out_unmap;
2639
2640                         ret = 0;
2641                 }
2642
2643                 /* Now copy the context entry */
2644                 memcpy(&ce, old_ce + idx, sizeof(ce));
2645
2646                 if (!context_present(&ce))
2647                         continue;
2648
2649                 did = context_domain_id(&ce);
2650                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2651                         set_bit(did, iommu->domain_ids);
2652
2653                 set_context_copied(iommu, bus, devfn);
2654                 new_ce[idx] = ce;
2655         }
2656
2657         tbl[tbl_idx + pos] = new_ce;
2658
2659         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2660
2661 out_unmap:
2662         memunmap(old_ce);
2663
2664 out:
2665         return ret;
2666 }
2667
2668 static int copy_translation_tables(struct intel_iommu *iommu)
2669 {
2670         struct context_entry **ctxt_tbls;
2671         struct root_entry *old_rt;
2672         phys_addr_t old_rt_phys;
2673         int ctxt_table_entries;
2674         u64 rtaddr_reg;
2675         int bus, ret;
2676         bool new_ext, ext;
2677
2678         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2679         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2680         new_ext    = !!sm_supported(iommu);
2681
2682         /*
2683          * The RTT bit can only be changed when translation is disabled,
2684          * but disabling translation means to open a window for data
2685          * corruption. So bail out and don't copy anything if we would
2686          * have to change the bit.
2687          */
2688         if (new_ext != ext)
2689                 return -EINVAL;
2690
2691         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2692         if (!iommu->copied_tables)
2693                 return -ENOMEM;
2694
2695         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2696         if (!old_rt_phys)
2697                 return -EINVAL;
2698
2699         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2700         if (!old_rt)
2701                 return -ENOMEM;
2702
2703         /* This is too big for the stack - allocate it from slab */
2704         ctxt_table_entries = ext ? 512 : 256;
2705         ret = -ENOMEM;
2706         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2707         if (!ctxt_tbls)
2708                 goto out_unmap;
2709
2710         for (bus = 0; bus < 256; bus++) {
2711                 ret = copy_context_table(iommu, &old_rt[bus],
2712                                          ctxt_tbls, bus, ext);
2713                 if (ret) {
2714                         pr_err("%s: Failed to copy context table for bus %d\n",
2715                                 iommu->name, bus);
2716                         continue;
2717                 }
2718         }
2719
2720         spin_lock(&iommu->lock);
2721
2722         /* Context tables are copied, now write them to the root_entry table */
2723         for (bus = 0; bus < 256; bus++) {
2724                 int idx = ext ? bus * 2 : bus;
2725                 u64 val;
2726
2727                 if (ctxt_tbls[idx]) {
2728                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2729                         iommu->root_entry[bus].lo = val;
2730                 }
2731
2732                 if (!ext || !ctxt_tbls[idx + 1])
2733                         continue;
2734
2735                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2736                 iommu->root_entry[bus].hi = val;
2737         }
2738
2739         spin_unlock(&iommu->lock);
2740
2741         kfree(ctxt_tbls);
2742
2743         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2744
2745         ret = 0;
2746
2747 out_unmap:
2748         memunmap(old_rt);
2749
2750         return ret;
2751 }
2752
2753 static int __init init_dmars(void)
2754 {
2755         struct dmar_drhd_unit *drhd;
2756         struct intel_iommu *iommu;
2757         int ret;
2758
2759         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2760         if (ret)
2761                 goto free_iommu;
2762
2763         for_each_iommu(iommu, drhd) {
2764                 if (drhd->ignored) {
2765                         iommu_disable_translation(iommu);
2766                         continue;
2767                 }
2768
2769                 /*
2770                  * Find the max pasid size of all IOMMU's in the system.
2771                  * We need to ensure the system pasid table is no bigger
2772                  * than the smallest supported.
2773                  */
2774                 if (pasid_supported(iommu)) {
2775                         u32 temp = 2 << ecap_pss(iommu->ecap);
2776
2777                         intel_pasid_max_id = min_t(u32, temp,
2778                                                    intel_pasid_max_id);
2779                 }
2780
2781                 intel_iommu_init_qi(iommu);
2782
2783                 ret = iommu_init_domains(iommu);
2784                 if (ret)
2785                         goto free_iommu;
2786
2787                 init_translation_status(iommu);
2788
2789                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2790                         iommu_disable_translation(iommu);
2791                         clear_translation_pre_enabled(iommu);
2792                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2793                                 iommu->name);
2794                 }
2795
2796                 /*
2797                  * TBD:
2798                  * we could share the same root & context tables
2799                  * among all IOMMU's. Need to Split it later.
2800                  */
2801                 ret = iommu_alloc_root_entry(iommu);
2802                 if (ret)
2803                         goto free_iommu;
2804
2805                 if (translation_pre_enabled(iommu)) {
2806                         pr_info("Translation already enabled - trying to copy translation structures\n");
2807
2808                         ret = copy_translation_tables(iommu);
2809                         if (ret) {
2810                                 /*
2811                                  * We found the IOMMU with translation
2812                                  * enabled - but failed to copy over the
2813                                  * old root-entry table. Try to proceed
2814                                  * by disabling translation now and
2815                                  * allocating a clean root-entry table.
2816                                  * This might cause DMAR faults, but
2817                                  * probably the dump will still succeed.
2818                                  */
2819                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2820                                        iommu->name);
2821                                 iommu_disable_translation(iommu);
2822                                 clear_translation_pre_enabled(iommu);
2823                         } else {
2824                                 pr_info("Copied translation tables from previous kernel for %s\n",
2825                                         iommu->name);
2826                         }
2827                 }
2828
2829                 if (!ecap_pass_through(iommu->ecap))
2830                         hw_pass_through = 0;
2831                 intel_svm_check(iommu);
2832         }
2833
2834         /*
2835          * Now that qi is enabled on all iommus, set the root entry and flush
2836          * caches. This is required on some Intel X58 chipsets, otherwise the
2837          * flush_context function will loop forever and the boot hangs.
2838          */
2839         for_each_active_iommu(iommu, drhd) {
2840                 iommu_flush_write_buffer(iommu);
2841                 iommu_set_root_entry(iommu);
2842         }
2843
2844 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2845         dmar_map_gfx = 0;
2846 #endif
2847
2848         if (!dmar_map_gfx)
2849                 iommu_identity_mapping |= IDENTMAP_GFX;
2850
2851         check_tylersburg_isoch();
2852
2853         ret = si_domain_init(hw_pass_through);
2854         if (ret)
2855                 goto free_iommu;
2856
2857         /*
2858          * for each drhd
2859          *   enable fault log
2860          *   global invalidate context cache
2861          *   global invalidate iotlb
2862          *   enable translation
2863          */
2864         for_each_iommu(iommu, drhd) {
2865                 if (drhd->ignored) {
2866                         /*
2867                          * we always have to disable PMRs or DMA may fail on
2868                          * this device
2869                          */
2870                         if (force_on)
2871                                 iommu_disable_protect_mem_regions(iommu);
2872                         continue;
2873                 }
2874
2875                 iommu_flush_write_buffer(iommu);
2876
2877 #ifdef CONFIG_INTEL_IOMMU_SVM
2878                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2879                         /*
2880                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2881                          * could cause possible lock race condition.
2882                          */
2883                         up_write(&dmar_global_lock);
2884                         ret = intel_svm_enable_prq(iommu);
2885                         down_write(&dmar_global_lock);
2886                         if (ret)
2887                                 goto free_iommu;
2888                 }
2889 #endif
2890                 ret = dmar_set_interrupt(iommu);
2891                 if (ret)
2892                         goto free_iommu;
2893         }
2894
2895         return 0;
2896
2897 free_iommu:
2898         for_each_active_iommu(iommu, drhd) {
2899                 disable_dmar_iommu(iommu);
2900                 free_dmar_iommu(iommu);
2901         }
2902         if (si_domain) {
2903                 domain_exit(si_domain);
2904                 si_domain = NULL;
2905         }
2906
2907         return ret;
2908 }
2909
2910 static void __init init_no_remapping_devices(void)
2911 {
2912         struct dmar_drhd_unit *drhd;
2913         struct device *dev;
2914         int i;
2915
2916         for_each_drhd_unit(drhd) {
2917                 if (!drhd->include_all) {
2918                         for_each_active_dev_scope(drhd->devices,
2919                                                   drhd->devices_cnt, i, dev)
2920                                 break;
2921                         /* ignore DMAR unit if no devices exist */
2922                         if (i == drhd->devices_cnt)
2923                                 drhd->ignored = 1;
2924                 }
2925         }
2926
2927         for_each_active_drhd_unit(drhd) {
2928                 if (drhd->include_all)
2929                         continue;
2930
2931                 for_each_active_dev_scope(drhd->devices,
2932                                           drhd->devices_cnt, i, dev)
2933                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2934                                 break;
2935                 if (i < drhd->devices_cnt)
2936                         continue;
2937
2938                 /* This IOMMU has *only* gfx devices. Either bypass it or
2939                    set the gfx_mapped flag, as appropriate */
2940                 drhd->gfx_dedicated = 1;
2941                 if (!dmar_map_gfx)
2942                         drhd->ignored = 1;
2943         }
2944 }
2945
2946 #ifdef CONFIG_SUSPEND
2947 static int init_iommu_hw(void)
2948 {
2949         struct dmar_drhd_unit *drhd;
2950         struct intel_iommu *iommu = NULL;
2951         int ret;
2952
2953         for_each_active_iommu(iommu, drhd) {
2954                 if (iommu->qi) {
2955                         ret = dmar_reenable_qi(iommu);
2956                         if (ret)
2957                                 return ret;
2958                 }
2959         }
2960
2961         for_each_iommu(iommu, drhd) {
2962                 if (drhd->ignored) {
2963                         /*
2964                          * we always have to disable PMRs or DMA may fail on
2965                          * this device
2966                          */
2967                         if (force_on)
2968                                 iommu_disable_protect_mem_regions(iommu);
2969                         continue;
2970                 }
2971
2972                 iommu_flush_write_buffer(iommu);
2973                 iommu_set_root_entry(iommu);
2974                 iommu_enable_translation(iommu);
2975                 iommu_disable_protect_mem_regions(iommu);
2976         }
2977
2978         return 0;
2979 }
2980
2981 static void iommu_flush_all(void)
2982 {
2983         struct dmar_drhd_unit *drhd;
2984         struct intel_iommu *iommu;
2985
2986         for_each_active_iommu(iommu, drhd) {
2987                 iommu->flush.flush_context(iommu, 0, 0, 0,
2988                                            DMA_CCMD_GLOBAL_INVL);
2989                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2990                                          DMA_TLB_GLOBAL_FLUSH);
2991         }
2992 }
2993
2994 static int iommu_suspend(void)
2995 {
2996         struct dmar_drhd_unit *drhd;
2997         struct intel_iommu *iommu = NULL;
2998         unsigned long flag;
2999
3000         for_each_active_iommu(iommu, drhd) {
3001                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3002                                              GFP_KERNEL);
3003                 if (!iommu->iommu_state)
3004                         goto nomem;
3005         }
3006
3007         iommu_flush_all();
3008
3009         for_each_active_iommu(iommu, drhd) {
3010                 iommu_disable_translation(iommu);
3011
3012                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3013
3014                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3015                         readl(iommu->reg + DMAR_FECTL_REG);
3016                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3017                         readl(iommu->reg + DMAR_FEDATA_REG);
3018                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3019                         readl(iommu->reg + DMAR_FEADDR_REG);
3020                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3021                         readl(iommu->reg + DMAR_FEUADDR_REG);
3022
3023                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3024         }
3025         return 0;
3026
3027 nomem:
3028         for_each_active_iommu(iommu, drhd)
3029                 kfree(iommu->iommu_state);
3030
3031         return -ENOMEM;
3032 }
3033
3034 static void iommu_resume(void)
3035 {
3036         struct dmar_drhd_unit *drhd;
3037         struct intel_iommu *iommu = NULL;
3038         unsigned long flag;
3039
3040         if (init_iommu_hw()) {
3041                 if (force_on)
3042                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3043                 else
3044                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3045                 return;
3046         }
3047
3048         for_each_active_iommu(iommu, drhd) {
3049
3050                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3051
3052                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3053                         iommu->reg + DMAR_FECTL_REG);
3054                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3055                         iommu->reg + DMAR_FEDATA_REG);
3056                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3057                         iommu->reg + DMAR_FEADDR_REG);
3058                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3059                         iommu->reg + DMAR_FEUADDR_REG);
3060
3061                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3062         }
3063
3064         for_each_active_iommu(iommu, drhd)
3065                 kfree(iommu->iommu_state);
3066 }
3067
3068 static struct syscore_ops iommu_syscore_ops = {
3069         .resume         = iommu_resume,
3070         .suspend        = iommu_suspend,
3071 };
3072
3073 static void __init init_iommu_pm_ops(void)
3074 {
3075         register_syscore_ops(&iommu_syscore_ops);
3076 }
3077
3078 #else
3079 static inline void init_iommu_pm_ops(void) {}
3080 #endif  /* CONFIG_PM */
3081
3082 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3083 {
3084         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3085             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3086             rmrr->end_address <= rmrr->base_address ||
3087             arch_rmrr_sanity_check(rmrr))
3088                 return -EINVAL;
3089
3090         return 0;
3091 }
3092
3093 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3094 {
3095         struct acpi_dmar_reserved_memory *rmrr;
3096         struct dmar_rmrr_unit *rmrru;
3097
3098         rmrr = (struct acpi_dmar_reserved_memory *)header;
3099         if (rmrr_sanity_check(rmrr)) {
3100                 pr_warn(FW_BUG
3101                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3102                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3103                            rmrr->base_address, rmrr->end_address,
3104                            dmi_get_system_info(DMI_BIOS_VENDOR),
3105                            dmi_get_system_info(DMI_BIOS_VERSION),
3106                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3107                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3108         }
3109
3110         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3111         if (!rmrru)
3112                 goto out;
3113
3114         rmrru->hdr = header;
3115
3116         rmrru->base_address = rmrr->base_address;
3117         rmrru->end_address = rmrr->end_address;
3118
3119         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3120                                 ((void *)rmrr) + rmrr->header.length,
3121                                 &rmrru->devices_cnt);
3122         if (rmrru->devices_cnt && rmrru->devices == NULL)
3123                 goto free_rmrru;
3124
3125         list_add(&rmrru->list, &dmar_rmrr_units);
3126
3127         return 0;
3128 free_rmrru:
3129         kfree(rmrru);
3130 out:
3131         return -ENOMEM;
3132 }
3133
3134 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3135 {
3136         struct dmar_atsr_unit *atsru;
3137         struct acpi_dmar_atsr *tmp;
3138
3139         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3140                                 dmar_rcu_check()) {
3141                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3142                 if (atsr->segment != tmp->segment)
3143                         continue;
3144                 if (atsr->header.length != tmp->header.length)
3145                         continue;
3146                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3147                         return atsru;
3148         }
3149
3150         return NULL;
3151 }
3152
3153 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3154 {
3155         struct acpi_dmar_atsr *atsr;
3156         struct dmar_atsr_unit *atsru;
3157
3158         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3159                 return 0;
3160
3161         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3162         atsru = dmar_find_atsr(atsr);
3163         if (atsru)
3164                 return 0;
3165
3166         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3167         if (!atsru)
3168                 return -ENOMEM;
3169
3170         /*
3171          * If memory is allocated from slab by ACPI _DSM method, we need to
3172          * copy the memory content because the memory buffer will be freed
3173          * on return.
3174          */
3175         atsru->hdr = (void *)(atsru + 1);
3176         memcpy(atsru->hdr, hdr, hdr->length);
3177         atsru->include_all = atsr->flags & 0x1;
3178         if (!atsru->include_all) {
3179                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3180                                 (void *)atsr + atsr->header.length,
3181                                 &atsru->devices_cnt);
3182                 if (atsru->devices_cnt && atsru->devices == NULL) {
3183                         kfree(atsru);
3184                         return -ENOMEM;
3185                 }
3186         }
3187
3188         list_add_rcu(&atsru->list, &dmar_atsr_units);
3189
3190         return 0;
3191 }
3192
3193 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3194 {
3195         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3196         kfree(atsru);
3197 }
3198
3199 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3200 {
3201         struct acpi_dmar_atsr *atsr;
3202         struct dmar_atsr_unit *atsru;
3203
3204         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3205         atsru = dmar_find_atsr(atsr);
3206         if (atsru) {
3207                 list_del_rcu(&atsru->list);
3208                 synchronize_rcu();
3209                 intel_iommu_free_atsr(atsru);
3210         }
3211
3212         return 0;
3213 }
3214
3215 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3216 {
3217         int i;
3218         struct device *dev;
3219         struct acpi_dmar_atsr *atsr;
3220         struct dmar_atsr_unit *atsru;
3221
3222         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3223         atsru = dmar_find_atsr(atsr);
3224         if (!atsru)
3225                 return 0;
3226
3227         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3228                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3229                                           i, dev)
3230                         return -EBUSY;
3231         }
3232
3233         return 0;
3234 }
3235
3236 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3237 {
3238         struct dmar_satc_unit *satcu;
3239         struct acpi_dmar_satc *tmp;
3240
3241         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3242                                 dmar_rcu_check()) {
3243                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3244                 if (satc->segment != tmp->segment)
3245                         continue;
3246                 if (satc->header.length != tmp->header.length)
3247                         continue;
3248                 if (memcmp(satc, tmp, satc->header.length) == 0)
3249                         return satcu;
3250         }
3251
3252         return NULL;
3253 }
3254
3255 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3256 {
3257         struct acpi_dmar_satc *satc;
3258         struct dmar_satc_unit *satcu;
3259
3260         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3261                 return 0;
3262
3263         satc = container_of(hdr, struct acpi_dmar_satc, header);
3264         satcu = dmar_find_satc(satc);
3265         if (satcu)
3266                 return 0;
3267
3268         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3269         if (!satcu)
3270                 return -ENOMEM;
3271
3272         satcu->hdr = (void *)(satcu + 1);
3273         memcpy(satcu->hdr, hdr, hdr->length);
3274         satcu->atc_required = satc->flags & 0x1;
3275         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3276                                               (void *)satc + satc->header.length,
3277                                               &satcu->devices_cnt);
3278         if (satcu->devices_cnt && !satcu->devices) {
3279                 kfree(satcu);
3280                 return -ENOMEM;
3281         }
3282         list_add_rcu(&satcu->list, &dmar_satc_units);
3283
3284         return 0;
3285 }
3286
3287 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3288 {
3289         int sp, ret;
3290         struct intel_iommu *iommu = dmaru->iommu;
3291
3292         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3293         if (ret)
3294                 goto out;
3295
3296         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3297                 pr_warn("%s: Doesn't support hardware pass through.\n",
3298                         iommu->name);
3299                 return -ENXIO;
3300         }
3301
3302         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3303         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3304                 pr_warn("%s: Doesn't support large page.\n",
3305                         iommu->name);
3306                 return -ENXIO;
3307         }
3308
3309         /*
3310          * Disable translation if already enabled prior to OS handover.
3311          */
3312         if (iommu->gcmd & DMA_GCMD_TE)
3313                 iommu_disable_translation(iommu);
3314
3315         ret = iommu_init_domains(iommu);
3316         if (ret == 0)
3317                 ret = iommu_alloc_root_entry(iommu);
3318         if (ret)
3319                 goto out;
3320
3321         intel_svm_check(iommu);
3322
3323         if (dmaru->ignored) {
3324                 /*
3325                  * we always have to disable PMRs or DMA may fail on this device
3326                  */
3327                 if (force_on)
3328                         iommu_disable_protect_mem_regions(iommu);
3329                 return 0;
3330         }
3331
3332         intel_iommu_init_qi(iommu);
3333         iommu_flush_write_buffer(iommu);
3334
3335 #ifdef CONFIG_INTEL_IOMMU_SVM
3336         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3337                 ret = intel_svm_enable_prq(iommu);
3338                 if (ret)
3339                         goto disable_iommu;
3340         }
3341 #endif
3342         ret = dmar_set_interrupt(iommu);
3343         if (ret)
3344                 goto disable_iommu;
3345
3346         iommu_set_root_entry(iommu);
3347         iommu_enable_translation(iommu);
3348
3349         iommu_disable_protect_mem_regions(iommu);
3350         return 0;
3351
3352 disable_iommu:
3353         disable_dmar_iommu(iommu);
3354 out:
3355         free_dmar_iommu(iommu);
3356         return ret;
3357 }
3358
3359 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3360 {
3361         int ret = 0;
3362         struct intel_iommu *iommu = dmaru->iommu;
3363
3364         if (!intel_iommu_enabled)
3365                 return 0;
3366         if (iommu == NULL)
3367                 return -EINVAL;
3368
3369         if (insert) {
3370                 ret = intel_iommu_add(dmaru);
3371         } else {
3372                 disable_dmar_iommu(iommu);
3373                 free_dmar_iommu(iommu);
3374         }
3375
3376         return ret;
3377 }
3378
3379 static void intel_iommu_free_dmars(void)
3380 {
3381         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3382         struct dmar_atsr_unit *atsru, *atsr_n;
3383         struct dmar_satc_unit *satcu, *satc_n;
3384
3385         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3386                 list_del(&rmrru->list);
3387                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3388                 kfree(rmrru);
3389         }
3390
3391         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3392                 list_del(&atsru->list);
3393                 intel_iommu_free_atsr(atsru);
3394         }
3395         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3396                 list_del(&satcu->list);
3397                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3398                 kfree(satcu);
3399         }
3400 }
3401
3402 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3403 {
3404         struct dmar_satc_unit *satcu;
3405         struct acpi_dmar_satc *satc;
3406         struct device *tmp;
3407         int i;
3408
3409         dev = pci_physfn(dev);
3410         rcu_read_lock();
3411
3412         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3413                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3414                 if (satc->segment != pci_domain_nr(dev->bus))
3415                         continue;
3416                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3417                         if (to_pci_dev(tmp) == dev)
3418                                 goto out;
3419         }
3420         satcu = NULL;
3421 out:
3422         rcu_read_unlock();
3423         return satcu;
3424 }
3425
3426 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3427 {
3428         int i, ret = 1;
3429         struct pci_bus *bus;
3430         struct pci_dev *bridge = NULL;
3431         struct device *tmp;
3432         struct acpi_dmar_atsr *atsr;
3433         struct dmar_atsr_unit *atsru;
3434         struct dmar_satc_unit *satcu;
3435
3436         dev = pci_physfn(dev);
3437         satcu = dmar_find_matched_satc_unit(dev);
3438         if (satcu)
3439                 /*
3440                  * This device supports ATS as it is in SATC table.
3441                  * When IOMMU is in legacy mode, enabling ATS is done
3442                  * automatically by HW for the device that requires
3443                  * ATS, hence OS should not enable this device ATS
3444                  * to avoid duplicated TLB invalidation.
3445                  */
3446                 return !(satcu->atc_required && !sm_supported(iommu));
3447
3448         for (bus = dev->bus; bus; bus = bus->parent) {
3449                 bridge = bus->self;
3450                 /* If it's an integrated device, allow ATS */
3451                 if (!bridge)
3452                         return 1;
3453                 /* Connected via non-PCIe: no ATS */
3454                 if (!pci_is_pcie(bridge) ||
3455                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3456                         return 0;
3457                 /* If we found the root port, look it up in the ATSR */
3458                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3459                         break;
3460         }
3461
3462         rcu_read_lock();
3463         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3464                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3465                 if (atsr->segment != pci_domain_nr(dev->bus))
3466                         continue;
3467
3468                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3469                         if (tmp == &bridge->dev)
3470                                 goto out;
3471
3472                 if (atsru->include_all)
3473                         goto out;
3474         }
3475         ret = 0;
3476 out:
3477         rcu_read_unlock();
3478
3479         return ret;
3480 }
3481
3482 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3483 {
3484         int ret;
3485         struct dmar_rmrr_unit *rmrru;
3486         struct dmar_atsr_unit *atsru;
3487         struct dmar_satc_unit *satcu;
3488         struct acpi_dmar_atsr *atsr;
3489         struct acpi_dmar_reserved_memory *rmrr;
3490         struct acpi_dmar_satc *satc;
3491
3492         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3493                 return 0;
3494
3495         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3496                 rmrr = container_of(rmrru->hdr,
3497                                     struct acpi_dmar_reserved_memory, header);
3498                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3499                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3500                                 ((void *)rmrr) + rmrr->header.length,
3501                                 rmrr->segment, rmrru->devices,
3502                                 rmrru->devices_cnt);
3503                         if (ret < 0)
3504                                 return ret;
3505                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3506                         dmar_remove_dev_scope(info, rmrr->segment,
3507                                 rmrru->devices, rmrru->devices_cnt);
3508                 }
3509         }
3510
3511         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3512                 if (atsru->include_all)
3513                         continue;
3514
3515                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3516                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3517                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3518                                         (void *)atsr + atsr->header.length,
3519                                         atsr->segment, atsru->devices,
3520                                         atsru->devices_cnt);
3521                         if (ret > 0)
3522                                 break;
3523                         else if (ret < 0)
3524                                 return ret;
3525                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3526                         if (dmar_remove_dev_scope(info, atsr->segment,
3527                                         atsru->devices, atsru->devices_cnt))
3528                                 break;
3529                 }
3530         }
3531         list_for_each_entry(satcu, &dmar_satc_units, list) {
3532                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3533                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3534                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3535                                         (void *)satc + satc->header.length,
3536                                         satc->segment, satcu->devices,
3537                                         satcu->devices_cnt);
3538                         if (ret > 0)
3539                                 break;
3540                         else if (ret < 0)
3541                                 return ret;
3542                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3543                         if (dmar_remove_dev_scope(info, satc->segment,
3544                                         satcu->devices, satcu->devices_cnt))
3545                                 break;
3546                 }
3547         }
3548
3549         return 0;
3550 }
3551
3552 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3553                                        unsigned long val, void *v)
3554 {
3555         struct memory_notify *mhp = v;
3556         unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3557         unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3558                         mhp->nr_pages - 1);
3559
3560         switch (val) {
3561         case MEM_GOING_ONLINE:
3562                 if (iommu_domain_identity_map(si_domain,
3563                                               start_vpfn, last_vpfn)) {
3564                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3565                                 start_vpfn, last_vpfn);
3566                         return NOTIFY_BAD;
3567                 }
3568                 break;
3569
3570         case MEM_OFFLINE:
3571         case MEM_CANCEL_ONLINE:
3572                 {
3573                         struct dmar_drhd_unit *drhd;
3574                         struct intel_iommu *iommu;
3575                         LIST_HEAD(freelist);
3576
3577                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3578
3579                         rcu_read_lock();
3580                         for_each_active_iommu(iommu, drhd)
3581                                 iommu_flush_iotlb_psi(iommu, si_domain,
3582                                         start_vpfn, mhp->nr_pages,
3583                                         list_empty(&freelist), 0);
3584                         rcu_read_unlock();
3585                         put_pages_list(&freelist);
3586                 }
3587                 break;
3588         }
3589
3590         return NOTIFY_OK;
3591 }
3592
3593 static struct notifier_block intel_iommu_memory_nb = {
3594         .notifier_call = intel_iommu_memory_notifier,
3595         .priority = 0
3596 };
3597
3598 static void intel_disable_iommus(void)
3599 {
3600         struct intel_iommu *iommu = NULL;
3601         struct dmar_drhd_unit *drhd;
3602
3603         for_each_iommu(iommu, drhd)
3604                 iommu_disable_translation(iommu);
3605 }
3606
3607 void intel_iommu_shutdown(void)
3608 {
3609         struct dmar_drhd_unit *drhd;
3610         struct intel_iommu *iommu = NULL;
3611
3612         if (no_iommu || dmar_disabled)
3613                 return;
3614
3615         down_write(&dmar_global_lock);
3616
3617         /* Disable PMRs explicitly here. */
3618         for_each_iommu(iommu, drhd)
3619                 iommu_disable_protect_mem_regions(iommu);
3620
3621         /* Make sure the IOMMUs are switched off */
3622         intel_disable_iommus();
3623
3624         up_write(&dmar_global_lock);
3625 }
3626
3627 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3628 {
3629         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3630
3631         return container_of(iommu_dev, struct intel_iommu, iommu);
3632 }
3633
3634 static ssize_t version_show(struct device *dev,
3635                             struct device_attribute *attr, char *buf)
3636 {
3637         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3638         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3639         return sysfs_emit(buf, "%d:%d\n",
3640                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3641 }
3642 static DEVICE_ATTR_RO(version);
3643
3644 static ssize_t address_show(struct device *dev,
3645                             struct device_attribute *attr, char *buf)
3646 {
3647         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3648         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3649 }
3650 static DEVICE_ATTR_RO(address);
3651
3652 static ssize_t cap_show(struct device *dev,
3653                         struct device_attribute *attr, char *buf)
3654 {
3655         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3656         return sysfs_emit(buf, "%llx\n", iommu->cap);
3657 }
3658 static DEVICE_ATTR_RO(cap);
3659
3660 static ssize_t ecap_show(struct device *dev,
3661                          struct device_attribute *attr, char *buf)
3662 {
3663         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3664         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3665 }
3666 static DEVICE_ATTR_RO(ecap);
3667
3668 static ssize_t domains_supported_show(struct device *dev,
3669                                       struct device_attribute *attr, char *buf)
3670 {
3671         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3672         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3673 }
3674 static DEVICE_ATTR_RO(domains_supported);
3675
3676 static ssize_t domains_used_show(struct device *dev,
3677                                  struct device_attribute *attr, char *buf)
3678 {
3679         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3680         return sysfs_emit(buf, "%d\n",
3681                           bitmap_weight(iommu->domain_ids,
3682                                         cap_ndoms(iommu->cap)));
3683 }
3684 static DEVICE_ATTR_RO(domains_used);
3685
3686 static struct attribute *intel_iommu_attrs[] = {
3687         &dev_attr_version.attr,
3688         &dev_attr_address.attr,
3689         &dev_attr_cap.attr,
3690         &dev_attr_ecap.attr,
3691         &dev_attr_domains_supported.attr,
3692         &dev_attr_domains_used.attr,
3693         NULL,
3694 };
3695
3696 static struct attribute_group intel_iommu_group = {
3697         .name = "intel-iommu",
3698         .attrs = intel_iommu_attrs,
3699 };
3700
3701 const struct attribute_group *intel_iommu_groups[] = {
3702         &intel_iommu_group,
3703         NULL,
3704 };
3705
3706 static inline bool has_external_pci(void)
3707 {
3708         struct pci_dev *pdev = NULL;
3709
3710         for_each_pci_dev(pdev)
3711                 if (pdev->external_facing) {
3712                         pci_dev_put(pdev);
3713                         return true;
3714                 }
3715
3716         return false;
3717 }
3718
3719 static int __init platform_optin_force_iommu(void)
3720 {
3721         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3722                 return 0;
3723
3724         if (no_iommu || dmar_disabled)
3725                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3726
3727         /*
3728          * If Intel-IOMMU is disabled by default, we will apply identity
3729          * map for all devices except those marked as being untrusted.
3730          */
3731         if (dmar_disabled)
3732                 iommu_set_default_passthrough(false);
3733
3734         dmar_disabled = 0;
3735         no_iommu = 0;
3736
3737         return 1;
3738 }
3739
3740 static int __init probe_acpi_namespace_devices(void)
3741 {
3742         struct dmar_drhd_unit *drhd;
3743         /* To avoid a -Wunused-but-set-variable warning. */
3744         struct intel_iommu *iommu __maybe_unused;
3745         struct device *dev;
3746         int i, ret = 0;
3747
3748         for_each_active_iommu(iommu, drhd) {
3749                 for_each_active_dev_scope(drhd->devices,
3750                                           drhd->devices_cnt, i, dev) {
3751                         struct acpi_device_physical_node *pn;
3752                         struct acpi_device *adev;
3753
3754                         if (dev->bus != &acpi_bus_type)
3755                                 continue;
3756
3757                         adev = to_acpi_device(dev);
3758                         mutex_lock(&adev->physical_node_lock);
3759                         list_for_each_entry(pn,
3760                                             &adev->physical_node_list, node) {
3761                                 ret = iommu_probe_device(pn->dev);
3762                                 if (ret)
3763                                         break;
3764                         }
3765                         mutex_unlock(&adev->physical_node_lock);
3766
3767                         if (ret)
3768                                 return ret;
3769                 }
3770         }
3771
3772         return 0;
3773 }
3774
3775 static __init int tboot_force_iommu(void)
3776 {
3777         if (!tboot_enabled())
3778                 return 0;
3779
3780         if (no_iommu || dmar_disabled)
3781                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3782
3783         dmar_disabled = 0;
3784         no_iommu = 0;
3785
3786         return 1;
3787 }
3788
3789 int __init intel_iommu_init(void)
3790 {
3791         int ret = -ENODEV;
3792         struct dmar_drhd_unit *drhd;
3793         struct intel_iommu *iommu;
3794
3795         /*
3796          * Intel IOMMU is required for a TXT/tboot launch or platform
3797          * opt in, so enforce that.
3798          */
3799         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3800                     platform_optin_force_iommu();
3801
3802         down_write(&dmar_global_lock);
3803         if (dmar_table_init()) {
3804                 if (force_on)
3805                         panic("tboot: Failed to initialize DMAR table\n");
3806                 goto out_free_dmar;
3807         }
3808
3809         if (dmar_dev_scope_init() < 0) {
3810                 if (force_on)
3811                         panic("tboot: Failed to initialize DMAR device scope\n");
3812                 goto out_free_dmar;
3813         }
3814
3815         up_write(&dmar_global_lock);
3816
3817         /*
3818          * The bus notifier takes the dmar_global_lock, so lockdep will
3819          * complain later when we register it under the lock.
3820          */
3821         dmar_register_bus_notifier();
3822
3823         down_write(&dmar_global_lock);
3824
3825         if (!no_iommu)
3826                 intel_iommu_debugfs_init();
3827
3828         if (no_iommu || dmar_disabled) {
3829                 /*
3830                  * We exit the function here to ensure IOMMU's remapping and
3831                  * mempool aren't setup, which means that the IOMMU's PMRs
3832                  * won't be disabled via the call to init_dmars(). So disable
3833                  * it explicitly here. The PMRs were setup by tboot prior to
3834                  * calling SENTER, but the kernel is expected to reset/tear
3835                  * down the PMRs.
3836                  */
3837                 if (intel_iommu_tboot_noforce) {
3838                         for_each_iommu(iommu, drhd)
3839                                 iommu_disable_protect_mem_regions(iommu);
3840                 }
3841
3842                 /*
3843                  * Make sure the IOMMUs are switched off, even when we
3844                  * boot into a kexec kernel and the previous kernel left
3845                  * them enabled
3846                  */
3847                 intel_disable_iommus();
3848                 goto out_free_dmar;
3849         }
3850
3851         if (list_empty(&dmar_rmrr_units))
3852                 pr_info("No RMRR found\n");
3853
3854         if (list_empty(&dmar_atsr_units))
3855                 pr_info("No ATSR found\n");
3856
3857         if (list_empty(&dmar_satc_units))
3858                 pr_info("No SATC found\n");
3859
3860         init_no_remapping_devices();
3861
3862         ret = init_dmars();
3863         if (ret) {
3864                 if (force_on)
3865                         panic("tboot: Failed to initialize DMARs\n");
3866                 pr_err("Initialization failed\n");
3867                 goto out_free_dmar;
3868         }
3869         up_write(&dmar_global_lock);
3870
3871         init_iommu_pm_ops();
3872
3873         down_read(&dmar_global_lock);
3874         for_each_active_iommu(iommu, drhd) {
3875                 /*
3876                  * The flush queue implementation does not perform
3877                  * page-selective invalidations that are required for efficient
3878                  * TLB flushes in virtual environments.  The benefit of batching
3879                  * is likely to be much lower than the overhead of synchronizing
3880                  * the virtual and physical IOMMU page-tables.
3881                  */
3882                 if (cap_caching_mode(iommu->cap) &&
3883                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3884                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3885                         iommu_set_dma_strict();
3886                 }
3887                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3888                                        intel_iommu_groups,
3889                                        "%s", iommu->name);
3890                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3891
3892                 iommu_pmu_register(iommu);
3893         }
3894         up_read(&dmar_global_lock);
3895
3896         if (si_domain && !hw_pass_through)
3897                 register_memory_notifier(&intel_iommu_memory_nb);
3898
3899         down_read(&dmar_global_lock);
3900         if (probe_acpi_namespace_devices())
3901                 pr_warn("ACPI name space devices didn't probe correctly\n");
3902
3903         /* Finally, we enable the DMA remapping hardware. */
3904         for_each_iommu(iommu, drhd) {
3905                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3906                         iommu_enable_translation(iommu);
3907
3908                 iommu_disable_protect_mem_regions(iommu);
3909         }
3910         up_read(&dmar_global_lock);
3911
3912         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3913
3914         intel_iommu_enabled = 1;
3915
3916         return 0;
3917
3918 out_free_dmar:
3919         intel_iommu_free_dmars();
3920         up_write(&dmar_global_lock);
3921         return ret;
3922 }
3923
3924 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3925 {
3926         struct device_domain_info *info = opaque;
3927
3928         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3929         return 0;
3930 }
3931
3932 /*
3933  * NB - intel-iommu lacks any sort of reference counting for the users of
3934  * dependent devices.  If multiple endpoints have intersecting dependent
3935  * devices, unbinding the driver from any one of them will possibly leave
3936  * the others unable to operate.
3937  */
3938 static void domain_context_clear(struct device_domain_info *info)
3939 {
3940         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3941                 return;
3942
3943         pci_for_each_dma_alias(to_pci_dev(info->dev),
3944                                &domain_context_clear_one_cb, info);
3945 }
3946
3947 static void dmar_remove_one_dev_info(struct device *dev)
3948 {
3949         struct device_domain_info *info = dev_iommu_priv_get(dev);
3950         struct dmar_domain *domain = info->domain;
3951         struct intel_iommu *iommu = info->iommu;
3952         unsigned long flags;
3953
3954         if (!dev_is_real_dma_subdevice(info->dev)) {
3955                 if (dev_is_pci(info->dev) && sm_supported(iommu))
3956                         intel_pasid_tear_down_entry(iommu, info->dev,
3957                                         IOMMU_NO_PASID, false);
3958
3959                 iommu_disable_pci_caps(info);
3960                 domain_context_clear(info);
3961         }
3962
3963         spin_lock_irqsave(&domain->lock, flags);
3964         list_del(&info->link);
3965         spin_unlock_irqrestore(&domain->lock, flags);
3966
3967         domain_detach_iommu(domain, iommu);
3968         info->domain = NULL;
3969 }
3970
3971 /*
3972  * Clear the page table pointer in context or pasid table entries so that
3973  * all DMA requests without PASID from the device are blocked. If the page
3974  * table has been set, clean up the data structures.
3975  */
3976 static void device_block_translation(struct device *dev)
3977 {
3978         struct device_domain_info *info = dev_iommu_priv_get(dev);
3979         struct intel_iommu *iommu = info->iommu;
3980         unsigned long flags;
3981
3982         iommu_disable_pci_caps(info);
3983         if (!dev_is_real_dma_subdevice(dev)) {
3984                 if (sm_supported(iommu))
3985                         intel_pasid_tear_down_entry(iommu, dev,
3986                                                     IOMMU_NO_PASID, false);
3987                 else
3988                         domain_context_clear(info);
3989         }
3990
3991         if (!info->domain)
3992                 return;
3993
3994         spin_lock_irqsave(&info->domain->lock, flags);
3995         list_del(&info->link);
3996         spin_unlock_irqrestore(&info->domain->lock, flags);
3997
3998         domain_detach_iommu(info->domain, iommu);
3999         info->domain = NULL;
4000 }
4001
4002 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4003 {
4004         int adjust_width;
4005
4006         /* calculate AGAW */
4007         domain->gaw = guest_width;
4008         adjust_width = guestwidth_to_adjustwidth(guest_width);
4009         domain->agaw = width_to_agaw(adjust_width);
4010
4011         domain->iommu_coherency = false;
4012         domain->iommu_superpage = 0;
4013         domain->max_addr = 0;
4014
4015         /* always allocate the top pgd */
4016         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4017         if (!domain->pgd)
4018                 return -ENOMEM;
4019         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4020         return 0;
4021 }
4022
4023 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4024                                       struct device *dev)
4025 {
4026         device_block_translation(dev);
4027         return 0;
4028 }
4029
4030 static struct iommu_domain blocking_domain = {
4031         .ops = &(const struct iommu_domain_ops) {
4032                 .attach_dev     = blocking_domain_attach_dev,
4033                 .free           = intel_iommu_domain_free
4034         }
4035 };
4036
4037 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4038 {
4039         struct dmar_domain *dmar_domain;
4040         struct iommu_domain *domain;
4041
4042         switch (type) {
4043         case IOMMU_DOMAIN_BLOCKED:
4044                 return &blocking_domain;
4045         case IOMMU_DOMAIN_DMA:
4046         case IOMMU_DOMAIN_UNMANAGED:
4047                 dmar_domain = alloc_domain(type);
4048                 if (!dmar_domain) {
4049                         pr_err("Can't allocate dmar_domain\n");
4050                         return NULL;
4051                 }
4052                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4053                         pr_err("Domain initialization failed\n");
4054                         domain_exit(dmar_domain);
4055                         return NULL;
4056                 }
4057
4058                 domain = &dmar_domain->domain;
4059                 domain->geometry.aperture_start = 0;
4060                 domain->geometry.aperture_end   =
4061                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4062                 domain->geometry.force_aperture = true;
4063
4064                 return domain;
4065         case IOMMU_DOMAIN_IDENTITY:
4066                 return &si_domain->domain;
4067         case IOMMU_DOMAIN_SVA:
4068                 return intel_svm_domain_alloc();
4069         default:
4070                 return NULL;
4071         }
4072
4073         return NULL;
4074 }
4075
4076 static void intel_iommu_domain_free(struct iommu_domain *domain)
4077 {
4078         if (domain != &si_domain->domain && domain != &blocking_domain)
4079                 domain_exit(to_dmar_domain(domain));
4080 }
4081
4082 static int prepare_domain_attach_device(struct iommu_domain *domain,
4083                                         struct device *dev)
4084 {
4085         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4086         struct intel_iommu *iommu;
4087         int addr_width;
4088
4089         iommu = device_to_iommu(dev, NULL, NULL);
4090         if (!iommu)
4091                 return -ENODEV;
4092
4093         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4094                 return -EINVAL;
4095
4096         /* check if this iommu agaw is sufficient for max mapped address */
4097         addr_width = agaw_to_width(iommu->agaw);
4098         if (addr_width > cap_mgaw(iommu->cap))
4099                 addr_width = cap_mgaw(iommu->cap);
4100
4101         if (dmar_domain->max_addr > (1LL << addr_width))
4102                 return -EINVAL;
4103         dmar_domain->gaw = addr_width;
4104
4105         /*
4106          * Knock out extra levels of page tables if necessary
4107          */
4108         while (iommu->agaw < dmar_domain->agaw) {
4109                 struct dma_pte *pte;
4110
4111                 pte = dmar_domain->pgd;
4112                 if (dma_pte_present(pte)) {
4113                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4114                         free_pgtable_page(pte);
4115                 }
4116                 dmar_domain->agaw--;
4117         }
4118
4119         return 0;
4120 }
4121
4122 static int intel_iommu_attach_device(struct iommu_domain *domain,
4123                                      struct device *dev)
4124 {
4125         struct device_domain_info *info = dev_iommu_priv_get(dev);
4126         int ret;
4127
4128         if (info->domain)
4129                 device_block_translation(dev);
4130
4131         ret = prepare_domain_attach_device(domain, dev);
4132         if (ret)
4133                 return ret;
4134
4135         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4136 }
4137
4138 static int intel_iommu_map(struct iommu_domain *domain,
4139                            unsigned long iova, phys_addr_t hpa,
4140                            size_t size, int iommu_prot, gfp_t gfp)
4141 {
4142         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4143         u64 max_addr;
4144         int prot = 0;
4145
4146         if (iommu_prot & IOMMU_READ)
4147                 prot |= DMA_PTE_READ;
4148         if (iommu_prot & IOMMU_WRITE)
4149                 prot |= DMA_PTE_WRITE;
4150         if (dmar_domain->set_pte_snp)
4151                 prot |= DMA_PTE_SNP;
4152
4153         max_addr = iova + size;
4154         if (dmar_domain->max_addr < max_addr) {
4155                 u64 end;
4156
4157                 /* check if minimum agaw is sufficient for mapped address */
4158                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4159                 if (end < max_addr) {
4160                         pr_err("%s: iommu width (%d) is not "
4161                                "sufficient for the mapped address (%llx)\n",
4162                                __func__, dmar_domain->gaw, max_addr);
4163                         return -EFAULT;
4164                 }
4165                 dmar_domain->max_addr = max_addr;
4166         }
4167         /* Round up size to next multiple of PAGE_SIZE, if it and
4168            the low bits of hpa would take us onto the next page */
4169         size = aligned_nrpages(hpa, size);
4170         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4171                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4172 }
4173
4174 static int intel_iommu_map_pages(struct iommu_domain *domain,
4175                                  unsigned long iova, phys_addr_t paddr,
4176                                  size_t pgsize, size_t pgcount,
4177                                  int prot, gfp_t gfp, size_t *mapped)
4178 {
4179         unsigned long pgshift = __ffs(pgsize);
4180         size_t size = pgcount << pgshift;
4181         int ret;
4182
4183         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4184                 return -EINVAL;
4185
4186         if (!IS_ALIGNED(iova | paddr, pgsize))
4187                 return -EINVAL;
4188
4189         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4190         if (!ret && mapped)
4191                 *mapped = size;
4192
4193         return ret;
4194 }
4195
4196 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4197                                 unsigned long iova, size_t size,
4198                                 struct iommu_iotlb_gather *gather)
4199 {
4200         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4201         unsigned long start_pfn, last_pfn;
4202         int level = 0;
4203
4204         /* Cope with horrid API which requires us to unmap more than the
4205            size argument if it happens to be a large-page mapping. */
4206         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4207                                      &level, GFP_ATOMIC)))
4208                 return 0;
4209
4210         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4211                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4212
4213         start_pfn = iova >> VTD_PAGE_SHIFT;
4214         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4215
4216         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4217
4218         if (dmar_domain->max_addr == iova + size)
4219                 dmar_domain->max_addr = iova;
4220
4221         /*
4222          * We do not use page-selective IOTLB invalidation in flush queue,
4223          * so there is no need to track page and sync iotlb.
4224          */
4225         if (!iommu_iotlb_gather_queued(gather))
4226                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4227
4228         return size;
4229 }
4230
4231 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4232                                       unsigned long iova,
4233                                       size_t pgsize, size_t pgcount,
4234                                       struct iommu_iotlb_gather *gather)
4235 {
4236         unsigned long pgshift = __ffs(pgsize);
4237         size_t size = pgcount << pgshift;
4238
4239         return intel_iommu_unmap(domain, iova, size, gather);
4240 }
4241
4242 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4243                                  struct iommu_iotlb_gather *gather)
4244 {
4245         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4246         unsigned long iova_pfn = IOVA_PFN(gather->start);
4247         size_t size = gather->end - gather->start;
4248         struct iommu_domain_info *info;
4249         unsigned long start_pfn;
4250         unsigned long nrpages;
4251         unsigned long i;
4252
4253         nrpages = aligned_nrpages(gather->start, size);
4254         start_pfn = mm_to_dma_pfn_start(iova_pfn);
4255
4256         xa_for_each(&dmar_domain->iommu_array, i, info)
4257                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4258                                       start_pfn, nrpages,
4259                                       list_empty(&gather->freelist), 0);
4260
4261         put_pages_list(&gather->freelist);
4262 }
4263
4264 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4265                                             dma_addr_t iova)
4266 {
4267         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4268         struct dma_pte *pte;
4269         int level = 0;
4270         u64 phys = 0;
4271
4272         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4273                              GFP_ATOMIC);
4274         if (pte && dma_pte_present(pte))
4275                 phys = dma_pte_addr(pte) +
4276                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4277                                                 VTD_PAGE_SHIFT) - 1));
4278
4279         return phys;
4280 }
4281
4282 static bool domain_support_force_snooping(struct dmar_domain *domain)
4283 {
4284         struct device_domain_info *info;
4285         bool support = true;
4286
4287         assert_spin_locked(&domain->lock);
4288         list_for_each_entry(info, &domain->devices, link) {
4289                 if (!ecap_sc_support(info->iommu->ecap)) {
4290                         support = false;
4291                         break;
4292                 }
4293         }
4294
4295         return support;
4296 }
4297
4298 static void domain_set_force_snooping(struct dmar_domain *domain)
4299 {
4300         struct device_domain_info *info;
4301
4302         assert_spin_locked(&domain->lock);
4303         /*
4304          * Second level page table supports per-PTE snoop control. The
4305          * iommu_map() interface will handle this by setting SNP bit.
4306          */
4307         if (!domain->use_first_level) {
4308                 domain->set_pte_snp = true;
4309                 return;
4310         }
4311
4312         list_for_each_entry(info, &domain->devices, link)
4313                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4314                                                      IOMMU_NO_PASID);
4315 }
4316
4317 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4318 {
4319         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4320         unsigned long flags;
4321
4322         if (dmar_domain->force_snooping)
4323                 return true;
4324
4325         spin_lock_irqsave(&dmar_domain->lock, flags);
4326         if (!domain_support_force_snooping(dmar_domain)) {
4327                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4328                 return false;
4329         }
4330
4331         domain_set_force_snooping(dmar_domain);
4332         dmar_domain->force_snooping = true;
4333         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4334
4335         return true;
4336 }
4337
4338 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4339 {
4340         struct device_domain_info *info = dev_iommu_priv_get(dev);
4341
4342         switch (cap) {
4343         case IOMMU_CAP_CACHE_COHERENCY:
4344         case IOMMU_CAP_DEFERRED_FLUSH:
4345                 return true;
4346         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4347                 return dmar_platform_optin();
4348         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4349                 return ecap_sc_support(info->iommu->ecap);
4350         default:
4351                 return false;
4352         }
4353 }
4354
4355 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4356 {
4357         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4358         struct device_domain_info *info;
4359         struct intel_iommu *iommu;
4360         u8 bus, devfn;
4361         int ret;
4362
4363         iommu = device_to_iommu(dev, &bus, &devfn);
4364         if (!iommu || !iommu->iommu.ops)
4365                 return ERR_PTR(-ENODEV);
4366
4367         info = kzalloc(sizeof(*info), GFP_KERNEL);
4368         if (!info)
4369                 return ERR_PTR(-ENOMEM);
4370
4371         if (dev_is_real_dma_subdevice(dev)) {
4372                 info->bus = pdev->bus->number;
4373                 info->devfn = pdev->devfn;
4374                 info->segment = pci_domain_nr(pdev->bus);
4375         } else {
4376                 info->bus = bus;
4377                 info->devfn = devfn;
4378                 info->segment = iommu->segment;
4379         }
4380
4381         info->dev = dev;
4382         info->iommu = iommu;
4383         if (dev_is_pci(dev)) {
4384                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4385                     pci_ats_supported(pdev) &&
4386                     dmar_ats_supported(pdev, iommu)) {
4387                         info->ats_supported = 1;
4388                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4389
4390                         /*
4391                          * For IOMMU that supports device IOTLB throttling
4392                          * (DIT), we assign PFSID to the invalidation desc
4393                          * of a VF such that IOMMU HW can gauge queue depth
4394                          * at PF level. If DIT is not set, PFSID will be
4395                          * treated as reserved, which should be set to 0.
4396                          */
4397                         if (ecap_dit(iommu->ecap))
4398                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
4399                         info->ats_qdep = pci_ats_queue_depth(pdev);
4400                 }
4401                 if (sm_supported(iommu)) {
4402                         if (pasid_supported(iommu)) {
4403                                 int features = pci_pasid_features(pdev);
4404
4405                                 if (features >= 0)
4406                                         info->pasid_supported = features | 1;
4407                         }
4408
4409                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4410                             pci_pri_supported(pdev))
4411                                 info->pri_supported = 1;
4412                 }
4413         }
4414
4415         dev_iommu_priv_set(dev, info);
4416
4417         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4418                 ret = intel_pasid_alloc_table(dev);
4419                 if (ret) {
4420                         dev_err(dev, "PASID table allocation failed\n");
4421                         dev_iommu_priv_set(dev, NULL);
4422                         kfree(info);
4423                         return ERR_PTR(ret);
4424                 }
4425         }
4426
4427         return &iommu->iommu;
4428 }
4429
4430 static void intel_iommu_release_device(struct device *dev)
4431 {
4432         struct device_domain_info *info = dev_iommu_priv_get(dev);
4433
4434         dmar_remove_one_dev_info(dev);
4435         intel_pasid_free_table(dev);
4436         dev_iommu_priv_set(dev, NULL);
4437         kfree(info);
4438         set_dma_ops(dev, NULL);
4439 }
4440
4441 static void intel_iommu_probe_finalize(struct device *dev)
4442 {
4443         set_dma_ops(dev, NULL);
4444         iommu_setup_dma_ops(dev, 0, U64_MAX);
4445 }
4446
4447 static void intel_iommu_get_resv_regions(struct device *device,
4448                                          struct list_head *head)
4449 {
4450         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4451         struct iommu_resv_region *reg;
4452         struct dmar_rmrr_unit *rmrr;
4453         struct device *i_dev;
4454         int i;
4455
4456         rcu_read_lock();
4457         for_each_rmrr_units(rmrr) {
4458                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4459                                           i, i_dev) {
4460                         struct iommu_resv_region *resv;
4461                         enum iommu_resv_type type;
4462                         size_t length;
4463
4464                         if (i_dev != device &&
4465                             !is_downstream_to_pci_bridge(device, i_dev))
4466                                 continue;
4467
4468                         length = rmrr->end_address - rmrr->base_address + 1;
4469
4470                         type = device_rmrr_is_relaxable(device) ?
4471                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4472
4473                         resv = iommu_alloc_resv_region(rmrr->base_address,
4474                                                        length, prot, type,
4475                                                        GFP_ATOMIC);
4476                         if (!resv)
4477                                 break;
4478
4479                         list_add_tail(&resv->list, head);
4480                 }
4481         }
4482         rcu_read_unlock();
4483
4484 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4485         if (dev_is_pci(device)) {
4486                 struct pci_dev *pdev = to_pci_dev(device);
4487
4488                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4489                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4490                                         IOMMU_RESV_DIRECT_RELAXABLE,
4491                                         GFP_KERNEL);
4492                         if (reg)
4493                                 list_add_tail(&reg->list, head);
4494                 }
4495         }
4496 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4497
4498         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4499                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4500                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4501         if (!reg)
4502                 return;
4503         list_add_tail(&reg->list, head);
4504 }
4505
4506 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4507 {
4508         if (dev_is_pci(dev))
4509                 return pci_device_group(dev);
4510         return generic_device_group(dev);
4511 }
4512
4513 static int intel_iommu_enable_sva(struct device *dev)
4514 {
4515         struct device_domain_info *info = dev_iommu_priv_get(dev);
4516         struct intel_iommu *iommu;
4517
4518         if (!info || dmar_disabled)
4519                 return -EINVAL;
4520
4521         iommu = info->iommu;
4522         if (!iommu)
4523                 return -EINVAL;
4524
4525         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4526                 return -ENODEV;
4527
4528         if (!info->pasid_enabled || !info->ats_enabled)
4529                 return -EINVAL;
4530
4531         /*
4532          * Devices having device-specific I/O fault handling should not
4533          * support PCI/PRI. The IOMMU side has no means to check the
4534          * capability of device-specific IOPF.  Therefore, IOMMU can only
4535          * default that if the device driver enables SVA on a non-PRI
4536          * device, it will handle IOPF in its own way.
4537          */
4538         if (!info->pri_supported)
4539                 return 0;
4540
4541         /* Devices supporting PRI should have it enabled. */
4542         if (!info->pri_enabled)
4543                 return -EINVAL;
4544
4545         return 0;
4546 }
4547
4548 static int intel_iommu_enable_iopf(struct device *dev)
4549 {
4550         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4551         struct device_domain_info *info = dev_iommu_priv_get(dev);
4552         struct intel_iommu *iommu;
4553         int ret;
4554
4555         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4556                 return -ENODEV;
4557
4558         if (info->pri_enabled)
4559                 return -EBUSY;
4560
4561         iommu = info->iommu;
4562         if (!iommu)
4563                 return -EINVAL;
4564
4565         /* PASID is required in PRG Response Message. */
4566         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4567                 return -EINVAL;
4568
4569         ret = pci_reset_pri(pdev);
4570         if (ret)
4571                 return ret;
4572
4573         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4574         if (ret)
4575                 return ret;
4576
4577         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4578         if (ret)
4579                 goto iopf_remove_device;
4580
4581         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4582         if (ret)
4583                 goto iopf_unregister_handler;
4584         info->pri_enabled = 1;
4585
4586         return 0;
4587
4588 iopf_unregister_handler:
4589         iommu_unregister_device_fault_handler(dev);
4590 iopf_remove_device:
4591         iopf_queue_remove_device(iommu->iopf_queue, dev);
4592
4593         return ret;
4594 }
4595
4596 static int intel_iommu_disable_iopf(struct device *dev)
4597 {
4598         struct device_domain_info *info = dev_iommu_priv_get(dev);
4599         struct intel_iommu *iommu = info->iommu;
4600
4601         if (!info->pri_enabled)
4602                 return -EINVAL;
4603
4604         /*
4605          * PCIe spec states that by clearing PRI enable bit, the Page
4606          * Request Interface will not issue new page requests, but has
4607          * outstanding page requests that have been transmitted or are
4608          * queued for transmission. This is supposed to be called after
4609          * the device driver has stopped DMA, all PASIDs have been
4610          * unbound and the outstanding PRQs have been drained.
4611          */
4612         pci_disable_pri(to_pci_dev(dev));
4613         info->pri_enabled = 0;
4614
4615         /*
4616          * With PRI disabled and outstanding PRQs drained, unregistering
4617          * fault handler and removing device from iopf queue should never
4618          * fail.
4619          */
4620         WARN_ON(iommu_unregister_device_fault_handler(dev));
4621         WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4622
4623         return 0;
4624 }
4625
4626 static int
4627 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4628 {
4629         switch (feat) {
4630         case IOMMU_DEV_FEAT_IOPF:
4631                 return intel_iommu_enable_iopf(dev);
4632
4633         case IOMMU_DEV_FEAT_SVA:
4634                 return intel_iommu_enable_sva(dev);
4635
4636         default:
4637                 return -ENODEV;
4638         }
4639 }
4640
4641 static int
4642 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4643 {
4644         switch (feat) {
4645         case IOMMU_DEV_FEAT_IOPF:
4646                 return intel_iommu_disable_iopf(dev);
4647
4648         case IOMMU_DEV_FEAT_SVA:
4649                 return 0;
4650
4651         default:
4652                 return -ENODEV;
4653         }
4654 }
4655
4656 static bool intel_iommu_is_attach_deferred(struct device *dev)
4657 {
4658         struct device_domain_info *info = dev_iommu_priv_get(dev);
4659
4660         return translation_pre_enabled(info->iommu) && !info->domain;
4661 }
4662
4663 /*
4664  * Check that the device does not live on an external facing PCI port that is
4665  * marked as untrusted. Such devices should not be able to apply quirks and
4666  * thus not be able to bypass the IOMMU restrictions.
4667  */
4668 static bool risky_device(struct pci_dev *pdev)
4669 {
4670         if (pdev->untrusted) {
4671                 pci_info(pdev,
4672                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4673                          pdev->vendor, pdev->device);
4674                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4675                 return true;
4676         }
4677         return false;
4678 }
4679
4680 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4681                                        unsigned long iova, size_t size)
4682 {
4683         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4684         unsigned long pages = aligned_nrpages(iova, size);
4685         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4686         struct iommu_domain_info *info;
4687         unsigned long i;
4688
4689         xa_for_each(&dmar_domain->iommu_array, i, info)
4690                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4691 }
4692
4693 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4694 {
4695         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4696         struct dev_pasid_info *curr, *dev_pasid = NULL;
4697         struct dmar_domain *dmar_domain;
4698         struct iommu_domain *domain;
4699         unsigned long flags;
4700
4701         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4702         if (WARN_ON_ONCE(!domain))
4703                 goto out_tear_down;
4704
4705         /*
4706          * The SVA implementation needs to handle its own stuffs like the mm
4707          * notification. Before consolidating that code into iommu core, let
4708          * the intel sva code handle it.
4709          */
4710         if (domain->type == IOMMU_DOMAIN_SVA) {
4711                 intel_svm_remove_dev_pasid(dev, pasid);
4712                 goto out_tear_down;
4713         }
4714
4715         dmar_domain = to_dmar_domain(domain);
4716         spin_lock_irqsave(&dmar_domain->lock, flags);
4717         list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4718                 if (curr->dev == dev && curr->pasid == pasid) {
4719                         list_del(&curr->link_domain);
4720                         dev_pasid = curr;
4721                         break;
4722                 }
4723         }
4724         WARN_ON_ONCE(!dev_pasid);
4725         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4726
4727         domain_detach_iommu(dmar_domain, iommu);
4728         kfree(dev_pasid);
4729 out_tear_down:
4730         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4731         intel_drain_pasid_prq(dev, pasid);
4732 }
4733
4734 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4735                                      struct device *dev, ioasid_t pasid)
4736 {
4737         struct device_domain_info *info = dev_iommu_priv_get(dev);
4738         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4739         struct intel_iommu *iommu = info->iommu;
4740         struct dev_pasid_info *dev_pasid;
4741         unsigned long flags;
4742         int ret;
4743
4744         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4745                 return -EOPNOTSUPP;
4746
4747         if (context_copied(iommu, info->bus, info->devfn))
4748                 return -EBUSY;
4749
4750         ret = prepare_domain_attach_device(domain, dev);
4751         if (ret)
4752                 return ret;
4753
4754         dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4755         if (!dev_pasid)
4756                 return -ENOMEM;
4757
4758         ret = domain_attach_iommu(dmar_domain, iommu);
4759         if (ret)
4760                 goto out_free;
4761
4762         if (domain_type_is_si(dmar_domain))
4763                 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4764                                                      dev, pasid);
4765         else if (dmar_domain->use_first_level)
4766                 ret = domain_setup_first_level(iommu, dmar_domain,
4767                                                dev, pasid);
4768         else
4769                 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4770                                                      dev, pasid);
4771         if (ret)
4772                 goto out_detach_iommu;
4773
4774         dev_pasid->dev = dev;
4775         dev_pasid->pasid = pasid;
4776         spin_lock_irqsave(&dmar_domain->lock, flags);
4777         list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4778         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4779
4780         return 0;
4781 out_detach_iommu:
4782         domain_detach_iommu(dmar_domain, iommu);
4783 out_free:
4784         kfree(dev_pasid);
4785         return ret;
4786 }
4787
4788 const struct iommu_ops intel_iommu_ops = {
4789         .capable                = intel_iommu_capable,
4790         .domain_alloc           = intel_iommu_domain_alloc,
4791         .probe_device           = intel_iommu_probe_device,
4792         .probe_finalize         = intel_iommu_probe_finalize,
4793         .release_device         = intel_iommu_release_device,
4794         .get_resv_regions       = intel_iommu_get_resv_regions,
4795         .device_group           = intel_iommu_device_group,
4796         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4797         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4798         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4799         .def_domain_type        = device_def_domain_type,
4800         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4801         .pgsize_bitmap          = SZ_4K,
4802 #ifdef CONFIG_INTEL_IOMMU_SVM
4803         .page_response          = intel_svm_page_response,
4804 #endif
4805         .default_domain_ops = &(const struct iommu_domain_ops) {
4806                 .attach_dev             = intel_iommu_attach_device,
4807                 .set_dev_pasid          = intel_iommu_set_dev_pasid,
4808                 .map_pages              = intel_iommu_map_pages,
4809                 .unmap_pages            = intel_iommu_unmap_pages,
4810                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4811                 .flush_iotlb_all        = intel_flush_iotlb_all,
4812                 .iotlb_sync             = intel_iommu_tlb_sync,
4813                 .iova_to_phys           = intel_iommu_iova_to_phys,
4814                 .free                   = intel_iommu_domain_free,
4815                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4816         }
4817 };
4818
4819 static void quirk_iommu_igfx(struct pci_dev *dev)
4820 {
4821         if (risky_device(dev))
4822                 return;
4823
4824         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4825         dmar_map_gfx = 0;
4826 }
4827
4828 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4830 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4836
4837 /* Broadwell igfx malfunctions with dmar */
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4846 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4862
4863 static void quirk_iommu_rwbf(struct pci_dev *dev)
4864 {
4865         if (risky_device(dev))
4866                 return;
4867
4868         /*
4869          * Mobile 4 Series Chipset neglects to set RWBF capability,
4870          * but needs it. Same seems to hold for the desktop versions.
4871          */
4872         pci_info(dev, "Forcing write-buffer flush capability\n");
4873         rwbf_quirk = 1;
4874 }
4875
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4881 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4883
4884 #define GGC 0x52
4885 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4886 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4887 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4888 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4889 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4890 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4891 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4892 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4893
4894 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4895 {
4896         unsigned short ggc;
4897
4898         if (risky_device(dev))
4899                 return;
4900
4901         if (pci_read_config_word(dev, GGC, &ggc))
4902                 return;
4903
4904         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4905                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4906                 dmar_map_gfx = 0;
4907         } else if (dmar_map_gfx) {
4908                 /* we have to ensure the gfx device is idle before we flush */
4909                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4910                 iommu_set_dma_strict();
4911         }
4912 }
4913 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4915 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4916 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4917
4918 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4919 {
4920         unsigned short ver;
4921
4922         if (!IS_GFX_DEVICE(dev))
4923                 return;
4924
4925         ver = (dev->device >> 8) & 0xff;
4926         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4927             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4928             ver != 0x9a && ver != 0xa7)
4929                 return;
4930
4931         if (risky_device(dev))
4932                 return;
4933
4934         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4935         iommu_skip_te_disable = 1;
4936 }
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4938
4939 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4940    ISOCH DMAR unit for the Azalia sound device, but not give it any
4941    TLB entries, which causes it to deadlock. Check for that.  We do
4942    this in a function called from init_dmars(), instead of in a PCI
4943    quirk, because we don't want to print the obnoxious "BIOS broken"
4944    message if VT-d is actually disabled.
4945 */
4946 static void __init check_tylersburg_isoch(void)
4947 {
4948         struct pci_dev *pdev;
4949         uint32_t vtisochctrl;
4950
4951         /* If there's no Azalia in the system anyway, forget it. */
4952         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4953         if (!pdev)
4954                 return;
4955
4956         if (risky_device(pdev)) {
4957                 pci_dev_put(pdev);
4958                 return;
4959         }
4960
4961         pci_dev_put(pdev);
4962
4963         /* System Management Registers. Might be hidden, in which case
4964            we can't do the sanity check. But that's OK, because the
4965            known-broken BIOSes _don't_ actually hide it, so far. */
4966         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4967         if (!pdev)
4968                 return;
4969
4970         if (risky_device(pdev)) {
4971                 pci_dev_put(pdev);
4972                 return;
4973         }
4974
4975         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4976                 pci_dev_put(pdev);
4977                 return;
4978         }
4979
4980         pci_dev_put(pdev);
4981
4982         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4983         if (vtisochctrl & 1)
4984                 return;
4985
4986         /* Drop all bits other than the number of TLB entries */
4987         vtisochctrl &= 0x1c;
4988
4989         /* If we have the recommended number of TLB entries (16), fine. */
4990         if (vtisochctrl == 0x10)
4991                 return;
4992
4993         /* Zero TLB entries? You get to ride the short bus to school. */
4994         if (!vtisochctrl) {
4995                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4996                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4997                      dmi_get_system_info(DMI_BIOS_VENDOR),
4998                      dmi_get_system_info(DMI_BIOS_VERSION),
4999                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5000                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5001                 return;
5002         }
5003
5004         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5005                vtisochctrl);
5006 }
5007
5008 /*
5009  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5010  * invalidation completion before posted writes initiated with translated address
5011  * that utilized translations matching the invalidation address range, violating
5012  * the invalidation completion ordering.
5013  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5014  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5015  * under the control of the trusted/privileged host device driver must use this
5016  * quirk.
5017  * Device TLBs are invalidated under the following six conditions:
5018  * 1. Device driver does DMA API unmap IOVA
5019  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5020  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5021  *    exit_mmap() due to crash
5022  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5023  *    VM has to free pages that were unmapped
5024  * 5. Userspace driver unmaps a DMA buffer
5025  * 6. Cache invalidation in vSVA usage (upcoming)
5026  *
5027  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5028  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5029  * invalidate TLB the same way as normal user unmap which will use this quirk.
5030  * The dTLB invalidation after PASID cache flush does not need this quirk.
5031  *
5032  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5033  */
5034 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5035                                unsigned long address, unsigned long mask,
5036                                u32 pasid, u16 qdep)
5037 {
5038         u16 sid;
5039
5040         if (likely(!info->dtlb_extra_inval))
5041                 return;
5042
5043         sid = PCI_DEVID(info->bus, info->devfn);
5044         if (pasid == IOMMU_NO_PASID) {
5045                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5046                                    qdep, address, mask);
5047         } else {
5048                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5049                                          pasid, qdep, address, mask);
5050         }
5051 }
5052
5053 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5054
5055 /*
5056  * Function to submit a command to the enhanced command interface. The
5057  * valid enhanced command descriptions are defined in Table 47 of the
5058  * VT-d spec. The VT-d hardware implementation may support some but not
5059  * all commands, which can be determined by checking the Enhanced
5060  * Command Capability Register.
5061  *
5062  * Return values:
5063  *  - 0: Command successful without any error;
5064  *  - Negative: software error value;
5065  *  - Nonzero positive: failure status code defined in Table 48.
5066  */
5067 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5068 {
5069         unsigned long flags;
5070         u64 res;
5071         int ret;
5072
5073         if (!cap_ecmds(iommu->cap))
5074                 return -ENODEV;
5075
5076         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5077
5078         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5079         if (res & DMA_ECMD_ECRSP_IP) {
5080                 ret = -EBUSY;
5081                 goto err;
5082         }
5083
5084         /*
5085          * Unconditionally write the operand B, because
5086          * - There is no side effect if an ecmd doesn't require an
5087          *   operand B, but we set the register to some value.
5088          * - It's not invoked in any critical path. The extra MMIO
5089          *   write doesn't bring any performance concerns.
5090          */
5091         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5092         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5093
5094         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5095                       !(res & DMA_ECMD_ECRSP_IP), res);
5096
5097         if (res & DMA_ECMD_ECRSP_IP) {
5098                 ret = -ETIMEDOUT;
5099                 goto err;
5100         }
5101
5102         ret = ecmd_get_status_code(res);
5103 err:
5104         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5105
5106         return ret;
5107 }