drivers/iommu/intel-iommu.c

   1 /*
   2  * Copyright © 2006-2014 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>,
  14  *          Ashok Raj <ashok.raj@intel.com>,
  15  *          Shaohua Li <shaohua.li@intel.com>,
  16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17  *          Fenghua Yu <fenghua.yu@intel.com>
  18  *          Joerg Roedel <jroedel@suse.de>
  19  */
  20
  21 #define pr_fmt(fmt)     "DMAR: " fmt
  22
  23 #include <linux/init.h>
  24 #include <linux/bitmap.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/export.h>
  27 #include <linux/slab.h>
  28 #include <linux/irq.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/spinlock.h>
  31 #include <linux/pci.h>
  32 #include <linux/dmar.h>
  33 #include <linux/dma-mapping.h>
  34 #include <linux/mempool.h>
  35 #include <linux/memory.h>
  36 #include <linux/cpu.h>
  37 #include <linux/timer.h>
  38 #include <linux/io.h>
  39 #include <linux/iova.h>
  40 #include <linux/iommu.h>
  41 #include <linux/intel-iommu.h>
  42 #include <linux/syscore_ops.h>
  43 #include <linux/tboot.h>
  44 #include <linux/dmi.h>
  45 #include <linux/pci-ats.h>
  46 #include <linux/memblock.h>
  47 #include <linux/dma-contiguous.h>
  48 #include <linux/crash_dump.h>
  49 #include <asm/irq_remapping.h>
  50 #include <asm/cacheflush.h>
  51 #include <asm/iommu.h>
  52
  53 #include "irq_remapping.h"
  54
  55 #define ROOT_SIZE               VTD_PAGE_SIZE
  56 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  57
  58 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  59 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  60 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  61 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  62
  63 #define IOAPIC_RANGE_START      (0xfee00000)
  64 #define IOAPIC_RANGE_END        (0xfeefffff)
  65 #define IOVA_START_ADDR         (0x1000)
  66
  67 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  68
  69 #define MAX_AGAW_WIDTH 64
  70 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  71
  72 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  73 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  74
  75 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  76    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  77 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  78                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  79 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  80
  81 /* IO virtual address start page frame number */
  82 #define IOVA_START_PFN          (1)
  83
  84 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  85 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  86 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  87
  88 /* page table handling */
  89 #define LEVEL_STRIDE            (9)
  90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  91
  92 /*
  93  * This bitmap is used to advertise the page sizes our hardware support
  94  * to the IOMMU core, which will then use this information to split
  95  * physically contiguous memory regions it is mapping into page sizes
  96  * that we support.
  97  *
  98  * Traditionally the IOMMU core just handed us the mappings directly,
  99  * after making sure the size is an order of a 4KiB page and that the
 100  * mapping has natural alignment.
 101  *
 102  * To retain this behavior, we currently advertise that we support
 103  * all page sizes that are an order of 4KiB.
 104  *
 105  * If at some point we'd like to utilize the IOMMU core's new behavior,
 106  * we could change this to advertise the real page sizes we support.
 107  */
 108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 109
 110 static inline int agaw_to_level(int agaw)
 111 {
 112         return agaw + 2;
 113 }
 114
 115 static inline int agaw_to_width(int agaw)
 116 {
 117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118 }
 119
 120 static inline int width_to_agaw(int width)
 121 {
 122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123 }
 124
 125 static inline unsigned int level_to_offset_bits(int level)
 126 {
 127         return (level - 1) * LEVEL_STRIDE;
 128 }
 129
 130 static inline int pfn_level_offset(unsigned long pfn, int level)
 131 {
 132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133 }
 134
 135 static inline unsigned long level_mask(int level)
 136 {
 137         return -1UL << level_to_offset_bits(level);
 138 }
 139
 140 static inline unsigned long level_size(int level)
 141 {
 142         return 1UL << level_to_offset_bits(level);
 143 }
 144
 145 static inline unsigned long align_to_level(unsigned long pfn, int level)
 146 {
 147         return (pfn + level_size(level) - 1) & level_mask(level);
 148 }
 149
 150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151 {
 152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153 }
 154
 155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156    are never going to work. */
 157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158 {
 159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160 }
 161
 162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163 {
 164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165 }
 166 static inline unsigned long page_to_dma_pfn(struct page *pg)
 167 {
 168         return mm_to_dma_pfn(page_to_pfn(pg));
 169 }
 170 static inline unsigned long virt_to_dma_pfn(void *p)
 171 {
 172         return page_to_dma_pfn(virt_to_page(p));
 173 }
 174
 175 /* global iommu list, set NULL for ignored DMAR units */
 176 static struct intel_iommu **g_iommus;
 177
 178 static void __init check_tylersburg_isoch(void);
 179 static int rwbf_quirk;
 180
 181 /*
 182  * set to 1 to panic kernel if can't successfully enable VT-d
 183  * (used when kernel is launched w/ TXT)
 184  */
 185 static int force_on = 0;
 186
 187 /*
 188  * 0: Present
 189  * 1-11: Reserved
 190  * 12-63: Context Ptr (12 - (haw-1))
 191  * 64-127: Reserved
 192  */
 193 struct root_entry {
 194         u64     lo;
 195         u64     hi;
 196 };
 197 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 198
 199 /*
 200  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 201  * if marked present.
 202  */
 203 static phys_addr_t root_entry_lctp(struct root_entry *re)
 204 {
 205         if (!(re->lo & 1))
 206                 return 0;
 207
 208         return re->lo & VTD_PAGE_MASK;
 209 }
 210
 211 /*
 212  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 213  * if marked present.
 214  */
 215 static phys_addr_t root_entry_uctp(struct root_entry *re)
 216 {
 217         if (!(re->hi & 1))
 218                 return 0;
 219
 220         return re->hi & VTD_PAGE_MASK;
 221 }
 222 /*
 223  * low 64 bits:
 224  * 0: present
 225  * 1: fault processing disable
 226  * 2-3: translation type
 227  * 12-63: address space root
 228  * high 64 bits:
 229  * 0-2: address width
 230  * 3-6: aval
 231  * 8-23: domain id
 232  */
 233 struct context_entry {
 234         u64 lo;
 235         u64 hi;
 236 };
 237
 238 static inline void context_clear_pasid_enable(struct context_entry *context)
 239 {
 240         context->lo &= ~(1ULL << 11);
 241 }
 242
 243 static inline bool context_pasid_enabled(struct context_entry *context)
 244 {
 245         return !!(context->lo & (1ULL << 11));
 246 }
 247
 248 static inline void context_set_copied(struct context_entry *context)
 249 {
 250         context->hi |= (1ull << 3);
 251 }
 252
 253 static inline bool context_copied(struct context_entry *context)
 254 {
 255         return !!(context->hi & (1ULL << 3));
 256 }
 257
 258 static inline bool __context_present(struct context_entry *context)
 259 {
 260         return (context->lo & 1);
 261 }
 262
 263 static inline bool context_present(struct context_entry *context)
 264 {
 265         return context_pasid_enabled(context) ?
 266              __context_present(context) :
 267              __context_present(context) && !context_copied(context);
 268 }
 269
 270 static inline void context_set_present(struct context_entry *context)
 271 {
 272         context->lo |= 1;
 273 }
 274
 275 static inline void context_set_fault_enable(struct context_entry *context)
 276 {
 277         context->lo &= (((u64)-1) << 2) | 1;
 278 }
 279
 280 static inline void context_set_translation_type(struct context_entry *context,
 281                                                 unsigned long value)
 282 {
 283         context->lo &= (((u64)-1) << 4) | 3;
 284         context->lo |= (value & 3) << 2;
 285 }
 286
 287 static inline void context_set_address_root(struct context_entry *context,
 288                                             unsigned long value)
 289 {
 290         context->lo &= ~VTD_PAGE_MASK;
 291         context->lo |= value & VTD_PAGE_MASK;
 292 }
 293
 294 static inline void context_set_address_width(struct context_entry *context,
 295                                              unsigned long value)
 296 {
 297         context->hi |= value & 7;
 298 }
 299
 300 static inline void context_set_domain_id(struct context_entry *context,
 301                                          unsigned long value)
 302 {
 303         context->hi |= (value & ((1 << 16) - 1)) << 8;
 304 }
 305
 306 static inline int context_domain_id(struct context_entry *c)
 307 {
 308         return((c->hi >> 8) & 0xffff);
 309 }
 310
 311 static inline void context_clear_entry(struct context_entry *context)
 312 {
 313         context->lo = 0;
 314         context->hi = 0;
 315 }
 316
 317 /*
 318  * 0: readable
 319  * 1: writable
 320  * 2-6: reserved
 321  * 7: super page
 322  * 8-10: available
 323  * 11: snoop behavior
 324  * 12-63: Host physcial address
 325  */
 326 struct dma_pte {
 327         u64 val;
 328 };
 329
 330 static inline void dma_clear_pte(struct dma_pte *pte)
 331 {
 332         pte->val = 0;
 333 }
 334
 335 static inline u64 dma_pte_addr(struct dma_pte *pte)
 336 {
 337 #ifdef CONFIG_64BIT
 338         return pte->val & VTD_PAGE_MASK;
 339 #else
 340         /* Must have a full atomic 64-bit read */
 341         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 342 #endif
 343 }
 344
 345 static inline bool dma_pte_present(struct dma_pte *pte)
 346 {
 347         return (pte->val & 3) != 0;
 348 }
 349
 350 static inline bool dma_pte_superpage(struct dma_pte *pte)
 351 {
 352         return (pte->val & DMA_PTE_LARGE_PAGE);
 353 }
 354
 355 static inline int first_pte_in_page(struct dma_pte *pte)
 356 {
 357         return !((unsigned long)pte & ~VTD_PAGE_MASK);
 358 }
 359
 360 /*
 361  * This domain is a statically identity mapping domain.
 362  *      1. This domain creats a static 1:1 mapping to all usable memory.
 363  *      2. It maps to each iommu if successful.
 364  *      3. Each iommu mapps to this domain if successful.
 365  */
 366 static struct dmar_domain *si_domain;
 367 static int hw_pass_through = 1;
 368
 369 /*
 370  * Domain represents a virtual machine, more than one devices
 371  * across iommus may be owned in one domain, e.g. kvm guest.
 372  */
 373 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 374
 375 /* si_domain contains mulitple devices */
 376 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 377
 378 #define for_each_domain_iommu(idx, domain)                      \
 379         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 380                 if (domain->iommu_refcnt[idx])
 381
 382 struct dmar_domain {
 383         int     nid;                    /* node id */
 384
 385         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
 386                                         /* Refcount of devices per iommu */
 387
 388
 389         u16             iommu_did[DMAR_UNITS_SUPPORTED];
 390                                         /* Domain ids per IOMMU. Use u16 since
 391                                          * domain ids are 16 bit wide according
 392                                          * to VT-d spec, section 9.3 */
 393
 394         bool has_iotlb_device;
 395         struct list_head devices;       /* all devices' list */
 396         struct iova_domain iovad;       /* iova's that belong to this domain */
 397
 398         struct dma_pte  *pgd;           /* virtual address */
 399         int             gaw;            /* max guest address width */
 400
 401         /* adjusted guest address width, 0 is level 2 30-bit */
 402         int             agaw;
 403
 404         int             flags;          /* flags to find out type of domain */
 405
 406         int             iommu_coherency;/* indicate coherency of iommu access */
 407         int             iommu_snooping; /* indicate snooping control feature*/
 408         int             iommu_count;    /* reference count of iommu */
 409         int             iommu_superpage;/* Level of superpages supported:
 410                                            0 == 4KiB (no superpages), 1 == 2MiB,
 411                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 412         u64             max_addr;       /* maximum mapped address */
 413
 414         struct iommu_domain domain;     /* generic domain data structure for
 415                                            iommu core */
 416 };
 417
 418 /* PCI domain-device relationship */
 419 struct device_domain_info {
 420         struct list_head link;  /* link to domain siblings */
 421         struct list_head global; /* link to global list */
 422         u8 bus;                 /* PCI bus number */
 423         u8 devfn;               /* PCI devfn number */
 424         u16 pfsid;              /* SRIOV physical function source ID */
 425         u8 pasid_supported:3;
 426         u8 pasid_enabled:1;
 427         u8 pri_supported:1;
 428         u8 pri_enabled:1;
 429         u8 ats_supported:1;
 430         u8 ats_enabled:1;
 431         u8 ats_qdep;
 432         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 433         struct intel_iommu *iommu; /* IOMMU used by this device */
 434         struct dmar_domain *domain; /* pointer to domain */
 435 };
 436
 437 struct dmar_rmrr_unit {
 438         struct list_head list;          /* list of rmrr units   */
 439         struct acpi_dmar_header *hdr;   /* ACPI header          */
 440         u64     base_address;           /* reserved base address*/
 441         u64     end_address;            /* reserved end address */
 442         struct dmar_dev_scope *devices; /* target devices */
 443         int     devices_cnt;            /* target device count */
 444 };
 445
 446 struct dmar_atsr_unit {
 447         struct list_head list;          /* list of ATSR units */
 448         struct acpi_dmar_header *hdr;   /* ACPI header */
 449         struct dmar_dev_scope *devices; /* target devices */
 450         int devices_cnt;                /* target device count */
 451         u8 include_all:1;               /* include all ports */
 452 };
 453
 454 static LIST_HEAD(dmar_atsr_units);
 455 static LIST_HEAD(dmar_rmrr_units);
 456
 457 #define for_each_rmrr_units(rmrr) \
 458         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 459
 460 static void flush_unmaps_timeout(unsigned long data);
 461
 462 struct deferred_flush_entry {
 463         unsigned long iova_pfn;
 464         unsigned long nrpages;
 465         struct dmar_domain *domain;
 466         struct page *freelist;
 467 };
 468
 469 #define HIGH_WATER_MARK 250
 470 struct deferred_flush_table {
 471         int next;
 472         struct deferred_flush_entry entries[HIGH_WATER_MARK];
 473 };
 474
 475 struct deferred_flush_data {
 476         spinlock_t lock;
 477         int timer_on;
 478         struct timer_list timer;
 479         long size;
 480         struct deferred_flush_table *tables;
 481 };
 482
 483 DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
 484
 485 /* bitmap for indexing intel_iommus */
 486 static int g_num_of_iommus;
 487
 488 static void domain_exit(struct dmar_domain *domain);
 489 static void domain_remove_dev_info(struct dmar_domain *domain);
 490 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 491                                      struct device *dev);
 492 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 493 static void domain_context_clear(struct intel_iommu *iommu,
 494                                  struct device *dev);
 495 static int domain_detach_iommu(struct dmar_domain *domain,
 496                                struct intel_iommu *iommu);
 497
 498 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 499 int dmar_disabled = 0;
 500 #else
 501 int dmar_disabled = 1;
 502 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 503
 504 int intel_iommu_enabled = 0;
 505 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 506
 507 static int dmar_map_gfx = 1;
 508 static int dmar_forcedac;
 509 static int intel_iommu_strict;
 510 static int intel_iommu_superpage = 1;
 511 static int intel_iommu_ecs = 1;
 512 static int intel_iommu_pasid28;
 513 static int iommu_identity_mapping;
 514
 515 #define IDENTMAP_ALL            1
 516 #define IDENTMAP_GFX            2
 517 #define IDENTMAP_AZALIA         4
 518
 519 /* Broadwell and Skylake have broken ECS support — normal so-called "second
 520  * level" translation of DMA requests-without-PASID doesn't actually happen
 521  * unless you also set the NESTE bit in an extended context-entry. Which of
 522  * course means that SVM doesn't work because it's trying to do nested
 523  * translation of the physical addresses it finds in the process page tables,
 524  * through the IOVA->phys mapping found in the "second level" page tables.
 525  *
 526  * The VT-d specification was retroactively changed to change the definition
 527  * of the capability bits and pretend that Broadwell/Skylake never happened...
 528  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 529  * for some reason it was the PASID capability bit which was redefined (from
 530  * bit 28 on BDW/SKL to bit 40 in future).
 531  *
 532  * So our test for ECS needs to eschew those implementations which set the old
 533  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 534  * Unless we are working around the 'pasid28' limitations, that is, by putting
 535  * the device into passthrough mode for normal DMA and thus masking the bug.
 536  */
 537 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 538                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 539 /* PASID support is thus enabled if ECS is enabled and *either* of the old
 540  * or new capability bits are set. */
 541 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
 542                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
 543
 544 int intel_iommu_gfx_mapped;
 545 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 546
 547 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 548 static DEFINE_SPINLOCK(device_domain_lock);
 549 static LIST_HEAD(device_domain_list);
 550
 551 static const struct iommu_ops intel_iommu_ops;
 552
 553 static bool translation_pre_enabled(struct intel_iommu *iommu)
 554 {
 555         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 556 }
 557
 558 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 559 {
 560         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 561 }
 562
 563 static void init_translation_status(struct intel_iommu *iommu)
 564 {
 565         u32 gsts;
 566
 567         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 568         if (gsts & DMA_GSTS_TES)
 569                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 570 }
 571
 572 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 573 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 574 {
 575         return container_of(dom, struct dmar_domain, domain);
 576 }
 577
 578 static int __init intel_iommu_setup(char *str)
 579 {
 580         if (!str)
 581                 return -EINVAL;
 582         while (*str) {
 583                 if (!strncmp(str, "on", 2)) {
 584                         dmar_disabled = 0;
 585                         pr_info("IOMMU enabled\n");
 586                 } else if (!strncmp(str, "off", 3)) {
 587                         dmar_disabled = 1;
 588                         pr_info("IOMMU disabled\n");
 589                 } else if (!strncmp(str, "igfx_off", 8)) {
 590                         dmar_map_gfx = 0;
 591                         pr_info("Disable GFX device mapping\n");
 592                 } else if (!strncmp(str, "forcedac", 8)) {
 593                         pr_info("Forcing DAC for PCI devices\n");
 594                         dmar_forcedac = 1;
 595                 } else if (!strncmp(str, "strict", 6)) {
 596                         pr_info("Disable batched IOTLB flush\n");
 597                         intel_iommu_strict = 1;
 598                 } else if (!strncmp(str, "sp_off", 6)) {
 599                         pr_info("Disable supported super page\n");
 600                         intel_iommu_superpage = 0;
 601                 } else if (!strncmp(str, "ecs_off", 7)) {
 602                         printk(KERN_INFO
 603                                 "Intel-IOMMU: disable extended context table support\n");
 604                         intel_iommu_ecs = 0;
 605                 } else if (!strncmp(str, "pasid28", 7)) {
 606                         printk(KERN_INFO
 607                                 "Intel-IOMMU: enable pre-production PASID support\n");
 608                         intel_iommu_pasid28 = 1;
 609                         iommu_identity_mapping |= IDENTMAP_GFX;
 610                 }
 611
 612                 str += strcspn(str, ",");
 613                 while (*str == ',')
 614                         str++;
 615         }
 616         return 0;
 617 }
 618 __setup("intel_iommu=", intel_iommu_setup);
 619
 620 static struct kmem_cache *iommu_domain_cache;
 621 static struct kmem_cache *iommu_devinfo_cache;
 622
 623 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 624 {
 625         struct dmar_domain **domains;
 626         int idx = did >> 8;
 627
 628         domains = iommu->domains[idx];
 629         if (!domains)
 630                 return NULL;
 631
 632         return domains[did & 0xff];
 633 }
 634
 635 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 636                              struct dmar_domain *domain)
 637 {
 638         struct dmar_domain **domains;
 639         int idx = did >> 8;
 640
 641         if (!iommu->domains[idx]) {
 642                 size_t size = 256 * sizeof(struct dmar_domain *);
 643                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 644         }
 645
 646         domains = iommu->domains[idx];
 647         if (WARN_ON(!domains))
 648                 return;
 649         else
 650                 domains[did & 0xff] = domain;
 651 }
 652
 653 static inline void *alloc_pgtable_page(int node)
 654 {
 655         struct page *page;
 656         void *vaddr = NULL;
 657
 658         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 659         if (page)
 660                 vaddr = page_address(page);
 661         return vaddr;
 662 }
 663
 664 static inline void free_pgtable_page(void *vaddr)
 665 {
 666         free_page((unsigned long)vaddr);
 667 }
 668
 669 static inline void *alloc_domain_mem(void)
 670 {
 671         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 672 }
 673
 674 static void free_domain_mem(void *vaddr)
 675 {
 676         kmem_cache_free(iommu_domain_cache, vaddr);
 677 }
 678
 679 static inline void * alloc_devinfo_mem(void)
 680 {
 681         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 682 }
 683
 684 static inline void free_devinfo_mem(void *vaddr)
 685 {
 686         kmem_cache_free(iommu_devinfo_cache, vaddr);
 687 }
 688
 689 static inline int domain_type_is_vm(struct dmar_domain *domain)
 690 {
 691         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 692 }
 693
 694 static inline int domain_type_is_si(struct dmar_domain *domain)
 695 {
 696         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 697 }
 698
 699 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 700 {
 701         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 702                                 DOMAIN_FLAG_STATIC_IDENTITY);
 703 }
 704
 705 static inline int domain_pfn_supported(struct dmar_domain *domain,
 706                                        unsigned long pfn)
 707 {
 708         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 709
 710         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 711 }
 712
 713 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 714 {
 715         unsigned long sagaw;
 716         int agaw = -1;
 717
 718         sagaw = cap_sagaw(iommu->cap);
 719         for (agaw = width_to_agaw(max_gaw);
 720              agaw >= 0; agaw--) {
 721                 if (test_bit(agaw, &sagaw))
 722                         break;
 723         }
 724
 725         return agaw;
 726 }
 727
 728 /*
 729  * Calculate max SAGAW for each iommu.
 730  */
 731 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 732 {
 733         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 734 }
 735
 736 /*
 737  * calculate agaw for each iommu.
 738  * "SAGAW" may be different across iommus, use a default agaw, and
 739  * get a supported less agaw for iommus that don't support the default agaw.
 740  */
 741 int iommu_calculate_agaw(struct intel_iommu *iommu)
 742 {
 743         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 744 }
 745
 746 /* This functionin only returns single iommu in a domain */
 747 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 748 {
 749         int iommu_id;
 750
 751         /* si_domain and vm domain should not get here. */
 752         BUG_ON(domain_type_is_vm_or_si(domain));
 753         for_each_domain_iommu(iommu_id, domain)
 754                 break;
 755
 756         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 757                 return NULL;
 758
 759         return g_iommus[iommu_id];
 760 }
 761
 762 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 763 {
 764         struct dmar_drhd_unit *drhd;
 765         struct intel_iommu *iommu;
 766         bool found = false;
 767         int i;
 768
 769         domain->iommu_coherency = 1;
 770
 771         for_each_domain_iommu(i, domain) {
 772                 found = true;
 773                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 774                         domain->iommu_coherency = 0;
 775                         break;
 776                 }
 777         }
 778         if (found)
 779                 return;
 780
 781         /* No hardware attached; use lowest common denominator */
 782         rcu_read_lock();
 783         for_each_active_iommu(iommu, drhd) {
 784                 if (!ecap_coherent(iommu->ecap)) {
 785                         domain->iommu_coherency = 0;
 786                         break;
 787                 }
 788         }
 789         rcu_read_unlock();
 790 }
 791
 792 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 793 {
 794         struct dmar_drhd_unit *drhd;
 795         struct intel_iommu *iommu;
 796         int ret = 1;
 797
 798         rcu_read_lock();
 799         for_each_active_iommu(iommu, drhd) {
 800                 if (iommu != skip) {
 801                         if (!ecap_sc_support(iommu->ecap)) {
 802                                 ret = 0;
 803                                 break;
 804                         }
 805                 }
 806         }
 807         rcu_read_unlock();
 808
 809         return ret;
 810 }
 811
 812 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 813 {
 814         struct dmar_drhd_unit *drhd;
 815         struct intel_iommu *iommu;
 816         int mask = 0xf;
 817
 818         if (!intel_iommu_superpage) {
 819                 return 0;
 820         }
 821
 822         /* set iommu_superpage to the smallest common denominator */
 823         rcu_read_lock();
 824         for_each_active_iommu(iommu, drhd) {
 825                 if (iommu != skip) {
 826                         mask &= cap_super_page_val(iommu->cap);
 827                         if (!mask)
 828                                 break;
 829                 }
 830         }
 831         rcu_read_unlock();
 832
 833         return fls(mask);
 834 }
 835
 836 /* Some capabilities may be different across iommus */
 837 static void domain_update_iommu_cap(struct dmar_domain *domain)
 838 {
 839         domain_update_iommu_coherency(domain);
 840         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 841         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 842 }
 843
 844 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 845                                                        u8 bus, u8 devfn, int alloc)
 846 {
 847         struct root_entry *root = &iommu->root_entry[bus];
 848         struct context_entry *context;
 849         u64 *entry;
 850
 851         entry = &root->lo;
 852         if (ecs_enabled(iommu)) {
 853                 if (devfn >= 0x80) {
 854                         devfn -= 0x80;
 855                         entry = &root->hi;
 856                 }
 857                 devfn *= 2;
 858         }
 859         if (*entry & 1)
 860                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 861         else {
 862                 unsigned long phy_addr;
 863                 if (!alloc)
 864                         return NULL;
 865
 866                 context = alloc_pgtable_page(iommu->node);
 867                 if (!context)
 868                         return NULL;
 869
 870                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 871                 phy_addr = virt_to_phys((void *)context);
 872                 *entry = phy_addr | 1;
 873                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 874         }
 875         return &context[devfn];
 876 }
 877
 878 static int iommu_dummy(struct device *dev)
 879 {
 880         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 881 }
 882
 883 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 884 {
 885         struct dmar_drhd_unit *drhd = NULL;
 886         struct intel_iommu *iommu;
 887         struct device *tmp;
 888         struct pci_dev *ptmp, *pdev = NULL;
 889         u16 segment = 0;
 890         int i;
 891
 892         if (iommu_dummy(dev))
 893                 return NULL;
 894
 895         if (dev_is_pci(dev)) {
 896                 struct pci_dev *pf_pdev;
 897
 898                 pdev = to_pci_dev(dev);
 899                 /* VFs aren't listed in scope tables; we need to look up
 900                  * the PF instead to find the IOMMU. */
 901                 pf_pdev = pci_physfn(pdev);
 902                 dev = &pf_pdev->dev;
 903                 segment = pci_domain_nr(pdev->bus);
 904         } else if (has_acpi_companion(dev))
 905                 dev = &ACPI_COMPANION(dev)->dev;
 906
 907         rcu_read_lock();
 908         for_each_active_iommu(iommu, drhd) {
 909                 if (pdev && segment != drhd->segment)
 910                         continue;
 911
 912                 for_each_active_dev_scope(drhd->devices,
 913                                           drhd->devices_cnt, i, tmp) {
 914                         if (tmp == dev) {
 915                                 /* For a VF use its original BDF# not that of the PF
 916                                  * which we used for the IOMMU lookup. Strictly speaking
 917                                  * we could do this for all PCI devices; we only need to
 918                                  * get the BDF# from the scope table for ACPI matches. */
 919                                 if (pdev && pdev->is_virtfn)
 920                                         goto got_pdev;
 921
 922                                 *bus = drhd->devices[i].bus;
 923                                 *devfn = drhd->devices[i].devfn;
 924                                 goto out;
 925                         }
 926
 927                         if (!pdev || !dev_is_pci(tmp))
 928                                 continue;
 929
 930                         ptmp = to_pci_dev(tmp);
 931                         if (ptmp->subordinate &&
 932                             ptmp->subordinate->number <= pdev->bus->number &&
 933                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
 934                                 goto got_pdev;
 935                 }
 936
 937                 if (pdev && drhd->include_all) {
 938                 got_pdev:
 939                         *bus = pdev->bus->number;
 940                         *devfn = pdev->devfn;
 941                         goto out;
 942                 }
 943         }
 944         iommu = NULL;
 945  out:
 946         rcu_read_unlock();
 947
 948         return iommu;
 949 }
 950
 951 static void domain_flush_cache(struct dmar_domain *domain,
 952                                void *addr, int size)
 953 {
 954         if (!domain->iommu_coherency)
 955                 clflush_cache_range(addr, size);
 956 }
 957
 958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 959 {
 960         struct context_entry *context;
 961         int ret = 0;
 962         unsigned long flags;
 963
 964         spin_lock_irqsave(&iommu->lock, flags);
 965         context = iommu_context_addr(iommu, bus, devfn, 0);
 966         if (context)
 967                 ret = context_present(context);
 968         spin_unlock_irqrestore(&iommu->lock, flags);
 969         return ret;
 970 }
 971
 972 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 973 {
 974         struct context_entry *context;
 975         unsigned long flags;
 976
 977         spin_lock_irqsave(&iommu->lock, flags);
 978         context = iommu_context_addr(iommu, bus, devfn, 0);
 979         if (context) {
 980                 context_clear_entry(context);
 981                 __iommu_flush_cache(iommu, context, sizeof(*context));
 982         }
 983         spin_unlock_irqrestore(&iommu->lock, flags);
 984 }
 985
 986 static void free_context_table(struct intel_iommu *iommu)
 987 {
 988         int i;
 989         unsigned long flags;
 990         struct context_entry *context;
 991
 992         spin_lock_irqsave(&iommu->lock, flags);
 993         if (!iommu->root_entry) {
 994                 goto out;
 995         }
 996         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 997                 context = iommu_context_addr(iommu, i, 0, 0);
 998                 if (context)
 999                         free_pgtable_page(context);
1000
1001                 if (!ecs_enabled(iommu))
1002                         continue;
1003
1004                 context = iommu_context_addr(iommu, i, 0x80, 0);
1005                 if (context)
1006                         free_pgtable_page(context);
1007
1008         }
1009         free_pgtable_page(iommu->root_entry);
1010         iommu->root_entry = NULL;
1011 out:
1012         spin_unlock_irqrestore(&iommu->lock, flags);
1013 }
1014
1015 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1016                                       unsigned long pfn, int *target_level)
1017 {
1018         struct dma_pte *parent, *pte = NULL;
1019         int level = agaw_to_level(domain->agaw);
1020         int offset;
1021
1022         BUG_ON(!domain->pgd);
1023
1024         if (!domain_pfn_supported(domain, pfn))
1025                 /* Address beyond IOMMU's addressing capabilities. */
1026                 return NULL;
1027
1028         parent = domain->pgd;
1029
1030         while (1) {
1031                 void *tmp_page;
1032
1033                 offset = pfn_level_offset(pfn, level);
1034                 pte = &parent[offset];
1035                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1036                         break;
1037                 if (level == *target_level)
1038                         break;
1039
1040                 if (!dma_pte_present(pte)) {
1041                         uint64_t pteval;
1042
1043                         tmp_page = alloc_pgtable_page(domain->nid);
1044
1045                         if (!tmp_page)
1046                                 return NULL;
1047
1048                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1049                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1050                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1051                                 /* Someone else set it while we were thinking; use theirs. */
1052                                 free_pgtable_page(tmp_page);
1053                         else
1054                                 domain_flush_cache(domain, pte, sizeof(*pte));
1055                 }
1056                 if (level == 1)
1057                         break;
1058
1059                 parent = phys_to_virt(dma_pte_addr(pte));
1060                 level--;
1061         }
1062
1063         if (!*target_level)
1064                 *target_level = level;
1065
1066         return pte;
1067 }
1068
1069
1070 /* return address's pte at specific level */
1071 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1072                                          unsigned long pfn,
1073                                          int level, int *large_page)
1074 {
1075         struct dma_pte *parent, *pte = NULL;
1076         int total = agaw_to_level(domain->agaw);
1077         int offset;
1078
1079         parent = domain->pgd;
1080         while (level <= total) {
1081                 offset = pfn_level_offset(pfn, total);
1082                 pte = &parent[offset];
1083                 if (level == total)
1084                         return pte;
1085
1086                 if (!dma_pte_present(pte)) {
1087                         *large_page = total;
1088                         break;
1089                 }
1090
1091                 if (dma_pte_superpage(pte)) {
1092                         *large_page = total;
1093                         return pte;
1094                 }
1095
1096                 parent = phys_to_virt(dma_pte_addr(pte));
1097                 total--;
1098         }
1099         return NULL;
1100 }
1101
1102 /* clear last level pte, a tlb flush should be followed */
1103 static void dma_pte_clear_range(struct dmar_domain *domain,
1104                                 unsigned long start_pfn,
1105                                 unsigned long last_pfn)
1106 {
1107         unsigned int large_page = 1;
1108         struct dma_pte *first_pte, *pte;
1109
1110         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1111         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1112         BUG_ON(start_pfn > last_pfn);
1113
1114         /* we don't need lock here; nobody else touches the iova range */
1115         do {
1116                 large_page = 1;
1117                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1118                 if (!pte) {
1119                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1120                         continue;
1121                 }
1122                 do {
1123                         dma_clear_pte(pte);
1124                         start_pfn += lvl_to_nr_pages(large_page);
1125                         pte++;
1126                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1127
1128                 domain_flush_cache(domain, first_pte,
1129                                    (void *)pte - (void *)first_pte);
1130
1131         } while (start_pfn && start_pfn <= last_pfn);
1132 }
1133
1134 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1135                                struct dma_pte *pte, unsigned long pfn,
1136                                unsigned long start_pfn, unsigned long last_pfn)
1137 {
1138         pfn = max(start_pfn, pfn);
1139         pte = &pte[pfn_level_offset(pfn, level)];
1140
1141         do {
1142                 unsigned long level_pfn;
1143                 struct dma_pte *level_pte;
1144
1145                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1146                         goto next;
1147
1148                 level_pfn = pfn & level_mask(level);
1149                 level_pte = phys_to_virt(dma_pte_addr(pte));
1150
1151                 if (level > 2)
1152                         dma_pte_free_level(domain, level - 1, level_pte,
1153                                            level_pfn, start_pfn, last_pfn);
1154
1155                 /* If range covers entire pagetable, free it */
1156                 if (!(start_pfn > level_pfn ||
1157                       last_pfn < level_pfn + level_size(level) - 1)) {
1158                         dma_clear_pte(pte);
1159                         domain_flush_cache(domain, pte, sizeof(*pte));
1160                         free_pgtable_page(level_pte);
1161                 }
1162 next:
1163                 pfn += level_size(level);
1164         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1165 }
1166
1167 /* clear last level (leaf) ptes and free page table pages. */
1168 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1169                                    unsigned long start_pfn,
1170                                    unsigned long last_pfn)
1171 {
1172         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174         BUG_ON(start_pfn > last_pfn);
1175
1176         dma_pte_clear_range(domain, start_pfn, last_pfn);
1177
1178         /* We don't need lock here; nobody else touches the iova range */
1179         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1180                            domain->pgd, 0, start_pfn, last_pfn);
1181
1182         /* free pgd */
1183         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1184                 free_pgtable_page(domain->pgd);
1185                 domain->pgd = NULL;
1186         }
1187 }
1188
1189 /* When a page at a given level is being unlinked from its parent, we don't
1190    need to *modify* it at all. All we need to do is make a list of all the
1191    pages which can be freed just as soon as we've flushed the IOTLB and we
1192    know the hardware page-walk will no longer touch them.
1193    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1194    be freed. */
1195 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1196                                             int level, struct dma_pte *pte,
1197                                             struct page *freelist)
1198 {
1199         struct page *pg;
1200
1201         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1202         pg->freelist = freelist;
1203         freelist = pg;
1204
1205         if (level == 1)
1206                 return freelist;
1207
1208         pte = page_address(pg);
1209         do {
1210                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1211                         freelist = dma_pte_list_pagetables(domain, level - 1,
1212                                                            pte, freelist);
1213                 pte++;
1214         } while (!first_pte_in_page(pte));
1215
1216         return freelist;
1217 }
1218
1219 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1220                                         struct dma_pte *pte, unsigned long pfn,
1221                                         unsigned long start_pfn,
1222                                         unsigned long last_pfn,
1223                                         struct page *freelist)
1224 {
1225         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1226
1227         pfn = max(start_pfn, pfn);
1228         pte = &pte[pfn_level_offset(pfn, level)];
1229
1230         do {
1231                 unsigned long level_pfn;
1232
1233                 if (!dma_pte_present(pte))
1234                         goto next;
1235
1236                 level_pfn = pfn & level_mask(level);
1237
1238                 /* If range covers entire pagetable, free it */
1239                 if (start_pfn <= level_pfn &&
1240                     last_pfn >= level_pfn + level_size(level) - 1) {
1241                         /* These suborbinate page tables are going away entirely. Don't
1242                            bother to clear them; we're just going to *free* them. */
1243                         if (level > 1 && !dma_pte_superpage(pte))
1244                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1245
1246                         dma_clear_pte(pte);
1247                         if (!first_pte)
1248                                 first_pte = pte;
1249                         last_pte = pte;
1250                 } else if (level > 1) {
1251                         /* Recurse down into a level that isn't *entirely* obsolete */
1252                         freelist = dma_pte_clear_level(domain, level - 1,
1253                                                        phys_to_virt(dma_pte_addr(pte)),
1254                                                        level_pfn, start_pfn, last_pfn,
1255                                                        freelist);
1256                 }
1257 next:
1258                 pfn += level_size(level);
1259         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1260
1261         if (first_pte)
1262                 domain_flush_cache(domain, first_pte,
1263                                    (void *)++last_pte - (void *)first_pte);
1264
1265         return freelist;
1266 }
1267
1268 /* We can't just free the pages because the IOMMU may still be walking
1269    the page tables, and may have cached the intermediate levels. The
1270    pages can only be freed after the IOTLB flush has been done. */
1271 static struct page *domain_unmap(struct dmar_domain *domain,
1272                                  unsigned long start_pfn,
1273                                  unsigned long last_pfn)
1274 {
1275         struct page *freelist = NULL;
1276
1277         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1278         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1279         BUG_ON(start_pfn > last_pfn);
1280
1281         /* we don't need lock here; nobody else touches the iova range */
1282         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1283                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1284
1285         /* free pgd */
1286         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1287                 struct page *pgd_page = virt_to_page(domain->pgd);
1288                 pgd_page->freelist = freelist;
1289                 freelist = pgd_page;
1290
1291                 domain->pgd = NULL;
1292         }
1293
1294         return freelist;
1295 }
1296
1297 static void dma_free_pagelist(struct page *freelist)
1298 {
1299         struct page *pg;
1300
1301         while ((pg = freelist)) {
1302                 freelist = pg->freelist;
1303                 free_pgtable_page(page_address(pg));
1304         }
1305 }
1306
1307 /* iommu handling */
1308 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1309 {
1310         struct root_entry *root;
1311         unsigned long flags;
1312
1313         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1314         if (!root) {
1315                 pr_err("Allocating root entry for %s failed\n",
1316                         iommu->name);
1317                 return -ENOMEM;
1318         }
1319
1320         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1321
1322         spin_lock_irqsave(&iommu->lock, flags);
1323         iommu->root_entry = root;
1324         spin_unlock_irqrestore(&iommu->lock, flags);
1325
1326         return 0;
1327 }
1328
1329 static void iommu_set_root_entry(struct intel_iommu *iommu)
1330 {
1331         u64 addr;
1332         u32 sts;
1333         unsigned long flag;
1334
1335         addr = virt_to_phys(iommu->root_entry);
1336         if (ecs_enabled(iommu))
1337                 addr |= DMA_RTADDR_RTT;
1338
1339         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1341
1342         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1343
1344         /* Make sure hardware complete it */
1345         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1346                       readl, (sts & DMA_GSTS_RTPS), sts);
1347
1348         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349 }
1350
1351 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1352 {
1353         u32 val;
1354         unsigned long flag;
1355
1356         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1357                 return;
1358
1359         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1360         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1361
1362         /* Make sure hardware complete it */
1363         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1364                       readl, (!(val & DMA_GSTS_WBFS)), val);
1365
1366         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1367 }
1368
1369 /* return value determine if we need a write buffer flush */
1370 static void __iommu_flush_context(struct intel_iommu *iommu,
1371                                   u16 did, u16 source_id, u8 function_mask,
1372                                   u64 type)
1373 {
1374         u64 val = 0;
1375         unsigned long flag;
1376
1377         switch (type) {
1378         case DMA_CCMD_GLOBAL_INVL:
1379                 val = DMA_CCMD_GLOBAL_INVL;
1380                 break;
1381         case DMA_CCMD_DOMAIN_INVL:
1382                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1383                 break;
1384         case DMA_CCMD_DEVICE_INVL:
1385                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1386                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1387                 break;
1388         default:
1389                 BUG();
1390         }
1391         val |= DMA_CCMD_ICC;
1392
1393         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1394         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1395
1396         /* Make sure hardware complete it */
1397         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1398                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1399
1400         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1401 }
1402
1403 /* return value determine if we need a write buffer flush */
1404 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1405                                 u64 addr, unsigned int size_order, u64 type)
1406 {
1407         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1408         u64 val = 0, val_iva = 0;
1409         unsigned long flag;
1410
1411         switch (type) {
1412         case DMA_TLB_GLOBAL_FLUSH:
1413                 /* global flush doesn't need set IVA_REG */
1414                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1415                 break;
1416         case DMA_TLB_DSI_FLUSH:
1417                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1418                 break;
1419         case DMA_TLB_PSI_FLUSH:
1420                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1421                 /* IH bit is passed in as part of address */
1422                 val_iva = size_order | addr;
1423                 break;
1424         default:
1425                 BUG();
1426         }
1427         /* Note: set drain read/write */
1428 #if 0
1429         /*
1430          * This is probably to be super secure.. Looks like we can
1431          * ignore it without any impact.
1432          */
1433         if (cap_read_drain(iommu->cap))
1434                 val |= DMA_TLB_READ_DRAIN;
1435 #endif
1436         if (cap_write_drain(iommu->cap))
1437                 val |= DMA_TLB_WRITE_DRAIN;
1438
1439         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1440         /* Note: Only uses first TLB reg currently */
1441         if (val_iva)
1442                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1443         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1444
1445         /* Make sure hardware complete it */
1446         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1447                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1448
1449         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1450
1451         /* check IOTLB invalidation granularity */
1452         if (DMA_TLB_IAIG(val) == 0)
1453                 pr_err("Flush IOTLB failed\n");
1454         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1455                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1456                         (unsigned long long)DMA_TLB_IIRG(type),
1457                         (unsigned long long)DMA_TLB_IAIG(val));
1458 }
1459
1460 static struct device_domain_info *
1461 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1462                          u8 bus, u8 devfn)
1463 {
1464         struct device_domain_info *info;
1465
1466         assert_spin_locked(&device_domain_lock);
1467
1468         if (!iommu->qi)
1469                 return NULL;
1470
1471         list_for_each_entry(info, &domain->devices, link)
1472                 if (info->iommu == iommu && info->bus == bus &&
1473                     info->devfn == devfn) {
1474                         if (info->ats_supported && info->dev)
1475                                 return info;
1476                         break;
1477                 }
1478
1479         return NULL;
1480 }
1481
1482 static void domain_update_iotlb(struct dmar_domain *domain)
1483 {
1484         struct device_domain_info *info;
1485         bool has_iotlb_device = false;
1486
1487         assert_spin_locked(&device_domain_lock);
1488
1489         list_for_each_entry(info, &domain->devices, link) {
1490                 struct pci_dev *pdev;
1491
1492                 if (!info->dev || !dev_is_pci(info->dev))
1493                         continue;
1494
1495                 pdev = to_pci_dev(info->dev);
1496                 if (pdev->ats_enabled) {
1497                         has_iotlb_device = true;
1498                         break;
1499                 }
1500         }
1501
1502         domain->has_iotlb_device = has_iotlb_device;
1503 }
1504
1505 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1506 {
1507         struct pci_dev *pdev;
1508
1509         assert_spin_locked(&device_domain_lock);
1510
1511         if (!info || !dev_is_pci(info->dev))
1512                 return;
1513
1514         pdev = to_pci_dev(info->dev);
1515         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1516          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1517          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1518          * reserved, which should be set to 0.
1519          */
1520         if (!ecap_dit(info->iommu->ecap))
1521                 info->pfsid = 0;
1522         else {
1523                 struct pci_dev *pf_pdev;
1524
1525                 /* pdev will be returned if device is not a vf */
1526                 pf_pdev = pci_physfn(pdev);
1527                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1528         }
1529
1530 #ifdef CONFIG_INTEL_IOMMU_SVM
1531         /* The PCIe spec, in its wisdom, declares that the behaviour of
1532            the device if you enable PASID support after ATS support is
1533            undefined. So always enable PASID support on devices which
1534            have it, even if we can't yet know if we're ever going to
1535            use it. */
1536         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1537                 info->pasid_enabled = 1;
1538
1539         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1540                 info->pri_enabled = 1;
1541 #endif
1542         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1543                 info->ats_enabled = 1;
1544                 domain_update_iotlb(info->domain);
1545                 info->ats_qdep = pci_ats_queue_depth(pdev);
1546         }
1547 }
1548
1549 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1550 {
1551         struct pci_dev *pdev;
1552
1553         assert_spin_locked(&device_domain_lock);
1554
1555         if (!dev_is_pci(info->dev))
1556                 return;
1557
1558         pdev = to_pci_dev(info->dev);
1559
1560         if (info->ats_enabled) {
1561                 pci_disable_ats(pdev);
1562                 info->ats_enabled = 0;
1563                 domain_update_iotlb(info->domain);
1564         }
1565 #ifdef CONFIG_INTEL_IOMMU_SVM
1566         if (info->pri_enabled) {
1567                 pci_disable_pri(pdev);
1568                 info->pri_enabled = 0;
1569         }
1570         if (info->pasid_enabled) {
1571                 pci_disable_pasid(pdev);
1572                 info->pasid_enabled = 0;
1573         }
1574 #endif
1575 }
1576
1577 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1578                                   u64 addr, unsigned mask)
1579 {
1580         u16 sid, qdep;
1581         unsigned long flags;
1582         struct device_domain_info *info;
1583
1584         if (!domain->has_iotlb_device)
1585                 return;
1586
1587         spin_lock_irqsave(&device_domain_lock, flags);
1588         list_for_each_entry(info, &domain->devices, link) {
1589                 if (!info->ats_enabled)
1590                         continue;
1591
1592                 sid = info->bus << 8 | info->devfn;
1593                 qdep = info->ats_qdep;
1594                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1595                                 qdep, addr, mask);
1596         }
1597         spin_unlock_irqrestore(&device_domain_lock, flags);
1598 }
1599
1600 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1601                                   struct dmar_domain *domain,
1602                                   unsigned long pfn, unsigned int pages,
1603                                   int ih, int map)
1604 {
1605         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1606         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1607         u16 did = domain->iommu_did[iommu->seq_id];
1608
1609         BUG_ON(pages == 0);
1610
1611         if (ih)
1612                 ih = 1 << 6;
1613         /*
1614          * Fallback to domain selective flush if no PSI support or the size is
1615          * too big.
1616          * PSI requires page size to be 2 ^ x, and the base address is naturally
1617          * aligned to the size
1618          */
1619         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1620                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1621                                                 DMA_TLB_DSI_FLUSH);
1622         else
1623                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1624                                                 DMA_TLB_PSI_FLUSH);
1625
1626         /*
1627          * In caching mode, changes of pages from non-present to present require
1628          * flush. However, device IOTLB doesn't need to be flushed in this case.
1629          */
1630         if (!cap_caching_mode(iommu->cap) || !map)
1631                 iommu_flush_dev_iotlb(domain, addr, mask);
1632 }
1633
1634 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1635 {
1636         u32 pmen;
1637         unsigned long flags;
1638
1639         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1640         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1641         pmen &= ~DMA_PMEN_EPM;
1642         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1643
1644         /* wait for the protected region status bit to clear */
1645         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1646                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1647
1648         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1649 }
1650
1651 static void iommu_enable_translation(struct intel_iommu *iommu)
1652 {
1653         u32 sts;
1654         unsigned long flags;
1655
1656         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1657         iommu->gcmd |= DMA_GCMD_TE;
1658         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1659
1660         /* Make sure hardware complete it */
1661         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1662                       readl, (sts & DMA_GSTS_TES), sts);
1663
1664         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1665 }
1666
1667 static void iommu_disable_translation(struct intel_iommu *iommu)
1668 {
1669         u32 sts;
1670         unsigned long flag;
1671
1672         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1673         iommu->gcmd &= ~DMA_GCMD_TE;
1674         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1675
1676         /* Make sure hardware complete it */
1677         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1678                       readl, (!(sts & DMA_GSTS_TES)), sts);
1679
1680         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1681 }
1682
1683
1684 static int iommu_init_domains(struct intel_iommu *iommu)
1685 {
1686         u32 ndomains, nlongs;
1687         size_t size;
1688
1689         ndomains = cap_ndoms(iommu->cap);
1690         pr_debug("%s: Number of Domains supported <%d>\n",
1691                  iommu->name, ndomains);
1692         nlongs = BITS_TO_LONGS(ndomains);
1693
1694         spin_lock_init(&iommu->lock);
1695
1696         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1697         if (!iommu->domain_ids) {
1698                 pr_err("%s: Allocating domain id array failed\n",
1699                        iommu->name);
1700                 return -ENOMEM;
1701         }
1702
1703         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1704         iommu->domains = kzalloc(size, GFP_KERNEL);
1705
1706         if (iommu->domains) {
1707                 size = 256 * sizeof(struct dmar_domain *);
1708                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1709         }
1710
1711         if (!iommu->domains || !iommu->domains[0]) {
1712                 pr_err("%s: Allocating domain array failed\n",
1713                        iommu->name);
1714                 kfree(iommu->domain_ids);
1715                 kfree(iommu->domains);
1716                 iommu->domain_ids = NULL;
1717                 iommu->domains    = NULL;
1718                 return -ENOMEM;
1719         }
1720
1721
1722
1723         /*
1724          * If Caching mode is set, then invalid translations are tagged
1725          * with domain-id 0, hence we need to pre-allocate it. We also
1726          * use domain-id 0 as a marker for non-allocated domain-id, so
1727          * make sure it is not used for a real domain.
1728          */
1729         set_bit(0, iommu->domain_ids);
1730
1731         return 0;
1732 }
1733
1734 static void disable_dmar_iommu(struct intel_iommu *iommu)
1735 {
1736         struct device_domain_info *info, *tmp;
1737         unsigned long flags;
1738
1739         if (!iommu->domains || !iommu->domain_ids)
1740                 return;
1741
1742 again:
1743         spin_lock_irqsave(&device_domain_lock, flags);
1744         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1745                 struct dmar_domain *domain;
1746
1747                 if (info->iommu != iommu)
1748                         continue;
1749
1750                 if (!info->dev || !info->domain)
1751                         continue;
1752
1753                 domain = info->domain;
1754
1755                 __dmar_remove_one_dev_info(info);
1756
1757                 if (!domain_type_is_vm_or_si(domain)) {
1758                         /*
1759                          * The domain_exit() function  can't be called under
1760                          * device_domain_lock, as it takes this lock itself.
1761                          * So release the lock here and re-run the loop
1762                          * afterwards.
1763                          */
1764                         spin_unlock_irqrestore(&device_domain_lock, flags);
1765                         domain_exit(domain);
1766                         goto again;
1767                 }
1768         }
1769         spin_unlock_irqrestore(&device_domain_lock, flags);
1770
1771         if (iommu->gcmd & DMA_GCMD_TE)
1772                 iommu_disable_translation(iommu);
1773 }
1774
1775 static void free_dmar_iommu(struct intel_iommu *iommu)
1776 {
1777         if ((iommu->domains) && (iommu->domain_ids)) {
1778                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1779                 int i;
1780
1781                 for (i = 0; i < elems; i++)
1782                         kfree(iommu->domains[i]);
1783                 kfree(iommu->domains);
1784                 kfree(iommu->domain_ids);
1785                 iommu->domains = NULL;
1786                 iommu->domain_ids = NULL;
1787         }
1788
1789         g_iommus[iommu->seq_id] = NULL;
1790
1791         /* free context mapping */
1792         free_context_table(iommu);
1793
1794 #ifdef CONFIG_INTEL_IOMMU_SVM
1795         if (pasid_enabled(iommu)) {
1796                 if (ecap_prs(iommu->ecap))
1797                         intel_svm_finish_prq(iommu);
1798                 intel_svm_free_pasid_tables(iommu);
1799         }
1800 #endif
1801 }
1802
1803 static struct dmar_domain *alloc_domain(int flags)
1804 {
1805         struct dmar_domain *domain;
1806
1807         domain = alloc_domain_mem();
1808         if (!domain)
1809                 return NULL;
1810
1811         memset(domain, 0, sizeof(*domain));
1812         domain->nid = -1;
1813         domain->flags = flags;
1814         domain->has_iotlb_device = false;
1815         INIT_LIST_HEAD(&domain->devices);
1816
1817         return domain;
1818 }
1819
1820 /* Must be called with iommu->lock */
1821 static int domain_attach_iommu(struct dmar_domain *domain,
1822                                struct intel_iommu *iommu)
1823 {
1824         unsigned long ndomains;
1825         int num;
1826
1827         assert_spin_locked(&device_domain_lock);
1828         assert_spin_locked(&iommu->lock);
1829
1830         domain->iommu_refcnt[iommu->seq_id] += 1;
1831         domain->iommu_count += 1;
1832         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1833                 ndomains = cap_ndoms(iommu->cap);
1834                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1835
1836                 if (num >= ndomains) {
1837                         pr_err("%s: No free domain ids\n", iommu->name);
1838                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1839                         domain->iommu_count -= 1;
1840                         return -ENOSPC;
1841                 }
1842
1843                 set_bit(num, iommu->domain_ids);
1844                 set_iommu_domain(iommu, num, domain);
1845
1846                 domain->iommu_did[iommu->seq_id] = num;
1847                 domain->nid                      = iommu->node;
1848
1849                 domain_update_iommu_cap(domain);
1850         }
1851
1852         return 0;
1853 }
1854
1855 static int domain_detach_iommu(struct dmar_domain *domain,
1856                                struct intel_iommu *iommu)
1857 {
1858         int num, count = INT_MAX;
1859
1860         assert_spin_locked(&device_domain_lock);
1861         assert_spin_locked(&iommu->lock);
1862
1863         domain->iommu_refcnt[iommu->seq_id] -= 1;
1864         count = --domain->iommu_count;
1865         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1866                 num = domain->iommu_did[iommu->seq_id];
1867                 clear_bit(num, iommu->domain_ids);
1868                 set_iommu_domain(iommu, num, NULL);
1869
1870                 domain_update_iommu_cap(domain);
1871                 domain->iommu_did[iommu->seq_id] = 0;
1872         }
1873
1874         return count;
1875 }
1876
1877 static struct iova_domain reserved_iova_list;
1878 static struct lock_class_key reserved_rbtree_key;
1879
1880 static int dmar_init_reserved_ranges(void)
1881 {
1882         struct pci_dev *pdev = NULL;
1883         struct iova *iova;
1884         int i;
1885
1886         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1887                         DMA_32BIT_PFN);
1888
1889         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1890                 &reserved_rbtree_key);
1891
1892         /* IOAPIC ranges shouldn't be accessed by DMA */
1893         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1894                 IOVA_PFN(IOAPIC_RANGE_END));
1895         if (!iova) {
1896                 pr_err("Reserve IOAPIC range failed\n");
1897                 return -ENODEV;
1898         }
1899
1900         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1901         for_each_pci_dev(pdev) {
1902                 struct resource *r;
1903
1904                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1905                         r = &pdev->resource[i];
1906                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1907                                 continue;
1908                         iova = reserve_iova(&reserved_iova_list,
1909                                             IOVA_PFN(r->start),
1910                                             IOVA_PFN(r->end));
1911                         if (!iova) {
1912                                 pr_err("Reserve iova failed\n");
1913                                 return -ENODEV;
1914                         }
1915                 }
1916         }
1917         return 0;
1918 }
1919
1920 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1921 {
1922         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1923 }
1924
1925 static inline int guestwidth_to_adjustwidth(int gaw)
1926 {
1927         int agaw;
1928         int r = (gaw - 12) % 9;
1929
1930         if (r == 0)
1931                 agaw = gaw;
1932         else
1933                 agaw = gaw + 9 - r;
1934         if (agaw > 64)
1935                 agaw = 64;
1936         return agaw;
1937 }
1938
1939 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1940                        int guest_width)
1941 {
1942         int adjust_width, agaw;
1943         unsigned long sagaw;
1944
1945         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1946                         DMA_32BIT_PFN);
1947         domain_reserve_special_ranges(domain);
1948
1949         /* calculate AGAW */
1950         if (guest_width > cap_mgaw(iommu->cap))
1951                 guest_width = cap_mgaw(iommu->cap);
1952         domain->gaw = guest_width;
1953         adjust_width = guestwidth_to_adjustwidth(guest_width);
1954         agaw = width_to_agaw(adjust_width);
1955         sagaw = cap_sagaw(iommu->cap);
1956         if (!test_bit(agaw, &sagaw)) {
1957                 /* hardware doesn't support it, choose a bigger one */
1958                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1959                 agaw = find_next_bit(&sagaw, 5, agaw);
1960                 if (agaw >= 5)
1961                         return -ENODEV;
1962         }
1963         domain->agaw = agaw;
1964
1965         if (ecap_coherent(iommu->ecap))
1966                 domain->iommu_coherency = 1;
1967         else
1968                 domain->iommu_coherency = 0;
1969
1970         if (ecap_sc_support(iommu->ecap))
1971                 domain->iommu_snooping = 1;
1972         else
1973                 domain->iommu_snooping = 0;
1974
1975         if (intel_iommu_superpage)
1976                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1977         else
1978                 domain->iommu_superpage = 0;
1979
1980         domain->nid = iommu->node;
1981
1982         /* always allocate the top pgd */
1983         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1984         if (!domain->pgd)
1985                 return -ENOMEM;
1986         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1987         return 0;
1988 }
1989
1990 static void domain_exit(struct dmar_domain *domain)
1991 {
1992         struct page *freelist = NULL;
1993
1994         /* Domain 0 is reserved, so dont process it */
1995         if (!domain)
1996                 return;
1997
1998         /* Flush any lazy unmaps that may reference this domain */
1999         if (!intel_iommu_strict) {
2000                 int cpu;
2001
2002                 for_each_possible_cpu(cpu)
2003                         flush_unmaps_timeout(cpu);
2004         }
2005
2006         /* Remove associated devices and clear attached or cached domains */
2007         rcu_read_lock();
2008         domain_remove_dev_info(domain);
2009         rcu_read_unlock();
2010
2011         /* destroy iovas */
2012         put_iova_domain(&domain->iovad);
2013
2014         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2015
2016         dma_free_pagelist(freelist);
2017
2018         free_domain_mem(domain);
2019 }
2020
2021 static int domain_context_mapping_one(struct dmar_domain *domain,
2022                                       struct intel_iommu *iommu,
2023                                       u8 bus, u8 devfn)
2024 {
2025         u16 did = domain->iommu_did[iommu->seq_id];
2026         int translation = CONTEXT_TT_MULTI_LEVEL;
2027         struct device_domain_info *info = NULL;
2028         struct context_entry *context;
2029         unsigned long flags;
2030         struct dma_pte *pgd;
2031         int ret, agaw;
2032
2033         WARN_ON(did == 0);
2034
2035         if (hw_pass_through && domain_type_is_si(domain))
2036                 translation = CONTEXT_TT_PASS_THROUGH;
2037
2038         pr_debug("Set context mapping for %02x:%02x.%d\n",
2039                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2040
2041         BUG_ON(!domain->pgd);
2042
2043         spin_lock_irqsave(&device_domain_lock, flags);
2044         spin_lock(&iommu->lock);
2045
2046         ret = -ENOMEM;
2047         context = iommu_context_addr(iommu, bus, devfn, 1);
2048         if (!context)
2049                 goto out_unlock;
2050
2051         ret = 0;
2052         if (context_present(context))
2053                 goto out_unlock;
2054
2055         /*
2056          * For kdump cases, old valid entries may be cached due to the
2057          * in-flight DMA and copied pgtable, but there is no unmapping
2058          * behaviour for them, thus we need an explicit cache flush for
2059          * the newly-mapped device. For kdump, at this point, the device
2060          * is supposed to finish reset at its driver probe stage, so no
2061          * in-flight DMA will exist, and we don't need to worry anymore
2062          * hereafter.
2063          */
2064         if (context_copied(context)) {
2065                 u16 did_old = context_domain_id(context);
2066
2067                 if (did_old >= 0 && did_old < cap_ndoms(iommu->cap)) {
2068                         iommu->flush.flush_context(iommu, did_old,
2069                                                    (((u16)bus) << 8) | devfn,
2070                                                    DMA_CCMD_MASK_NOBIT,
2071                                                    DMA_CCMD_DEVICE_INVL);
2072                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2073                                                  DMA_TLB_DSI_FLUSH);
2074                 }
2075         }
2076
2077         pgd = domain->pgd;
2078
2079         context_clear_entry(context);
2080         context_set_domain_id(context, did);
2081
2082         /*
2083          * Skip top levels of page tables for iommu which has less agaw
2084          * than default.  Unnecessary for PT mode.
2085          */
2086         if (translation != CONTEXT_TT_PASS_THROUGH) {
2087                 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2088                         ret = -ENOMEM;
2089                         pgd = phys_to_virt(dma_pte_addr(pgd));
2090                         if (!dma_pte_present(pgd))
2091                                 goto out_unlock;
2092                 }
2093
2094                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2095                 if (info && info->ats_supported)
2096                         translation = CONTEXT_TT_DEV_IOTLB;
2097                 else
2098                         translation = CONTEXT_TT_MULTI_LEVEL;
2099
2100                 context_set_address_root(context, virt_to_phys(pgd));
2101                 context_set_address_width(context, agaw);
2102         } else {
2103                 /*
2104                  * In pass through mode, AW must be programmed to
2105                  * indicate the largest AGAW value supported by
2106                  * hardware. And ASR is ignored by hardware.
2107                  */
2108                 context_set_address_width(context, iommu->msagaw);
2109         }
2110
2111         context_set_translation_type(context, translation);
2112         context_set_fault_enable(context);
2113         context_set_present(context);
2114         domain_flush_cache(domain, context, sizeof(*context));
2115
2116         /*
2117          * It's a non-present to present mapping. If hardware doesn't cache
2118          * non-present entry we only need to flush the write-buffer. If the
2119          * _does_ cache non-present entries, then it does so in the special
2120          * domain #0, which we have to flush:
2121          */
2122         if (cap_caching_mode(iommu->cap)) {
2123                 iommu->flush.flush_context(iommu, 0,
2124                                            (((u16)bus) << 8) | devfn,
2125                                            DMA_CCMD_MASK_NOBIT,
2126                                            DMA_CCMD_DEVICE_INVL);
2127                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2128         } else {
2129                 iommu_flush_write_buffer(iommu);
2130         }
2131         iommu_enable_dev_iotlb(info);
2132
2133         ret = 0;
2134
2135 out_unlock:
2136         spin_unlock(&iommu->lock);
2137         spin_unlock_irqrestore(&device_domain_lock, flags);
2138
2139         return ret;
2140 }
2141
2142 struct domain_context_mapping_data {
2143         struct dmar_domain *domain;
2144         struct intel_iommu *iommu;
2145 };
2146
2147 static int domain_context_mapping_cb(struct pci_dev *pdev,
2148                                      u16 alias, void *opaque)
2149 {
2150         struct domain_context_mapping_data *data = opaque;
2151
2152         return domain_context_mapping_one(data->domain, data->iommu,
2153                                           PCI_BUS_NUM(alias), alias & 0xff);
2154 }
2155
2156 static int
2157 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2158 {
2159         struct intel_iommu *iommu;
2160         u8 bus, devfn;
2161         struct domain_context_mapping_data data;
2162
2163         iommu = device_to_iommu(dev, &bus, &devfn);
2164         if (!iommu)
2165                 return -ENODEV;
2166
2167         if (!dev_is_pci(dev))
2168                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2169
2170         data.domain = domain;
2171         data.iommu = iommu;
2172
2173         return pci_for_each_dma_alias(to_pci_dev(dev),
2174                                       &domain_context_mapping_cb, &data);
2175 }
2176
2177 static int domain_context_mapped_cb(struct pci_dev *pdev,
2178                                     u16 alias, void *opaque)
2179 {
2180         struct intel_iommu *iommu = opaque;
2181
2182         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2183 }
2184
2185 static int domain_context_mapped(struct device *dev)
2186 {
2187         struct intel_iommu *iommu;
2188         u8 bus, devfn;
2189
2190         iommu = device_to_iommu(dev, &bus, &devfn);
2191         if (!iommu)
2192                 return -ENODEV;
2193
2194         if (!dev_is_pci(dev))
2195                 return device_context_mapped(iommu, bus, devfn);
2196
2197         return !pci_for_each_dma_alias(to_pci_dev(dev),
2198                                        domain_context_mapped_cb, iommu);
2199 }
2200
2201 /* Returns a number of VTD pages, but aligned to MM page size */
2202 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2203                                             size_t size)
2204 {
2205         host_addr &= ~PAGE_MASK;
2206         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2207 }
2208
2209 /* Return largest possible superpage level for a given mapping */
2210 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2211                                           unsigned long iov_pfn,
2212                                           unsigned long phy_pfn,
2213                                           unsigned long pages)
2214 {
2215         int support, level = 1;
2216         unsigned long pfnmerge;
2217
2218         support = domain->iommu_superpage;
2219
2220         /* To use a large page, the virtual *and* physical addresses
2221            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2222            of them will mean we have to use smaller pages. So just
2223            merge them and check both at once. */
2224         pfnmerge = iov_pfn | phy_pfn;
2225
2226         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2227                 pages >>= VTD_STRIDE_SHIFT;
2228                 if (!pages)
2229                         break;
2230                 pfnmerge >>= VTD_STRIDE_SHIFT;
2231                 level++;
2232                 support--;
2233         }
2234         return level;
2235 }
2236
2237 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2238                             struct scatterlist *sg, unsigned long phys_pfn,
2239                             unsigned long nr_pages, int prot)
2240 {
2241         struct dma_pte *first_pte = NULL, *pte = NULL;
2242         phys_addr_t uninitialized_var(pteval);
2243         unsigned long sg_res = 0;
2244         unsigned int largepage_lvl = 0;
2245         unsigned long lvl_pages = 0;
2246
2247         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2248
2249         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2250                 return -EINVAL;
2251
2252         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2253
2254         if (!sg) {
2255                 sg_res = nr_pages;
2256                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2257         }
2258
2259         while (nr_pages > 0) {
2260                 uint64_t tmp;
2261
2262                 if (!sg_res) {
2263                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2264
2265                         sg_res = aligned_nrpages(sg->offset, sg->length);
2266                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2267                         sg->dma_length = sg->length;
2268                         pteval = (sg_phys(sg) - pgoff) | prot;
2269                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2270                 }
2271
2272                 if (!pte) {
2273                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2274
2275                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2276                         if (!pte)
2277                                 return -ENOMEM;
2278                         /* It is large page*/
2279                         if (largepage_lvl > 1) {
2280                                 unsigned long nr_superpages, end_pfn;
2281
2282                                 pteval |= DMA_PTE_LARGE_PAGE;
2283                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2284
2285                                 nr_superpages = sg_res / lvl_pages;
2286                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2287
2288                                 /*
2289                                  * Ensure that old small page tables are
2290                                  * removed to make room for superpage(s).
2291                                  */
2292                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2293                         } else {
2294                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2295                         }
2296
2297                 }
2298                 /* We don't need lock here, nobody else
2299                  * touches the iova range
2300                  */
2301                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2302                 if (tmp) {
2303                         static int dumps = 5;
2304                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2305                                 iov_pfn, tmp, (unsigned long long)pteval);
2306                         if (dumps) {
2307                                 dumps--;
2308                                 debug_dma_dump_mappings(NULL);
2309                         }
2310                         WARN_ON(1);
2311                 }
2312
2313                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2314
2315                 BUG_ON(nr_pages < lvl_pages);
2316                 BUG_ON(sg_res < lvl_pages);
2317
2318                 nr_pages -= lvl_pages;
2319                 iov_pfn += lvl_pages;
2320                 phys_pfn += lvl_pages;
2321                 pteval += lvl_pages * VTD_PAGE_SIZE;
2322                 sg_res -= lvl_pages;
2323
2324                 /* If the next PTE would be the first in a new page, then we
2325                    need to flush the cache on the entries we've just written.
2326                    And then we'll need to recalculate 'pte', so clear it and
2327                    let it get set again in the if (!pte) block above.
2328
2329                    If we're done (!nr_pages) we need to flush the cache too.
2330
2331                    Also if we've been setting superpages, we may need to
2332                    recalculate 'pte' and switch back to smaller pages for the
2333                    end of the mapping, if the trailing size is not enough to
2334                    use another superpage (i.e. sg_res < lvl_pages). */
2335                 pte++;
2336                 if (!nr_pages || first_pte_in_page(pte) ||
2337                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2338                         domain_flush_cache(domain, first_pte,
2339                                            (void *)pte - (void *)first_pte);
2340                         pte = NULL;
2341                 }
2342
2343                 if (!sg_res && nr_pages)
2344                         sg = sg_next(sg);
2345         }
2346         return 0;
2347 }
2348
2349 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2350                                     struct scatterlist *sg, unsigned long nr_pages,
2351                                     int prot)
2352 {
2353         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2354 }
2355
2356 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2357                                      unsigned long phys_pfn, unsigned long nr_pages,
2358                                      int prot)
2359 {
2360         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2361 }
2362
2363 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2364 {
2365         if (!iommu)
2366                 return;
2367
2368         clear_context_table(iommu, bus, devfn);
2369         iommu->flush.flush_context(iommu, 0, 0, 0,
2370                                            DMA_CCMD_GLOBAL_INVL);
2371         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2372 }
2373
2374 static inline void unlink_domain_info(struct device_domain_info *info)
2375 {
2376         assert_spin_locked(&device_domain_lock);
2377         list_del(&info->link);
2378         list_del(&info->global);
2379         if (info->dev)
2380                 info->dev->archdata.iommu = NULL;
2381 }
2382
2383 static void domain_remove_dev_info(struct dmar_domain *domain)
2384 {
2385         struct device_domain_info *info, *tmp;
2386         unsigned long flags;
2387
2388         spin_lock_irqsave(&device_domain_lock, flags);
2389         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2390                 __dmar_remove_one_dev_info(info);
2391         spin_unlock_irqrestore(&device_domain_lock, flags);
2392 }
2393
2394 /*
2395  * find_domain
2396  * Note: we use struct device->archdata.iommu stores the info
2397  */
2398 static struct dmar_domain *find_domain(struct device *dev)
2399 {
2400         struct device_domain_info *info;
2401
2402         /* No lock here, assumes no domain exit in normal case */
2403         info = dev->archdata.iommu;
2404         if (info)
2405                 return info->domain;
2406         return NULL;
2407 }
2408
2409 static inline struct device_domain_info *
2410 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2411 {
2412         struct device_domain_info *info;
2413
2414         list_for_each_entry(info, &device_domain_list, global)
2415                 if (info->iommu->segment == segment && info->bus == bus &&
2416                     info->devfn == devfn)
2417                         return info;
2418
2419         return NULL;
2420 }
2421
2422 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2423                                                     int bus, int devfn,
2424                                                     struct device *dev,
2425                                                     struct dmar_domain *domain)
2426 {
2427         struct dmar_domain *found = NULL;
2428         struct device_domain_info *info;
2429         unsigned long flags;
2430         int ret;
2431
2432         info = alloc_devinfo_mem();
2433         if (!info)
2434                 return NULL;
2435
2436         info->bus = bus;
2437         info->devfn = devfn;
2438         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2439         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2440         info->ats_qdep = 0;
2441         info->dev = dev;
2442         info->domain = domain;
2443         info->iommu = iommu;
2444
2445         if (dev && dev_is_pci(dev)) {
2446                 struct pci_dev *pdev = to_pci_dev(info->dev);
2447
2448                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2449                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2450                     dmar_find_matched_atsr_unit(pdev))
2451                         info->ats_supported = 1;
2452
2453                 if (ecs_enabled(iommu)) {
2454                         if (pasid_enabled(iommu)) {
2455                                 int features = pci_pasid_features(pdev);
2456                                 if (features >= 0)
2457                                         info->pasid_supported = features | 1;
2458                         }
2459
2460                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2461                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2462                                 info->pri_supported = 1;
2463                 }
2464         }
2465
2466         spin_lock_irqsave(&device_domain_lock, flags);
2467         if (dev)
2468                 found = find_domain(dev);
2469
2470         if (!found) {
2471                 struct device_domain_info *info2;
2472                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2473                 if (info2) {
2474                         found      = info2->domain;
2475                         info2->dev = dev;
2476                 }
2477         }
2478
2479         if (found) {
2480                 spin_unlock_irqrestore(&device_domain_lock, flags);
2481                 free_devinfo_mem(info);
2482                 /* Caller must free the original domain */
2483                 return found;
2484         }
2485
2486         spin_lock(&iommu->lock);
2487         ret = domain_attach_iommu(domain, iommu);
2488         spin_unlock(&iommu->lock);
2489
2490         if (ret) {
2491                 spin_unlock_irqrestore(&device_domain_lock, flags);
2492                 free_devinfo_mem(info);
2493                 return NULL;
2494         }
2495
2496         list_add(&info->link, &domain->devices);
2497         list_add(&info->global, &device_domain_list);
2498         if (dev)
2499                 dev->archdata.iommu = info;
2500         spin_unlock_irqrestore(&device_domain_lock, flags);
2501
2502         if (dev && domain_context_mapping(domain, dev)) {
2503                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2504                 dmar_remove_one_dev_info(domain, dev);
2505                 return NULL;
2506         }
2507
2508         return domain;
2509 }
2510
2511 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2512 {
2513         *(u16 *)opaque = alias;
2514         return 0;
2515 }
2516
2517 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2518 {
2519         struct device_domain_info *info = NULL;
2520         struct dmar_domain *domain = NULL;
2521         struct intel_iommu *iommu;
2522         u16 req_id, dma_alias;
2523         unsigned long flags;
2524         u8 bus, devfn;
2525
2526         iommu = device_to_iommu(dev, &bus, &devfn);
2527         if (!iommu)
2528                 return NULL;
2529
2530         req_id = ((u16)bus << 8) | devfn;
2531
2532         if (dev_is_pci(dev)) {
2533                 struct pci_dev *pdev = to_pci_dev(dev);
2534
2535                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2536
2537                 spin_lock_irqsave(&device_domain_lock, flags);
2538                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2539                                                       PCI_BUS_NUM(dma_alias),
2540                                                       dma_alias & 0xff);
2541                 if (info) {
2542                         iommu = info->iommu;
2543                         domain = info->domain;
2544                 }
2545                 spin_unlock_irqrestore(&device_domain_lock, flags);
2546
2547                 /* DMA alias already has a domain, use it */
2548                 if (info)
2549                         goto out;
2550         }
2551
2552         /* Allocate and initialize new domain for the device */
2553         domain = alloc_domain(0);
2554         if (!domain)
2555                 return NULL;
2556         if (domain_init(domain, iommu, gaw)) {
2557                 domain_exit(domain);
2558                 return NULL;
2559         }
2560
2561 out:
2562
2563         return domain;
2564 }
2565
2566 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2567                                               struct dmar_domain *domain)
2568 {
2569         struct intel_iommu *iommu;
2570         struct dmar_domain *tmp;
2571         u16 req_id, dma_alias;
2572         u8 bus, devfn;
2573
2574         iommu = device_to_iommu(dev, &bus, &devfn);
2575         if (!iommu)
2576                 return NULL;
2577
2578         req_id = ((u16)bus << 8) | devfn;
2579
2580         if (dev_is_pci(dev)) {
2581                 struct pci_dev *pdev = to_pci_dev(dev);
2582
2583                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2584
2585                 /* register PCI DMA alias device */
2586                 if (req_id != dma_alias) {
2587                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2588                                         dma_alias & 0xff, NULL, domain);
2589
2590                         if (!tmp || tmp != domain)
2591                                 return tmp;
2592                 }
2593         }
2594
2595         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2596         if (!tmp || tmp != domain)
2597                 return tmp;
2598
2599         return domain;
2600 }
2601
2602 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2603 {
2604         struct dmar_domain *domain, *tmp;
2605
2606         domain = find_domain(dev);
2607         if (domain)
2608                 goto out;
2609
2610         domain = find_or_alloc_domain(dev, gaw);
2611         if (!domain)
2612                 goto out;
2613
2614         tmp = set_domain_for_dev(dev, domain);
2615         if (!tmp || domain != tmp) {
2616                 domain_exit(domain);
2617                 domain = tmp;
2618         }
2619
2620 out:
2621
2622         return domain;
2623 }
2624
2625 static int iommu_domain_identity_map(struct dmar_domain *domain,
2626                                      unsigned long long start,
2627                                      unsigned long long end)
2628 {
2629         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2630         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2631
2632         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2633                           dma_to_mm_pfn(last_vpfn))) {
2634                 pr_err("Reserving iova failed\n");
2635                 return -ENOMEM;
2636         }
2637
2638         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2639         /*
2640          * RMRR range might have overlap with physical memory range,
2641          * clear it first
2642          */
2643         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2644
2645         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2646                                   last_vpfn - first_vpfn + 1,
2647                                   DMA_PTE_READ|DMA_PTE_WRITE);
2648 }
2649
2650 static int domain_prepare_identity_map(struct device *dev,
2651                                        struct dmar_domain *domain,
2652                                        unsigned long long start,
2653                                        unsigned long long end)
2654 {
2655         /* For _hardware_ passthrough, don't bother. But for software
2656            passthrough, we do it anyway -- it may indicate a memory
2657            range which is reserved in E820, so which didn't get set
2658            up to start with in si_domain */
2659         if (domain == si_domain && hw_pass_through) {
2660                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2661                         dev_name(dev), start, end);
2662                 return 0;
2663         }
2664
2665         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2666                 dev_name(dev), start, end);
2667
2668         if (end < start) {
2669                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2670                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2671                         dmi_get_system_info(DMI_BIOS_VENDOR),
2672                         dmi_get_system_info(DMI_BIOS_VERSION),
2673                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2674                 return -EIO;
2675         }
2676
2677         if (end >> agaw_to_width(domain->agaw)) {
2678                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2679                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2680                      agaw_to_width(domain->agaw),
2681                      dmi_get_system_info(DMI_BIOS_VENDOR),
2682                      dmi_get_system_info(DMI_BIOS_VERSION),
2683                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2684                 return -EIO;
2685         }
2686
2687         return iommu_domain_identity_map(domain, start, end);
2688 }
2689
2690 static int iommu_prepare_identity_map(struct device *dev,
2691                                       unsigned long long start,
2692                                       unsigned long long end)
2693 {
2694         struct dmar_domain *domain;
2695         int ret;
2696
2697         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2698         if (!domain)
2699                 return -ENOMEM;
2700
2701         ret = domain_prepare_identity_map(dev, domain, start, end);
2702         if (ret)
2703                 domain_exit(domain);
2704
2705         return ret;
2706 }
2707
2708 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2709                                          struct device *dev)
2710 {
2711         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2712                 return 0;
2713         return iommu_prepare_identity_map(dev, rmrr->base_address,
2714                                           rmrr->end_address);
2715 }
2716
2717 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2718 static inline void iommu_prepare_isa(void)
2719 {
2720         struct pci_dev *pdev;
2721         int ret;
2722
2723         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2724         if (!pdev)
2725                 return;
2726
2727         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2728         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2729
2730         if (ret)
2731                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2732
2733         pci_dev_put(pdev);
2734 }
2735 #else
2736 static inline void iommu_prepare_isa(void)
2737 {
2738         return;
2739 }
2740 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2741
2742 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2743
2744 static int __init si_domain_init(int hw)
2745 {
2746         int nid, ret = 0;
2747
2748         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2749         if (!si_domain)
2750                 return -EFAULT;
2751
2752         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2753                 domain_exit(si_domain);
2754                 return -EFAULT;
2755         }
2756
2757         pr_debug("Identity mapping domain allocated\n");
2758
2759         if (hw)
2760                 return 0;
2761
2762         for_each_online_node(nid) {
2763                 unsigned long start_pfn, end_pfn;
2764                 int i;
2765
2766                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2767                         ret = iommu_domain_identity_map(si_domain,
2768                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2769                         if (ret)
2770                                 return ret;
2771                 }
2772         }
2773
2774         return 0;
2775 }
2776
2777 static int identity_mapping(struct device *dev)
2778 {
2779         struct device_domain_info *info;
2780
2781         if (likely(!iommu_identity_mapping))
2782                 return 0;
2783
2784         info = dev->archdata.iommu;
2785         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2786                 return (info->domain == si_domain);
2787
2788         return 0;
2789 }
2790
2791 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2792 {
2793         struct dmar_domain *ndomain;
2794         struct intel_iommu *iommu;
2795         u8 bus, devfn;
2796
2797         iommu = device_to_iommu(dev, &bus, &devfn);
2798         if (!iommu)
2799                 return -ENODEV;
2800
2801         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2802         if (ndomain != domain)
2803                 return -EBUSY;
2804
2805         return 0;
2806 }
2807
2808 static bool device_has_rmrr(struct device *dev)
2809 {
2810         struct dmar_rmrr_unit *rmrr;
2811         struct device *tmp;
2812         int i;
2813
2814         rcu_read_lock();
2815         for_each_rmrr_units(rmrr) {
2816                 /*
2817                  * Return TRUE if this RMRR contains the device that
2818                  * is passed in.
2819                  */
2820                 for_each_active_dev_scope(rmrr->devices,
2821                                           rmrr->devices_cnt, i, tmp)
2822                         if (tmp == dev) {
2823                                 rcu_read_unlock();
2824                                 return true;
2825                         }
2826         }
2827         rcu_read_unlock();
2828         return false;
2829 }
2830
2831 /*
2832  * There are a couple cases where we need to restrict the functionality of
2833  * devices associated with RMRRs.  The first is when evaluating a device for
2834  * identity mapping because problems exist when devices are moved in and out
2835  * of domains and their respective RMRR information is lost.  This means that
2836  * a device with associated RMRRs will never be in a "passthrough" domain.
2837  * The second is use of the device through the IOMMU API.  This interface
2838  * expects to have full control of the IOVA space for the device.  We cannot
2839  * satisfy both the requirement that RMRR access is maintained and have an
2840  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2841  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2842  * We therefore prevent devices associated with an RMRR from participating in
2843  * the IOMMU API, which eliminates them from device assignment.
2844  *
2845  * In both cases we assume that PCI USB devices with RMRRs have them largely
2846  * for historical reasons and that the RMRR space is not actively used post
2847  * boot.  This exclusion may change if vendors begin to abuse it.
2848  *
2849  * The same exception is made for graphics devices, with the requirement that
2850  * any use of the RMRR regions will be torn down before assigning the device
2851  * to a guest.
2852  */
2853 static bool device_is_rmrr_locked(struct device *dev)
2854 {
2855         if (!device_has_rmrr(dev))
2856                 return false;
2857
2858         if (dev_is_pci(dev)) {
2859                 struct pci_dev *pdev = to_pci_dev(dev);
2860
2861                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2862                         return false;
2863         }
2864
2865         return true;
2866 }
2867
2868 static int iommu_should_identity_map(struct device *dev, int startup)
2869 {
2870
2871         if (dev_is_pci(dev)) {
2872                 struct pci_dev *pdev = to_pci_dev(dev);
2873
2874                 if (device_is_rmrr_locked(dev))
2875                         return 0;
2876
2877                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2878                         return 1;
2879
2880                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2881                         return 1;
2882
2883                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2884                         return 0;
2885
2886                 /*
2887                  * We want to start off with all devices in the 1:1 domain, and
2888                  * take them out later if we find they can't access all of memory.
2889                  *
2890                  * However, we can't do this for PCI devices behind bridges,
2891                  * because all PCI devices behind the same bridge will end up
2892                  * with the same source-id on their transactions.
2893                  *
2894                  * Practically speaking, we can't change things around for these
2895                  * devices at run-time, because we can't be sure there'll be no
2896                  * DMA transactions in flight for any of their siblings.
2897                  *
2898                  * So PCI devices (unless they're on the root bus) as well as
2899                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2900                  * the 1:1 domain, just in _case_ one of their siblings turns out
2901                  * not to be able to map all of memory.
2902                  */
2903                 if (!pci_is_pcie(pdev)) {
2904                         if (!pci_is_root_bus(pdev->bus))
2905                                 return 0;
2906                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2907                                 return 0;
2908                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2909                         return 0;
2910         } else {
2911                 if (device_has_rmrr(dev))
2912                         return 0;
2913         }
2914
2915         /*
2916          * At boot time, we don't yet know if devices will be 64-bit capable.
2917          * Assume that they will — if they turn out not to be, then we can
2918          * take them out of the 1:1 domain later.
2919          */
2920         if (!startup) {
2921                 /*
2922                  * If the device's dma_mask is less than the system's memory
2923                  * size then this is not a candidate for identity mapping.
2924                  */
2925                 u64 dma_mask = *dev->dma_mask;
2926
2927                 if (dev->coherent_dma_mask &&
2928                     dev->coherent_dma_mask < dma_mask)
2929                         dma_mask = dev->coherent_dma_mask;
2930
2931                 return dma_mask >= dma_get_required_mask(dev);
2932         }
2933
2934         return 1;
2935 }
2936
2937 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2938 {
2939         int ret;
2940
2941         if (!iommu_should_identity_map(dev, 1))
2942                 return 0;
2943
2944         ret = domain_add_dev_info(si_domain, dev);
2945         if (!ret)
2946                 pr_info("%s identity mapping for device %s\n",
2947                         hw ? "Hardware" : "Software", dev_name(dev));
2948         else if (ret == -ENODEV)
2949                 /* device not associated with an iommu */
2950                 ret = 0;
2951
2952         return ret;
2953 }
2954
2955
2956 static int __init iommu_prepare_static_identity_mapping(int hw)
2957 {
2958         struct pci_dev *pdev = NULL;
2959         struct dmar_drhd_unit *drhd;
2960         struct intel_iommu *iommu;
2961         struct device *dev;
2962         int i;
2963         int ret = 0;
2964
2965         for_each_pci_dev(pdev) {
2966                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2967                 if (ret)
2968                         return ret;
2969         }
2970
2971         for_each_active_iommu(iommu, drhd)
2972                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2973                         struct acpi_device_physical_node *pn;
2974                         struct acpi_device *adev;
2975
2976                         if (dev->bus != &acpi_bus_type)
2977                                 continue;
2978
2979                         adev= to_acpi_device(dev);
2980                         mutex_lock(&adev->physical_node_lock);
2981                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2982                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2983                                 if (ret)
2984                                         break;
2985                         }
2986                         mutex_unlock(&adev->physical_node_lock);
2987                         if (ret)
2988                                 return ret;
2989                 }
2990
2991         return 0;
2992 }
2993
2994 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2995 {
2996         /*
2997          * Start from the sane iommu hardware state.
2998          * If the queued invalidation is already initialized by us
2999          * (for example, while enabling interrupt-remapping) then
3000          * we got the things already rolling from a sane state.
3001          */
3002         if (!iommu->qi) {
3003                 /*
3004                  * Clear any previous faults.
3005                  */
3006                 dmar_fault(-1, iommu);
3007                 /*
3008                  * Disable queued invalidation if supported and already enabled
3009                  * before OS handover.
3010                  */
3011                 dmar_disable_qi(iommu);
3012         }
3013
3014         if (dmar_enable_qi(iommu)) {
3015                 /*
3016                  * Queued Invalidate not enabled, use Register Based Invalidate
3017                  */
3018                 iommu->flush.flush_context = __iommu_flush_context;
3019                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3020                 pr_info("%s: Using Register based invalidation\n",
3021                         iommu->name);
3022         } else {
3023                 iommu->flush.flush_context = qi_flush_context;
3024                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3025                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3026         }
3027 }
3028
3029 static int copy_context_table(struct intel_iommu *iommu,
3030                               struct root_entry *old_re,
3031                               struct context_entry **tbl,
3032                               int bus, bool ext)
3033 {
3034         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3035         struct context_entry *new_ce = NULL, ce;
3036         struct context_entry *old_ce = NULL;
3037         struct root_entry re;
3038         phys_addr_t old_ce_phys;
3039
3040         tbl_idx = ext ? bus * 2 : bus;
3041         memcpy(&re, old_re, sizeof(re));
3042
3043         for (devfn = 0; devfn < 256; devfn++) {
3044                 /* First calculate the correct index */
3045                 idx = (ext ? devfn * 2 : devfn) % 256;
3046
3047                 if (idx == 0) {
3048                         /* First save what we may have and clean up */
3049                         if (new_ce) {
3050                                 tbl[tbl_idx] = new_ce;
3051                                 __iommu_flush_cache(iommu, new_ce,
3052                                                     VTD_PAGE_SIZE);
3053                                 pos = 1;
3054                         }
3055
3056                         if (old_ce)
3057                                 memunmap(old_ce);
3058
3059                         ret = 0;
3060                         if (devfn < 0x80)
3061                                 old_ce_phys = root_entry_lctp(&re);
3062                         else
3063                                 old_ce_phys = root_entry_uctp(&re);
3064
3065                         if (!old_ce_phys) {
3066                                 if (ext && devfn == 0) {
3067                                         /* No LCTP, try UCTP */
3068                                         devfn = 0x7f;
3069                                         continue;
3070                                 } else {
3071                                         goto out;
3072                                 }
3073                         }
3074
3075                         ret = -ENOMEM;
3076                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3077                                         MEMREMAP_WB);
3078                         if (!old_ce)
3079                                 goto out;
3080
3081                         new_ce = alloc_pgtable_page(iommu->node);
3082                         if (!new_ce)
3083                                 goto out_unmap;
3084
3085                         ret = 0;
3086                 }
3087
3088                 /* Now copy the context entry */
3089                 memcpy(&ce, old_ce + idx, sizeof(ce));
3090
3091                 if (!__context_present(&ce))
3092                         continue;
3093
3094                 did = context_domain_id(&ce);
3095                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3096                         set_bit(did, iommu->domain_ids);
3097
3098                 /*
3099                  * We need a marker for copied context entries. This
3100                  * marker needs to work for the old format as well as
3101                  * for extended context entries.
3102                  *
3103                  * Bit 67 of the context entry is used. In the old
3104                  * format this bit is available to software, in the
3105                  * extended format it is the PGE bit, but PGE is ignored
3106                  * by HW if PASIDs are disabled (and thus still
3107                  * available).
3108                  *
3109                  * So disable PASIDs first and then mark the entry
3110                  * copied. This means that we don't copy PASID
3111                  * translations from the old kernel, but this is fine as
3112                  * faults there are not fatal.
3113                  */
3114                 context_clear_pasid_enable(&ce);
3115                 context_set_copied(&ce);
3116
3117                 new_ce[idx] = ce;
3118         }
3119
3120         tbl[tbl_idx + pos] = new_ce;
3121
3122         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3123
3124 out_unmap:
3125         memunmap(old_ce);
3126
3127 out:
3128         return ret;
3129 }
3130
3131 static int copy_translation_tables(struct intel_iommu *iommu)
3132 {
3133         struct context_entry **ctxt_tbls;
3134         struct root_entry *old_rt;
3135         phys_addr_t old_rt_phys;
3136         int ctxt_table_entries;
3137         unsigned long flags;
3138         u64 rtaddr_reg;
3139         int bus, ret;
3140         bool new_ext, ext;
3141
3142         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3143         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3144         new_ext    = !!ecap_ecs(iommu->ecap);
3145
3146         /*
3147          * The RTT bit can only be changed when translation is disabled,
3148          * but disabling translation means to open a window for data
3149          * corruption. So bail out and don't copy anything if we would
3150          * have to change the bit.
3151          */
3152         if (new_ext != ext)
3153                 return -EINVAL;
3154
3155         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3156         if (!old_rt_phys)
3157                 return -EINVAL;
3158
3159         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3160         if (!old_rt)
3161                 return -ENOMEM;
3162
3163         /* This is too big for the stack - allocate it from slab */
3164         ctxt_table_entries = ext ? 512 : 256;
3165         ret = -ENOMEM;
3166         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3167         if (!ctxt_tbls)
3168                 goto out_unmap;
3169
3170         for (bus = 0; bus < 256; bus++) {
3171                 ret = copy_context_table(iommu, &old_rt[bus],
3172                                          ctxt_tbls, bus, ext);
3173                 if (ret) {
3174                         pr_err("%s: Failed to copy context table for bus %d\n",
3175                                 iommu->name, bus);
3176                         continue;
3177                 }
3178         }
3179
3180         spin_lock_irqsave(&iommu->lock, flags);
3181
3182         /* Context tables are copied, now write them to the root_entry table */
3183         for (bus = 0; bus < 256; bus++) {
3184                 int idx = ext ? bus * 2 : bus;
3185                 u64 val;
3186
3187                 if (ctxt_tbls[idx]) {
3188                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3189                         iommu->root_entry[bus].lo = val;
3190                 }
3191
3192                 if (!ext || !ctxt_tbls[idx + 1])
3193                         continue;
3194
3195                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3196                 iommu->root_entry[bus].hi = val;
3197         }
3198
3199         spin_unlock_irqrestore(&iommu->lock, flags);
3200
3201         kfree(ctxt_tbls);
3202
3203         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3204
3205         ret = 0;
3206
3207 out_unmap:
3208         memunmap(old_rt);
3209
3210         return ret;
3211 }
3212
3213 static int __init init_dmars(void)
3214 {
3215         struct dmar_drhd_unit *drhd;
3216         struct dmar_rmrr_unit *rmrr;
3217         bool copied_tables = false;
3218         struct device *dev;
3219         struct intel_iommu *iommu;
3220         int i, ret, cpu;
3221
3222         /*
3223          * for each drhd
3224          *    allocate root
3225          *    initialize and program root entry to not present
3226          * endfor
3227          */
3228         for_each_drhd_unit(drhd) {
3229                 /*
3230                  * lock not needed as this is only incremented in the single
3231                  * threaded kernel __init code path all other access are read
3232                  * only
3233                  */
3234                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3235                         g_num_of_iommus++;
3236                         continue;
3237                 }
3238                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3239         }
3240
3241         /* Preallocate enough resources for IOMMU hot-addition */
3242         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3243                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3244
3245         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3246                         GFP_KERNEL);
3247         if (!g_iommus) {
3248                 pr_err("Allocating global iommu array failed\n");
3249                 ret = -ENOMEM;
3250                 goto error;
3251         }
3252
3253         for_each_possible_cpu(cpu) {
3254                 struct deferred_flush_data *dfd = per_cpu_ptr(&deferred_flush,
3255                                                               cpu);
3256
3257                 dfd->tables = kzalloc(g_num_of_iommus *
3258                                       sizeof(struct deferred_flush_table),
3259                                       GFP_KERNEL);
3260                 if (!dfd->tables) {
3261                         ret = -ENOMEM;
3262                         goto free_g_iommus;
3263                 }
3264
3265                 spin_lock_init(&dfd->lock);
3266                 setup_timer(&dfd->timer, flush_unmaps_timeout, cpu);
3267         }
3268
3269         for_each_active_iommu(iommu, drhd) {
3270                 g_iommus[iommu->seq_id] = iommu;
3271
3272                 intel_iommu_init_qi(iommu);
3273
3274                 ret = iommu_init_domains(iommu);
3275                 if (ret)
3276                         goto free_iommu;
3277
3278                 init_translation_status(iommu);
3279
3280                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3281                         iommu_disable_translation(iommu);
3282                         clear_translation_pre_enabled(iommu);
3283                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3284                                 iommu->name);
3285                 }
3286
3287                 /*
3288                  * TBD:
3289                  * we could share the same root & context tables
3290                  * among all IOMMU's. Need to Split it later.
3291                  */
3292                 ret = iommu_alloc_root_entry(iommu);
3293                 if (ret)
3294                         goto free_iommu;
3295
3296                 if (translation_pre_enabled(iommu)) {
3297                         pr_info("Translation already enabled - trying to copy translation structures\n");
3298
3299                         ret = copy_translation_tables(iommu);
3300                         if (ret) {
3301                                 /*
3302                                  * We found the IOMMU with translation
3303                                  * enabled - but failed to copy over the
3304                                  * old root-entry table. Try to proceed
3305                                  * by disabling translation now and
3306                                  * allocating a clean root-entry table.
3307                                  * This might cause DMAR faults, but
3308                                  * probably the dump will still succeed.
3309                                  */
3310                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3311                                        iommu->name);
3312                                 iommu_disable_translation(iommu);
3313                                 clear_translation_pre_enabled(iommu);
3314                         } else {
3315                                 pr_info("Copied translation tables from previous kernel for %s\n",
3316                                         iommu->name);
3317                                 copied_tables = true;
3318                         }
3319                 }
3320
3321                 if (!ecap_pass_through(iommu->ecap))
3322                         hw_pass_through = 0;
3323 #ifdef CONFIG_INTEL_IOMMU_SVM
3324                 if (pasid_enabled(iommu))
3325                         intel_svm_alloc_pasid_tables(iommu);
3326 #endif
3327         }
3328
3329         /*
3330          * Now that qi is enabled on all iommus, set the root entry and flush
3331          * caches. This is required on some Intel X58 chipsets, otherwise the
3332          * flush_context function will loop forever and the boot hangs.
3333          */
3334         for_each_active_iommu(iommu, drhd) {
3335                 iommu_flush_write_buffer(iommu);
3336                 iommu_set_root_entry(iommu);
3337                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3338                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3339         }
3340
3341         if (iommu_pass_through)
3342                 iommu_identity_mapping |= IDENTMAP_ALL;
3343
3344 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3345         iommu_identity_mapping |= IDENTMAP_GFX;
3346 #endif
3347
3348         check_tylersburg_isoch();
3349
3350         if (iommu_identity_mapping) {
3351                 ret = si_domain_init(hw_pass_through);
3352                 if (ret)
3353                         goto free_iommu;
3354         }
3355
3356
3357         /*
3358          * If we copied translations from a previous kernel in the kdump
3359          * case, we can not assign the devices to domains now, as that
3360          * would eliminate the old mappings. So skip this part and defer
3361          * the assignment to device driver initialization time.
3362          */
3363         if (copied_tables)
3364                 goto domains_done;
3365
3366         /*
3367          * If pass through is not set or not enabled, setup context entries for
3368          * identity mappings for rmrr, gfx, and isa and may fall back to static
3369          * identity mapping if iommu_identity_mapping is set.
3370          */
3371         if (iommu_identity_mapping) {
3372                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3373                 if (ret) {
3374                         pr_crit("Failed to setup IOMMU pass-through\n");
3375                         goto free_iommu;
3376                 }
3377         }
3378         /*
3379          * For each rmrr
3380          *   for each dev attached to rmrr
3381          *   do
3382          *     locate drhd for dev, alloc domain for dev
3383          *     allocate free domain
3384          *     allocate page table entries for rmrr
3385          *     if context not allocated for bus
3386          *           allocate and init context
3387          *           set present in root table for this bus
3388          *     init context with domain, translation etc
3389          *    endfor
3390          * endfor
3391          */
3392         pr_info("Setting RMRR:\n");
3393         for_each_rmrr_units(rmrr) {
3394                 /* some BIOS lists non-exist devices in DMAR table. */
3395                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3396                                           i, dev) {
3397                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3398                         if (ret)
3399                                 pr_err("Mapping reserved region failed\n");
3400                 }
3401         }
3402
3403         iommu_prepare_isa();
3404
3405 domains_done:
3406
3407         /*
3408          * for each drhd
3409          *   enable fault log
3410          *   global invalidate context cache
3411          *   global invalidate iotlb
3412          *   enable translation
3413          */
3414         for_each_iommu(iommu, drhd) {
3415                 if (drhd->ignored) {
3416                         /*
3417                          * we always have to disable PMRs or DMA may fail on
3418                          * this device
3419                          */
3420                         if (force_on)
3421                                 iommu_disable_protect_mem_regions(iommu);
3422                         continue;
3423                 }
3424
3425                 iommu_flush_write_buffer(iommu);
3426
3427 #ifdef CONFIG_INTEL_IOMMU_SVM
3428                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3429                         ret = intel_svm_enable_prq(iommu);
3430                         if (ret)
3431                                 goto free_iommu;
3432                 }
3433 #endif
3434                 ret = dmar_set_interrupt(iommu);
3435                 if (ret)
3436                         goto free_iommu;
3437
3438                 if (!translation_pre_enabled(iommu))
3439                         iommu_enable_translation(iommu);
3440
3441                 iommu_disable_protect_mem_regions(iommu);
3442         }
3443
3444         return 0;
3445
3446 free_iommu:
3447         for_each_active_iommu(iommu, drhd) {
3448                 disable_dmar_iommu(iommu);
3449                 free_dmar_iommu(iommu);
3450         }
3451 free_g_iommus:
3452         for_each_possible_cpu(cpu)
3453                 kfree(per_cpu_ptr(&deferred_flush, cpu)->tables);
3454         kfree(g_iommus);
3455 error:
3456         return ret;
3457 }
3458
3459 /* This takes a number of _MM_ pages, not VTD pages */
3460 static unsigned long intel_alloc_iova(struct device *dev,
3461                                      struct dmar_domain *domain,
3462                                      unsigned long nrpages, uint64_t dma_mask)
3463 {
3464         unsigned long iova_pfn = 0;
3465
3466         /* Restrict dma_mask to the width that the iommu can handle */
3467         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3468         /* Ensure we reserve the whole size-aligned region */
3469         nrpages = __roundup_pow_of_two(nrpages);
3470
3471         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3472                 /*
3473                  * First try to allocate an io virtual address in
3474                  * DMA_BIT_MASK(32) and if that fails then try allocating
3475                  * from higher range
3476                  */
3477                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3478                                            IOVA_PFN(DMA_BIT_MASK(32)));
3479                 if (iova_pfn)
3480                         return iova_pfn;
3481         }
3482         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
3483         if (unlikely(!iova_pfn)) {
3484                 pr_err("Allocating %ld-page iova for %s failed",
3485                        nrpages, dev_name(dev));
3486                 return 0;
3487         }
3488
3489         return iova_pfn;
3490 }
3491
3492 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3493 {
3494         struct dmar_domain *domain, *tmp;
3495         struct dmar_rmrr_unit *rmrr;
3496         struct device *i_dev;
3497         int i, ret;
3498
3499         domain = find_domain(dev);
3500         if (domain)
3501                 goto out;
3502
3503         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3504         if (!domain)
3505                 goto out;
3506
3507         /* We have a new domain - setup possible RMRRs for the device */
3508         rcu_read_lock();
3509         for_each_rmrr_units(rmrr) {
3510                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3511                                           i, i_dev) {
3512                         if (i_dev != dev)
3513                                 continue;
3514
3515                         ret = domain_prepare_identity_map(dev, domain,
3516                                                           rmrr->base_address,
3517                                                           rmrr->end_address);
3518                         if (ret)
3519                                 dev_err(dev, "Mapping reserved region failed\n");
3520                 }
3521         }
3522         rcu_read_unlock();
3523
3524         tmp = set_domain_for_dev(dev, domain);
3525         if (!tmp || domain != tmp) {
3526                 domain_exit(domain);
3527                 domain = tmp;
3528         }
3529
3530 out:
3531
3532         if (!domain)
3533                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3534
3535
3536         return domain;
3537 }
3538
3539 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3540 {
3541         struct device_domain_info *info;
3542
3543         /* No lock here, assumes no domain exit in normal case */
3544         info = dev->archdata.iommu;
3545         if (likely(info))
3546                 return info->domain;
3547
3548         return __get_valid_domain_for_dev(dev);
3549 }
3550
3551 /* Check if the dev needs to go through non-identity map and unmap process.*/
3552 static int iommu_no_mapping(struct device *dev)
3553 {
3554         int found;
3555
3556         if (iommu_dummy(dev))
3557                 return 1;
3558
3559         if (!iommu_identity_mapping)
3560                 return 0;
3561
3562         found = identity_mapping(dev);
3563         if (found) {
3564                 if (iommu_should_identity_map(dev, 0))
3565                         return 1;
3566                 else {
3567                         /*
3568                          * 32 bit DMA is removed from si_domain and fall back
3569                          * to non-identity mapping.
3570                          */
3571                         dmar_remove_one_dev_info(si_domain, dev);
3572                         pr_info("32bit %s uses non-identity mapping\n",
3573                                 dev_name(dev));
3574                         return 0;
3575                 }
3576         } else {
3577                 /*
3578                  * In case of a detached 64 bit DMA device from vm, the device
3579                  * is put into si_domain for identity mapping.
3580                  */
3581                 if (iommu_should_identity_map(dev, 0)) {
3582                         int ret;
3583                         ret = domain_add_dev_info(si_domain, dev);
3584                         if (!ret) {
3585                                 pr_info("64bit %s uses identity mapping\n",
3586                                         dev_name(dev));
3587                                 return 1;
3588                         }
3589                 }
3590         }
3591
3592         return 0;
3593 }
3594
3595 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3596                                      size_t size, int dir, u64 dma_mask)
3597 {
3598         struct dmar_domain *domain;
3599         phys_addr_t start_paddr;
3600         unsigned long iova_pfn;
3601         int prot = 0;
3602         int ret;
3603         struct intel_iommu *iommu;
3604         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3605
3606         BUG_ON(dir == DMA_NONE);
3607
3608         if (iommu_no_mapping(dev))
3609                 return paddr;
3610
3611         domain = get_valid_domain_for_dev(dev);
3612         if (!domain)
3613                 return 0;
3614
3615         iommu = domain_get_iommu(domain);
3616         size = aligned_nrpages(paddr, size);
3617
3618         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3619         if (!iova_pfn)
3620                 goto error;
3621
3622         /*
3623          * Check if DMAR supports zero-length reads on write only
3624          * mappings..
3625          */
3626         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3627                         !cap_zlr(iommu->cap))
3628                 prot |= DMA_PTE_READ;
3629         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3630                 prot |= DMA_PTE_WRITE;
3631         /*
3632          * paddr - (paddr + size) might be partial page, we should map the whole
3633          * page.  Note: if two part of one page are separately mapped, we
3634          * might have two guest_addr mapping to the same host paddr, but this
3635          * is not a big problem
3636          */
3637         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3638                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3639         if (ret)
3640                 goto error;
3641
3642         /* it's a non-present to present mapping. Only flush if caching mode */
3643         if (cap_caching_mode(iommu->cap))
3644                 iommu_flush_iotlb_psi(iommu, domain,
3645                                       mm_to_dma_pfn(iova_pfn),
3646                                       size, 0, 1);
3647         else
3648                 iommu_flush_write_buffer(iommu);
3649
3650         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3651         start_paddr += paddr & ~PAGE_MASK;
3652         return start_paddr;
3653
3654 error:
3655         if (iova_pfn)
3656                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3657         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3658                 dev_name(dev), size, (unsigned long long)paddr, dir);
3659         return 0;
3660 }
3661
3662 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3663                                  unsigned long offset, size_t size,
3664                                  enum dma_data_direction dir,
3665                                  unsigned long attrs)
3666 {
3667         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3668                                   dir, *dev->dma_mask);
3669 }
3670
3671 static void flush_unmaps(struct deferred_flush_data *flush_data)
3672 {
3673         int i, j;
3674
3675         flush_data->timer_on = 0;
3676
3677         /* just flush them all */
3678         for (i = 0; i < g_num_of_iommus; i++) {
3679                 struct intel_iommu *iommu = g_iommus[i];
3680                 struct deferred_flush_table *flush_table =
3681                                 &flush_data->tables[i];
3682                 if (!iommu)
3683                         continue;
3684
3685                 if (!flush_table->next)
3686                         continue;
3687
3688                 /* In caching mode, global flushes turn emulation expensive */
3689                 if (!cap_caching_mode(iommu->cap))
3690                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3691                                          DMA_TLB_GLOBAL_FLUSH);
3692                 for (j = 0; j < flush_table->next; j++) {
3693                         unsigned long mask;
3694                         struct deferred_flush_entry *entry =
3695                                                 &flush_table->entries[j];
3696                         unsigned long iova_pfn = entry->iova_pfn;
3697                         unsigned long nrpages = entry->nrpages;
3698                         struct dmar_domain *domain = entry->domain;
3699                         struct page *freelist = entry->freelist;
3700
3701                         /* On real hardware multiple invalidations are expensive */
3702                         if (cap_caching_mode(iommu->cap))
3703                                 iommu_flush_iotlb_psi(iommu, domain,
3704                                         mm_to_dma_pfn(iova_pfn),
3705                                         nrpages, !freelist, 0);
3706                         else {
3707                                 mask = ilog2(nrpages);
3708                                 iommu_flush_dev_iotlb(domain,
3709                                                 (uint64_t)iova_pfn << PAGE_SHIFT, mask);
3710                         }
3711                         free_iova_fast(&domain->iovad, iova_pfn, nrpages);
3712                         if (freelist)
3713                                 dma_free_pagelist(freelist);
3714                 }
3715                 flush_table->next = 0;
3716         }
3717
3718         flush_data->size = 0;
3719 }
3720
3721 static void flush_unmaps_timeout(unsigned long cpuid)
3722 {
3723         struct deferred_flush_data *flush_data = per_cpu_ptr(&deferred_flush, cpuid);
3724         unsigned long flags;
3725
3726         spin_lock_irqsave(&flush_data->lock, flags);
3727         flush_unmaps(flush_data);
3728         spin_unlock_irqrestore(&flush_data->lock, flags);
3729 }
3730
3731 static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
3732                       unsigned long nrpages, struct page *freelist)
3733 {
3734         unsigned long flags;
3735         int entry_id, iommu_id;
3736         struct intel_iommu *iommu;
3737         struct deferred_flush_entry *entry;
3738         struct deferred_flush_data *flush_data;
3739         unsigned int cpuid;
3740
3741         cpuid = get_cpu();
3742         flush_data = per_cpu_ptr(&deferred_flush, cpuid);
3743
3744         /* Flush all CPUs' entries to avoid deferring too much.  If
3745          * this becomes a bottleneck, can just flush us, and rely on
3746          * flush timer for the rest.
3747          */
3748         if (flush_data->size == HIGH_WATER_MARK) {
3749                 int cpu;
3750
3751                 for_each_online_cpu(cpu)
3752                         flush_unmaps_timeout(cpu);
3753         }
3754
3755         spin_lock_irqsave(&flush_data->lock, flags);
3756
3757         iommu = domain_get_iommu(dom);
3758         iommu_id = iommu->seq_id;
3759
3760         entry_id = flush_data->tables[iommu_id].next;
3761         ++(flush_data->tables[iommu_id].next);
3762
3763         entry = &flush_data->tables[iommu_id].entries[entry_id];
3764         entry->domain = dom;
3765         entry->iova_pfn = iova_pfn;
3766         entry->nrpages = nrpages;
3767         entry->freelist = freelist;
3768
3769         if (!flush_data->timer_on) {
3770                 mod_timer(&flush_data->timer, jiffies + msecs_to_jiffies(10));
3771                 flush_data->timer_on = 1;
3772         }
3773         flush_data->size++;
3774         spin_unlock_irqrestore(&flush_data->lock, flags);
3775
3776         put_cpu();
3777 }
3778
3779 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3780 {
3781         struct dmar_domain *domain;
3782         unsigned long start_pfn, last_pfn;
3783         unsigned long nrpages;
3784         unsigned long iova_pfn;
3785         struct intel_iommu *iommu;
3786         struct page *freelist;
3787
3788         if (iommu_no_mapping(dev))
3789                 return;
3790
3791         domain = find_domain(dev);
3792         BUG_ON(!domain);
3793
3794         iommu = domain_get_iommu(domain);
3795
3796         iova_pfn = IOVA_PFN(dev_addr);
3797
3798         nrpages = aligned_nrpages(dev_addr, size);
3799         start_pfn = mm_to_dma_pfn(iova_pfn);
3800         last_pfn = start_pfn + nrpages - 1;
3801
3802         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3803                  dev_name(dev), start_pfn, last_pfn);
3804
3805         freelist = domain_unmap(domain, start_pfn, last_pfn);
3806
3807         if (intel_iommu_strict) {
3808                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3809                                       nrpages, !freelist, 0);
3810                 /* free iova */
3811                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3812                 dma_free_pagelist(freelist);
3813         } else {
3814                 add_unmap(domain, iova_pfn, nrpages, freelist);
3815                 /*
3816                  * queue up the release of the unmap to save the 1/6th of the
3817                  * cpu used up by the iotlb flush operation...
3818                  */
3819         }
3820 }
3821
3822 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3823                              size_t size, enum dma_data_direction dir,
3824                              unsigned long attrs)
3825 {
3826         intel_unmap(dev, dev_addr, size);
3827 }
3828
3829 static void *intel_alloc_coherent(struct device *dev, size_t size,
3830                                   dma_addr_t *dma_handle, gfp_t flags,
3831                                   unsigned long attrs)
3832 {
3833         struct page *page = NULL;
3834         int order;
3835
3836         size = PAGE_ALIGN(size);
3837         order = get_order(size);
3838
3839         if (!iommu_no_mapping(dev))
3840                 flags &= ~(GFP_DMA | GFP_DMA32);
3841         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3842                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3843                         flags |= GFP_DMA;
3844                 else
3845                         flags |= GFP_DMA32;
3846         }
3847
3848         if (gfpflags_allow_blocking(flags)) {
3849                 unsigned int count = size >> PAGE_SHIFT;
3850
3851                 page = dma_alloc_from_contiguous(dev, count, order);
3852                 if (page && iommu_no_mapping(dev) &&
3853                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3854                         dma_release_from_contiguous(dev, page, count);
3855                         page = NULL;
3856                 }
3857         }
3858
3859         if (!page)
3860                 page = alloc_pages(flags, order);
3861         if (!page)
3862                 return NULL;
3863         memset(page_address(page), 0, size);
3864
3865         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3866                                          DMA_BIDIRECTIONAL,
3867                                          dev->coherent_dma_mask);
3868         if (*dma_handle)
3869                 return page_address(page);
3870         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3871                 __free_pages(page, order);
3872
3873         return NULL;
3874 }
3875
3876 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3877                                 dma_addr_t dma_handle, unsigned long attrs)
3878 {
3879         int order;
3880         struct page *page = virt_to_page(vaddr);
3881
3882         size = PAGE_ALIGN(size);
3883         order = get_order(size);
3884
3885         intel_unmap(dev, dma_handle, size);
3886         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3887                 __free_pages(page, order);
3888 }
3889
3890 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3891                            int nelems, enum dma_data_direction dir,
3892                            unsigned long attrs)
3893 {
3894         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3895         unsigned long nrpages = 0;
3896         struct scatterlist *sg;
3897         int i;
3898
3899         for_each_sg(sglist, sg, nelems, i) {
3900                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3901         }
3902
3903         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3904 }
3905
3906 static int intel_nontranslate_map_sg(struct device *hddev,
3907         struct scatterlist *sglist, int nelems, int dir)
3908 {
3909         int i;
3910         struct scatterlist *sg;
3911
3912         for_each_sg(sglist, sg, nelems, i) {
3913                 BUG_ON(!sg_page(sg));
3914                 sg->dma_address = sg_phys(sg);
3915                 sg->dma_length = sg->length;
3916         }
3917         return nelems;
3918 }
3919
3920 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3921                         enum dma_data_direction dir, unsigned long attrs)
3922 {
3923         int i;
3924         struct dmar_domain *domain;
3925         size_t size = 0;
3926         int prot = 0;
3927         unsigned long iova_pfn;
3928         int ret;
3929         struct scatterlist *sg;
3930         unsigned long start_vpfn;
3931         struct intel_iommu *iommu;
3932
3933         BUG_ON(dir == DMA_NONE);
3934         if (iommu_no_mapping(dev))
3935                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3936
3937         domain = get_valid_domain_for_dev(dev);
3938         if (!domain)
3939                 return 0;
3940
3941         iommu = domain_get_iommu(domain);
3942
3943         for_each_sg(sglist, sg, nelems, i)
3944                 size += aligned_nrpages(sg->offset, sg->length);
3945
3946         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3947                                 *dev->dma_mask);
3948         if (!iova_pfn) {
3949                 sglist->dma_length = 0;
3950                 return 0;
3951         }
3952
3953         /*
3954          * Check if DMAR supports zero-length reads on write only
3955          * mappings..
3956          */
3957         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3958                         !cap_zlr(iommu->cap))
3959                 prot |= DMA_PTE_READ;
3960         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3961                 prot |= DMA_PTE_WRITE;
3962
3963         start_vpfn = mm_to_dma_pfn(iova_pfn);
3964
3965         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3966         if (unlikely(ret)) {
3967                 dma_pte_free_pagetable(domain, start_vpfn,
3968                                        start_vpfn + size - 1);
3969                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3970                 return 0;
3971         }
3972
3973         /* it's a non-present to present mapping. Only flush if caching mode */
3974         if (cap_caching_mode(iommu->cap))
3975                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3976         else
3977                 iommu_flush_write_buffer(iommu);
3978
3979         return nelems;
3980 }
3981
3982 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3983 {
3984         return !dma_addr;
3985 }
3986
3987 struct dma_map_ops intel_dma_ops = {
3988         .alloc = intel_alloc_coherent,
3989         .free = intel_free_coherent,
3990         .map_sg = intel_map_sg,
3991         .unmap_sg = intel_unmap_sg,
3992         .map_page = intel_map_page,
3993         .unmap_page = intel_unmap_page,
3994         .mapping_error = intel_mapping_error,
3995 };
3996
3997 static inline int iommu_domain_cache_init(void)
3998 {
3999         int ret = 0;
4000
4001         iommu_domain_cache = kmem_cache_create("iommu_domain",
4002                                          sizeof(struct dmar_domain),
4003                                          0,
4004                                          SLAB_HWCACHE_ALIGN,
4005
4006                                          NULL);
4007         if (!iommu_domain_cache) {
4008                 pr_err("Couldn't create iommu_domain cache\n");
4009                 ret = -ENOMEM;
4010         }
4011
4012         return ret;
4013 }
4014
4015 static inline int iommu_devinfo_cache_init(void)
4016 {
4017         int ret = 0;
4018
4019         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4020                                          sizeof(struct device_domain_info),
4021                                          0,
4022                                          SLAB_HWCACHE_ALIGN,
4023                                          NULL);
4024         if (!iommu_devinfo_cache) {
4025                 pr_err("Couldn't create devinfo cache\n");
4026                 ret = -ENOMEM;
4027         }
4028
4029         return ret;
4030 }
4031
4032 static int __init iommu_init_mempool(void)
4033 {
4034         int ret;
4035         ret = iova_cache_get();
4036         if (ret)
4037                 return ret;
4038
4039         ret = iommu_domain_cache_init();
4040         if (ret)
4041                 goto domain_error;
4042
4043         ret = iommu_devinfo_cache_init();
4044         if (!ret)
4045                 return ret;
4046
4047         kmem_cache_destroy(iommu_domain_cache);
4048 domain_error:
4049         iova_cache_put();
4050
4051         return -ENOMEM;
4052 }
4053
4054 static void __init iommu_exit_mempool(void)
4055 {
4056         kmem_cache_destroy(iommu_devinfo_cache);
4057         kmem_cache_destroy(iommu_domain_cache);
4058         iova_cache_put();
4059 }
4060
4061 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4062 {
4063         struct dmar_drhd_unit *drhd;
4064         u32 vtbar;
4065         int rc;
4066
4067         /* We know that this device on this chipset has its own IOMMU.
4068          * If we find it under a different IOMMU, then the BIOS is lying
4069          * to us. Hope that the IOMMU for this device is actually
4070          * disabled, and it needs no translation...
4071          */
4072         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4073         if (rc) {
4074                 /* "can't" happen */
4075                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4076                 return;
4077         }
4078         vtbar &= 0xffff0000;
4079
4080         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4081         drhd = dmar_find_matched_drhd_unit(pdev);
4082         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4083                             TAINT_FIRMWARE_WORKAROUND,
4084                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4085                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4086 }
4087 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4088
4089 static void __init init_no_remapping_devices(void)
4090 {
4091         struct dmar_drhd_unit *drhd;
4092         struct device *dev;
4093         int i;
4094
4095         for_each_drhd_unit(drhd) {
4096                 if (!drhd->include_all) {
4097                         for_each_active_dev_scope(drhd->devices,
4098                                                   drhd->devices_cnt, i, dev)
4099                                 break;
4100                         /* ignore DMAR unit if no devices exist */
4101                         if (i == drhd->devices_cnt)
4102                                 drhd->ignored = 1;
4103                 }
4104         }
4105
4106         for_each_active_drhd_unit(drhd) {
4107                 if (drhd->include_all)
4108                         continue;
4109
4110                 for_each_active_dev_scope(drhd->devices,
4111                                           drhd->devices_cnt, i, dev)
4112                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4113                                 break;
4114                 if (i < drhd->devices_cnt)
4115                         continue;
4116
4117                 /* This IOMMU has *only* gfx devices. Either bypass it or
4118                    set the gfx_mapped flag, as appropriate */
4119                 if (dmar_map_gfx) {
4120                         intel_iommu_gfx_mapped = 1;
4121                 } else {
4122                         drhd->ignored = 1;
4123                         for_each_active_dev_scope(drhd->devices,
4124                                                   drhd->devices_cnt, i, dev)
4125                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4126                 }
4127         }
4128 }
4129
4130 #ifdef CONFIG_SUSPEND
4131 static int init_iommu_hw(void)
4132 {
4133         struct dmar_drhd_unit *drhd;
4134         struct intel_iommu *iommu = NULL;
4135
4136         for_each_active_iommu(iommu, drhd)
4137                 if (iommu->qi)
4138                         dmar_reenable_qi(iommu);
4139
4140         for_each_iommu(iommu, drhd) {
4141                 if (drhd->ignored) {
4142                         /*
4143                          * we always have to disable PMRs or DMA may fail on
4144                          * this device
4145                          */
4146                         if (force_on)
4147                                 iommu_disable_protect_mem_regions(iommu);
4148                         continue;
4149                 }
4150
4151                 iommu_flush_write_buffer(iommu);
4152
4153                 iommu_set_root_entry(iommu);
4154
4155                 iommu->flush.flush_context(iommu, 0, 0, 0,
4156                                            DMA_CCMD_GLOBAL_INVL);
4157                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4158                 iommu_enable_translation(iommu);
4159                 iommu_disable_protect_mem_regions(iommu);
4160         }
4161
4162         return 0;
4163 }
4164
4165 static void iommu_flush_all(void)
4166 {
4167         struct dmar_drhd_unit *drhd;
4168         struct intel_iommu *iommu;
4169
4170         for_each_active_iommu(iommu, drhd) {
4171                 iommu->flush.flush_context(iommu, 0, 0, 0,
4172                                            DMA_CCMD_GLOBAL_INVL);
4173                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4174                                          DMA_TLB_GLOBAL_FLUSH);
4175         }
4176 }
4177
4178 static int iommu_suspend(void)
4179 {
4180         struct dmar_drhd_unit *drhd;
4181         struct intel_iommu *iommu = NULL;
4182         unsigned long flag;
4183
4184         for_each_active_iommu(iommu, drhd) {
4185                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4186                                                  GFP_ATOMIC);
4187                 if (!iommu->iommu_state)
4188                         goto nomem;
4189         }
4190
4191         iommu_flush_all();
4192
4193         for_each_active_iommu(iommu, drhd) {
4194                 iommu_disable_translation(iommu);
4195
4196                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4197
4198                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4199                         readl(iommu->reg + DMAR_FECTL_REG);
4200                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4201                         readl(iommu->reg + DMAR_FEDATA_REG);
4202                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4203                         readl(iommu->reg + DMAR_FEADDR_REG);
4204                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4205                         readl(iommu->reg + DMAR_FEUADDR_REG);
4206
4207                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4208         }
4209         return 0;
4210
4211 nomem:
4212         for_each_active_iommu(iommu, drhd)
4213                 kfree(iommu->iommu_state);
4214
4215         return -ENOMEM;
4216 }
4217
4218 static void iommu_resume(void)
4219 {
4220         struct dmar_drhd_unit *drhd;
4221         struct intel_iommu *iommu = NULL;
4222         unsigned long flag;
4223
4224         if (init_iommu_hw()) {
4225                 if (force_on)
4226                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4227                 else
4228                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4229                 return;
4230         }
4231
4232         for_each_active_iommu(iommu, drhd) {
4233
4234                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4235
4236                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4237                         iommu->reg + DMAR_FECTL_REG);
4238                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4239                         iommu->reg + DMAR_FEDATA_REG);
4240                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4241                         iommu->reg + DMAR_FEADDR_REG);
4242                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4243                         iommu->reg + DMAR_FEUADDR_REG);
4244
4245                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4246         }
4247
4248         for_each_active_iommu(iommu, drhd)
4249                 kfree(iommu->iommu_state);
4250 }
4251
4252 static struct syscore_ops iommu_syscore_ops = {
4253         .resume         = iommu_resume,
4254         .suspend        = iommu_suspend,
4255 };
4256
4257 static void __init init_iommu_pm_ops(void)
4258 {
4259         register_syscore_ops(&iommu_syscore_ops);
4260 }
4261
4262 #else
4263 static inline void init_iommu_pm_ops(void) {}
4264 #endif  /* CONFIG_PM */
4265
4266
4267 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4268 {
4269         struct acpi_dmar_reserved_memory *rmrr;
4270         struct dmar_rmrr_unit *rmrru;
4271
4272         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4273         if (!rmrru)
4274                 return -ENOMEM;
4275
4276         rmrru->hdr = header;
4277         rmrr = (struct acpi_dmar_reserved_memory *)header;
4278         rmrru->base_address = rmrr->base_address;
4279         rmrru->end_address = rmrr->end_address;
4280         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4281                                 ((void *)rmrr) + rmrr->header.length,
4282                                 &rmrru->devices_cnt);
4283         if (rmrru->devices_cnt && rmrru->devices == NULL) {
4284                 kfree(rmrru);
4285                 return -ENOMEM;
4286         }
4287
4288         list_add(&rmrru->list, &dmar_rmrr_units);
4289
4290         return 0;
4291 }
4292
4293 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4294 {
4295         struct dmar_atsr_unit *atsru;
4296         struct acpi_dmar_atsr *tmp;
4297
4298         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4299                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4300                 if (atsr->segment != tmp->segment)
4301                         continue;
4302                 if (atsr->header.length != tmp->header.length)
4303                         continue;
4304                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4305                         return atsru;
4306         }
4307
4308         return NULL;
4309 }
4310
4311 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4312 {
4313         struct acpi_dmar_atsr *atsr;
4314         struct dmar_atsr_unit *atsru;
4315
4316         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4317                 return 0;
4318
4319         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4320         atsru = dmar_find_atsr(atsr);
4321         if (atsru)
4322                 return 0;
4323
4324         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4325         if (!atsru)
4326                 return -ENOMEM;
4327
4328         /*
4329          * If memory is allocated from slab by ACPI _DSM method, we need to
4330          * copy the memory content because the memory buffer will be freed
4331          * on return.
4332          */
4333         atsru->hdr = (void *)(atsru + 1);
4334         memcpy(atsru->hdr, hdr, hdr->length);
4335         atsru->include_all = atsr->flags & 0x1;
4336         if (!atsru->include_all) {
4337                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4338                                 (void *)atsr + atsr->header.length,
4339                                 &atsru->devices_cnt);
4340                 if (atsru->devices_cnt && atsru->devices == NULL) {
4341                         kfree(atsru);
4342                         return -ENOMEM;
4343                 }
4344         }
4345
4346         list_add_rcu(&atsru->list, &dmar_atsr_units);
4347
4348         return 0;
4349 }
4350
4351 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4352 {
4353         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4354         kfree(atsru);
4355 }
4356
4357 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4358 {
4359         struct acpi_dmar_atsr *atsr;
4360         struct dmar_atsr_unit *atsru;
4361
4362         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4363         atsru = dmar_find_atsr(atsr);
4364         if (atsru) {
4365                 list_del_rcu(&atsru->list);
4366                 synchronize_rcu();
4367                 intel_iommu_free_atsr(atsru);
4368         }
4369
4370         return 0;
4371 }
4372
4373 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4374 {
4375         int i;
4376         struct device *dev;
4377         struct acpi_dmar_atsr *atsr;
4378         struct dmar_atsr_unit *atsru;
4379
4380         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4381         atsru = dmar_find_atsr(atsr);
4382         if (!atsru)
4383                 return 0;
4384
4385         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4386                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4387                                           i, dev)
4388                         return -EBUSY;
4389         }
4390
4391         return 0;
4392 }
4393
4394 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4395 {
4396         int sp, ret = 0;
4397         struct intel_iommu *iommu = dmaru->iommu;
4398
4399         if (g_iommus[iommu->seq_id])
4400                 return 0;
4401
4402         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4403                 pr_warn("%s: Doesn't support hardware pass through.\n",
4404                         iommu->name);
4405                 return -ENXIO;
4406         }
4407         if (!ecap_sc_support(iommu->ecap) &&
4408             domain_update_iommu_snooping(iommu)) {
4409                 pr_warn("%s: Doesn't support snooping.\n",
4410                         iommu->name);
4411                 return -ENXIO;
4412         }
4413         sp = domain_update_iommu_superpage(iommu) - 1;
4414         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4415                 pr_warn("%s: Doesn't support large page.\n",
4416                         iommu->name);
4417                 return -ENXIO;
4418         }
4419
4420         /*
4421          * Disable translation if already enabled prior to OS handover.
4422          */
4423         if (iommu->gcmd & DMA_GCMD_TE)
4424                 iommu_disable_translation(iommu);
4425
4426         g_iommus[iommu->seq_id] = iommu;
4427         ret = iommu_init_domains(iommu);
4428         if (ret == 0)
4429                 ret = iommu_alloc_root_entry(iommu);
4430         if (ret)
4431                 goto out;
4432
4433 #ifdef CONFIG_INTEL_IOMMU_SVM
4434         if (pasid_enabled(iommu))
4435                 intel_svm_alloc_pasid_tables(iommu);
4436 #endif
4437
4438         if (dmaru->ignored) {
4439                 /*
4440                  * we always have to disable PMRs or DMA may fail on this device
4441                  */
4442                 if (force_on)
4443                         iommu_disable_protect_mem_regions(iommu);
4444                 return 0;
4445         }
4446
4447         intel_iommu_init_qi(iommu);
4448         iommu_flush_write_buffer(iommu);
4449
4450 #ifdef CONFIG_INTEL_IOMMU_SVM
4451         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4452                 ret = intel_svm_enable_prq(iommu);
4453                 if (ret)
4454                         goto disable_iommu;
4455         }
4456 #endif
4457         ret = dmar_set_interrupt(iommu);
4458         if (ret)
4459                 goto disable_iommu;
4460
4461         iommu_set_root_entry(iommu);
4462         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4463         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4464         iommu_enable_translation(iommu);
4465
4466         iommu_disable_protect_mem_regions(iommu);
4467         return 0;
4468
4469 disable_iommu:
4470         disable_dmar_iommu(iommu);
4471 out:
4472         free_dmar_iommu(iommu);
4473         return ret;
4474 }
4475
4476 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4477 {
4478         int ret = 0;
4479         struct intel_iommu *iommu = dmaru->iommu;
4480
4481         if (!intel_iommu_enabled)
4482                 return 0;
4483         if (iommu == NULL)
4484                 return -EINVAL;
4485
4486         if (insert) {
4487                 ret = intel_iommu_add(dmaru);
4488         } else {
4489                 disable_dmar_iommu(iommu);
4490                 free_dmar_iommu(iommu);
4491         }
4492
4493         return ret;
4494 }
4495
4496 static void intel_iommu_free_dmars(void)
4497 {
4498         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4499         struct dmar_atsr_unit *atsru, *atsr_n;
4500
4501         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4502                 list_del(&rmrru->list);
4503                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4504                 kfree(rmrru);
4505         }
4506
4507         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4508                 list_del(&atsru->list);
4509                 intel_iommu_free_atsr(atsru);
4510         }
4511 }
4512
4513 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4514 {
4515         int i, ret = 1;
4516         struct pci_bus *bus;
4517         struct pci_dev *bridge = NULL;
4518         struct device *tmp;
4519         struct acpi_dmar_atsr *atsr;
4520         struct dmar_atsr_unit *atsru;
4521
4522         dev = pci_physfn(dev);
4523         for (bus = dev->bus; bus; bus = bus->parent) {
4524                 bridge = bus->self;
4525                 /* If it's an integrated device, allow ATS */
4526                 if (!bridge)
4527                         return 1;
4528                 /* Connected via non-PCIe: no ATS */
4529                 if (!pci_is_pcie(bridge) ||
4530                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4531                         return 0;
4532                 /* If we found the root port, look it up in the ATSR */
4533                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4534                         break;
4535         }
4536
4537         rcu_read_lock();
4538         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4539                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4540                 if (atsr->segment != pci_domain_nr(dev->bus))
4541                         continue;
4542
4543                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4544                         if (tmp == &bridge->dev)
4545                                 goto out;
4546
4547                 if (atsru->include_all)
4548                         goto out;
4549         }
4550         ret = 0;
4551 out:
4552         rcu_read_unlock();
4553
4554         return ret;
4555 }
4556
4557 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4558 {
4559         int ret = 0;
4560         struct dmar_rmrr_unit *rmrru;
4561         struct dmar_atsr_unit *atsru;
4562         struct acpi_dmar_atsr *atsr;
4563         struct acpi_dmar_reserved_memory *rmrr;
4564
4565         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4566                 return 0;
4567
4568         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4569                 rmrr = container_of(rmrru->hdr,
4570                                     struct acpi_dmar_reserved_memory, header);
4571                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4572                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4573                                 ((void *)rmrr) + rmrr->header.length,
4574                                 rmrr->segment, rmrru->devices,
4575                                 rmrru->devices_cnt);
4576                         if(ret < 0)
4577                                 return ret;
4578                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4579                         dmar_remove_dev_scope(info, rmrr->segment,
4580                                 rmrru->devices, rmrru->devices_cnt);
4581                 }
4582         }
4583
4584         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4585                 if (atsru->include_all)
4586                         continue;
4587
4588                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4589                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4590                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4591                                         (void *)atsr + atsr->header.length,
4592                                         atsr->segment, atsru->devices,
4593                                         atsru->devices_cnt);
4594                         if (ret > 0)
4595                                 break;
4596                         else if(ret < 0)
4597                                 return ret;
4598                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4599                         if (dmar_remove_dev_scope(info, atsr->segment,
4600                                         atsru->devices, atsru->devices_cnt))
4601                                 break;
4602                 }
4603         }
4604
4605         return 0;
4606 }
4607
4608 /*
4609  * Here we only respond to action of unbound device from driver.
4610  *
4611  * Added device is not attached to its DMAR domain here yet. That will happen
4612  * when mapping the device to iova.
4613  */
4614 static int device_notifier(struct notifier_block *nb,
4615                                   unsigned long action, void *data)
4616 {
4617         struct device *dev = data;
4618         struct dmar_domain *domain;
4619
4620         if (iommu_dummy(dev))
4621                 return 0;
4622
4623         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4624                 return 0;
4625
4626         domain = find_domain(dev);
4627         if (!domain)
4628                 return 0;
4629
4630         dmar_remove_one_dev_info(domain, dev);
4631         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4632                 domain_exit(domain);
4633
4634         return 0;
4635 }
4636
4637 static struct notifier_block device_nb = {
4638         .notifier_call = device_notifier,
4639 };
4640
4641 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4642                                        unsigned long val, void *v)
4643 {
4644         struct memory_notify *mhp = v;
4645         unsigned long long start, end;
4646         unsigned long start_vpfn, last_vpfn;
4647
4648         switch (val) {
4649         case MEM_GOING_ONLINE:
4650                 start = mhp->start_pfn << PAGE_SHIFT;
4651                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4652                 if (iommu_domain_identity_map(si_domain, start, end)) {
4653                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4654                                 start, end);
4655                         return NOTIFY_BAD;
4656                 }
4657                 break;
4658
4659         case MEM_OFFLINE:
4660         case MEM_CANCEL_ONLINE:
4661                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4662                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4663                 while (start_vpfn <= last_vpfn) {
4664                         struct iova *iova;
4665                         struct dmar_drhd_unit *drhd;
4666                         struct intel_iommu *iommu;
4667                         struct page *freelist;
4668
4669                         iova = find_iova(&si_domain->iovad, start_vpfn);
4670                         if (iova == NULL) {
4671                                 pr_debug("Failed get IOVA for PFN %lx\n",
4672                                          start_vpfn);
4673                                 break;
4674                         }
4675
4676                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4677                                                      start_vpfn, last_vpfn);
4678                         if (iova == NULL) {
4679                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4680                                         start_vpfn, last_vpfn);
4681                                 return NOTIFY_BAD;
4682                         }
4683
4684                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4685                                                iova->pfn_hi);
4686
4687                         rcu_read_lock();
4688                         for_each_active_iommu(iommu, drhd)
4689                                 iommu_flush_iotlb_psi(iommu, si_domain,
4690                                         iova->pfn_lo, iova_size(iova),
4691                                         !freelist, 0);
4692                         rcu_read_unlock();
4693                         dma_free_pagelist(freelist);
4694
4695                         start_vpfn = iova->pfn_hi + 1;
4696                         free_iova_mem(iova);
4697                 }
4698                 break;
4699         }
4700
4701         return NOTIFY_OK;
4702 }
4703
4704 static struct notifier_block intel_iommu_memory_nb = {
4705         .notifier_call = intel_iommu_memory_notifier,
4706         .priority = 0
4707 };
4708
4709 static void free_all_cpu_cached_iovas(unsigned int cpu)
4710 {
4711         int i;
4712
4713         for (i = 0; i < g_num_of_iommus; i++) {
4714                 struct intel_iommu *iommu = g_iommus[i];
4715                 struct dmar_domain *domain;
4716                 int did;
4717
4718                 if (!iommu)
4719                         continue;
4720
4721                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4722                         domain = get_iommu_domain(iommu, (u16)did);
4723
4724                         if (!domain)
4725                                 continue;
4726                         free_cpu_cached_iovas(cpu, &domain->iovad);
4727                 }
4728         }
4729 }
4730
4731 static int intel_iommu_cpu_notifier(struct notifier_block *nfb,
4732                                     unsigned long action, void *v)
4733 {
4734         unsigned int cpu = (unsigned long)v;
4735
4736         switch (action) {
4737         case CPU_DEAD:
4738         case CPU_DEAD_FROZEN:
4739                 free_all_cpu_cached_iovas(cpu);
4740                 flush_unmaps_timeout(cpu);
4741                 break;
4742         }
4743         return NOTIFY_OK;
4744 }
4745
4746 static struct notifier_block intel_iommu_cpu_nb = {
4747         .notifier_call = intel_iommu_cpu_notifier,
4748 };
4749
4750 static ssize_t intel_iommu_show_version(struct device *dev,
4751                                         struct device_attribute *attr,
4752                                         char *buf)
4753 {
4754         struct intel_iommu *iommu = dev_get_drvdata(dev);
4755         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4756         return sprintf(buf, "%d:%d\n",
4757                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4758 }
4759 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4760
4761 static ssize_t intel_iommu_show_address(struct device *dev,
4762                                         struct device_attribute *attr,
4763                                         char *buf)
4764 {
4765         struct intel_iommu *iommu = dev_get_drvdata(dev);
4766         return sprintf(buf, "%llx\n", iommu->reg_phys);
4767 }
4768 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4769
4770 static ssize_t intel_iommu_show_cap(struct device *dev,
4771                                     struct device_attribute *attr,
4772                                     char *buf)
4773 {
4774         struct intel_iommu *iommu = dev_get_drvdata(dev);
4775         return sprintf(buf, "%llx\n", iommu->cap);
4776 }
4777 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4778
4779 static ssize_t intel_iommu_show_ecap(struct device *dev,
4780                                     struct device_attribute *attr,
4781                                     char *buf)
4782 {
4783         struct intel_iommu *iommu = dev_get_drvdata(dev);
4784         return sprintf(buf, "%llx\n", iommu->ecap);
4785 }
4786 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4787
4788 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4789                                       struct device_attribute *attr,
4790                                       char *buf)
4791 {
4792         struct intel_iommu *iommu = dev_get_drvdata(dev);
4793         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4794 }
4795 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4796
4797 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4798                                            struct device_attribute *attr,
4799                                            char *buf)
4800 {
4801         struct intel_iommu *iommu = dev_get_drvdata(dev);
4802         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4803                                                   cap_ndoms(iommu->cap)));
4804 }
4805 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4806
4807 static struct attribute *intel_iommu_attrs[] = {
4808         &dev_attr_version.attr,
4809         &dev_attr_address.attr,
4810         &dev_attr_cap.attr,
4811         &dev_attr_ecap.attr,
4812         &dev_attr_domains_supported.attr,
4813         &dev_attr_domains_used.attr,
4814         NULL,
4815 };
4816
4817 static struct attribute_group intel_iommu_group = {
4818         .name = "intel-iommu",
4819         .attrs = intel_iommu_attrs,
4820 };
4821
4822 const struct attribute_group *intel_iommu_groups[] = {
4823         &intel_iommu_group,
4824         NULL,
4825 };
4826
4827 int __init intel_iommu_init(void)
4828 {
4829         int ret = -ENODEV;
4830         struct dmar_drhd_unit *drhd;
4831         struct intel_iommu *iommu;
4832
4833         /* VT-d is required for a TXT/tboot launch, so enforce that */
4834         force_on = tboot_force_iommu();
4835
4836         if (iommu_init_mempool()) {
4837                 if (force_on)
4838                         panic("tboot: Failed to initialize iommu memory\n");
4839                 return -ENOMEM;
4840         }
4841
4842         down_write(&dmar_global_lock);
4843         if (dmar_table_init()) {
4844                 if (force_on)
4845                         panic("tboot: Failed to initialize DMAR table\n");
4846                 goto out_free_dmar;
4847         }
4848
4849         if (dmar_dev_scope_init() < 0) {
4850                 if (force_on)
4851                         panic("tboot: Failed to initialize DMAR device scope\n");
4852                 goto out_free_dmar;
4853         }
4854
4855         if (no_iommu || dmar_disabled)
4856                 goto out_free_dmar;
4857
4858         if (list_empty(&dmar_rmrr_units))
4859                 pr_info("No RMRR found\n");
4860
4861         if (list_empty(&dmar_atsr_units))
4862                 pr_info("No ATSR found\n");
4863
4864         if (dmar_init_reserved_ranges()) {
4865                 if (force_on)
4866                         panic("tboot: Failed to reserve iommu ranges\n");
4867                 goto out_free_reserved_range;
4868         }
4869
4870         init_no_remapping_devices();
4871
4872         ret = init_dmars();
4873         if (ret) {
4874                 if (force_on)
4875                         panic("tboot: Failed to initialize DMARs\n");
4876                 pr_err("Initialization failed\n");
4877                 goto out_free_reserved_range;
4878         }
4879         up_write(&dmar_global_lock);
4880         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4881
4882 #ifdef CONFIG_SWIOTLB
4883         swiotlb = 0;
4884 #endif
4885         dma_ops = &intel_dma_ops;
4886
4887         init_iommu_pm_ops();
4888
4889         for_each_active_iommu(iommu, drhd)
4890                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4891                                                        intel_iommu_groups,
4892                                                        "%s", iommu->name);
4893
4894         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4895         bus_register_notifier(&pci_bus_type, &device_nb);
4896         if (si_domain && !hw_pass_through)
4897                 register_memory_notifier(&intel_iommu_memory_nb);
4898         register_hotcpu_notifier(&intel_iommu_cpu_nb);
4899
4900         intel_iommu_enabled = 1;
4901
4902         return 0;
4903
4904 out_free_reserved_range:
4905         put_iova_domain(&reserved_iova_list);
4906 out_free_dmar:
4907         intel_iommu_free_dmars();
4908         up_write(&dmar_global_lock);
4909         iommu_exit_mempool();
4910         return ret;
4911 }
4912
4913 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4914 {
4915         struct intel_iommu *iommu = opaque;
4916
4917         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4918         return 0;
4919 }
4920
4921 /*
4922  * NB - intel-iommu lacks any sort of reference counting for the users of
4923  * dependent devices.  If multiple endpoints have intersecting dependent
4924  * devices, unbinding the driver from any one of them will possibly leave
4925  * the others unable to operate.
4926  */
4927 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4928 {
4929         if (!iommu || !dev || !dev_is_pci(dev))
4930                 return;
4931
4932         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4933 }
4934
4935 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4936 {
4937         struct intel_iommu *iommu;
4938         unsigned long flags;
4939
4940         assert_spin_locked(&device_domain_lock);
4941
4942         if (WARN_ON(!info))
4943                 return;
4944
4945         iommu = info->iommu;
4946
4947         if (info->dev) {
4948                 iommu_disable_dev_iotlb(info);
4949                 domain_context_clear(iommu, info->dev);
4950         }
4951
4952         unlink_domain_info(info);
4953
4954         spin_lock_irqsave(&iommu->lock, flags);
4955         domain_detach_iommu(info->domain, iommu);
4956         spin_unlock_irqrestore(&iommu->lock, flags);
4957
4958         free_devinfo_mem(info);
4959 }
4960
4961 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4962                                      struct device *dev)
4963 {
4964         struct device_domain_info *info;
4965         unsigned long flags;
4966
4967         spin_lock_irqsave(&device_domain_lock, flags);
4968         info = dev->archdata.iommu;
4969         __dmar_remove_one_dev_info(info);
4970         spin_unlock_irqrestore(&device_domain_lock, flags);
4971 }
4972
4973 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4974 {
4975         int adjust_width;
4976
4977         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4978                         DMA_32BIT_PFN);
4979         domain_reserve_special_ranges(domain);
4980
4981         /* calculate AGAW */
4982         domain->gaw = guest_width;
4983         adjust_width = guestwidth_to_adjustwidth(guest_width);
4984         domain->agaw = width_to_agaw(adjust_width);
4985
4986         domain->iommu_coherency = 0;
4987         domain->iommu_snooping = 0;
4988         domain->iommu_superpage = 0;
4989         domain->max_addr = 0;
4990
4991         /* always allocate the top pgd */
4992         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4993         if (!domain->pgd)
4994                 return -ENOMEM;
4995         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4996         return 0;
4997 }
4998
4999 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5000 {
5001         struct dmar_domain *dmar_domain;
5002         struct iommu_domain *domain;
5003
5004         if (type != IOMMU_DOMAIN_UNMANAGED)
5005                 return NULL;
5006
5007         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
5008         if (!dmar_domain) {
5009                 pr_err("Can't allocate dmar_domain\n");
5010                 return NULL;
5011         }
5012         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5013                 pr_err("Domain initialization failed\n");
5014                 domain_exit(dmar_domain);
5015                 return NULL;
5016         }
5017         domain_update_iommu_cap(dmar_domain);
5018
5019         domain = &dmar_domain->domain;
5020         domain->geometry.aperture_start = 0;
5021         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5022         domain->geometry.force_aperture = true;
5023
5024         return domain;
5025 }
5026
5027 static void intel_iommu_domain_free(struct iommu_domain *domain)
5028 {
5029         domain_exit(to_dmar_domain(domain));
5030 }
5031
5032 static int intel_iommu_attach_device(struct iommu_domain *domain,
5033                                      struct device *dev)
5034 {
5035         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5036         struct intel_iommu *iommu;
5037         int addr_width;
5038         u8 bus, devfn;
5039
5040         if (device_is_rmrr_locked(dev)) {
5041                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5042                 return -EPERM;
5043         }
5044
5045         /* normally dev is not mapped */
5046         if (unlikely(domain_context_mapped(dev))) {
5047                 struct dmar_domain *old_domain;
5048
5049                 old_domain = find_domain(dev);
5050                 if (old_domain) {
5051                         rcu_read_lock();
5052                         dmar_remove_one_dev_info(old_domain, dev);
5053                         rcu_read_unlock();
5054
5055                         if (!domain_type_is_vm_or_si(old_domain) &&
5056                              list_empty(&old_domain->devices))
5057                                 domain_exit(old_domain);
5058                 }
5059         }
5060
5061         iommu = device_to_iommu(dev, &bus, &devfn);
5062         if (!iommu)
5063                 return -ENODEV;
5064
5065         /* check if this iommu agaw is sufficient for max mapped address */
5066         addr_width = agaw_to_width(iommu->agaw);
5067         if (addr_width > cap_mgaw(iommu->cap))
5068                 addr_width = cap_mgaw(iommu->cap);
5069
5070         if (dmar_domain->max_addr > (1LL << addr_width)) {
5071                 pr_err("%s: iommu width (%d) is not "
5072                        "sufficient for the mapped address (%llx)\n",
5073                        __func__, addr_width, dmar_domain->max_addr);
5074                 return -EFAULT;
5075         }
5076         dmar_domain->gaw = addr_width;
5077
5078         /*
5079          * Knock out extra levels of page tables if necessary
5080          */
5081         while (iommu->agaw < dmar_domain->agaw) {
5082                 struct dma_pte *pte;
5083
5084                 pte = dmar_domain->pgd;
5085                 if (dma_pte_present(pte)) {
5086                         dmar_domain->pgd = (struct dma_pte *)
5087                                 phys_to_virt(dma_pte_addr(pte));
5088                         free_pgtable_page(pte);
5089                 }
5090                 dmar_domain->agaw--;
5091         }
5092
5093         return domain_add_dev_info(dmar_domain, dev);
5094 }
5095
5096 static void intel_iommu_detach_device(struct iommu_domain *domain,
5097                                       struct device *dev)
5098 {
5099         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5100 }
5101
5102 static int intel_iommu_map(struct iommu_domain *domain,
5103                            unsigned long iova, phys_addr_t hpa,
5104                            size_t size, int iommu_prot)
5105 {
5106         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5107         u64 max_addr;
5108         int prot = 0;
5109         int ret;
5110
5111         if (iommu_prot & IOMMU_READ)
5112                 prot |= DMA_PTE_READ;
5113         if (iommu_prot & IOMMU_WRITE)
5114                 prot |= DMA_PTE_WRITE;
5115         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5116                 prot |= DMA_PTE_SNP;
5117
5118         max_addr = iova + size;
5119         if (dmar_domain->max_addr < max_addr) {
5120                 u64 end;
5121
5122                 /* check if minimum agaw is sufficient for mapped address */
5123                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5124                 if (end < max_addr) {
5125                         pr_err("%s: iommu width (%d) is not "
5126                                "sufficient for the mapped address (%llx)\n",
5127                                __func__, dmar_domain->gaw, max_addr);
5128                         return -EFAULT;
5129                 }
5130                 dmar_domain->max_addr = max_addr;
5131         }
5132         /* Round up size to next multiple of PAGE_SIZE, if it and
5133            the low bits of hpa would take us onto the next page */
5134         size = aligned_nrpages(hpa, size);
5135         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5136                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5137         return ret;
5138 }
5139
5140 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5141                                 unsigned long iova, size_t size)
5142 {
5143         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5144         struct page *freelist = NULL;
5145         struct intel_iommu *iommu;
5146         unsigned long start_pfn, last_pfn;
5147         unsigned int npages;
5148         int iommu_id, level = 0;
5149
5150         /* Cope with horrid API which requires us to unmap more than the
5151            size argument if it happens to be a large-page mapping. */
5152         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5153
5154         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5155                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5156
5157         start_pfn = iova >> VTD_PAGE_SHIFT;
5158         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5159
5160         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5161
5162         npages = last_pfn - start_pfn + 1;
5163
5164         for_each_domain_iommu(iommu_id, dmar_domain) {
5165                 iommu = g_iommus[iommu_id];
5166
5167                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5168                                       start_pfn, npages, !freelist, 0);
5169         }
5170
5171         dma_free_pagelist(freelist);
5172
5173         if (dmar_domain->max_addr == iova + size)
5174                 dmar_domain->max_addr = iova;
5175
5176         return size;
5177 }
5178
5179 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5180                                             dma_addr_t iova)
5181 {
5182         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5183         struct dma_pte *pte;
5184         int level = 0;
5185         u64 phys = 0;
5186
5187         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5188         if (pte)
5189                 phys = dma_pte_addr(pte);
5190
5191         return phys;
5192 }
5193
5194 static bool intel_iommu_capable(enum iommu_cap cap)
5195 {
5196         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5197                 return domain_update_iommu_snooping(NULL) == 1;
5198         if (cap == IOMMU_CAP_INTR_REMAP)
5199                 return irq_remapping_enabled == 1;
5200
5201         return false;
5202 }
5203
5204 static int intel_iommu_add_device(struct device *dev)
5205 {
5206         struct intel_iommu *iommu;
5207         struct iommu_group *group;
5208         u8 bus, devfn;
5209
5210         iommu = device_to_iommu(dev, &bus, &devfn);
5211         if (!iommu)
5212                 return -ENODEV;
5213
5214         iommu_device_link(iommu->iommu_dev, dev);
5215
5216         group = iommu_group_get_for_dev(dev);
5217
5218         if (IS_ERR(group))
5219                 return PTR_ERR(group);
5220
5221         iommu_group_put(group);
5222         return 0;
5223 }
5224
5225 static void intel_iommu_remove_device(struct device *dev)
5226 {
5227         struct intel_iommu *iommu;
5228         u8 bus, devfn;
5229
5230         iommu = device_to_iommu(dev, &bus, &devfn);
5231         if (!iommu)
5232                 return;
5233
5234         iommu_group_remove_device(dev);
5235
5236         iommu_device_unlink(iommu->iommu_dev, dev);
5237 }
5238
5239 #ifdef CONFIG_INTEL_IOMMU_SVM
5240 #define MAX_NR_PASID_BITS (20)
5241 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5242 {
5243         /*
5244          * Convert ecap_pss to extend context entry pts encoding, also
5245          * respect the soft pasid_max value set by the iommu.
5246          * - number of PASID bits = ecap_pss + 1
5247          * - number of PASID table entries = 2^(pts + 5)
5248          * Therefore, pts = ecap_pss - 4
5249          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5250          */
5251         if (ecap_pss(iommu->ecap) < 5)
5252                 return 0;
5253
5254         /* pasid_max is encoded as actual number of entries not the bits */
5255         return find_first_bit((unsigned long *)&iommu->pasid_max,
5256                         MAX_NR_PASID_BITS) - 5;
5257 }
5258
5259 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5260 {
5261         struct device_domain_info *info;
5262         struct context_entry *context;
5263         struct dmar_domain *domain;
5264         unsigned long flags;
5265         u64 ctx_lo;
5266         int ret;
5267
5268         domain = get_valid_domain_for_dev(sdev->dev);
5269         if (!domain)
5270                 return -EINVAL;
5271
5272         spin_lock_irqsave(&device_domain_lock, flags);
5273         spin_lock(&iommu->lock);
5274
5275         ret = -EINVAL;
5276         info = sdev->dev->archdata.iommu;
5277         if (!info || !info->pasid_supported)
5278                 goto out;
5279
5280         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5281         if (WARN_ON(!context))
5282                 goto out;
5283
5284         ctx_lo = context[0].lo;
5285
5286         sdev->did = domain->iommu_did[iommu->seq_id];
5287         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5288
5289         if (!(ctx_lo & CONTEXT_PASIDE)) {
5290                 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5291                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5292                         intel_iommu_get_pts(iommu);
5293
5294                 wmb();
5295                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5296                  * extended to permit requests-with-PASID if the PASIDE bit
5297                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5298                  * however, the PASIDE bit is ignored and requests-with-PASID
5299                  * are unconditionally blocked. Which makes less sense.
5300                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5301                  * "guest mode" translation types depending on whether ATS
5302                  * is available or not. Annoyingly, we can't use the new
5303                  * modes *unless* PASIDE is set. */
5304                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5305                         ctx_lo &= ~CONTEXT_TT_MASK;
5306                         if (info->ats_supported)
5307                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5308                         else
5309                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5310                 }
5311                 ctx_lo |= CONTEXT_PASIDE;
5312                 if (iommu->pasid_state_table)
5313                         ctx_lo |= CONTEXT_DINVE;
5314                 if (info->pri_supported)
5315                         ctx_lo |= CONTEXT_PRS;
5316                 context[0].lo = ctx_lo;
5317                 wmb();
5318                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5319                                            DMA_CCMD_MASK_NOBIT,
5320                                            DMA_CCMD_DEVICE_INVL);
5321         }
5322
5323         /* Enable PASID support in the device, if it wasn't already */
5324         if (!info->pasid_enabled)
5325                 iommu_enable_dev_iotlb(info);
5326
5327         if (info->ats_enabled) {
5328                 sdev->dev_iotlb = 1;
5329                 sdev->qdep = info->ats_qdep;
5330                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5331                         sdev->qdep = 0;
5332         }
5333         ret = 0;
5334
5335  out:
5336         spin_unlock(&iommu->lock);
5337         spin_unlock_irqrestore(&device_domain_lock, flags);
5338
5339         return ret;
5340 }
5341
5342 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5343 {
5344         struct intel_iommu *iommu;
5345         u8 bus, devfn;
5346
5347         if (iommu_dummy(dev)) {
5348                 dev_warn(dev,
5349                          "No IOMMU translation for device; cannot enable SVM\n");
5350                 return NULL;
5351         }
5352
5353         iommu = device_to_iommu(dev, &bus, &devfn);
5354         if ((!iommu)) {
5355                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5356                 return NULL;
5357         }
5358
5359         if (!iommu->pasid_table) {
5360                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5361                 return NULL;
5362         }
5363
5364         return iommu;
5365 }
5366 #endif /* CONFIG_INTEL_IOMMU_SVM */
5367
5368 static const struct iommu_ops intel_iommu_ops = {
5369         .capable        = intel_iommu_capable,
5370         .domain_alloc   = intel_iommu_domain_alloc,
5371         .domain_free    = intel_iommu_domain_free,
5372         .attach_dev     = intel_iommu_attach_device,
5373         .detach_dev     = intel_iommu_detach_device,
5374         .map            = intel_iommu_map,
5375         .unmap          = intel_iommu_unmap,
5376         .map_sg         = default_iommu_map_sg,
5377         .iova_to_phys   = intel_iommu_iova_to_phys,
5378         .add_device     = intel_iommu_add_device,
5379         .remove_device  = intel_iommu_remove_device,
5380         .device_group   = pci_device_group,
5381         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
5382 };
5383
5384 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5385 {
5386         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5387         pr_info("Disabling IOMMU for graphics on this chipset\n");
5388         dmar_map_gfx = 0;
5389 }
5390
5391 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5392 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5393 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5394 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5395 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5396 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5397 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5398
5399 static void quirk_iommu_rwbf(struct pci_dev *dev)
5400 {
5401         /*
5402          * Mobile 4 Series Chipset neglects to set RWBF capability,
5403          * but needs it. Same seems to hold for the desktop versions.
5404          */
5405         pr_info("Forcing write-buffer flush capability\n");
5406         rwbf_quirk = 1;
5407 }
5408
5409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5415 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5416
5417 #define GGC 0x52
5418 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5419 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5420 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5421 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5422 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5423 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5424 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5425 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5426
5427 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5428 {
5429         unsigned short ggc;
5430
5431         if (pci_read_config_word(dev, GGC, &ggc))
5432                 return;
5433
5434         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5435                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5436                 dmar_map_gfx = 0;
5437         } else if (dmar_map_gfx) {
5438                 /* we have to ensure the gfx device is idle before we flush */
5439                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5440                 intel_iommu_strict = 1;
5441        }
5442 }
5443 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5445 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5447
5448 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5449    ISOCH DMAR unit for the Azalia sound device, but not give it any
5450    TLB entries, which causes it to deadlock. Check for that.  We do
5451    this in a function called from init_dmars(), instead of in a PCI
5452    quirk, because we don't want to print the obnoxious "BIOS broken"
5453    message if VT-d is actually disabled.
5454 */
5455 static void __init check_tylersburg_isoch(void)
5456 {
5457         struct pci_dev *pdev;
5458         uint32_t vtisochctrl;
5459
5460         /* If there's no Azalia in the system anyway, forget it. */
5461         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5462         if (!pdev)
5463                 return;
5464         pci_dev_put(pdev);
5465
5466         /* System Management Registers. Might be hidden, in which case
5467            we can't do the sanity check. But that's OK, because the
5468            known-broken BIOSes _don't_ actually hide it, so far. */
5469         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5470         if (!pdev)
5471                 return;
5472
5473         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5474                 pci_dev_put(pdev);
5475                 return;
5476         }
5477
5478         pci_dev_put(pdev);
5479
5480         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5481         if (vtisochctrl & 1)
5482                 return;
5483
5484         /* Drop all bits other than the number of TLB entries */
5485         vtisochctrl &= 0x1c;
5486
5487         /* If we have the recommended number of TLB entries (16), fine. */
5488         if (vtisochctrl == 0x10)
5489                 return;
5490
5491         /* Zero TLB entries? You get to ride the short bus to school. */
5492         if (!vtisochctrl) {
5493                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5494                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5495                      dmi_get_system_info(DMI_BIOS_VENDOR),
5496                      dmi_get_system_info(DMI_BIOS_VERSION),
5497                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5498                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5499                 return;
5500         }
5501
5502         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5503                vtisochctrl);
5504 }