]> git.ipfire.org Git - thirdparty/linux.git/blob - drivers/iommu/intel/iommu.c
Merge branches 'apple/dart', 'arm/mediatek', 'arm/renesas', 'arm/rockchip', 'arm...
[thirdparty/linux.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25
26 #include "iommu.h"
27 #include "../dma-iommu.h"
28 #include "../irq_remapping.h"
29 #include "../iommu-sva.h"
30 #include "pasid.h"
31 #include "cap_audit.h"
32 #include "perfmon.h"
33
34 #define ROOT_SIZE VTD_PAGE_SIZE
35 #define CONTEXT_SIZE VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START (0xfee00000)
43 #define IOAPIC_RANGE_END (0xfeefffff)
44 #define IOVA_START_ADDR (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN (1)
62
63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
64
65 /* page table handling */
66 #define LEVEL_STRIDE (9)
67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
68
69 static inline int agaw_to_level(int agaw)
70 {
71 return agaw + 2;
72 }
73
74 static inline int agaw_to_width(int agaw)
75 {
76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78
79 static inline int width_to_agaw(int width)
80 {
81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 return (level - 1) * LEVEL_STRIDE;
87 }
88
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93
94 static inline u64 level_mask(int level)
95 {
96 return -1ULL << level_to_offset_bits(level);
97 }
98
99 static inline u64 level_size(int level)
100 {
101 return 1ULL << level_to_offset_bits(level);
102 }
103
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115 are never going to work. */
116 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
117 {
118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
121 {
122 return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
123 }
124 static inline unsigned long page_to_dma_pfn(struct page *pg)
125 {
126 return mm_to_dma_pfn_start(page_to_pfn(pg));
127 }
128 static inline unsigned long virt_to_dma_pfn(void *p)
129 {
130 return page_to_dma_pfn(virt_to_page(p));
131 }
132
133 static void __init check_tylersburg_isoch(void);
134 static int rwbf_quirk;
135
136 /*
137 * set to 1 to panic kernel if can't successfully enable VT-d
138 * (used when kernel is launched w/ TXT)
139 */
140 static int force_on = 0;
141 static int intel_iommu_tboot_noforce;
142 static int no_platform_optin;
143
144 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
145
146 /*
147 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
148 * if marked present.
149 */
150 static phys_addr_t root_entry_lctp(struct root_entry *re)
151 {
152 if (!(re->lo & 1))
153 return 0;
154
155 return re->lo & VTD_PAGE_MASK;
156 }
157
158 /*
159 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
160 * if marked present.
161 */
162 static phys_addr_t root_entry_uctp(struct root_entry *re)
163 {
164 if (!(re->hi & 1))
165 return 0;
166
167 return re->hi & VTD_PAGE_MASK;
168 }
169
170 static inline void context_set_present(struct context_entry *context)
171 {
172 context->lo |= 1;
173 }
174
175 static inline void context_set_fault_enable(struct context_entry *context)
176 {
177 context->lo &= (((u64)-1) << 2) | 1;
178 }
179
180 static inline void context_set_translation_type(struct context_entry *context,
181 unsigned long value)
182 {
183 context->lo &= (((u64)-1) << 4) | 3;
184 context->lo |= (value & 3) << 2;
185 }
186
187 static inline void context_set_address_root(struct context_entry *context,
188 unsigned long value)
189 {
190 context->lo &= ~VTD_PAGE_MASK;
191 context->lo |= value & VTD_PAGE_MASK;
192 }
193
194 static inline void context_set_address_width(struct context_entry *context,
195 unsigned long value)
196 {
197 context->hi |= value & 7;
198 }
199
200 static inline void context_set_domain_id(struct context_entry *context,
201 unsigned long value)
202 {
203 context->hi |= (value & ((1 << 16) - 1)) << 8;
204 }
205
206 static inline void context_set_pasid(struct context_entry *context)
207 {
208 context->lo |= CONTEXT_PASIDE;
209 }
210
211 static inline int context_domain_id(struct context_entry *c)
212 {
213 return((c->hi >> 8) & 0xffff);
214 }
215
216 static inline void context_clear_entry(struct context_entry *context)
217 {
218 context->lo = 0;
219 context->hi = 0;
220 }
221
222 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
223 {
224 if (!iommu->copied_tables)
225 return false;
226
227 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
228 }
229
230 static inline void
231 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
232 {
233 set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
234 }
235
236 static inline void
237 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
238 {
239 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
240 }
241
242 /*
243 * This domain is a statically identity mapping domain.
244 * 1. This domain creats a static 1:1 mapping to all usable memory.
245 * 2. It maps to each iommu if successful.
246 * 3. Each iommu mapps to this domain if successful.
247 */
248 static struct dmar_domain *si_domain;
249 static int hw_pass_through = 1;
250
251 struct dmar_rmrr_unit {
252 struct list_head list; /* list of rmrr units */
253 struct acpi_dmar_header *hdr; /* ACPI header */
254 u64 base_address; /* reserved base address*/
255 u64 end_address; /* reserved end address */
256 struct dmar_dev_scope *devices; /* target devices */
257 int devices_cnt; /* target device count */
258 };
259
260 struct dmar_atsr_unit {
261 struct list_head list; /* list of ATSR units */
262 struct acpi_dmar_header *hdr; /* ACPI header */
263 struct dmar_dev_scope *devices; /* target devices */
264 int devices_cnt; /* target device count */
265 u8 include_all:1; /* include all ports */
266 };
267
268 struct dmar_satc_unit {
269 struct list_head list; /* list of SATC units */
270 struct acpi_dmar_header *hdr; /* ACPI header */
271 struct dmar_dev_scope *devices; /* target devices */
272 struct intel_iommu *iommu; /* the corresponding iommu */
273 int devices_cnt; /* target device count */
274 u8 atc_required:1; /* ATS is required */
275 };
276
277 static LIST_HEAD(dmar_atsr_units);
278 static LIST_HEAD(dmar_rmrr_units);
279 static LIST_HEAD(dmar_satc_units);
280
281 #define for_each_rmrr_units(rmrr) \
282 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
283
284 static void device_block_translation(struct device *dev);
285 static void intel_iommu_domain_free(struct iommu_domain *domain);
286
287 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
288 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
289
290 int intel_iommu_enabled = 0;
291 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
292
293 static int dmar_map_gfx = 1;
294 static int intel_iommu_superpage = 1;
295 static int iommu_identity_mapping;
296 static int iommu_skip_te_disable;
297
298 #define IDENTMAP_GFX 2
299 #define IDENTMAP_AZALIA 4
300
301 const struct iommu_ops intel_iommu_ops;
302
303 static bool translation_pre_enabled(struct intel_iommu *iommu)
304 {
305 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
306 }
307
308 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
309 {
310 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
311 }
312
313 static void init_translation_status(struct intel_iommu *iommu)
314 {
315 u32 gsts;
316
317 gsts = readl(iommu->reg + DMAR_GSTS_REG);
318 if (gsts & DMA_GSTS_TES)
319 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
320 }
321
322 static int __init intel_iommu_setup(char *str)
323 {
324 if (!str)
325 return -EINVAL;
326
327 while (*str) {
328 if (!strncmp(str, "on", 2)) {
329 dmar_disabled = 0;
330 pr_info("IOMMU enabled\n");
331 } else if (!strncmp(str, "off", 3)) {
332 dmar_disabled = 1;
333 no_platform_optin = 1;
334 pr_info("IOMMU disabled\n");
335 } else if (!strncmp(str, "igfx_off", 8)) {
336 dmar_map_gfx = 0;
337 pr_info("Disable GFX device mapping\n");
338 } else if (!strncmp(str, "forcedac", 8)) {
339 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
340 iommu_dma_forcedac = true;
341 } else if (!strncmp(str, "strict", 6)) {
342 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
343 iommu_set_dma_strict();
344 } else if (!strncmp(str, "sp_off", 6)) {
345 pr_info("Disable supported super page\n");
346 intel_iommu_superpage = 0;
347 } else if (!strncmp(str, "sm_on", 5)) {
348 pr_info("Enable scalable mode if hardware supports\n");
349 intel_iommu_sm = 1;
350 } else if (!strncmp(str, "sm_off", 6)) {
351 pr_info("Scalable mode is disallowed\n");
352 intel_iommu_sm = 0;
353 } else if (!strncmp(str, "tboot_noforce", 13)) {
354 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
355 intel_iommu_tboot_noforce = 1;
356 } else {
357 pr_notice("Unknown option - '%s'\n", str);
358 }
359
360 str += strcspn(str, ",");
361 while (*str == ',')
362 str++;
363 }
364
365 return 1;
366 }
367 __setup("intel_iommu=", intel_iommu_setup);
368
369 void *alloc_pgtable_page(int node, gfp_t gfp)
370 {
371 struct page *page;
372 void *vaddr = NULL;
373
374 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
375 if (page)
376 vaddr = page_address(page);
377 return vaddr;
378 }
379
380 void free_pgtable_page(void *vaddr)
381 {
382 free_page((unsigned long)vaddr);
383 }
384
385 static inline int domain_type_is_si(struct dmar_domain *domain)
386 {
387 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
388 }
389
390 static inline int domain_pfn_supported(struct dmar_domain *domain,
391 unsigned long pfn)
392 {
393 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
394
395 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
396 }
397
398 /*
399 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
400 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
401 * the returned SAGAW.
402 */
403 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
404 {
405 unsigned long fl_sagaw, sl_sagaw;
406
407 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
408 sl_sagaw = cap_sagaw(iommu->cap);
409
410 /* Second level only. */
411 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
412 return sl_sagaw;
413
414 /* First level only. */
415 if (!ecap_slts(iommu->ecap))
416 return fl_sagaw;
417
418 return fl_sagaw & sl_sagaw;
419 }
420
421 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
422 {
423 unsigned long sagaw;
424 int agaw;
425
426 sagaw = __iommu_calculate_sagaw(iommu);
427 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
428 if (test_bit(agaw, &sagaw))
429 break;
430 }
431
432 return agaw;
433 }
434
435 /*
436 * Calculate max SAGAW for each iommu.
437 */
438 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
439 {
440 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
441 }
442
443 /*
444 * calculate agaw for each iommu.
445 * "SAGAW" may be different across iommus, use a default agaw, and
446 * get a supported less agaw for iommus that don't support the default agaw.
447 */
448 int iommu_calculate_agaw(struct intel_iommu *iommu)
449 {
450 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
451 }
452
453 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
454 {
455 return sm_supported(iommu) ?
456 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
457 }
458
459 static void domain_update_iommu_coherency(struct dmar_domain *domain)
460 {
461 struct iommu_domain_info *info;
462 struct dmar_drhd_unit *drhd;
463 struct intel_iommu *iommu;
464 bool found = false;
465 unsigned long i;
466
467 domain->iommu_coherency = true;
468 xa_for_each(&domain->iommu_array, i, info) {
469 found = true;
470 if (!iommu_paging_structure_coherency(info->iommu)) {
471 domain->iommu_coherency = false;
472 break;
473 }
474 }
475 if (found)
476 return;
477
478 /* No hardware attached; use lowest common denominator */
479 rcu_read_lock();
480 for_each_active_iommu(iommu, drhd) {
481 if (!iommu_paging_structure_coherency(iommu)) {
482 domain->iommu_coherency = false;
483 break;
484 }
485 }
486 rcu_read_unlock();
487 }
488
489 static int domain_update_iommu_superpage(struct dmar_domain *domain,
490 struct intel_iommu *skip)
491 {
492 struct dmar_drhd_unit *drhd;
493 struct intel_iommu *iommu;
494 int mask = 0x3;
495
496 if (!intel_iommu_superpage)
497 return 0;
498
499 /* set iommu_superpage to the smallest common denominator */
500 rcu_read_lock();
501 for_each_active_iommu(iommu, drhd) {
502 if (iommu != skip) {
503 if (domain && domain->use_first_level) {
504 if (!cap_fl1gp_support(iommu->cap))
505 mask = 0x1;
506 } else {
507 mask &= cap_super_page_val(iommu->cap);
508 }
509
510 if (!mask)
511 break;
512 }
513 }
514 rcu_read_unlock();
515
516 return fls(mask);
517 }
518
519 static int domain_update_device_node(struct dmar_domain *domain)
520 {
521 struct device_domain_info *info;
522 int nid = NUMA_NO_NODE;
523 unsigned long flags;
524
525 spin_lock_irqsave(&domain->lock, flags);
526 list_for_each_entry(info, &domain->devices, link) {
527 /*
528 * There could possibly be multiple device numa nodes as devices
529 * within the same domain may sit behind different IOMMUs. There
530 * isn't perfect answer in such situation, so we select first
531 * come first served policy.
532 */
533 nid = dev_to_node(info->dev);
534 if (nid != NUMA_NO_NODE)
535 break;
536 }
537 spin_unlock_irqrestore(&domain->lock, flags);
538
539 return nid;
540 }
541
542 static void domain_update_iotlb(struct dmar_domain *domain);
543
544 /* Return the super pagesize bitmap if supported. */
545 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
546 {
547 unsigned long bitmap = 0;
548
549 /*
550 * 1-level super page supports page size of 2MiB, 2-level super page
551 * supports page size of both 2MiB and 1GiB.
552 */
553 if (domain->iommu_superpage == 1)
554 bitmap |= SZ_2M;
555 else if (domain->iommu_superpage == 2)
556 bitmap |= SZ_2M | SZ_1G;
557
558 return bitmap;
559 }
560
561 /* Some capabilities may be different across iommus */
562 static void domain_update_iommu_cap(struct dmar_domain *domain)
563 {
564 domain_update_iommu_coherency(domain);
565 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
566
567 /*
568 * If RHSA is missing, we should default to the device numa domain
569 * as fall back.
570 */
571 if (domain->nid == NUMA_NO_NODE)
572 domain->nid = domain_update_device_node(domain);
573
574 /*
575 * First-level translation restricts the input-address to a
576 * canonical address (i.e., address bits 63:N have the same
577 * value as address bit [N-1], where N is 48-bits with 4-level
578 * paging and 57-bits with 5-level paging). Hence, skip bit
579 * [N-1].
580 */
581 if (domain->use_first_level)
582 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
583 else
584 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
585
586 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
587 domain_update_iotlb(domain);
588 }
589
590 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
591 u8 devfn, int alloc)
592 {
593 struct root_entry *root = &iommu->root_entry[bus];
594 struct context_entry *context;
595 u64 *entry;
596
597 /*
598 * Except that the caller requested to allocate a new entry,
599 * returning a copied context entry makes no sense.
600 */
601 if (!alloc && context_copied(iommu, bus, devfn))
602 return NULL;
603
604 entry = &root->lo;
605 if (sm_supported(iommu)) {
606 if (devfn >= 0x80) {
607 devfn -= 0x80;
608 entry = &root->hi;
609 }
610 devfn *= 2;
611 }
612 if (*entry & 1)
613 context = phys_to_virt(*entry & VTD_PAGE_MASK);
614 else {
615 unsigned long phy_addr;
616 if (!alloc)
617 return NULL;
618
619 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
620 if (!context)
621 return NULL;
622
623 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
624 phy_addr = virt_to_phys((void *)context);
625 *entry = phy_addr | 1;
626 __iommu_flush_cache(iommu, entry, sizeof(*entry));
627 }
628 return &context[devfn];
629 }
630
631 /**
632 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
633 * sub-hierarchy of a candidate PCI-PCI bridge
634 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
635 * @bridge: the candidate PCI-PCI bridge
636 *
637 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
638 */
639 static bool
640 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
641 {
642 struct pci_dev *pdev, *pbridge;
643
644 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
645 return false;
646
647 pdev = to_pci_dev(dev);
648 pbridge = to_pci_dev(bridge);
649
650 if (pbridge->subordinate &&
651 pbridge->subordinate->number <= pdev->bus->number &&
652 pbridge->subordinate->busn_res.end >= pdev->bus->number)
653 return true;
654
655 return false;
656 }
657
658 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
659 {
660 struct dmar_drhd_unit *drhd;
661 u32 vtbar;
662 int rc;
663
664 /* We know that this device on this chipset has its own IOMMU.
665 * If we find it under a different IOMMU, then the BIOS is lying
666 * to us. Hope that the IOMMU for this device is actually
667 * disabled, and it needs no translation...
668 */
669 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
670 if (rc) {
671 /* "can't" happen */
672 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
673 return false;
674 }
675 vtbar &= 0xffff0000;
676
677 /* we know that the this iommu should be at offset 0xa000 from vtbar */
678 drhd = dmar_find_matched_drhd_unit(pdev);
679 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
680 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
681 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
682 return true;
683 }
684
685 return false;
686 }
687
688 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
689 {
690 if (!iommu || iommu->drhd->ignored)
691 return true;
692
693 if (dev_is_pci(dev)) {
694 struct pci_dev *pdev = to_pci_dev(dev);
695
696 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
697 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
698 quirk_ioat_snb_local_iommu(pdev))
699 return true;
700 }
701
702 return false;
703 }
704
705 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
706 {
707 struct dmar_drhd_unit *drhd = NULL;
708 struct pci_dev *pdev = NULL;
709 struct intel_iommu *iommu;
710 struct device *tmp;
711 u16 segment = 0;
712 int i;
713
714 if (!dev)
715 return NULL;
716
717 if (dev_is_pci(dev)) {
718 struct pci_dev *pf_pdev;
719
720 pdev = pci_real_dma_dev(to_pci_dev(dev));
721
722 /* VFs aren't listed in scope tables; we need to look up
723 * the PF instead to find the IOMMU. */
724 pf_pdev = pci_physfn(pdev);
725 dev = &pf_pdev->dev;
726 segment = pci_domain_nr(pdev->bus);
727 } else if (has_acpi_companion(dev))
728 dev = &ACPI_COMPANION(dev)->dev;
729
730 rcu_read_lock();
731 for_each_iommu(iommu, drhd) {
732 if (pdev && segment != drhd->segment)
733 continue;
734
735 for_each_active_dev_scope(drhd->devices,
736 drhd->devices_cnt, i, tmp) {
737 if (tmp == dev) {
738 /* For a VF use its original BDF# not that of the PF
739 * which we used for the IOMMU lookup. Strictly speaking
740 * we could do this for all PCI devices; we only need to
741 * get the BDF# from the scope table for ACPI matches. */
742 if (pdev && pdev->is_virtfn)
743 goto got_pdev;
744
745 if (bus && devfn) {
746 *bus = drhd->devices[i].bus;
747 *devfn = drhd->devices[i].devfn;
748 }
749 goto out;
750 }
751
752 if (is_downstream_to_pci_bridge(dev, tmp))
753 goto got_pdev;
754 }
755
756 if (pdev && drhd->include_all) {
757 got_pdev:
758 if (bus && devfn) {
759 *bus = pdev->bus->number;
760 *devfn = pdev->devfn;
761 }
762 goto out;
763 }
764 }
765 iommu = NULL;
766 out:
767 if (iommu_is_dummy(iommu, dev))
768 iommu = NULL;
769
770 rcu_read_unlock();
771
772 return iommu;
773 }
774
775 static void domain_flush_cache(struct dmar_domain *domain,
776 void *addr, int size)
777 {
778 if (!domain->iommu_coherency)
779 clflush_cache_range(addr, size);
780 }
781
782 static void free_context_table(struct intel_iommu *iommu)
783 {
784 struct context_entry *context;
785 int i;
786
787 if (!iommu->root_entry)
788 return;
789
790 for (i = 0; i < ROOT_ENTRY_NR; i++) {
791 context = iommu_context_addr(iommu, i, 0, 0);
792 if (context)
793 free_pgtable_page(context);
794
795 if (!sm_supported(iommu))
796 continue;
797
798 context = iommu_context_addr(iommu, i, 0x80, 0);
799 if (context)
800 free_pgtable_page(context);
801 }
802
803 free_pgtable_page(iommu->root_entry);
804 iommu->root_entry = NULL;
805 }
806
807 #ifdef CONFIG_DMAR_DEBUG
808 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
809 u8 bus, u8 devfn, struct dma_pte *parent, int level)
810 {
811 struct dma_pte *pte;
812 int offset;
813
814 while (1) {
815 offset = pfn_level_offset(pfn, level);
816 pte = &parent[offset];
817 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
818 pr_info("PTE not present at level %d\n", level);
819 break;
820 }
821
822 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
823
824 if (level == 1)
825 break;
826
827 parent = phys_to_virt(dma_pte_addr(pte));
828 level--;
829 }
830 }
831
832 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
833 unsigned long long addr, u32 pasid)
834 {
835 struct pasid_dir_entry *dir, *pde;
836 struct pasid_entry *entries, *pte;
837 struct context_entry *ctx_entry;
838 struct root_entry *rt_entry;
839 int i, dir_index, index, level;
840 u8 devfn = source_id & 0xff;
841 u8 bus = source_id >> 8;
842 struct dma_pte *pgtable;
843
844 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
845
846 /* root entry dump */
847 rt_entry = &iommu->root_entry[bus];
848 if (!rt_entry) {
849 pr_info("root table entry is not present\n");
850 return;
851 }
852
853 if (sm_supported(iommu))
854 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
855 rt_entry->hi, rt_entry->lo);
856 else
857 pr_info("root entry: 0x%016llx", rt_entry->lo);
858
859 /* context entry dump */
860 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
861 if (!ctx_entry) {
862 pr_info("context table entry is not present\n");
863 return;
864 }
865
866 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
867 ctx_entry->hi, ctx_entry->lo);
868
869 /* legacy mode does not require PASID entries */
870 if (!sm_supported(iommu)) {
871 level = agaw_to_level(ctx_entry->hi & 7);
872 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
873 goto pgtable_walk;
874 }
875
876 /* get the pointer to pasid directory entry */
877 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
878 if (!dir) {
879 pr_info("pasid directory entry is not present\n");
880 return;
881 }
882 /* For request-without-pasid, get the pasid from context entry */
883 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
884 pasid = IOMMU_NO_PASID;
885
886 dir_index = pasid >> PASID_PDE_SHIFT;
887 pde = &dir[dir_index];
888 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
889
890 /* get the pointer to the pasid table entry */
891 entries = get_pasid_table_from_pde(pde);
892 if (!entries) {
893 pr_info("pasid table entry is not present\n");
894 return;
895 }
896 index = pasid & PASID_PTE_MASK;
897 pte = &entries[index];
898 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
899 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
900
901 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
902 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
903 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
904 } else {
905 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
906 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
907 }
908
909 pgtable_walk:
910 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
911 }
912 #endif
913
914 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
915 unsigned long pfn, int *target_level,
916 gfp_t gfp)
917 {
918 struct dma_pte *parent, *pte;
919 int level = agaw_to_level(domain->agaw);
920 int offset;
921
922 if (!domain_pfn_supported(domain, pfn))
923 /* Address beyond IOMMU's addressing capabilities. */
924 return NULL;
925
926 parent = domain->pgd;
927
928 while (1) {
929 void *tmp_page;
930
931 offset = pfn_level_offset(pfn, level);
932 pte = &parent[offset];
933 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
934 break;
935 if (level == *target_level)
936 break;
937
938 if (!dma_pte_present(pte)) {
939 uint64_t pteval;
940
941 tmp_page = alloc_pgtable_page(domain->nid, gfp);
942
943 if (!tmp_page)
944 return NULL;
945
946 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
947 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
948 if (domain->use_first_level)
949 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
950
951 if (cmpxchg64(&pte->val, 0ULL, pteval))
952 /* Someone else set it while we were thinking; use theirs. */
953 free_pgtable_page(tmp_page);
954 else
955 domain_flush_cache(domain, pte, sizeof(*pte));
956 }
957 if (level == 1)
958 break;
959
960 parent = phys_to_virt(dma_pte_addr(pte));
961 level--;
962 }
963
964 if (!*target_level)
965 *target_level = level;
966
967 return pte;
968 }
969
970 /* return address's pte at specific level */
971 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
972 unsigned long pfn,
973 int level, int *large_page)
974 {
975 struct dma_pte *parent, *pte;
976 int total = agaw_to_level(domain->agaw);
977 int offset;
978
979 parent = domain->pgd;
980 while (level <= total) {
981 offset = pfn_level_offset(pfn, total);
982 pte = &parent[offset];
983 if (level == total)
984 return pte;
985
986 if (!dma_pte_present(pte)) {
987 *large_page = total;
988 break;
989 }
990
991 if (dma_pte_superpage(pte)) {
992 *large_page = total;
993 return pte;
994 }
995
996 parent = phys_to_virt(dma_pte_addr(pte));
997 total--;
998 }
999 return NULL;
1000 }
1001
1002 /* clear last level pte, a tlb flush should be followed */
1003 static void dma_pte_clear_range(struct dmar_domain *domain,
1004 unsigned long start_pfn,
1005 unsigned long last_pfn)
1006 {
1007 unsigned int large_page;
1008 struct dma_pte *first_pte, *pte;
1009
1010 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1011 WARN_ON(start_pfn > last_pfn))
1012 return;
1013
1014 /* we don't need lock here; nobody else touches the iova range */
1015 do {
1016 large_page = 1;
1017 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1018 if (!pte) {
1019 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1020 continue;
1021 }
1022 do {
1023 dma_clear_pte(pte);
1024 start_pfn += lvl_to_nr_pages(large_page);
1025 pte++;
1026 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1027
1028 domain_flush_cache(domain, first_pte,
1029 (void *)pte - (void *)first_pte);
1030
1031 } while (start_pfn && start_pfn <= last_pfn);
1032 }
1033
1034 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1035 int retain_level, struct dma_pte *pte,
1036 unsigned long pfn, unsigned long start_pfn,
1037 unsigned long last_pfn)
1038 {
1039 pfn = max(start_pfn, pfn);
1040 pte = &pte[pfn_level_offset(pfn, level)];
1041
1042 do {
1043 unsigned long level_pfn;
1044 struct dma_pte *level_pte;
1045
1046 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1047 goto next;
1048
1049 level_pfn = pfn & level_mask(level);
1050 level_pte = phys_to_virt(dma_pte_addr(pte));
1051
1052 if (level > 2) {
1053 dma_pte_free_level(domain, level - 1, retain_level,
1054 level_pte, level_pfn, start_pfn,
1055 last_pfn);
1056 }
1057
1058 /*
1059 * Free the page table if we're below the level we want to
1060 * retain and the range covers the entire table.
1061 */
1062 if (level < retain_level && !(start_pfn > level_pfn ||
1063 last_pfn < level_pfn + level_size(level) - 1)) {
1064 dma_clear_pte(pte);
1065 domain_flush_cache(domain, pte, sizeof(*pte));
1066 free_pgtable_page(level_pte);
1067 }
1068 next:
1069 pfn += level_size(level);
1070 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1071 }
1072
1073 /*
1074 * clear last level (leaf) ptes and free page table pages below the
1075 * level we wish to keep intact.
1076 */
1077 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1078 unsigned long start_pfn,
1079 unsigned long last_pfn,
1080 int retain_level)
1081 {
1082 dma_pte_clear_range(domain, start_pfn, last_pfn);
1083
1084 /* We don't need lock here; nobody else touches the iova range */
1085 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1086 domain->pgd, 0, start_pfn, last_pfn);
1087
1088 /* free pgd */
1089 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090 free_pgtable_page(domain->pgd);
1091 domain->pgd = NULL;
1092 }
1093 }
1094
1095 /* When a page at a given level is being unlinked from its parent, we don't
1096 need to *modify* it at all. All we need to do is make a list of all the
1097 pages which can be freed just as soon as we've flushed the IOTLB and we
1098 know the hardware page-walk will no longer touch them.
1099 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1100 be freed. */
1101 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1102 int level, struct dma_pte *pte,
1103 struct list_head *freelist)
1104 {
1105 struct page *pg;
1106
1107 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1108 list_add_tail(&pg->lru, freelist);
1109
1110 if (level == 1)
1111 return;
1112
1113 pte = page_address(pg);
1114 do {
1115 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1116 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1117 pte++;
1118 } while (!first_pte_in_page(pte));
1119 }
1120
1121 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1122 struct dma_pte *pte, unsigned long pfn,
1123 unsigned long start_pfn, unsigned long last_pfn,
1124 struct list_head *freelist)
1125 {
1126 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1127
1128 pfn = max(start_pfn, pfn);
1129 pte = &pte[pfn_level_offset(pfn, level)];
1130
1131 do {
1132 unsigned long level_pfn = pfn & level_mask(level);
1133
1134 if (!dma_pte_present(pte))
1135 goto next;
1136
1137 /* If range covers entire pagetable, free it */
1138 if (start_pfn <= level_pfn &&
1139 last_pfn >= level_pfn + level_size(level) - 1) {
1140 /* These suborbinate page tables are going away entirely. Don't
1141 bother to clear them; we're just going to *free* them. */
1142 if (level > 1 && !dma_pte_superpage(pte))
1143 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1144
1145 dma_clear_pte(pte);
1146 if (!first_pte)
1147 first_pte = pte;
1148 last_pte = pte;
1149 } else if (level > 1) {
1150 /* Recurse down into a level that isn't *entirely* obsolete */
1151 dma_pte_clear_level(domain, level - 1,
1152 phys_to_virt(dma_pte_addr(pte)),
1153 level_pfn, start_pfn, last_pfn,
1154 freelist);
1155 }
1156 next:
1157 pfn = level_pfn + level_size(level);
1158 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1159
1160 if (first_pte)
1161 domain_flush_cache(domain, first_pte,
1162 (void *)++last_pte - (void *)first_pte);
1163 }
1164
1165 /* We can't just free the pages because the IOMMU may still be walking
1166 the page tables, and may have cached the intermediate levels. The
1167 pages can only be freed after the IOTLB flush has been done. */
1168 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1169 unsigned long last_pfn, struct list_head *freelist)
1170 {
1171 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1172 WARN_ON(start_pfn > last_pfn))
1173 return;
1174
1175 /* we don't need lock here; nobody else touches the iova range */
1176 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1177 domain->pgd, 0, start_pfn, last_pfn, freelist);
1178
1179 /* free pgd */
1180 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1181 struct page *pgd_page = virt_to_page(domain->pgd);
1182 list_add_tail(&pgd_page->lru, freelist);
1183 domain->pgd = NULL;
1184 }
1185 }
1186
1187 /* iommu handling */
1188 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1189 {
1190 struct root_entry *root;
1191
1192 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1193 if (!root) {
1194 pr_err("Allocating root entry for %s failed\n",
1195 iommu->name);
1196 return -ENOMEM;
1197 }
1198
1199 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1200 iommu->root_entry = root;
1201
1202 return 0;
1203 }
1204
1205 static void iommu_set_root_entry(struct intel_iommu *iommu)
1206 {
1207 u64 addr;
1208 u32 sts;
1209 unsigned long flag;
1210
1211 addr = virt_to_phys(iommu->root_entry);
1212 if (sm_supported(iommu))
1213 addr |= DMA_RTADDR_SMT;
1214
1215 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1216 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1217
1218 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1219
1220 /* Make sure hardware complete it */
1221 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1222 readl, (sts & DMA_GSTS_RTPS), sts);
1223
1224 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1225
1226 /*
1227 * Hardware invalidates all DMA remapping hardware translation
1228 * caches as part of SRTP flow.
1229 */
1230 if (cap_esrtps(iommu->cap))
1231 return;
1232
1233 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1234 if (sm_supported(iommu))
1235 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1236 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1237 }
1238
1239 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1240 {
1241 u32 val;
1242 unsigned long flag;
1243
1244 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1245 return;
1246
1247 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1248 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1249
1250 /* Make sure hardware complete it */
1251 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1252 readl, (!(val & DMA_GSTS_WBFS)), val);
1253
1254 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1255 }
1256
1257 /* return value determine if we need a write buffer flush */
1258 static void __iommu_flush_context(struct intel_iommu *iommu,
1259 u16 did, u16 source_id, u8 function_mask,
1260 u64 type)
1261 {
1262 u64 val = 0;
1263 unsigned long flag;
1264
1265 switch (type) {
1266 case DMA_CCMD_GLOBAL_INVL:
1267 val = DMA_CCMD_GLOBAL_INVL;
1268 break;
1269 case DMA_CCMD_DOMAIN_INVL:
1270 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1271 break;
1272 case DMA_CCMD_DEVICE_INVL:
1273 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1274 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1275 break;
1276 default:
1277 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1278 iommu->name, type);
1279 return;
1280 }
1281 val |= DMA_CCMD_ICC;
1282
1283 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1284 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1285
1286 /* Make sure hardware complete it */
1287 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1288 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1289
1290 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1291 }
1292
1293 /* return value determine if we need a write buffer flush */
1294 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1295 u64 addr, unsigned int size_order, u64 type)
1296 {
1297 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1298 u64 val = 0, val_iva = 0;
1299 unsigned long flag;
1300
1301 switch (type) {
1302 case DMA_TLB_GLOBAL_FLUSH:
1303 /* global flush doesn't need set IVA_REG */
1304 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1305 break;
1306 case DMA_TLB_DSI_FLUSH:
1307 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1308 break;
1309 case DMA_TLB_PSI_FLUSH:
1310 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1311 /* IH bit is passed in as part of address */
1312 val_iva = size_order | addr;
1313 break;
1314 default:
1315 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1316 iommu->name, type);
1317 return;
1318 }
1319
1320 if (cap_write_drain(iommu->cap))
1321 val |= DMA_TLB_WRITE_DRAIN;
1322
1323 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1324 /* Note: Only uses first TLB reg currently */
1325 if (val_iva)
1326 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1327 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1328
1329 /* Make sure hardware complete it */
1330 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1331 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1332
1333 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1334
1335 /* check IOTLB invalidation granularity */
1336 if (DMA_TLB_IAIG(val) == 0)
1337 pr_err("Flush IOTLB failed\n");
1338 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1339 pr_debug("TLB flush request %Lx, actual %Lx\n",
1340 (unsigned long long)DMA_TLB_IIRG(type),
1341 (unsigned long long)DMA_TLB_IAIG(val));
1342 }
1343
1344 static struct device_domain_info *
1345 domain_lookup_dev_info(struct dmar_domain *domain,
1346 struct intel_iommu *iommu, u8 bus, u8 devfn)
1347 {
1348 struct device_domain_info *info;
1349 unsigned long flags;
1350
1351 spin_lock_irqsave(&domain->lock, flags);
1352 list_for_each_entry(info, &domain->devices, link) {
1353 if (info->iommu == iommu && info->bus == bus &&
1354 info->devfn == devfn) {
1355 spin_unlock_irqrestore(&domain->lock, flags);
1356 return info;
1357 }
1358 }
1359 spin_unlock_irqrestore(&domain->lock, flags);
1360
1361 return NULL;
1362 }
1363
1364 static void domain_update_iotlb(struct dmar_domain *domain)
1365 {
1366 struct dev_pasid_info *dev_pasid;
1367 struct device_domain_info *info;
1368 bool has_iotlb_device = false;
1369 unsigned long flags;
1370
1371 spin_lock_irqsave(&domain->lock, flags);
1372 list_for_each_entry(info, &domain->devices, link) {
1373 if (info->ats_enabled) {
1374 has_iotlb_device = true;
1375 break;
1376 }
1377 }
1378
1379 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1380 info = dev_iommu_priv_get(dev_pasid->dev);
1381 if (info->ats_enabled) {
1382 has_iotlb_device = true;
1383 break;
1384 }
1385 }
1386 domain->has_iotlb_device = has_iotlb_device;
1387 spin_unlock_irqrestore(&domain->lock, flags);
1388 }
1389
1390 /*
1391 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1392 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1393 * check because it applies only to the built-in QAT devices and it doesn't
1394 * grant additional privileges.
1395 */
1396 #define BUGGY_QAT_DEVID_MASK 0x4940
1397 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1398 {
1399 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1400 return false;
1401
1402 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1403 return false;
1404
1405 return true;
1406 }
1407
1408 static void iommu_enable_pci_caps(struct device_domain_info *info)
1409 {
1410 struct pci_dev *pdev;
1411
1412 if (!dev_is_pci(info->dev))
1413 return;
1414
1415 pdev = to_pci_dev(info->dev);
1416
1417 /* The PCIe spec, in its wisdom, declares that the behaviour of
1418 the device if you enable PASID support after ATS support is
1419 undefined. So always enable PASID support on devices which
1420 have it, even if we can't yet know if we're ever going to
1421 use it. */
1422 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1423 info->pasid_enabled = 1;
1424
1425 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1426 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1427 info->ats_enabled = 1;
1428 domain_update_iotlb(info->domain);
1429 }
1430 }
1431
1432 static void iommu_disable_pci_caps(struct device_domain_info *info)
1433 {
1434 struct pci_dev *pdev;
1435
1436 if (!dev_is_pci(info->dev))
1437 return;
1438
1439 pdev = to_pci_dev(info->dev);
1440
1441 if (info->ats_enabled) {
1442 pci_disable_ats(pdev);
1443 info->ats_enabled = 0;
1444 domain_update_iotlb(info->domain);
1445 }
1446
1447 if (info->pasid_enabled) {
1448 pci_disable_pasid(pdev);
1449 info->pasid_enabled = 0;
1450 }
1451 }
1452
1453 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1454 u64 addr, unsigned int mask)
1455 {
1456 u16 sid, qdep;
1457
1458 if (!info || !info->ats_enabled)
1459 return;
1460
1461 sid = info->bus << 8 | info->devfn;
1462 qdep = info->ats_qdep;
1463 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1464 qdep, addr, mask);
1465 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1466 }
1467
1468 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1469 u64 addr, unsigned mask)
1470 {
1471 struct dev_pasid_info *dev_pasid;
1472 struct device_domain_info *info;
1473 unsigned long flags;
1474
1475 if (!domain->has_iotlb_device)
1476 return;
1477
1478 spin_lock_irqsave(&domain->lock, flags);
1479 list_for_each_entry(info, &domain->devices, link)
1480 __iommu_flush_dev_iotlb(info, addr, mask);
1481
1482 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1483 info = dev_iommu_priv_get(dev_pasid->dev);
1484
1485 if (!info->ats_enabled)
1486 continue;
1487
1488 qi_flush_dev_iotlb_pasid(info->iommu,
1489 PCI_DEVID(info->bus, info->devfn),
1490 info->pfsid, dev_pasid->pasid,
1491 info->ats_qdep, addr,
1492 mask);
1493 }
1494 spin_unlock_irqrestore(&domain->lock, flags);
1495 }
1496
1497 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1498 struct dmar_domain *domain, u64 addr,
1499 unsigned long npages, bool ih)
1500 {
1501 u16 did = domain_id_iommu(domain, iommu);
1502 struct dev_pasid_info *dev_pasid;
1503 unsigned long flags;
1504
1505 spin_lock_irqsave(&domain->lock, flags);
1506 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1507 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1508
1509 if (!list_empty(&domain->devices))
1510 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1511 spin_unlock_irqrestore(&domain->lock, flags);
1512 }
1513
1514 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1515 struct dmar_domain *domain,
1516 unsigned long pfn, unsigned int pages,
1517 int ih, int map)
1518 {
1519 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1520 unsigned int mask = ilog2(aligned_pages);
1521 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1522 u16 did = domain_id_iommu(domain, iommu);
1523
1524 if (WARN_ON(!pages))
1525 return;
1526
1527 if (ih)
1528 ih = 1 << 6;
1529
1530 if (domain->use_first_level) {
1531 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1532 } else {
1533 unsigned long bitmask = aligned_pages - 1;
1534
1535 /*
1536 * PSI masks the low order bits of the base address. If the
1537 * address isn't aligned to the mask, then compute a mask value
1538 * needed to ensure the target range is flushed.
1539 */
1540 if (unlikely(bitmask & pfn)) {
1541 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1542
1543 /*
1544 * Since end_pfn <= pfn + bitmask, the only way bits
1545 * higher than bitmask can differ in pfn and end_pfn is
1546 * by carrying. This means after masking out bitmask,
1547 * high bits starting with the first set bit in
1548 * shared_bits are all equal in both pfn and end_pfn.
1549 */
1550 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1551 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1552 }
1553
1554 /*
1555 * Fallback to domain selective flush if no PSI support or
1556 * the size is too big.
1557 */
1558 if (!cap_pgsel_inv(iommu->cap) ||
1559 mask > cap_max_amask_val(iommu->cap))
1560 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1561 DMA_TLB_DSI_FLUSH);
1562 else
1563 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1564 DMA_TLB_PSI_FLUSH);
1565 }
1566
1567 /*
1568 * In caching mode, changes of pages from non-present to present require
1569 * flush. However, device IOTLB doesn't need to be flushed in this case.
1570 */
1571 if (!cap_caching_mode(iommu->cap) || !map)
1572 iommu_flush_dev_iotlb(domain, addr, mask);
1573 }
1574
1575 /* Notification for newly created mappings */
1576 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1577 struct dmar_domain *domain,
1578 unsigned long pfn, unsigned int pages)
1579 {
1580 /*
1581 * It's a non-present to present mapping. Only flush if caching mode
1582 * and second level.
1583 */
1584 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1585 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1586 else
1587 iommu_flush_write_buffer(iommu);
1588 }
1589
1590 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1591 {
1592 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1593 struct iommu_domain_info *info;
1594 unsigned long idx;
1595
1596 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1597 struct intel_iommu *iommu = info->iommu;
1598 u16 did = domain_id_iommu(dmar_domain, iommu);
1599
1600 if (dmar_domain->use_first_level)
1601 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1602 else
1603 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1604 DMA_TLB_DSI_FLUSH);
1605
1606 if (!cap_caching_mode(iommu->cap))
1607 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1608 }
1609 }
1610
1611 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1612 {
1613 u32 pmen;
1614 unsigned long flags;
1615
1616 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1617 return;
1618
1619 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1620 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1621 pmen &= ~DMA_PMEN_EPM;
1622 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1623
1624 /* wait for the protected region status bit to clear */
1625 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1626 readl, !(pmen & DMA_PMEN_PRS), pmen);
1627
1628 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1629 }
1630
1631 static void iommu_enable_translation(struct intel_iommu *iommu)
1632 {
1633 u32 sts;
1634 unsigned long flags;
1635
1636 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1637 iommu->gcmd |= DMA_GCMD_TE;
1638 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1639
1640 /* Make sure hardware complete it */
1641 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1642 readl, (sts & DMA_GSTS_TES), sts);
1643
1644 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1645 }
1646
1647 static void iommu_disable_translation(struct intel_iommu *iommu)
1648 {
1649 u32 sts;
1650 unsigned long flag;
1651
1652 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1653 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1654 return;
1655
1656 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1657 iommu->gcmd &= ~DMA_GCMD_TE;
1658 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1659
1660 /* Make sure hardware complete it */
1661 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1662 readl, (!(sts & DMA_GSTS_TES)), sts);
1663
1664 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1665 }
1666
1667 static int iommu_init_domains(struct intel_iommu *iommu)
1668 {
1669 u32 ndomains;
1670
1671 ndomains = cap_ndoms(iommu->cap);
1672 pr_debug("%s: Number of Domains supported <%d>\n",
1673 iommu->name, ndomains);
1674
1675 spin_lock_init(&iommu->lock);
1676
1677 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1678 if (!iommu->domain_ids)
1679 return -ENOMEM;
1680
1681 /*
1682 * If Caching mode is set, then invalid translations are tagged
1683 * with domain-id 0, hence we need to pre-allocate it. We also
1684 * use domain-id 0 as a marker for non-allocated domain-id, so
1685 * make sure it is not used for a real domain.
1686 */
1687 set_bit(0, iommu->domain_ids);
1688
1689 /*
1690 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1691 * entry for first-level or pass-through translation modes should
1692 * be programmed with a domain id different from those used for
1693 * second-level or nested translation. We reserve a domain id for
1694 * this purpose.
1695 */
1696 if (sm_supported(iommu))
1697 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1698
1699 return 0;
1700 }
1701
1702 static void disable_dmar_iommu(struct intel_iommu *iommu)
1703 {
1704 if (!iommu->domain_ids)
1705 return;
1706
1707 /*
1708 * All iommu domains must have been detached from the devices,
1709 * hence there should be no domain IDs in use.
1710 */
1711 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1712 > NUM_RESERVED_DID))
1713 return;
1714
1715 if (iommu->gcmd & DMA_GCMD_TE)
1716 iommu_disable_translation(iommu);
1717 }
1718
1719 static void free_dmar_iommu(struct intel_iommu *iommu)
1720 {
1721 if (iommu->domain_ids) {
1722 bitmap_free(iommu->domain_ids);
1723 iommu->domain_ids = NULL;
1724 }
1725
1726 if (iommu->copied_tables) {
1727 bitmap_free(iommu->copied_tables);
1728 iommu->copied_tables = NULL;
1729 }
1730
1731 /* free context mapping */
1732 free_context_table(iommu);
1733
1734 #ifdef CONFIG_INTEL_IOMMU_SVM
1735 if (pasid_supported(iommu)) {
1736 if (ecap_prs(iommu->ecap))
1737 intel_svm_finish_prq(iommu);
1738 }
1739 #endif
1740 }
1741
1742 /*
1743 * Check and return whether first level is used by default for
1744 * DMA translation.
1745 */
1746 static bool first_level_by_default(unsigned int type)
1747 {
1748 /* Only SL is available in legacy mode */
1749 if (!scalable_mode_support())
1750 return false;
1751
1752 /* Only level (either FL or SL) is available, just use it */
1753 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1754 return intel_cap_flts_sanity();
1755
1756 /* Both levels are available, decide it based on domain type */
1757 return type != IOMMU_DOMAIN_UNMANAGED;
1758 }
1759
1760 static struct dmar_domain *alloc_domain(unsigned int type)
1761 {
1762 struct dmar_domain *domain;
1763
1764 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1765 if (!domain)
1766 return NULL;
1767
1768 domain->nid = NUMA_NO_NODE;
1769 if (first_level_by_default(type))
1770 domain->use_first_level = true;
1771 domain->has_iotlb_device = false;
1772 INIT_LIST_HEAD(&domain->devices);
1773 INIT_LIST_HEAD(&domain->dev_pasids);
1774 spin_lock_init(&domain->lock);
1775 xa_init(&domain->iommu_array);
1776
1777 return domain;
1778 }
1779
1780 static int domain_attach_iommu(struct dmar_domain *domain,
1781 struct intel_iommu *iommu)
1782 {
1783 struct iommu_domain_info *info, *curr;
1784 unsigned long ndomains;
1785 int num, ret = -ENOSPC;
1786
1787 info = kzalloc(sizeof(*info), GFP_KERNEL);
1788 if (!info)
1789 return -ENOMEM;
1790
1791 spin_lock(&iommu->lock);
1792 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1793 if (curr) {
1794 curr->refcnt++;
1795 spin_unlock(&iommu->lock);
1796 kfree(info);
1797 return 0;
1798 }
1799
1800 ndomains = cap_ndoms(iommu->cap);
1801 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1802 if (num >= ndomains) {
1803 pr_err("%s: No free domain ids\n", iommu->name);
1804 goto err_unlock;
1805 }
1806
1807 set_bit(num, iommu->domain_ids);
1808 info->refcnt = 1;
1809 info->did = num;
1810 info->iommu = iommu;
1811 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1812 NULL, info, GFP_ATOMIC);
1813 if (curr) {
1814 ret = xa_err(curr) ? : -EBUSY;
1815 goto err_clear;
1816 }
1817 domain_update_iommu_cap(domain);
1818
1819 spin_unlock(&iommu->lock);
1820 return 0;
1821
1822 err_clear:
1823 clear_bit(info->did, iommu->domain_ids);
1824 err_unlock:
1825 spin_unlock(&iommu->lock);
1826 kfree(info);
1827 return ret;
1828 }
1829
1830 static void domain_detach_iommu(struct dmar_domain *domain,
1831 struct intel_iommu *iommu)
1832 {
1833 struct iommu_domain_info *info;
1834
1835 spin_lock(&iommu->lock);
1836 info = xa_load(&domain->iommu_array, iommu->seq_id);
1837 if (--info->refcnt == 0) {
1838 clear_bit(info->did, iommu->domain_ids);
1839 xa_erase(&domain->iommu_array, iommu->seq_id);
1840 domain->nid = NUMA_NO_NODE;
1841 domain_update_iommu_cap(domain);
1842 kfree(info);
1843 }
1844 spin_unlock(&iommu->lock);
1845 }
1846
1847 static inline int guestwidth_to_adjustwidth(int gaw)
1848 {
1849 int agaw;
1850 int r = (gaw - 12) % 9;
1851
1852 if (r == 0)
1853 agaw = gaw;
1854 else
1855 agaw = gaw + 9 - r;
1856 if (agaw > 64)
1857 agaw = 64;
1858 return agaw;
1859 }
1860
1861 static void domain_exit(struct dmar_domain *domain)
1862 {
1863 if (domain->pgd) {
1864 LIST_HEAD(freelist);
1865
1866 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1867 put_pages_list(&freelist);
1868 }
1869
1870 if (WARN_ON(!list_empty(&domain->devices)))
1871 return;
1872
1873 kfree(domain);
1874 }
1875
1876 /*
1877 * Get the PASID directory size for scalable mode context entry.
1878 * Value of X in the PDTS field of a scalable mode context entry
1879 * indicates PASID directory with 2^(X + 7) entries.
1880 */
1881 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1882 {
1883 unsigned long pds, max_pde;
1884
1885 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1886 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1887 if (pds < 7)
1888 return 0;
1889
1890 return pds - 7;
1891 }
1892
1893 /*
1894 * Set the RID_PASID field of a scalable mode context entry. The
1895 * IOMMU hardware will use the PASID value set in this field for
1896 * DMA translations of DMA requests without PASID.
1897 */
1898 static inline void
1899 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1900 {
1901 context->hi |= pasid & ((1 << 20) - 1);
1902 }
1903
1904 /*
1905 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1906 * entry.
1907 */
1908 static inline void context_set_sm_dte(struct context_entry *context)
1909 {
1910 context->lo |= BIT_ULL(2);
1911 }
1912
1913 /*
1914 * Set the PRE(Page Request Enable) field of a scalable mode context
1915 * entry.
1916 */
1917 static inline void context_set_sm_pre(struct context_entry *context)
1918 {
1919 context->lo |= BIT_ULL(4);
1920 }
1921
1922 /* Convert value to context PASID directory size field coding. */
1923 #define context_pdts(pds) (((pds) & 0x7) << 9)
1924
1925 static int domain_context_mapping_one(struct dmar_domain *domain,
1926 struct intel_iommu *iommu,
1927 struct pasid_table *table,
1928 u8 bus, u8 devfn)
1929 {
1930 struct device_domain_info *info =
1931 domain_lookup_dev_info(domain, iommu, bus, devfn);
1932 u16 did = domain_id_iommu(domain, iommu);
1933 int translation = CONTEXT_TT_MULTI_LEVEL;
1934 struct context_entry *context;
1935 int ret;
1936
1937 if (hw_pass_through && domain_type_is_si(domain))
1938 translation = CONTEXT_TT_PASS_THROUGH;
1939
1940 pr_debug("Set context mapping for %02x:%02x.%d\n",
1941 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1942
1943 spin_lock(&iommu->lock);
1944 ret = -ENOMEM;
1945 context = iommu_context_addr(iommu, bus, devfn, 1);
1946 if (!context)
1947 goto out_unlock;
1948
1949 ret = 0;
1950 if (context_present(context) && !context_copied(iommu, bus, devfn))
1951 goto out_unlock;
1952
1953 /*
1954 * For kdump cases, old valid entries may be cached due to the
1955 * in-flight DMA and copied pgtable, but there is no unmapping
1956 * behaviour for them, thus we need an explicit cache flush for
1957 * the newly-mapped device. For kdump, at this point, the device
1958 * is supposed to finish reset at its driver probe stage, so no
1959 * in-flight DMA will exist, and we don't need to worry anymore
1960 * hereafter.
1961 */
1962 if (context_copied(iommu, bus, devfn)) {
1963 u16 did_old = context_domain_id(context);
1964
1965 if (did_old < cap_ndoms(iommu->cap)) {
1966 iommu->flush.flush_context(iommu, did_old,
1967 (((u16)bus) << 8) | devfn,
1968 DMA_CCMD_MASK_NOBIT,
1969 DMA_CCMD_DEVICE_INVL);
1970 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1971 DMA_TLB_DSI_FLUSH);
1972 }
1973
1974 clear_context_copied(iommu, bus, devfn);
1975 }
1976
1977 context_clear_entry(context);
1978
1979 if (sm_supported(iommu)) {
1980 unsigned long pds;
1981
1982 /* Setup the PASID DIR pointer: */
1983 pds = context_get_sm_pds(table);
1984 context->lo = (u64)virt_to_phys(table->table) |
1985 context_pdts(pds);
1986
1987 /* Setup the RID_PASID field: */
1988 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1989
1990 /*
1991 * Setup the Device-TLB enable bit and Page request
1992 * Enable bit:
1993 */
1994 if (info && info->ats_supported)
1995 context_set_sm_dte(context);
1996 if (info && info->pri_supported)
1997 context_set_sm_pre(context);
1998 if (info && info->pasid_supported)
1999 context_set_pasid(context);
2000 } else {
2001 struct dma_pte *pgd = domain->pgd;
2002 int agaw;
2003
2004 context_set_domain_id(context, did);
2005
2006 if (translation != CONTEXT_TT_PASS_THROUGH) {
2007 /*
2008 * Skip top levels of page tables for iommu which has
2009 * less agaw than default. Unnecessary for PT mode.
2010 */
2011 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2012 ret = -ENOMEM;
2013 pgd = phys_to_virt(dma_pte_addr(pgd));
2014 if (!dma_pte_present(pgd))
2015 goto out_unlock;
2016 }
2017
2018 if (info && info->ats_supported)
2019 translation = CONTEXT_TT_DEV_IOTLB;
2020 else
2021 translation = CONTEXT_TT_MULTI_LEVEL;
2022
2023 context_set_address_root(context, virt_to_phys(pgd));
2024 context_set_address_width(context, agaw);
2025 } else {
2026 /*
2027 * In pass through mode, AW must be programmed to
2028 * indicate the largest AGAW value supported by
2029 * hardware. And ASR is ignored by hardware.
2030 */
2031 context_set_address_width(context, iommu->msagaw);
2032 }
2033
2034 context_set_translation_type(context, translation);
2035 }
2036
2037 context_set_fault_enable(context);
2038 context_set_present(context);
2039 if (!ecap_coherent(iommu->ecap))
2040 clflush_cache_range(context, sizeof(*context));
2041
2042 /*
2043 * It's a non-present to present mapping. If hardware doesn't cache
2044 * non-present entry we only need to flush the write-buffer. If the
2045 * _does_ cache non-present entries, then it does so in the special
2046 * domain #0, which we have to flush:
2047 */
2048 if (cap_caching_mode(iommu->cap)) {
2049 iommu->flush.flush_context(iommu, 0,
2050 (((u16)bus) << 8) | devfn,
2051 DMA_CCMD_MASK_NOBIT,
2052 DMA_CCMD_DEVICE_INVL);
2053 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2054 } else {
2055 iommu_flush_write_buffer(iommu);
2056 }
2057
2058 ret = 0;
2059
2060 out_unlock:
2061 spin_unlock(&iommu->lock);
2062
2063 return ret;
2064 }
2065
2066 struct domain_context_mapping_data {
2067 struct dmar_domain *domain;
2068 struct intel_iommu *iommu;
2069 struct pasid_table *table;
2070 };
2071
2072 static int domain_context_mapping_cb(struct pci_dev *pdev,
2073 u16 alias, void *opaque)
2074 {
2075 struct domain_context_mapping_data *data = opaque;
2076
2077 return domain_context_mapping_one(data->domain, data->iommu,
2078 data->table, PCI_BUS_NUM(alias),
2079 alias & 0xff);
2080 }
2081
2082 static int
2083 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2084 {
2085 struct domain_context_mapping_data data;
2086 struct pasid_table *table;
2087 struct intel_iommu *iommu;
2088 u8 bus, devfn;
2089
2090 iommu = device_to_iommu(dev, &bus, &devfn);
2091 if (!iommu)
2092 return -ENODEV;
2093
2094 table = intel_pasid_get_table(dev);
2095
2096 if (!dev_is_pci(dev))
2097 return domain_context_mapping_one(domain, iommu, table,
2098 bus, devfn);
2099
2100 data.domain = domain;
2101 data.iommu = iommu;
2102 data.table = table;
2103
2104 return pci_for_each_dma_alias(to_pci_dev(dev),
2105 &domain_context_mapping_cb, &data);
2106 }
2107
2108 /* Returns a number of VTD pages, but aligned to MM page size */
2109 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2110 size_t size)
2111 {
2112 host_addr &= ~PAGE_MASK;
2113 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2114 }
2115
2116 /* Return largest possible superpage level for a given mapping */
2117 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2118 unsigned long iov_pfn,
2119 unsigned long phy_pfn,
2120 unsigned long pages)
2121 {
2122 int support, level = 1;
2123 unsigned long pfnmerge;
2124
2125 support = domain->iommu_superpage;
2126
2127 /* To use a large page, the virtual *and* physical addresses
2128 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2129 of them will mean we have to use smaller pages. So just
2130 merge them and check both at once. */
2131 pfnmerge = iov_pfn | phy_pfn;
2132
2133 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2134 pages >>= VTD_STRIDE_SHIFT;
2135 if (!pages)
2136 break;
2137 pfnmerge >>= VTD_STRIDE_SHIFT;
2138 level++;
2139 support--;
2140 }
2141 return level;
2142 }
2143
2144 /*
2145 * Ensure that old small page tables are removed to make room for superpage(s).
2146 * We're going to add new large pages, so make sure we don't remove their parent
2147 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2148 */
2149 static void switch_to_super_page(struct dmar_domain *domain,
2150 unsigned long start_pfn,
2151 unsigned long end_pfn, int level)
2152 {
2153 unsigned long lvl_pages = lvl_to_nr_pages(level);
2154 struct iommu_domain_info *info;
2155 struct dma_pte *pte = NULL;
2156 unsigned long i;
2157
2158 while (start_pfn <= end_pfn) {
2159 if (!pte)
2160 pte = pfn_to_dma_pte(domain, start_pfn, &level,
2161 GFP_ATOMIC);
2162
2163 if (dma_pte_present(pte)) {
2164 dma_pte_free_pagetable(domain, start_pfn,
2165 start_pfn + lvl_pages - 1,
2166 level + 1);
2167
2168 xa_for_each(&domain->iommu_array, i, info)
2169 iommu_flush_iotlb_psi(info->iommu, domain,
2170 start_pfn, lvl_pages,
2171 0, 0);
2172 }
2173
2174 pte++;
2175 start_pfn += lvl_pages;
2176 if (first_pte_in_page(pte))
2177 pte = NULL;
2178 }
2179 }
2180
2181 static int
2182 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2183 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2184 gfp_t gfp)
2185 {
2186 struct dma_pte *first_pte = NULL, *pte = NULL;
2187 unsigned int largepage_lvl = 0;
2188 unsigned long lvl_pages = 0;
2189 phys_addr_t pteval;
2190 u64 attr;
2191
2192 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2193 return -EINVAL;
2194
2195 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2196 return -EINVAL;
2197
2198 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2199 attr |= DMA_FL_PTE_PRESENT;
2200 if (domain->use_first_level) {
2201 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2202 if (prot & DMA_PTE_WRITE)
2203 attr |= DMA_FL_PTE_DIRTY;
2204 }
2205
2206 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2207
2208 while (nr_pages > 0) {
2209 uint64_t tmp;
2210
2211 if (!pte) {
2212 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2213 phys_pfn, nr_pages);
2214
2215 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2216 gfp);
2217 if (!pte)
2218 return -ENOMEM;
2219 first_pte = pte;
2220
2221 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2222
2223 /* It is large page*/
2224 if (largepage_lvl > 1) {
2225 unsigned long end_pfn;
2226 unsigned long pages_to_remove;
2227
2228 pteval |= DMA_PTE_LARGE_PAGE;
2229 pages_to_remove = min_t(unsigned long, nr_pages,
2230 nr_pte_to_next_page(pte) * lvl_pages);
2231 end_pfn = iov_pfn + pages_to_remove - 1;
2232 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2233 } else {
2234 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2235 }
2236
2237 }
2238 /* We don't need lock here, nobody else
2239 * touches the iova range
2240 */
2241 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2242 if (tmp) {
2243 static int dumps = 5;
2244 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2245 iov_pfn, tmp, (unsigned long long)pteval);
2246 if (dumps) {
2247 dumps--;
2248 debug_dma_dump_mappings(NULL);
2249 }
2250 WARN_ON(1);
2251 }
2252
2253 nr_pages -= lvl_pages;
2254 iov_pfn += lvl_pages;
2255 phys_pfn += lvl_pages;
2256 pteval += lvl_pages * VTD_PAGE_SIZE;
2257
2258 /* If the next PTE would be the first in a new page, then we
2259 * need to flush the cache on the entries we've just written.
2260 * And then we'll need to recalculate 'pte', so clear it and
2261 * let it get set again in the if (!pte) block above.
2262 *
2263 * If we're done (!nr_pages) we need to flush the cache too.
2264 *
2265 * Also if we've been setting superpages, we may need to
2266 * recalculate 'pte' and switch back to smaller pages for the
2267 * end of the mapping, if the trailing size is not enough to
2268 * use another superpage (i.e. nr_pages < lvl_pages).
2269 */
2270 pte++;
2271 if (!nr_pages || first_pte_in_page(pte) ||
2272 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2273 domain_flush_cache(domain, first_pte,
2274 (void *)pte - (void *)first_pte);
2275 pte = NULL;
2276 }
2277 }
2278
2279 return 0;
2280 }
2281
2282 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2283 {
2284 struct intel_iommu *iommu = info->iommu;
2285 struct context_entry *context;
2286 u16 did_old;
2287
2288 if (!iommu)
2289 return;
2290
2291 spin_lock(&iommu->lock);
2292 context = iommu_context_addr(iommu, bus, devfn, 0);
2293 if (!context) {
2294 spin_unlock(&iommu->lock);
2295 return;
2296 }
2297
2298 if (sm_supported(iommu)) {
2299 if (hw_pass_through && domain_type_is_si(info->domain))
2300 did_old = FLPT_DEFAULT_DID;
2301 else
2302 did_old = domain_id_iommu(info->domain, iommu);
2303 } else {
2304 did_old = context_domain_id(context);
2305 }
2306
2307 context_clear_entry(context);
2308 __iommu_flush_cache(iommu, context, sizeof(*context));
2309 spin_unlock(&iommu->lock);
2310 iommu->flush.flush_context(iommu,
2311 did_old,
2312 (((u16)bus) << 8) | devfn,
2313 DMA_CCMD_MASK_NOBIT,
2314 DMA_CCMD_DEVICE_INVL);
2315
2316 if (sm_supported(iommu))
2317 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2318
2319 iommu->flush.flush_iotlb(iommu,
2320 did_old,
2321 0,
2322 0,
2323 DMA_TLB_DSI_FLUSH);
2324
2325 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2326 }
2327
2328 static int domain_setup_first_level(struct intel_iommu *iommu,
2329 struct dmar_domain *domain,
2330 struct device *dev,
2331 u32 pasid)
2332 {
2333 struct dma_pte *pgd = domain->pgd;
2334 int agaw, level;
2335 int flags = 0;
2336
2337 /*
2338 * Skip top levels of page tables for iommu which has
2339 * less agaw than default. Unnecessary for PT mode.
2340 */
2341 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2342 pgd = phys_to_virt(dma_pte_addr(pgd));
2343 if (!dma_pte_present(pgd))
2344 return -ENOMEM;
2345 }
2346
2347 level = agaw_to_level(agaw);
2348 if (level != 4 && level != 5)
2349 return -EINVAL;
2350
2351 if (level == 5)
2352 flags |= PASID_FLAG_FL5LP;
2353
2354 if (domain->force_snooping)
2355 flags |= PASID_FLAG_PAGE_SNOOP;
2356
2357 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2358 domain_id_iommu(domain, iommu),
2359 flags);
2360 }
2361
2362 static bool dev_is_real_dma_subdevice(struct device *dev)
2363 {
2364 return dev && dev_is_pci(dev) &&
2365 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2366 }
2367
2368 static int iommu_domain_identity_map(struct dmar_domain *domain,
2369 unsigned long first_vpfn,
2370 unsigned long last_vpfn)
2371 {
2372 /*
2373 * RMRR range might have overlap with physical memory range,
2374 * clear it first
2375 */
2376 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2377
2378 return __domain_mapping(domain, first_vpfn,
2379 first_vpfn, last_vpfn - first_vpfn + 1,
2380 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2381 }
2382
2383 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2384
2385 static int __init si_domain_init(int hw)
2386 {
2387 struct dmar_rmrr_unit *rmrr;
2388 struct device *dev;
2389 int i, nid, ret;
2390
2391 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2392 if (!si_domain)
2393 return -EFAULT;
2394
2395 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2396 domain_exit(si_domain);
2397 si_domain = NULL;
2398 return -EFAULT;
2399 }
2400
2401 if (hw)
2402 return 0;
2403
2404 for_each_online_node(nid) {
2405 unsigned long start_pfn, end_pfn;
2406 int i;
2407
2408 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2409 ret = iommu_domain_identity_map(si_domain,
2410 mm_to_dma_pfn_start(start_pfn),
2411 mm_to_dma_pfn_end(end_pfn));
2412 if (ret)
2413 return ret;
2414 }
2415 }
2416
2417 /*
2418 * Identity map the RMRRs so that devices with RMRRs could also use
2419 * the si_domain.
2420 */
2421 for_each_rmrr_units(rmrr) {
2422 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2423 i, dev) {
2424 unsigned long long start = rmrr->base_address;
2425 unsigned long long end = rmrr->end_address;
2426
2427 if (WARN_ON(end < start ||
2428 end >> agaw_to_width(si_domain->agaw)))
2429 continue;
2430
2431 ret = iommu_domain_identity_map(si_domain,
2432 mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2433 mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2434 if (ret)
2435 return ret;
2436 }
2437 }
2438
2439 return 0;
2440 }
2441
2442 static int dmar_domain_attach_device(struct dmar_domain *domain,
2443 struct device *dev)
2444 {
2445 struct device_domain_info *info = dev_iommu_priv_get(dev);
2446 struct intel_iommu *iommu;
2447 unsigned long flags;
2448 u8 bus, devfn;
2449 int ret;
2450
2451 iommu = device_to_iommu(dev, &bus, &devfn);
2452 if (!iommu)
2453 return -ENODEV;
2454
2455 ret = domain_attach_iommu(domain, iommu);
2456 if (ret)
2457 return ret;
2458 info->domain = domain;
2459 spin_lock_irqsave(&domain->lock, flags);
2460 list_add(&info->link, &domain->devices);
2461 spin_unlock_irqrestore(&domain->lock, flags);
2462
2463 /* PASID table is mandatory for a PCI device in scalable mode. */
2464 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2465 /* Setup the PASID entry for requests without PASID: */
2466 if (hw_pass_through && domain_type_is_si(domain))
2467 ret = intel_pasid_setup_pass_through(iommu, domain,
2468 dev, IOMMU_NO_PASID);
2469 else if (domain->use_first_level)
2470 ret = domain_setup_first_level(iommu, domain, dev,
2471 IOMMU_NO_PASID);
2472 else
2473 ret = intel_pasid_setup_second_level(iommu, domain,
2474 dev, IOMMU_NO_PASID);
2475 if (ret) {
2476 dev_err(dev, "Setup RID2PASID failed\n");
2477 device_block_translation(dev);
2478 return ret;
2479 }
2480 }
2481
2482 ret = domain_context_mapping(domain, dev);
2483 if (ret) {
2484 dev_err(dev, "Domain context map failed\n");
2485 device_block_translation(dev);
2486 return ret;
2487 }
2488
2489 iommu_enable_pci_caps(info);
2490
2491 return 0;
2492 }
2493
2494 /**
2495 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2496 * is relaxable (ie. is allowed to be not enforced under some conditions)
2497 * @dev: device handle
2498 *
2499 * We assume that PCI USB devices with RMRRs have them largely
2500 * for historical reasons and that the RMRR space is not actively used post
2501 * boot. This exclusion may change if vendors begin to abuse it.
2502 *
2503 * The same exception is made for graphics devices, with the requirement that
2504 * any use of the RMRR regions will be torn down before assigning the device
2505 * to a guest.
2506 *
2507 * Return: true if the RMRR is relaxable, false otherwise
2508 */
2509 static bool device_rmrr_is_relaxable(struct device *dev)
2510 {
2511 struct pci_dev *pdev;
2512
2513 if (!dev_is_pci(dev))
2514 return false;
2515
2516 pdev = to_pci_dev(dev);
2517 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2518 return true;
2519 else
2520 return false;
2521 }
2522
2523 /*
2524 * Return the required default domain type for a specific device.
2525 *
2526 * @dev: the device in query
2527 * @startup: true if this is during early boot
2528 *
2529 * Returns:
2530 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2531 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2532 * - 0: both identity and dynamic domains work for this device
2533 */
2534 static int device_def_domain_type(struct device *dev)
2535 {
2536 if (dev_is_pci(dev)) {
2537 struct pci_dev *pdev = to_pci_dev(dev);
2538
2539 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2540 return IOMMU_DOMAIN_IDENTITY;
2541
2542 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2543 return IOMMU_DOMAIN_IDENTITY;
2544 }
2545
2546 return 0;
2547 }
2548
2549 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2550 {
2551 /*
2552 * Start from the sane iommu hardware state.
2553 * If the queued invalidation is already initialized by us
2554 * (for example, while enabling interrupt-remapping) then
2555 * we got the things already rolling from a sane state.
2556 */
2557 if (!iommu->qi) {
2558 /*
2559 * Clear any previous faults.
2560 */
2561 dmar_fault(-1, iommu);
2562 /*
2563 * Disable queued invalidation if supported and already enabled
2564 * before OS handover.
2565 */
2566 dmar_disable_qi(iommu);
2567 }
2568
2569 if (dmar_enable_qi(iommu)) {
2570 /*
2571 * Queued Invalidate not enabled, use Register Based Invalidate
2572 */
2573 iommu->flush.flush_context = __iommu_flush_context;
2574 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2575 pr_info("%s: Using Register based invalidation\n",
2576 iommu->name);
2577 } else {
2578 iommu->flush.flush_context = qi_flush_context;
2579 iommu->flush.flush_iotlb = qi_flush_iotlb;
2580 pr_info("%s: Using Queued invalidation\n", iommu->name);
2581 }
2582 }
2583
2584 static int copy_context_table(struct intel_iommu *iommu,
2585 struct root_entry *old_re,
2586 struct context_entry **tbl,
2587 int bus, bool ext)
2588 {
2589 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2590 struct context_entry *new_ce = NULL, ce;
2591 struct context_entry *old_ce = NULL;
2592 struct root_entry re;
2593 phys_addr_t old_ce_phys;
2594
2595 tbl_idx = ext ? bus * 2 : bus;
2596 memcpy(&re, old_re, sizeof(re));
2597
2598 for (devfn = 0; devfn < 256; devfn++) {
2599 /* First calculate the correct index */
2600 idx = (ext ? devfn * 2 : devfn) % 256;
2601
2602 if (idx == 0) {
2603 /* First save what we may have and clean up */
2604 if (new_ce) {
2605 tbl[tbl_idx] = new_ce;
2606 __iommu_flush_cache(iommu, new_ce,
2607 VTD_PAGE_SIZE);
2608 pos = 1;
2609 }
2610
2611 if (old_ce)
2612 memunmap(old_ce);
2613
2614 ret = 0;
2615 if (devfn < 0x80)
2616 old_ce_phys = root_entry_lctp(&re);
2617 else
2618 old_ce_phys = root_entry_uctp(&re);
2619
2620 if (!old_ce_phys) {
2621 if (ext && devfn == 0) {
2622 /* No LCTP, try UCTP */
2623 devfn = 0x7f;
2624 continue;
2625 } else {
2626 goto out;
2627 }
2628 }
2629
2630 ret = -ENOMEM;
2631 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2632 MEMREMAP_WB);
2633 if (!old_ce)
2634 goto out;
2635
2636 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2637 if (!new_ce)
2638 goto out_unmap;
2639
2640 ret = 0;
2641 }
2642
2643 /* Now copy the context entry */
2644 memcpy(&ce, old_ce + idx, sizeof(ce));
2645
2646 if (!context_present(&ce))
2647 continue;
2648
2649 did = context_domain_id(&ce);
2650 if (did >= 0 && did < cap_ndoms(iommu->cap))
2651 set_bit(did, iommu->domain_ids);
2652
2653 set_context_copied(iommu, bus, devfn);
2654 new_ce[idx] = ce;
2655 }
2656
2657 tbl[tbl_idx + pos] = new_ce;
2658
2659 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2660
2661 out_unmap:
2662 memunmap(old_ce);
2663
2664 out:
2665 return ret;
2666 }
2667
2668 static int copy_translation_tables(struct intel_iommu *iommu)
2669 {
2670 struct context_entry **ctxt_tbls;
2671 struct root_entry *old_rt;
2672 phys_addr_t old_rt_phys;
2673 int ctxt_table_entries;
2674 u64 rtaddr_reg;
2675 int bus, ret;
2676 bool new_ext, ext;
2677
2678 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2679 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2680 new_ext = !!sm_supported(iommu);
2681
2682 /*
2683 * The RTT bit can only be changed when translation is disabled,
2684 * but disabling translation means to open a window for data
2685 * corruption. So bail out and don't copy anything if we would
2686 * have to change the bit.
2687 */
2688 if (new_ext != ext)
2689 return -EINVAL;
2690
2691 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2692 if (!iommu->copied_tables)
2693 return -ENOMEM;
2694
2695 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2696 if (!old_rt_phys)
2697 return -EINVAL;
2698
2699 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2700 if (!old_rt)
2701 return -ENOMEM;
2702
2703 /* This is too big for the stack - allocate it from slab */
2704 ctxt_table_entries = ext ? 512 : 256;
2705 ret = -ENOMEM;
2706 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2707 if (!ctxt_tbls)
2708 goto out_unmap;
2709
2710 for (bus = 0; bus < 256; bus++) {
2711 ret = copy_context_table(iommu, &old_rt[bus],
2712 ctxt_tbls, bus, ext);
2713 if (ret) {
2714 pr_err("%s: Failed to copy context table for bus %d\n",
2715 iommu->name, bus);
2716 continue;
2717 }
2718 }
2719
2720 spin_lock(&iommu->lock);
2721
2722 /* Context tables are copied, now write them to the root_entry table */
2723 for (bus = 0; bus < 256; bus++) {
2724 int idx = ext ? bus * 2 : bus;
2725 u64 val;
2726
2727 if (ctxt_tbls[idx]) {
2728 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2729 iommu->root_entry[bus].lo = val;
2730 }
2731
2732 if (!ext || !ctxt_tbls[idx + 1])
2733 continue;
2734
2735 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2736 iommu->root_entry[bus].hi = val;
2737 }
2738
2739 spin_unlock(&iommu->lock);
2740
2741 kfree(ctxt_tbls);
2742
2743 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2744
2745 ret = 0;
2746
2747 out_unmap:
2748 memunmap(old_rt);
2749
2750 return ret;
2751 }
2752
2753 static int __init init_dmars(void)
2754 {
2755 struct dmar_drhd_unit *drhd;
2756 struct intel_iommu *iommu;
2757 int ret;
2758
2759 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2760 if (ret)
2761 goto free_iommu;
2762
2763 for_each_iommu(iommu, drhd) {
2764 if (drhd->ignored) {
2765 iommu_disable_translation(iommu);
2766 continue;
2767 }
2768
2769 /*
2770 * Find the max pasid size of all IOMMU's in the system.
2771 * We need to ensure the system pasid table is no bigger
2772 * than the smallest supported.
2773 */
2774 if (pasid_supported(iommu)) {
2775 u32 temp = 2 << ecap_pss(iommu->ecap);
2776
2777 intel_pasid_max_id = min_t(u32, temp,
2778 intel_pasid_max_id);
2779 }
2780
2781 intel_iommu_init_qi(iommu);
2782
2783 ret = iommu_init_domains(iommu);
2784 if (ret)
2785 goto free_iommu;
2786
2787 init_translation_status(iommu);
2788
2789 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2790 iommu_disable_translation(iommu);
2791 clear_translation_pre_enabled(iommu);
2792 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2793 iommu->name);
2794 }
2795
2796 /*
2797 * TBD:
2798 * we could share the same root & context tables
2799 * among all IOMMU's. Need to Split it later.
2800 */
2801 ret = iommu_alloc_root_entry(iommu);
2802 if (ret)
2803 goto free_iommu;
2804
2805 if (translation_pre_enabled(iommu)) {
2806 pr_info("Translation already enabled - trying to copy translation structures\n");
2807
2808 ret = copy_translation_tables(iommu);
2809 if (ret) {
2810 /*
2811 * We found the IOMMU with translation
2812 * enabled - but failed to copy over the
2813 * old root-entry table. Try to proceed
2814 * by disabling translation now and
2815 * allocating a clean root-entry table.
2816 * This might cause DMAR faults, but
2817 * probably the dump will still succeed.
2818 */
2819 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2820 iommu->name);
2821 iommu_disable_translation(iommu);
2822 clear_translation_pre_enabled(iommu);
2823 } else {
2824 pr_info("Copied translation tables from previous kernel for %s\n",
2825 iommu->name);
2826 }
2827 }
2828
2829 if (!ecap_pass_through(iommu->ecap))
2830 hw_pass_through = 0;
2831 intel_svm_check(iommu);
2832 }
2833
2834 /*
2835 * Now that qi is enabled on all iommus, set the root entry and flush
2836 * caches. This is required on some Intel X58 chipsets, otherwise the
2837 * flush_context function will loop forever and the boot hangs.
2838 */
2839 for_each_active_iommu(iommu, drhd) {
2840 iommu_flush_write_buffer(iommu);
2841 iommu_set_root_entry(iommu);
2842 }
2843
2844 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2845 dmar_map_gfx = 0;
2846 #endif
2847
2848 if (!dmar_map_gfx)
2849 iommu_identity_mapping |= IDENTMAP_GFX;
2850
2851 check_tylersburg_isoch();
2852
2853 ret = si_domain_init(hw_pass_through);
2854 if (ret)
2855 goto free_iommu;
2856
2857 /*
2858 * for each drhd
2859 * enable fault log
2860 * global invalidate context cache
2861 * global invalidate iotlb
2862 * enable translation
2863 */
2864 for_each_iommu(iommu, drhd) {
2865 if (drhd->ignored) {
2866 /*
2867 * we always have to disable PMRs or DMA may fail on
2868 * this device
2869 */
2870 if (force_on)
2871 iommu_disable_protect_mem_regions(iommu);
2872 continue;
2873 }
2874
2875 iommu_flush_write_buffer(iommu);
2876
2877 #ifdef CONFIG_INTEL_IOMMU_SVM
2878 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2879 /*
2880 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2881 * could cause possible lock race condition.
2882 */
2883 up_write(&dmar_global_lock);
2884 ret = intel_svm_enable_prq(iommu);
2885 down_write(&dmar_global_lock);
2886 if (ret)
2887 goto free_iommu;
2888 }
2889 #endif
2890 ret = dmar_set_interrupt(iommu);
2891 if (ret)
2892 goto free_iommu;
2893 }
2894
2895 return 0;
2896
2897 free_iommu:
2898 for_each_active_iommu(iommu, drhd) {
2899 disable_dmar_iommu(iommu);
2900 free_dmar_iommu(iommu);
2901 }
2902 if (si_domain) {
2903 domain_exit(si_domain);
2904 si_domain = NULL;
2905 }
2906
2907 return ret;
2908 }
2909
2910 static void __init init_no_remapping_devices(void)
2911 {
2912 struct dmar_drhd_unit *drhd;
2913 struct device *dev;
2914 int i;
2915
2916 for_each_drhd_unit(drhd) {
2917 if (!drhd->include_all) {
2918 for_each_active_dev_scope(drhd->devices,
2919 drhd->devices_cnt, i, dev)
2920 break;
2921 /* ignore DMAR unit if no devices exist */
2922 if (i == drhd->devices_cnt)
2923 drhd->ignored = 1;
2924 }
2925 }
2926
2927 for_each_active_drhd_unit(drhd) {
2928 if (drhd->include_all)
2929 continue;
2930
2931 for_each_active_dev_scope(drhd->devices,
2932 drhd->devices_cnt, i, dev)
2933 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2934 break;
2935 if (i < drhd->devices_cnt)
2936 continue;
2937
2938 /* This IOMMU has *only* gfx devices. Either bypass it or
2939 set the gfx_mapped flag, as appropriate */
2940 drhd->gfx_dedicated = 1;
2941 if (!dmar_map_gfx)
2942 drhd->ignored = 1;
2943 }
2944 }
2945
2946 #ifdef CONFIG_SUSPEND
2947 static int init_iommu_hw(void)
2948 {
2949 struct dmar_drhd_unit *drhd;
2950 struct intel_iommu *iommu = NULL;
2951 int ret;
2952
2953 for_each_active_iommu(iommu, drhd) {
2954 if (iommu->qi) {
2955 ret = dmar_reenable_qi(iommu);
2956 if (ret)
2957 return ret;
2958 }
2959 }
2960
2961 for_each_iommu(iommu, drhd) {
2962 if (drhd->ignored) {
2963 /*
2964 * we always have to disable PMRs or DMA may fail on
2965 * this device
2966 */
2967 if (force_on)
2968 iommu_disable_protect_mem_regions(iommu);
2969 continue;
2970 }
2971
2972 iommu_flush_write_buffer(iommu);
2973 iommu_set_root_entry(iommu);
2974 iommu_enable_translation(iommu);
2975 iommu_disable_protect_mem_regions(iommu);
2976 }
2977
2978 return 0;
2979 }
2980
2981 static void iommu_flush_all(void)
2982 {
2983 struct dmar_drhd_unit *drhd;
2984 struct intel_iommu *iommu;
2985
2986 for_each_active_iommu(iommu, drhd) {
2987 iommu->flush.flush_context(iommu, 0, 0, 0,
2988 DMA_CCMD_GLOBAL_INVL);
2989 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2990 DMA_TLB_GLOBAL_FLUSH);
2991 }
2992 }
2993
2994 static int iommu_suspend(void)
2995 {
2996 struct dmar_drhd_unit *drhd;
2997 struct intel_iommu *iommu = NULL;
2998 unsigned long flag;
2999
3000 for_each_active_iommu(iommu, drhd) {
3001 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3002 GFP_KERNEL);
3003 if (!iommu->iommu_state)
3004 goto nomem;
3005 }
3006
3007 iommu_flush_all();
3008
3009 for_each_active_iommu(iommu, drhd) {
3010 iommu_disable_translation(iommu);
3011
3012 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3013
3014 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3015 readl(iommu->reg + DMAR_FECTL_REG);
3016 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3017 readl(iommu->reg + DMAR_FEDATA_REG);
3018 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3019 readl(iommu->reg + DMAR_FEADDR_REG);
3020 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3021 readl(iommu->reg + DMAR_FEUADDR_REG);
3022
3023 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3024 }
3025 return 0;
3026
3027 nomem:
3028 for_each_active_iommu(iommu, drhd)
3029 kfree(iommu->iommu_state);
3030
3031 return -ENOMEM;
3032 }
3033
3034 static void iommu_resume(void)
3035 {
3036 struct dmar_drhd_unit *drhd;
3037 struct intel_iommu *iommu = NULL;
3038 unsigned long flag;
3039
3040 if (init_iommu_hw()) {
3041 if (force_on)
3042 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3043 else
3044 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3045 return;
3046 }
3047
3048 for_each_active_iommu(iommu, drhd) {
3049
3050 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3051
3052 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3053 iommu->reg + DMAR_FECTL_REG);
3054 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3055 iommu->reg + DMAR_FEDATA_REG);
3056 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3057 iommu->reg + DMAR_FEADDR_REG);
3058 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3059 iommu->reg + DMAR_FEUADDR_REG);
3060
3061 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3062 }
3063
3064 for_each_active_iommu(iommu, drhd)
3065 kfree(iommu->iommu_state);
3066 }
3067
3068 static struct syscore_ops iommu_syscore_ops = {
3069 .resume = iommu_resume,
3070 .suspend = iommu_suspend,
3071 };
3072
3073 static void __init init_iommu_pm_ops(void)
3074 {
3075 register_syscore_ops(&iommu_syscore_ops);
3076 }
3077
3078 #else
3079 static inline void init_iommu_pm_ops(void) {}
3080 #endif /* CONFIG_PM */
3081
3082 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3083 {
3084 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3085 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3086 rmrr->end_address <= rmrr->base_address ||
3087 arch_rmrr_sanity_check(rmrr))
3088 return -EINVAL;
3089
3090 return 0;
3091 }
3092
3093 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3094 {
3095 struct acpi_dmar_reserved_memory *rmrr;
3096 struct dmar_rmrr_unit *rmrru;
3097
3098 rmrr = (struct acpi_dmar_reserved_memory *)header;
3099 if (rmrr_sanity_check(rmrr)) {
3100 pr_warn(FW_BUG
3101 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3102 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3103 rmrr->base_address, rmrr->end_address,
3104 dmi_get_system_info(DMI_BIOS_VENDOR),
3105 dmi_get_system_info(DMI_BIOS_VERSION),
3106 dmi_get_system_info(DMI_PRODUCT_VERSION));
3107 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3108 }
3109
3110 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3111 if (!rmrru)
3112 goto out;
3113
3114 rmrru->hdr = header;
3115
3116 rmrru->base_address = rmrr->base_address;
3117 rmrru->end_address = rmrr->end_address;
3118
3119 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3120 ((void *)rmrr) + rmrr->header.length,
3121 &rmrru->devices_cnt);
3122 if (rmrru->devices_cnt && rmrru->devices == NULL)
3123 goto free_rmrru;
3124
3125 list_add(&rmrru->list, &dmar_rmrr_units);
3126
3127 return 0;
3128 free_rmrru:
3129 kfree(rmrru);
3130 out:
3131 return -ENOMEM;
3132 }
3133
3134 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3135 {
3136 struct dmar_atsr_unit *atsru;
3137 struct acpi_dmar_atsr *tmp;
3138
3139 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3140 dmar_rcu_check()) {
3141 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3142 if (atsr->segment != tmp->segment)
3143 continue;
3144 if (atsr->header.length != tmp->header.length)
3145 continue;
3146 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3147 return atsru;
3148 }
3149
3150 return NULL;
3151 }
3152
3153 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3154 {
3155 struct acpi_dmar_atsr *atsr;
3156 struct dmar_atsr_unit *atsru;
3157
3158 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3159 return 0;
3160
3161 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3162 atsru = dmar_find_atsr(atsr);
3163 if (atsru)
3164 return 0;
3165
3166 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3167 if (!atsru)
3168 return -ENOMEM;
3169
3170 /*
3171 * If memory is allocated from slab by ACPI _DSM method, we need to
3172 * copy the memory content because the memory buffer will be freed
3173 * on return.
3174 */
3175 atsru->hdr = (void *)(atsru + 1);
3176 memcpy(atsru->hdr, hdr, hdr->length);
3177 atsru->include_all = atsr->flags & 0x1;
3178 if (!atsru->include_all) {
3179 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3180 (void *)atsr + atsr->header.length,
3181 &atsru->devices_cnt);
3182 if (atsru->devices_cnt && atsru->devices == NULL) {
3183 kfree(atsru);
3184 return -ENOMEM;
3185 }
3186 }
3187
3188 list_add_rcu(&atsru->list, &dmar_atsr_units);
3189
3190 return 0;
3191 }
3192
3193 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3194 {
3195 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3196 kfree(atsru);
3197 }
3198
3199 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3200 {
3201 struct acpi_dmar_atsr *atsr;
3202 struct dmar_atsr_unit *atsru;
3203
3204 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3205 atsru = dmar_find_atsr(atsr);
3206 if (atsru) {
3207 list_del_rcu(&atsru->list);
3208 synchronize_rcu();
3209 intel_iommu_free_atsr(atsru);
3210 }
3211
3212 return 0;
3213 }
3214
3215 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3216 {
3217 int i;
3218 struct device *dev;
3219 struct acpi_dmar_atsr *atsr;
3220 struct dmar_atsr_unit *atsru;
3221
3222 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3223 atsru = dmar_find_atsr(atsr);
3224 if (!atsru)
3225 return 0;
3226
3227 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3228 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3229 i, dev)
3230 return -EBUSY;
3231 }
3232
3233 return 0;
3234 }
3235
3236 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3237 {
3238 struct dmar_satc_unit *satcu;
3239 struct acpi_dmar_satc *tmp;
3240
3241 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3242 dmar_rcu_check()) {
3243 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3244 if (satc->segment != tmp->segment)
3245 continue;
3246 if (satc->header.length != tmp->header.length)
3247 continue;
3248 if (memcmp(satc, tmp, satc->header.length) == 0)
3249 return satcu;
3250 }
3251
3252 return NULL;
3253 }
3254
3255 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3256 {
3257 struct acpi_dmar_satc *satc;
3258 struct dmar_satc_unit *satcu;
3259
3260 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3261 return 0;
3262
3263 satc = container_of(hdr, struct acpi_dmar_satc, header);
3264 satcu = dmar_find_satc(satc);
3265 if (satcu)
3266 return 0;
3267
3268 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3269 if (!satcu)
3270 return -ENOMEM;
3271
3272 satcu->hdr = (void *)(satcu + 1);
3273 memcpy(satcu->hdr, hdr, hdr->length);
3274 satcu->atc_required = satc->flags & 0x1;
3275 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3276 (void *)satc + satc->header.length,
3277 &satcu->devices_cnt);
3278 if (satcu->devices_cnt && !satcu->devices) {
3279 kfree(satcu);
3280 return -ENOMEM;
3281 }
3282 list_add_rcu(&satcu->list, &dmar_satc_units);
3283
3284 return 0;
3285 }
3286
3287 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3288 {
3289 int sp, ret;
3290 struct intel_iommu *iommu = dmaru->iommu;
3291
3292 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3293 if (ret)
3294 goto out;
3295
3296 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3297 pr_warn("%s: Doesn't support hardware pass through.\n",
3298 iommu->name);
3299 return -ENXIO;
3300 }
3301
3302 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3303 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3304 pr_warn("%s: Doesn't support large page.\n",
3305 iommu->name);
3306 return -ENXIO;
3307 }
3308
3309 /*
3310 * Disable translation if already enabled prior to OS handover.
3311 */
3312 if (iommu->gcmd & DMA_GCMD_TE)
3313 iommu_disable_translation(iommu);
3314
3315 ret = iommu_init_domains(iommu);
3316 if (ret == 0)
3317 ret = iommu_alloc_root_entry(iommu);
3318 if (ret)
3319 goto out;
3320
3321 intel_svm_check(iommu);
3322
3323 if (dmaru->ignored) {
3324 /*
3325 * we always have to disable PMRs or DMA may fail on this device
3326 */
3327 if (force_on)
3328 iommu_disable_protect_mem_regions(iommu);
3329 return 0;
3330 }
3331
3332 intel_iommu_init_qi(iommu);
3333 iommu_flush_write_buffer(iommu);
3334
3335 #ifdef CONFIG_INTEL_IOMMU_SVM
3336 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3337 ret = intel_svm_enable_prq(iommu);
3338 if (ret)
3339 goto disable_iommu;
3340 }
3341 #endif
3342 ret = dmar_set_interrupt(iommu);
3343 if (ret)
3344 goto disable_iommu;
3345
3346 iommu_set_root_entry(iommu);
3347 iommu_enable_translation(iommu);
3348
3349 iommu_disable_protect_mem_regions(iommu);
3350 return 0;
3351
3352 disable_iommu:
3353 disable_dmar_iommu(iommu);
3354 out:
3355 free_dmar_iommu(iommu);
3356 return ret;
3357 }
3358
3359 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3360 {
3361 int ret = 0;
3362 struct intel_iommu *iommu = dmaru->iommu;
3363
3364 if (!intel_iommu_enabled)
3365 return 0;
3366 if (iommu == NULL)
3367 return -EINVAL;
3368
3369 if (insert) {
3370 ret = intel_iommu_add(dmaru);
3371 } else {
3372 disable_dmar_iommu(iommu);
3373 free_dmar_iommu(iommu);
3374 }
3375
3376 return ret;
3377 }
3378
3379 static void intel_iommu_free_dmars(void)
3380 {
3381 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3382 struct dmar_atsr_unit *atsru, *atsr_n;
3383 struct dmar_satc_unit *satcu, *satc_n;
3384
3385 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3386 list_del(&rmrru->list);
3387 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3388 kfree(rmrru);
3389 }
3390
3391 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3392 list_del(&atsru->list);
3393 intel_iommu_free_atsr(atsru);
3394 }
3395 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3396 list_del(&satcu->list);
3397 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3398 kfree(satcu);
3399 }
3400 }
3401
3402 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3403 {
3404 struct dmar_satc_unit *satcu;
3405 struct acpi_dmar_satc *satc;
3406 struct device *tmp;
3407 int i;
3408
3409 dev = pci_physfn(dev);
3410 rcu_read_lock();
3411
3412 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3413 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3414 if (satc->segment != pci_domain_nr(dev->bus))
3415 continue;
3416 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3417 if (to_pci_dev(tmp) == dev)
3418 goto out;
3419 }
3420 satcu = NULL;
3421 out:
3422 rcu_read_unlock();
3423 return satcu;
3424 }
3425
3426 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3427 {
3428 int i, ret = 1;
3429 struct pci_bus *bus;
3430 struct pci_dev *bridge = NULL;
3431 struct device *tmp;
3432 struct acpi_dmar_atsr *atsr;
3433 struct dmar_atsr_unit *atsru;
3434 struct dmar_satc_unit *satcu;
3435
3436 dev = pci_physfn(dev);
3437 satcu = dmar_find_matched_satc_unit(dev);
3438 if (satcu)
3439 /*
3440 * This device supports ATS as it is in SATC table.
3441 * When IOMMU is in legacy mode, enabling ATS is done
3442 * automatically by HW for the device that requires
3443 * ATS, hence OS should not enable this device ATS
3444 * to avoid duplicated TLB invalidation.
3445 */
3446 return !(satcu->atc_required && !sm_supported(iommu));
3447
3448 for (bus = dev->bus; bus; bus = bus->parent) {
3449 bridge = bus->self;
3450 /* If it's an integrated device, allow ATS */
3451 if (!bridge)
3452 return 1;
3453 /* Connected via non-PCIe: no ATS */
3454 if (!pci_is_pcie(bridge) ||
3455 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3456 return 0;
3457 /* If we found the root port, look it up in the ATSR */
3458 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3459 break;
3460 }
3461
3462 rcu_read_lock();
3463 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3464 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3465 if (atsr->segment != pci_domain_nr(dev->bus))
3466 continue;
3467
3468 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3469 if (tmp == &bridge->dev)
3470 goto out;
3471
3472 if (atsru->include_all)
3473 goto out;
3474 }
3475 ret = 0;
3476 out:
3477 rcu_read_unlock();
3478
3479 return ret;
3480 }
3481
3482 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3483 {
3484 int ret;
3485 struct dmar_rmrr_unit *rmrru;
3486 struct dmar_atsr_unit *atsru;
3487 struct dmar_satc_unit *satcu;
3488 struct acpi_dmar_atsr *atsr;
3489 struct acpi_dmar_reserved_memory *rmrr;
3490 struct acpi_dmar_satc *satc;
3491
3492 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3493 return 0;
3494
3495 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3496 rmrr = container_of(rmrru->hdr,
3497 struct acpi_dmar_reserved_memory, header);
3498 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3499 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3500 ((void *)rmrr) + rmrr->header.length,
3501 rmrr->segment, rmrru->devices,
3502 rmrru->devices_cnt);
3503 if (ret < 0)
3504 return ret;
3505 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3506 dmar_remove_dev_scope(info, rmrr->segment,
3507 rmrru->devices, rmrru->devices_cnt);
3508 }
3509 }
3510
3511 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3512 if (atsru->include_all)
3513 continue;
3514
3515 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3516 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3517 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3518 (void *)atsr + atsr->header.length,
3519 atsr->segment, atsru->devices,
3520 atsru->devices_cnt);
3521 if (ret > 0)
3522 break;
3523 else if (ret < 0)
3524 return ret;
3525 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3526 if (dmar_remove_dev_scope(info, atsr->segment,
3527 atsru->devices, atsru->devices_cnt))
3528 break;
3529 }
3530 }
3531 list_for_each_entry(satcu, &dmar_satc_units, list) {
3532 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3533 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3534 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3535 (void *)satc + satc->header.length,
3536 satc->segment, satcu->devices,
3537 satcu->devices_cnt);
3538 if (ret > 0)
3539 break;
3540 else if (ret < 0)
3541 return ret;
3542 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3543 if (dmar_remove_dev_scope(info, satc->segment,
3544 satcu->devices, satcu->devices_cnt))
3545 break;
3546 }
3547 }
3548
3549 return 0;
3550 }
3551
3552 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3553 unsigned long val, void *v)
3554 {
3555 struct memory_notify *mhp = v;
3556 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3557 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3558 mhp->nr_pages - 1);
3559
3560 switch (val) {
3561 case MEM_GOING_ONLINE:
3562 if (iommu_domain_identity_map(si_domain,
3563 start_vpfn, last_vpfn)) {
3564 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3565 start_vpfn, last_vpfn);
3566 return NOTIFY_BAD;
3567 }
3568 break;
3569
3570 case MEM_OFFLINE:
3571 case MEM_CANCEL_ONLINE:
3572 {
3573 struct dmar_drhd_unit *drhd;
3574 struct intel_iommu *iommu;
3575 LIST_HEAD(freelist);
3576
3577 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3578
3579 rcu_read_lock();
3580 for_each_active_iommu(iommu, drhd)
3581 iommu_flush_iotlb_psi(iommu, si_domain,
3582 start_vpfn, mhp->nr_pages,
3583 list_empty(&freelist), 0);
3584 rcu_read_unlock();
3585 put_pages_list(&freelist);
3586 }
3587 break;
3588 }
3589
3590 return NOTIFY_OK;
3591 }
3592
3593 static struct notifier_block intel_iommu_memory_nb = {
3594 .notifier_call = intel_iommu_memory_notifier,
3595 .priority = 0
3596 };
3597
3598 static void intel_disable_iommus(void)
3599 {
3600 struct intel_iommu *iommu = NULL;
3601 struct dmar_drhd_unit *drhd;
3602
3603 for_each_iommu(iommu, drhd)
3604 iommu_disable_translation(iommu);
3605 }
3606
3607 void intel_iommu_shutdown(void)
3608 {
3609 struct dmar_drhd_unit *drhd;
3610 struct intel_iommu *iommu = NULL;
3611
3612 if (no_iommu || dmar_disabled)
3613 return;
3614
3615 down_write(&dmar_global_lock);
3616
3617 /* Disable PMRs explicitly here. */
3618 for_each_iommu(iommu, drhd)
3619 iommu_disable_protect_mem_regions(iommu);
3620
3621 /* Make sure the IOMMUs are switched off */
3622 intel_disable_iommus();
3623
3624 up_write(&dmar_global_lock);
3625 }
3626
3627 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3628 {
3629 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3630
3631 return container_of(iommu_dev, struct intel_iommu, iommu);
3632 }
3633
3634 static ssize_t version_show(struct device *dev,
3635 struct device_attribute *attr, char *buf)
3636 {
3637 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3638 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3639 return sysfs_emit(buf, "%d:%d\n",
3640 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3641 }
3642 static DEVICE_ATTR_RO(version);
3643
3644 static ssize_t address_show(struct device *dev,
3645 struct device_attribute *attr, char *buf)
3646 {
3647 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3648 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3649 }
3650 static DEVICE_ATTR_RO(address);
3651
3652 static ssize_t cap_show(struct device *dev,
3653 struct device_attribute *attr, char *buf)
3654 {
3655 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3656 return sysfs_emit(buf, "%llx\n", iommu->cap);
3657 }
3658 static DEVICE_ATTR_RO(cap);
3659
3660 static ssize_t ecap_show(struct device *dev,
3661 struct device_attribute *attr, char *buf)
3662 {
3663 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3664 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3665 }
3666 static DEVICE_ATTR_RO(ecap);
3667
3668 static ssize_t domains_supported_show(struct device *dev,
3669 struct device_attribute *attr, char *buf)
3670 {
3671 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3672 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3673 }
3674 static DEVICE_ATTR_RO(domains_supported);
3675
3676 static ssize_t domains_used_show(struct device *dev,
3677 struct device_attribute *attr, char *buf)
3678 {
3679 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3680 return sysfs_emit(buf, "%d\n",
3681 bitmap_weight(iommu->domain_ids,
3682 cap_ndoms(iommu->cap)));
3683 }
3684 static DEVICE_ATTR_RO(domains_used);
3685
3686 static struct attribute *intel_iommu_attrs[] = {
3687 &dev_attr_version.attr,
3688 &dev_attr_address.attr,
3689 &dev_attr_cap.attr,
3690 &dev_attr_ecap.attr,
3691 &dev_attr_domains_supported.attr,
3692 &dev_attr_domains_used.attr,
3693 NULL,
3694 };
3695
3696 static struct attribute_group intel_iommu_group = {
3697 .name = "intel-iommu",
3698 .attrs = intel_iommu_attrs,
3699 };
3700
3701 const struct attribute_group *intel_iommu_groups[] = {
3702 &intel_iommu_group,
3703 NULL,
3704 };
3705
3706 static inline bool has_external_pci(void)
3707 {
3708 struct pci_dev *pdev = NULL;
3709
3710 for_each_pci_dev(pdev)
3711 if (pdev->external_facing) {
3712 pci_dev_put(pdev);
3713 return true;
3714 }
3715
3716 return false;
3717 }
3718
3719 static int __init platform_optin_force_iommu(void)
3720 {
3721 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3722 return 0;
3723
3724 if (no_iommu || dmar_disabled)
3725 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3726
3727 /*
3728 * If Intel-IOMMU is disabled by default, we will apply identity
3729 * map for all devices except those marked as being untrusted.
3730 */
3731 if (dmar_disabled)
3732 iommu_set_default_passthrough(false);
3733
3734 dmar_disabled = 0;
3735 no_iommu = 0;
3736
3737 return 1;
3738 }
3739
3740 static int __init probe_acpi_namespace_devices(void)
3741 {
3742 struct dmar_drhd_unit *drhd;
3743 /* To avoid a -Wunused-but-set-variable warning. */
3744 struct intel_iommu *iommu __maybe_unused;
3745 struct device *dev;
3746 int i, ret = 0;
3747
3748 for_each_active_iommu(iommu, drhd) {
3749 for_each_active_dev_scope(drhd->devices,
3750 drhd->devices_cnt, i, dev) {
3751 struct acpi_device_physical_node *pn;
3752 struct acpi_device *adev;
3753
3754 if (dev->bus != &acpi_bus_type)
3755 continue;
3756
3757 adev = to_acpi_device(dev);
3758 mutex_lock(&adev->physical_node_lock);
3759 list_for_each_entry(pn,
3760 &adev->physical_node_list, node) {
3761 ret = iommu_probe_device(pn->dev);
3762 if (ret)
3763 break;
3764 }
3765 mutex_unlock(&adev->physical_node_lock);
3766
3767 if (ret)
3768 return ret;
3769 }
3770 }
3771
3772 return 0;
3773 }
3774
3775 static __init int tboot_force_iommu(void)
3776 {
3777 if (!tboot_enabled())
3778 return 0;
3779
3780 if (no_iommu || dmar_disabled)
3781 pr_warn("Forcing Intel-IOMMU to enabled\n");
3782
3783 dmar_disabled = 0;
3784 no_iommu = 0;
3785
3786 return 1;
3787 }
3788
3789 int __init intel_iommu_init(void)
3790 {
3791 int ret = -ENODEV;
3792 struct dmar_drhd_unit *drhd;
3793 struct intel_iommu *iommu;
3794
3795 /*
3796 * Intel IOMMU is required for a TXT/tboot launch or platform
3797 * opt in, so enforce that.
3798 */
3799 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3800 platform_optin_force_iommu();
3801
3802 down_write(&dmar_global_lock);
3803 if (dmar_table_init()) {
3804 if (force_on)
3805 panic("tboot: Failed to initialize DMAR table\n");
3806 goto out_free_dmar;
3807 }
3808
3809 if (dmar_dev_scope_init() < 0) {
3810 if (force_on)
3811 panic("tboot: Failed to initialize DMAR device scope\n");
3812 goto out_free_dmar;
3813 }
3814
3815 up_write(&dmar_global_lock);
3816
3817 /*
3818 * The bus notifier takes the dmar_global_lock, so lockdep will
3819 * complain later when we register it under the lock.
3820 */
3821 dmar_register_bus_notifier();
3822
3823 down_write(&dmar_global_lock);
3824
3825 if (!no_iommu)
3826 intel_iommu_debugfs_init();
3827
3828 if (no_iommu || dmar_disabled) {
3829 /*
3830 * We exit the function here to ensure IOMMU's remapping and
3831 * mempool aren't setup, which means that the IOMMU's PMRs
3832 * won't be disabled via the call to init_dmars(). So disable
3833 * it explicitly here. The PMRs were setup by tboot prior to
3834 * calling SENTER, but the kernel is expected to reset/tear
3835 * down the PMRs.
3836 */
3837 if (intel_iommu_tboot_noforce) {
3838 for_each_iommu(iommu, drhd)
3839 iommu_disable_protect_mem_regions(iommu);
3840 }
3841
3842 /*
3843 * Make sure the IOMMUs are switched off, even when we
3844 * boot into a kexec kernel and the previous kernel left
3845 * them enabled
3846 */
3847 intel_disable_iommus();
3848 goto out_free_dmar;
3849 }
3850
3851 if (list_empty(&dmar_rmrr_units))
3852 pr_info("No RMRR found\n");
3853
3854 if (list_empty(&dmar_atsr_units))
3855 pr_info("No ATSR found\n");
3856
3857 if (list_empty(&dmar_satc_units))
3858 pr_info("No SATC found\n");
3859
3860 init_no_remapping_devices();
3861
3862 ret = init_dmars();
3863 if (ret) {
3864 if (force_on)
3865 panic("tboot: Failed to initialize DMARs\n");
3866 pr_err("Initialization failed\n");
3867 goto out_free_dmar;
3868 }
3869 up_write(&dmar_global_lock);
3870
3871 init_iommu_pm_ops();
3872
3873 down_read(&dmar_global_lock);
3874 for_each_active_iommu(iommu, drhd) {
3875 /*
3876 * The flush queue implementation does not perform
3877 * page-selective invalidations that are required for efficient
3878 * TLB flushes in virtual environments. The benefit of batching
3879 * is likely to be much lower than the overhead of synchronizing
3880 * the virtual and physical IOMMU page-tables.
3881 */
3882 if (cap_caching_mode(iommu->cap) &&
3883 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3884 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3885 iommu_set_dma_strict();
3886 }
3887 iommu_device_sysfs_add(&iommu->iommu, NULL,
3888 intel_iommu_groups,
3889 "%s", iommu->name);
3890 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3891
3892 iommu_pmu_register(iommu);
3893 }
3894 up_read(&dmar_global_lock);
3895
3896 if (si_domain && !hw_pass_through)
3897 register_memory_notifier(&intel_iommu_memory_nb);
3898
3899 down_read(&dmar_global_lock);
3900 if (probe_acpi_namespace_devices())
3901 pr_warn("ACPI name space devices didn't probe correctly\n");
3902
3903 /* Finally, we enable the DMA remapping hardware. */
3904 for_each_iommu(iommu, drhd) {
3905 if (!drhd->ignored && !translation_pre_enabled(iommu))
3906 iommu_enable_translation(iommu);
3907
3908 iommu_disable_protect_mem_regions(iommu);
3909 }
3910 up_read(&dmar_global_lock);
3911
3912 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3913
3914 intel_iommu_enabled = 1;
3915
3916 return 0;
3917
3918 out_free_dmar:
3919 intel_iommu_free_dmars();
3920 up_write(&dmar_global_lock);
3921 return ret;
3922 }
3923
3924 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3925 {
3926 struct device_domain_info *info = opaque;
3927
3928 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3929 return 0;
3930 }
3931
3932 /*
3933 * NB - intel-iommu lacks any sort of reference counting for the users of
3934 * dependent devices. If multiple endpoints have intersecting dependent
3935 * devices, unbinding the driver from any one of them will possibly leave
3936 * the others unable to operate.
3937 */
3938 static void domain_context_clear(struct device_domain_info *info)
3939 {
3940 if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3941 return;
3942
3943 pci_for_each_dma_alias(to_pci_dev(info->dev),
3944 &domain_context_clear_one_cb, info);
3945 }
3946
3947 static void dmar_remove_one_dev_info(struct device *dev)
3948 {
3949 struct device_domain_info *info = dev_iommu_priv_get(dev);
3950 struct dmar_domain *domain = info->domain;
3951 struct intel_iommu *iommu = info->iommu;
3952 unsigned long flags;
3953
3954 if (!dev_is_real_dma_subdevice(info->dev)) {
3955 if (dev_is_pci(info->dev) && sm_supported(iommu))
3956 intel_pasid_tear_down_entry(iommu, info->dev,
3957 IOMMU_NO_PASID, false);
3958
3959 iommu_disable_pci_caps(info);
3960 domain_context_clear(info);
3961 }
3962
3963 spin_lock_irqsave(&domain->lock, flags);
3964 list_del(&info->link);
3965 spin_unlock_irqrestore(&domain->lock, flags);
3966
3967 domain_detach_iommu(domain, iommu);
3968 info->domain = NULL;
3969 }
3970
3971 /*
3972 * Clear the page table pointer in context or pasid table entries so that
3973 * all DMA requests without PASID from the device are blocked. If the page
3974 * table has been set, clean up the data structures.
3975 */
3976 static void device_block_translation(struct device *dev)
3977 {
3978 struct device_domain_info *info = dev_iommu_priv_get(dev);
3979 struct intel_iommu *iommu = info->iommu;
3980 unsigned long flags;
3981
3982 iommu_disable_pci_caps(info);
3983 if (!dev_is_real_dma_subdevice(dev)) {
3984 if (sm_supported(iommu))
3985 intel_pasid_tear_down_entry(iommu, dev,
3986 IOMMU_NO_PASID, false);
3987 else
3988 domain_context_clear(info);
3989 }
3990
3991 if (!info->domain)
3992 return;
3993
3994 spin_lock_irqsave(&info->domain->lock, flags);
3995 list_del(&info->link);
3996 spin_unlock_irqrestore(&info->domain->lock, flags);
3997
3998 domain_detach_iommu(info->domain, iommu);
3999 info->domain = NULL;
4000 }
4001
4002 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4003 {
4004 int adjust_width;
4005
4006 /* calculate AGAW */
4007 domain->gaw = guest_width;
4008 adjust_width = guestwidth_to_adjustwidth(guest_width);
4009 domain->agaw = width_to_agaw(adjust_width);
4010
4011 domain->iommu_coherency = false;
4012 domain->iommu_superpage = 0;
4013 domain->max_addr = 0;
4014
4015 /* always allocate the top pgd */
4016 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4017 if (!domain->pgd)
4018 return -ENOMEM;
4019 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4020 return 0;
4021 }
4022
4023 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4024 struct device *dev)
4025 {
4026 device_block_translation(dev);
4027 return 0;
4028 }
4029
4030 static struct iommu_domain blocking_domain = {
4031 .ops = &(const struct iommu_domain_ops) {
4032 .attach_dev = blocking_domain_attach_dev,
4033 .free = intel_iommu_domain_free
4034 }
4035 };
4036
4037 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4038 {
4039 struct dmar_domain *dmar_domain;
4040 struct iommu_domain *domain;
4041
4042 switch (type) {
4043 case IOMMU_DOMAIN_BLOCKED:
4044 return &blocking_domain;
4045 case IOMMU_DOMAIN_DMA:
4046 case IOMMU_DOMAIN_UNMANAGED:
4047 dmar_domain = alloc_domain(type);
4048 if (!dmar_domain) {
4049 pr_err("Can't allocate dmar_domain\n");
4050 return NULL;
4051 }
4052 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4053 pr_err("Domain initialization failed\n");
4054 domain_exit(dmar_domain);
4055 return NULL;
4056 }
4057
4058 domain = &dmar_domain->domain;
4059 domain->geometry.aperture_start = 0;
4060 domain->geometry.aperture_end =
4061 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4062 domain->geometry.force_aperture = true;
4063
4064 return domain;
4065 case IOMMU_DOMAIN_IDENTITY:
4066 return &si_domain->domain;
4067 case IOMMU_DOMAIN_SVA:
4068 return intel_svm_domain_alloc();
4069 default:
4070 return NULL;
4071 }
4072
4073 return NULL;
4074 }
4075
4076 static void intel_iommu_domain_free(struct iommu_domain *domain)
4077 {
4078 if (domain != &si_domain->domain && domain != &blocking_domain)
4079 domain_exit(to_dmar_domain(domain));
4080 }
4081
4082 static int prepare_domain_attach_device(struct iommu_domain *domain,
4083 struct device *dev)
4084 {
4085 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4086 struct intel_iommu *iommu;
4087 int addr_width;
4088
4089 iommu = device_to_iommu(dev, NULL, NULL);
4090 if (!iommu)
4091 return -ENODEV;
4092
4093 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4094 return -EINVAL;
4095
4096 /* check if this iommu agaw is sufficient for max mapped address */
4097 addr_width = agaw_to_width(iommu->agaw);
4098 if (addr_width > cap_mgaw(iommu->cap))
4099 addr_width = cap_mgaw(iommu->cap);
4100
4101 if (dmar_domain->max_addr > (1LL << addr_width))
4102 return -EINVAL;
4103 dmar_domain->gaw = addr_width;
4104
4105 /*
4106 * Knock out extra levels of page tables if necessary
4107 */
4108 while (iommu->agaw < dmar_domain->agaw) {
4109 struct dma_pte *pte;
4110
4111 pte = dmar_domain->pgd;
4112 if (dma_pte_present(pte)) {
4113 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4114 free_pgtable_page(pte);
4115 }
4116 dmar_domain->agaw--;
4117 }
4118
4119 return 0;
4120 }
4121
4122 static int intel_iommu_attach_device(struct iommu_domain *domain,
4123 struct device *dev)
4124 {
4125 struct device_domain_info *info = dev_iommu_priv_get(dev);
4126 int ret;
4127
4128 if (info->domain)
4129 device_block_translation(dev);
4130
4131 ret = prepare_domain_attach_device(domain, dev);
4132 if (ret)
4133 return ret;
4134
4135 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4136 }
4137
4138 static int intel_iommu_map(struct iommu_domain *domain,
4139 unsigned long iova, phys_addr_t hpa,
4140 size_t size, int iommu_prot, gfp_t gfp)
4141 {
4142 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4143 u64 max_addr;
4144 int prot = 0;
4145
4146 if (iommu_prot & IOMMU_READ)
4147 prot |= DMA_PTE_READ;
4148 if (iommu_prot & IOMMU_WRITE)
4149 prot |= DMA_PTE_WRITE;
4150 if (dmar_domain->set_pte_snp)
4151 prot |= DMA_PTE_SNP;
4152
4153 max_addr = iova + size;
4154 if (dmar_domain->max_addr < max_addr) {
4155 u64 end;
4156
4157 /* check if minimum agaw is sufficient for mapped address */
4158 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4159 if (end < max_addr) {
4160 pr_err("%s: iommu width (%d) is not "
4161 "sufficient for the mapped address (%llx)\n",
4162 __func__, dmar_domain->gaw, max_addr);
4163 return -EFAULT;
4164 }
4165 dmar_domain->max_addr = max_addr;
4166 }
4167 /* Round up size to next multiple of PAGE_SIZE, if it and
4168 the low bits of hpa would take us onto the next page */
4169 size = aligned_nrpages(hpa, size);
4170 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4171 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4172 }
4173
4174 static int intel_iommu_map_pages(struct iommu_domain *domain,
4175 unsigned long iova, phys_addr_t paddr,
4176 size_t pgsize, size_t pgcount,
4177 int prot, gfp_t gfp, size_t *mapped)
4178 {
4179 unsigned long pgshift = __ffs(pgsize);
4180 size_t size = pgcount << pgshift;
4181 int ret;
4182
4183 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4184 return -EINVAL;
4185
4186 if (!IS_ALIGNED(iova | paddr, pgsize))
4187 return -EINVAL;
4188
4189 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4190 if (!ret && mapped)
4191 *mapped = size;
4192
4193 return ret;
4194 }
4195
4196 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4197 unsigned long iova, size_t size,
4198 struct iommu_iotlb_gather *gather)
4199 {
4200 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4201 unsigned long start_pfn, last_pfn;
4202 int level = 0;
4203
4204 /* Cope with horrid API which requires us to unmap more than the
4205 size argument if it happens to be a large-page mapping. */
4206 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4207 &level, GFP_ATOMIC)))
4208 return 0;
4209
4210 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4211 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4212
4213 start_pfn = iova >> VTD_PAGE_SHIFT;
4214 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4215
4216 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4217
4218 if (dmar_domain->max_addr == iova + size)
4219 dmar_domain->max_addr = iova;
4220
4221 /*
4222 * We do not use page-selective IOTLB invalidation in flush queue,
4223 * so there is no need to track page and sync iotlb.
4224 */
4225 if (!iommu_iotlb_gather_queued(gather))
4226 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4227
4228 return size;
4229 }
4230
4231 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4232 unsigned long iova,
4233 size_t pgsize, size_t pgcount,
4234 struct iommu_iotlb_gather *gather)
4235 {
4236 unsigned long pgshift = __ffs(pgsize);
4237 size_t size = pgcount << pgshift;
4238
4239 return intel_iommu_unmap(domain, iova, size, gather);
4240 }
4241
4242 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4243 struct iommu_iotlb_gather *gather)
4244 {
4245 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4246 unsigned long iova_pfn = IOVA_PFN(gather->start);
4247 size_t size = gather->end - gather->start;
4248 struct iommu_domain_info *info;
4249 unsigned long start_pfn;
4250 unsigned long nrpages;
4251 unsigned long i;
4252
4253 nrpages = aligned_nrpages(gather->start, size);
4254 start_pfn = mm_to_dma_pfn_start(iova_pfn);
4255
4256 xa_for_each(&dmar_domain->iommu_array, i, info)
4257 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4258 start_pfn, nrpages,
4259 list_empty(&gather->freelist), 0);
4260
4261 put_pages_list(&gather->freelist);
4262 }
4263
4264 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4265 dma_addr_t iova)
4266 {
4267 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4268 struct dma_pte *pte;
4269 int level = 0;
4270 u64 phys = 0;
4271
4272 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4273 GFP_ATOMIC);
4274 if (pte && dma_pte_present(pte))
4275 phys = dma_pte_addr(pte) +
4276 (iova & (BIT_MASK(level_to_offset_bits(level) +
4277 VTD_PAGE_SHIFT) - 1));
4278
4279 return phys;
4280 }
4281
4282 static bool domain_support_force_snooping(struct dmar_domain *domain)
4283 {
4284 struct device_domain_info *info;
4285 bool support = true;
4286
4287 assert_spin_locked(&domain->lock);
4288 list_for_each_entry(info, &domain->devices, link) {
4289 if (!ecap_sc_support(info->iommu->ecap)) {
4290 support = false;
4291 break;
4292 }
4293 }
4294
4295 return support;
4296 }
4297
4298 static void domain_set_force_snooping(struct dmar_domain *domain)
4299 {
4300 struct device_domain_info *info;
4301
4302 assert_spin_locked(&domain->lock);
4303 /*
4304 * Second level page table supports per-PTE snoop control. The
4305 * iommu_map() interface will handle this by setting SNP bit.
4306 */
4307 if (!domain->use_first_level) {
4308 domain->set_pte_snp = true;
4309 return;
4310 }
4311
4312 list_for_each_entry(info, &domain->devices, link)
4313 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4314 IOMMU_NO_PASID);
4315 }
4316
4317 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4318 {
4319 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4320 unsigned long flags;
4321
4322 if (dmar_domain->force_snooping)
4323 return true;
4324
4325 spin_lock_irqsave(&dmar_domain->lock, flags);
4326 if (!domain_support_force_snooping(dmar_domain)) {
4327 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4328 return false;
4329 }
4330
4331 domain_set_force_snooping(dmar_domain);
4332 dmar_domain->force_snooping = true;
4333 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4334
4335 return true;
4336 }
4337
4338 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4339 {
4340 struct device_domain_info *info = dev_iommu_priv_get(dev);
4341
4342 switch (cap) {
4343 case IOMMU_CAP_CACHE_COHERENCY:
4344 case IOMMU_CAP_DEFERRED_FLUSH:
4345 return true;
4346 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4347 return dmar_platform_optin();
4348 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4349 return ecap_sc_support(info->iommu->ecap);
4350 default:
4351 return false;
4352 }
4353 }
4354
4355 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4356 {
4357 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4358 struct device_domain_info *info;
4359 struct intel_iommu *iommu;
4360 u8 bus, devfn;
4361 int ret;
4362
4363 iommu = device_to_iommu(dev, &bus, &devfn);
4364 if (!iommu || !iommu->iommu.ops)
4365 return ERR_PTR(-ENODEV);
4366
4367 info = kzalloc(sizeof(*info), GFP_KERNEL);
4368 if (!info)
4369 return ERR_PTR(-ENOMEM);
4370
4371 if (dev_is_real_dma_subdevice(dev)) {
4372 info->bus = pdev->bus->number;
4373 info->devfn = pdev->devfn;
4374 info->segment = pci_domain_nr(pdev->bus);
4375 } else {
4376 info->bus = bus;
4377 info->devfn = devfn;
4378 info->segment = iommu->segment;
4379 }
4380
4381 info->dev = dev;
4382 info->iommu = iommu;
4383 if (dev_is_pci(dev)) {
4384 if (ecap_dev_iotlb_support(iommu->ecap) &&
4385 pci_ats_supported(pdev) &&
4386 dmar_ats_supported(pdev, iommu)) {
4387 info->ats_supported = 1;
4388 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4389
4390 /*
4391 * For IOMMU that supports device IOTLB throttling
4392 * (DIT), we assign PFSID to the invalidation desc
4393 * of a VF such that IOMMU HW can gauge queue depth
4394 * at PF level. If DIT is not set, PFSID will be
4395 * treated as reserved, which should be set to 0.
4396 */
4397 if (ecap_dit(iommu->ecap))
4398 info->pfsid = pci_dev_id(pci_physfn(pdev));
4399 info->ats_qdep = pci_ats_queue_depth(pdev);
4400 }
4401 if (sm_supported(iommu)) {
4402 if (pasid_supported(iommu)) {
4403 int features = pci_pasid_features(pdev);
4404
4405 if (features >= 0)
4406 info->pasid_supported = features | 1;
4407 }
4408
4409 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4410 pci_pri_supported(pdev))
4411 info->pri_supported = 1;
4412 }
4413 }
4414
4415 dev_iommu_priv_set(dev, info);
4416
4417 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4418 ret = intel_pasid_alloc_table(dev);
4419 if (ret) {
4420 dev_err(dev, "PASID table allocation failed\n");
4421 dev_iommu_priv_set(dev, NULL);
4422 kfree(info);
4423 return ERR_PTR(ret);
4424 }
4425 }
4426
4427 return &iommu->iommu;
4428 }
4429
4430 static void intel_iommu_release_device(struct device *dev)
4431 {
4432 struct device_domain_info *info = dev_iommu_priv_get(dev);
4433
4434 dmar_remove_one_dev_info(dev);
4435 intel_pasid_free_table(dev);
4436 dev_iommu_priv_set(dev, NULL);
4437 kfree(info);
4438 set_dma_ops(dev, NULL);
4439 }
4440
4441 static void intel_iommu_probe_finalize(struct device *dev)
4442 {
4443 set_dma_ops(dev, NULL);
4444 iommu_setup_dma_ops(dev, 0, U64_MAX);
4445 }
4446
4447 static void intel_iommu_get_resv_regions(struct device *device,
4448 struct list_head *head)
4449 {
4450 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4451 struct iommu_resv_region *reg;
4452 struct dmar_rmrr_unit *rmrr;
4453 struct device *i_dev;
4454 int i;
4455
4456 rcu_read_lock();
4457 for_each_rmrr_units(rmrr) {
4458 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4459 i, i_dev) {
4460 struct iommu_resv_region *resv;
4461 enum iommu_resv_type type;
4462 size_t length;
4463
4464 if (i_dev != device &&
4465 !is_downstream_to_pci_bridge(device, i_dev))
4466 continue;
4467
4468 length = rmrr->end_address - rmrr->base_address + 1;
4469
4470 type = device_rmrr_is_relaxable(device) ?
4471 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4472
4473 resv = iommu_alloc_resv_region(rmrr->base_address,
4474 length, prot, type,
4475 GFP_ATOMIC);
4476 if (!resv)
4477 break;
4478
4479 list_add_tail(&resv->list, head);
4480 }
4481 }
4482 rcu_read_unlock();
4483
4484 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4485 if (dev_is_pci(device)) {
4486 struct pci_dev *pdev = to_pci_dev(device);
4487
4488 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4489 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4490 IOMMU_RESV_DIRECT_RELAXABLE,
4491 GFP_KERNEL);
4492 if (reg)
4493 list_add_tail(&reg->list, head);
4494 }
4495 }
4496 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4497
4498 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4499 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4500 0, IOMMU_RESV_MSI, GFP_KERNEL);
4501 if (!reg)
4502 return;
4503 list_add_tail(&reg->list, head);
4504 }
4505
4506 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4507 {
4508 if (dev_is_pci(dev))
4509 return pci_device_group(dev);
4510 return generic_device_group(dev);
4511 }
4512
4513 static int intel_iommu_enable_sva(struct device *dev)
4514 {
4515 struct device_domain_info *info = dev_iommu_priv_get(dev);
4516 struct intel_iommu *iommu;
4517
4518 if (!info || dmar_disabled)
4519 return -EINVAL;
4520
4521 iommu = info->iommu;
4522 if (!iommu)
4523 return -EINVAL;
4524
4525 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4526 return -ENODEV;
4527
4528 if (!info->pasid_enabled || !info->ats_enabled)
4529 return -EINVAL;
4530
4531 /*
4532 * Devices having device-specific I/O fault handling should not
4533 * support PCI/PRI. The IOMMU side has no means to check the
4534 * capability of device-specific IOPF. Therefore, IOMMU can only
4535 * default that if the device driver enables SVA on a non-PRI
4536 * device, it will handle IOPF in its own way.
4537 */
4538 if (!info->pri_supported)
4539 return 0;
4540
4541 /* Devices supporting PRI should have it enabled. */
4542 if (!info->pri_enabled)
4543 return -EINVAL;
4544
4545 return 0;
4546 }
4547
4548 static int intel_iommu_enable_iopf(struct device *dev)
4549 {
4550 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4551 struct device_domain_info *info = dev_iommu_priv_get(dev);
4552 struct intel_iommu *iommu;
4553 int ret;
4554
4555 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4556 return -ENODEV;
4557
4558 if (info->pri_enabled)
4559 return -EBUSY;
4560
4561 iommu = info->iommu;
4562 if (!iommu)
4563 return -EINVAL;
4564
4565 /* PASID is required in PRG Response Message. */
4566 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4567 return -EINVAL;
4568
4569 ret = pci_reset_pri(pdev);
4570 if (ret)
4571 return ret;
4572
4573 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4574 if (ret)
4575 return ret;
4576
4577 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4578 if (ret)
4579 goto iopf_remove_device;
4580
4581 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4582 if (ret)
4583 goto iopf_unregister_handler;
4584 info->pri_enabled = 1;
4585
4586 return 0;
4587
4588 iopf_unregister_handler:
4589 iommu_unregister_device_fault_handler(dev);
4590 iopf_remove_device:
4591 iopf_queue_remove_device(iommu->iopf_queue, dev);
4592
4593 return ret;
4594 }
4595
4596 static int intel_iommu_disable_iopf(struct device *dev)
4597 {
4598 struct device_domain_info *info = dev_iommu_priv_get(dev);
4599 struct intel_iommu *iommu = info->iommu;
4600
4601 if (!info->pri_enabled)
4602 return -EINVAL;
4603
4604 /*
4605 * PCIe spec states that by clearing PRI enable bit, the Page
4606 * Request Interface will not issue new page requests, but has
4607 * outstanding page requests that have been transmitted or are
4608 * queued for transmission. This is supposed to be called after
4609 * the device driver has stopped DMA, all PASIDs have been
4610 * unbound and the outstanding PRQs have been drained.
4611 */
4612 pci_disable_pri(to_pci_dev(dev));
4613 info->pri_enabled = 0;
4614
4615 /*
4616 * With PRI disabled and outstanding PRQs drained, unregistering
4617 * fault handler and removing device from iopf queue should never
4618 * fail.
4619 */
4620 WARN_ON(iommu_unregister_device_fault_handler(dev));
4621 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4622
4623 return 0;
4624 }
4625
4626 static int
4627 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4628 {
4629 switch (feat) {
4630 case IOMMU_DEV_FEAT_IOPF:
4631 return intel_iommu_enable_iopf(dev);
4632
4633 case IOMMU_DEV_FEAT_SVA:
4634 return intel_iommu_enable_sva(dev);
4635
4636 default:
4637 return -ENODEV;
4638 }
4639 }
4640
4641 static int
4642 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4643 {
4644 switch (feat) {
4645 case IOMMU_DEV_FEAT_IOPF:
4646 return intel_iommu_disable_iopf(dev);
4647
4648 case IOMMU_DEV_FEAT_SVA:
4649 return 0;
4650
4651 default:
4652 return -ENODEV;
4653 }
4654 }
4655
4656 static bool intel_iommu_is_attach_deferred(struct device *dev)
4657 {
4658 struct device_domain_info *info = dev_iommu_priv_get(dev);
4659
4660 return translation_pre_enabled(info->iommu) && !info->domain;
4661 }
4662
4663 /*
4664 * Check that the device does not live on an external facing PCI port that is
4665 * marked as untrusted. Such devices should not be able to apply quirks and
4666 * thus not be able to bypass the IOMMU restrictions.
4667 */
4668 static bool risky_device(struct pci_dev *pdev)
4669 {
4670 if (pdev->untrusted) {
4671 pci_info(pdev,
4672 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4673 pdev->vendor, pdev->device);
4674 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4675 return true;
4676 }
4677 return false;
4678 }
4679
4680 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4681 unsigned long iova, size_t size)
4682 {
4683 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4684 unsigned long pages = aligned_nrpages(iova, size);
4685 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4686 struct iommu_domain_info *info;
4687 unsigned long i;
4688
4689 xa_for_each(&dmar_domain->iommu_array, i, info)
4690 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4691 }
4692
4693 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4694 {
4695 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4696 struct dev_pasid_info *curr, *dev_pasid = NULL;
4697 struct dmar_domain *dmar_domain;
4698 struct iommu_domain *domain;
4699 unsigned long flags;
4700
4701 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4702 if (WARN_ON_ONCE(!domain))
4703 goto out_tear_down;
4704
4705 /*
4706 * The SVA implementation needs to handle its own stuffs like the mm
4707 * notification. Before consolidating that code into iommu core, let
4708 * the intel sva code handle it.
4709 */
4710 if (domain->type == IOMMU_DOMAIN_SVA) {
4711 intel_svm_remove_dev_pasid(dev, pasid);
4712 goto out_tear_down;
4713 }
4714
4715 dmar_domain = to_dmar_domain(domain);
4716 spin_lock_irqsave(&dmar_domain->lock, flags);
4717 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4718 if (curr->dev == dev && curr->pasid == pasid) {
4719 list_del(&curr->link_domain);
4720 dev_pasid = curr;
4721 break;
4722 }
4723 }
4724 WARN_ON_ONCE(!dev_pasid);
4725 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4726
4727 domain_detach_iommu(dmar_domain, iommu);
4728 kfree(dev_pasid);
4729 out_tear_down:
4730 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4731 intel_drain_pasid_prq(dev, pasid);
4732 }
4733
4734 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4735 struct device *dev, ioasid_t pasid)
4736 {
4737 struct device_domain_info *info = dev_iommu_priv_get(dev);
4738 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4739 struct intel_iommu *iommu = info->iommu;
4740 struct dev_pasid_info *dev_pasid;
4741 unsigned long flags;
4742 int ret;
4743
4744 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4745 return -EOPNOTSUPP;
4746
4747 if (context_copied(iommu, info->bus, info->devfn))
4748 return -EBUSY;
4749
4750 ret = prepare_domain_attach_device(domain, dev);
4751 if (ret)
4752 return ret;
4753
4754 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4755 if (!dev_pasid)
4756 return -ENOMEM;
4757
4758 ret = domain_attach_iommu(dmar_domain, iommu);
4759 if (ret)
4760 goto out_free;
4761
4762 if (domain_type_is_si(dmar_domain))
4763 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4764 dev, pasid);
4765 else if (dmar_domain->use_first_level)
4766 ret = domain_setup_first_level(iommu, dmar_domain,
4767 dev, pasid);
4768 else
4769 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4770 dev, pasid);
4771 if (ret)
4772 goto out_detach_iommu;
4773
4774 dev_pasid->dev = dev;
4775 dev_pasid->pasid = pasid;
4776 spin_lock_irqsave(&dmar_domain->lock, flags);
4777 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4778 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4779
4780 return 0;
4781 out_detach_iommu:
4782 domain_detach_iommu(dmar_domain, iommu);
4783 out_free:
4784 kfree(dev_pasid);
4785 return ret;
4786 }
4787
4788 const struct iommu_ops intel_iommu_ops = {
4789 .capable = intel_iommu_capable,
4790 .domain_alloc = intel_iommu_domain_alloc,
4791 .probe_device = intel_iommu_probe_device,
4792 .probe_finalize = intel_iommu_probe_finalize,
4793 .release_device = intel_iommu_release_device,
4794 .get_resv_regions = intel_iommu_get_resv_regions,
4795 .device_group = intel_iommu_device_group,
4796 .dev_enable_feat = intel_iommu_dev_enable_feat,
4797 .dev_disable_feat = intel_iommu_dev_disable_feat,
4798 .is_attach_deferred = intel_iommu_is_attach_deferred,
4799 .def_domain_type = device_def_domain_type,
4800 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4801 .pgsize_bitmap = SZ_4K,
4802 #ifdef CONFIG_INTEL_IOMMU_SVM
4803 .page_response = intel_svm_page_response,
4804 #endif
4805 .default_domain_ops = &(const struct iommu_domain_ops) {
4806 .attach_dev = intel_iommu_attach_device,
4807 .set_dev_pasid = intel_iommu_set_dev_pasid,
4808 .map_pages = intel_iommu_map_pages,
4809 .unmap_pages = intel_iommu_unmap_pages,
4810 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4811 .flush_iotlb_all = intel_flush_iotlb_all,
4812 .iotlb_sync = intel_iommu_tlb_sync,
4813 .iova_to_phys = intel_iommu_iova_to_phys,
4814 .free = intel_iommu_domain_free,
4815 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4816 }
4817 };
4818
4819 static void quirk_iommu_igfx(struct pci_dev *dev)
4820 {
4821 if (risky_device(dev))
4822 return;
4823
4824 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4825 dmar_map_gfx = 0;
4826 }
4827
4828 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4830 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4836
4837 /* Broadwell igfx malfunctions with dmar */
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4846 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4862
4863 static void quirk_iommu_rwbf(struct pci_dev *dev)
4864 {
4865 if (risky_device(dev))
4866 return;
4867
4868 /*
4869 * Mobile 4 Series Chipset neglects to set RWBF capability,
4870 * but needs it. Same seems to hold for the desktop versions.
4871 */
4872 pci_info(dev, "Forcing write-buffer flush capability\n");
4873 rwbf_quirk = 1;
4874 }
4875
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4881 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4883
4884 #define GGC 0x52
4885 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4886 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4887 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4888 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4889 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4890 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4891 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4892 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4893
4894 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4895 {
4896 unsigned short ggc;
4897
4898 if (risky_device(dev))
4899 return;
4900
4901 if (pci_read_config_word(dev, GGC, &ggc))
4902 return;
4903
4904 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4905 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4906 dmar_map_gfx = 0;
4907 } else if (dmar_map_gfx) {
4908 /* we have to ensure the gfx device is idle before we flush */
4909 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4910 iommu_set_dma_strict();
4911 }
4912 }
4913 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4915 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4916 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4917
4918 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4919 {
4920 unsigned short ver;
4921
4922 if (!IS_GFX_DEVICE(dev))
4923 return;
4924
4925 ver = (dev->device >> 8) & 0xff;
4926 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4927 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4928 ver != 0x9a && ver != 0xa7)
4929 return;
4930
4931 if (risky_device(dev))
4932 return;
4933
4934 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4935 iommu_skip_te_disable = 1;
4936 }
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4938
4939 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4940 ISOCH DMAR unit for the Azalia sound device, but not give it any
4941 TLB entries, which causes it to deadlock. Check for that. We do
4942 this in a function called from init_dmars(), instead of in a PCI
4943 quirk, because we don't want to print the obnoxious "BIOS broken"
4944 message if VT-d is actually disabled.
4945 */
4946 static void __init check_tylersburg_isoch(void)
4947 {
4948 struct pci_dev *pdev;
4949 uint32_t vtisochctrl;
4950
4951 /* If there's no Azalia in the system anyway, forget it. */
4952 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4953 if (!pdev)
4954 return;
4955
4956 if (risky_device(pdev)) {
4957 pci_dev_put(pdev);
4958 return;
4959 }
4960
4961 pci_dev_put(pdev);
4962
4963 /* System Management Registers. Might be hidden, in which case
4964 we can't do the sanity check. But that's OK, because the
4965 known-broken BIOSes _don't_ actually hide it, so far. */
4966 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4967 if (!pdev)
4968 return;
4969
4970 if (risky_device(pdev)) {
4971 pci_dev_put(pdev);
4972 return;
4973 }
4974
4975 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4976 pci_dev_put(pdev);
4977 return;
4978 }
4979
4980 pci_dev_put(pdev);
4981
4982 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4983 if (vtisochctrl & 1)
4984 return;
4985
4986 /* Drop all bits other than the number of TLB entries */
4987 vtisochctrl &= 0x1c;
4988
4989 /* If we have the recommended number of TLB entries (16), fine. */
4990 if (vtisochctrl == 0x10)
4991 return;
4992
4993 /* Zero TLB entries? You get to ride the short bus to school. */
4994 if (!vtisochctrl) {
4995 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4996 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4997 dmi_get_system_info(DMI_BIOS_VENDOR),
4998 dmi_get_system_info(DMI_BIOS_VERSION),
4999 dmi_get_system_info(DMI_PRODUCT_VERSION));
5000 iommu_identity_mapping |= IDENTMAP_AZALIA;
5001 return;
5002 }
5003
5004 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5005 vtisochctrl);
5006 }
5007
5008 /*
5009 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5010 * invalidation completion before posted writes initiated with translated address
5011 * that utilized translations matching the invalidation address range, violating
5012 * the invalidation completion ordering.
5013 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5014 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5015 * under the control of the trusted/privileged host device driver must use this
5016 * quirk.
5017 * Device TLBs are invalidated under the following six conditions:
5018 * 1. Device driver does DMA API unmap IOVA
5019 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5020 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5021 * exit_mmap() due to crash
5022 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5023 * VM has to free pages that were unmapped
5024 * 5. Userspace driver unmaps a DMA buffer
5025 * 6. Cache invalidation in vSVA usage (upcoming)
5026 *
5027 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5028 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5029 * invalidate TLB the same way as normal user unmap which will use this quirk.
5030 * The dTLB invalidation after PASID cache flush does not need this quirk.
5031 *
5032 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5033 */
5034 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5035 unsigned long address, unsigned long mask,
5036 u32 pasid, u16 qdep)
5037 {
5038 u16 sid;
5039
5040 if (likely(!info->dtlb_extra_inval))
5041 return;
5042
5043 sid = PCI_DEVID(info->bus, info->devfn);
5044 if (pasid == IOMMU_NO_PASID) {
5045 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5046 qdep, address, mask);
5047 } else {
5048 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5049 pasid, qdep, address, mask);
5050 }
5051 }
5052
5053 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5054
5055 /*
5056 * Function to submit a command to the enhanced command interface. The
5057 * valid enhanced command descriptions are defined in Table 47 of the
5058 * VT-d spec. The VT-d hardware implementation may support some but not
5059 * all commands, which can be determined by checking the Enhanced
5060 * Command Capability Register.
5061 *
5062 * Return values:
5063 * - 0: Command successful without any error;
5064 * - Negative: software error value;
5065 * - Nonzero positive: failure status code defined in Table 48.
5066 */
5067 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5068 {
5069 unsigned long flags;
5070 u64 res;
5071 int ret;
5072
5073 if (!cap_ecmds(iommu->cap))
5074 return -ENODEV;
5075
5076 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5077
5078 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5079 if (res & DMA_ECMD_ECRSP_IP) {
5080 ret = -EBUSY;
5081 goto err;
5082 }
5083
5084 /*
5085 * Unconditionally write the operand B, because
5086 * - There is no side effect if an ecmd doesn't require an
5087 * operand B, but we set the register to some value.
5088 * - It's not invoked in any critical path. The extra MMIO
5089 * write doesn't bring any performance concerns.
5090 */
5091 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5092 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5093
5094 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5095 !(res & DMA_ECMD_ECRSP_IP), res);
5096
5097 if (res & DMA_ECMD_ECRSP_IP) {
5098 ret = -ETIMEDOUT;
5099 goto err;
5100 }
5101
5102 ret = ecmd_get_status_code(res);
5103 err:
5104 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5105
5106 return ret;
5107 }