1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
27 #include "../dma-iommu.h"
28 #include "../irq_remapping.h"
29 #include "../iommu-sva.h"
31 #include "cap_audit.h"
34 #define ROOT_SIZE VTD_PAGE_SIZE
35 #define CONTEXT_SIZE VTD_PAGE_SIZE
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 #define IOAPIC_RANGE_START (0xfee00000)
43 #define IOAPIC_RANGE_END (0xfeefffff)
44 #define IOVA_START_ADDR (0x1000)
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN (1)
63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
65 /* page table handling */
66 #define LEVEL_STRIDE (9)
67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
69 static inline int agaw_to_level(int agaw
)
74 static inline int agaw_to_width(int agaw
)
76 return min_t(int, 30 + agaw
* LEVEL_STRIDE
, MAX_AGAW_WIDTH
);
79 static inline int width_to_agaw(int width
)
81 return DIV_ROUND_UP(width
- 30, LEVEL_STRIDE
);
84 static inline unsigned int level_to_offset_bits(int level
)
86 return (level
- 1) * LEVEL_STRIDE
;
89 static inline int pfn_level_offset(u64 pfn
, int level
)
91 return (pfn
>> level_to_offset_bits(level
)) & LEVEL_MASK
;
94 static inline u64
level_mask(int level
)
96 return -1ULL << level_to_offset_bits(level
);
99 static inline u64
level_size(int level
)
101 return 1ULL << level_to_offset_bits(level
);
104 static inline u64
align_to_level(u64 pfn
, int level
)
106 return (pfn
+ level_size(level
) - 1) & level_mask(level
);
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl
)
111 return 1UL << min_t(int, (lvl
- 1) * LEVEL_STRIDE
, MAX_AGAW_PFN_WIDTH
);
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115 are never going to work. */
116 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn
)
118 return mm_pfn
<< (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
120 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn
)
122 return ((mm_pfn
+ 1) << (PAGE_SHIFT
- VTD_PAGE_SHIFT
)) - 1;
124 static inline unsigned long page_to_dma_pfn(struct page
*pg
)
126 return mm_to_dma_pfn_start(page_to_pfn(pg
));
128 static inline unsigned long virt_to_dma_pfn(void *p
)
130 return page_to_dma_pfn(virt_to_page(p
));
133 static void __init
check_tylersburg_isoch(void);
134 static int rwbf_quirk
;
137 * set to 1 to panic kernel if can't successfully enable VT-d
138 * (used when kernel is launched w/ TXT)
140 static int force_on
= 0;
141 static int intel_iommu_tboot_noforce
;
142 static int no_platform_optin
;
144 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
147 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
150 static phys_addr_t
root_entry_lctp(struct root_entry
*re
)
155 return re
->lo
& VTD_PAGE_MASK
;
159 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
162 static phys_addr_t
root_entry_uctp(struct root_entry
*re
)
167 return re
->hi
& VTD_PAGE_MASK
;
170 static inline void context_set_present(struct context_entry
*context
)
175 static inline void context_set_fault_enable(struct context_entry
*context
)
177 context
->lo
&= (((u64
)-1) << 2) | 1;
180 static inline void context_set_translation_type(struct context_entry
*context
,
183 context
->lo
&= (((u64
)-1) << 4) | 3;
184 context
->lo
|= (value
& 3) << 2;
187 static inline void context_set_address_root(struct context_entry
*context
,
190 context
->lo
&= ~VTD_PAGE_MASK
;
191 context
->lo
|= value
& VTD_PAGE_MASK
;
194 static inline void context_set_address_width(struct context_entry
*context
,
197 context
->hi
|= value
& 7;
200 static inline void context_set_domain_id(struct context_entry
*context
,
203 context
->hi
|= (value
& ((1 << 16) - 1)) << 8;
206 static inline void context_set_pasid(struct context_entry
*context
)
208 context
->lo
|= CONTEXT_PASIDE
;
211 static inline int context_domain_id(struct context_entry
*c
)
213 return((c
->hi
>> 8) & 0xffff);
216 static inline void context_clear_entry(struct context_entry
*context
)
222 static inline bool context_copied(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
224 if (!iommu
->copied_tables
)
227 return test_bit(((long)bus
<< 8) | devfn
, iommu
->copied_tables
);
231 set_context_copied(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
233 set_bit(((long)bus
<< 8) | devfn
, iommu
->copied_tables
);
237 clear_context_copied(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
239 clear_bit(((long)bus
<< 8) | devfn
, iommu
->copied_tables
);
243 * This domain is a statically identity mapping domain.
244 * 1. This domain creats a static 1:1 mapping to all usable memory.
245 * 2. It maps to each iommu if successful.
246 * 3. Each iommu mapps to this domain if successful.
248 static struct dmar_domain
*si_domain
;
249 static int hw_pass_through
= 1;
251 struct dmar_rmrr_unit
{
252 struct list_head list
; /* list of rmrr units */
253 struct acpi_dmar_header
*hdr
; /* ACPI header */
254 u64 base_address
; /* reserved base address*/
255 u64 end_address
; /* reserved end address */
256 struct dmar_dev_scope
*devices
; /* target devices */
257 int devices_cnt
; /* target device count */
260 struct dmar_atsr_unit
{
261 struct list_head list
; /* list of ATSR units */
262 struct acpi_dmar_header
*hdr
; /* ACPI header */
263 struct dmar_dev_scope
*devices
; /* target devices */
264 int devices_cnt
; /* target device count */
265 u8 include_all
:1; /* include all ports */
268 struct dmar_satc_unit
{
269 struct list_head list
; /* list of SATC units */
270 struct acpi_dmar_header
*hdr
; /* ACPI header */
271 struct dmar_dev_scope
*devices
; /* target devices */
272 struct intel_iommu
*iommu
; /* the corresponding iommu */
273 int devices_cnt
; /* target device count */
274 u8 atc_required
:1; /* ATS is required */
277 static LIST_HEAD(dmar_atsr_units
);
278 static LIST_HEAD(dmar_rmrr_units
);
279 static LIST_HEAD(dmar_satc_units
);
281 #define for_each_rmrr_units(rmrr) \
282 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284 static void device_block_translation(struct device
*dev
);
285 static void intel_iommu_domain_free(struct iommu_domain
*domain
);
287 int dmar_disabled
= !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON
);
288 int intel_iommu_sm
= IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
);
290 int intel_iommu_enabled
= 0;
291 EXPORT_SYMBOL_GPL(intel_iommu_enabled
);
293 static int dmar_map_gfx
= 1;
294 static int intel_iommu_superpage
= 1;
295 static int iommu_identity_mapping
;
296 static int iommu_skip_te_disable
;
298 #define IDENTMAP_GFX 2
299 #define IDENTMAP_AZALIA 4
301 const struct iommu_ops intel_iommu_ops
;
303 static bool translation_pre_enabled(struct intel_iommu
*iommu
)
305 return (iommu
->flags
& VTD_FLAG_TRANS_PRE_ENABLED
);
308 static void clear_translation_pre_enabled(struct intel_iommu
*iommu
)
310 iommu
->flags
&= ~VTD_FLAG_TRANS_PRE_ENABLED
;
313 static void init_translation_status(struct intel_iommu
*iommu
)
317 gsts
= readl(iommu
->reg
+ DMAR_GSTS_REG
);
318 if (gsts
& DMA_GSTS_TES
)
319 iommu
->flags
|= VTD_FLAG_TRANS_PRE_ENABLED
;
322 static int __init
intel_iommu_setup(char *str
)
328 if (!strncmp(str
, "on", 2)) {
330 pr_info("IOMMU enabled\n");
331 } else if (!strncmp(str
, "off", 3)) {
333 no_platform_optin
= 1;
334 pr_info("IOMMU disabled\n");
335 } else if (!strncmp(str
, "igfx_off", 8)) {
337 pr_info("Disable GFX device mapping\n");
338 } else if (!strncmp(str
, "forcedac", 8)) {
339 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
340 iommu_dma_forcedac
= true;
341 } else if (!strncmp(str
, "strict", 6)) {
342 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
343 iommu_set_dma_strict();
344 } else if (!strncmp(str
, "sp_off", 6)) {
345 pr_info("Disable supported super page\n");
346 intel_iommu_superpage
= 0;
347 } else if (!strncmp(str
, "sm_on", 5)) {
348 pr_info("Enable scalable mode if hardware supports\n");
350 } else if (!strncmp(str
, "sm_off", 6)) {
351 pr_info("Scalable mode is disallowed\n");
353 } else if (!strncmp(str
, "tboot_noforce", 13)) {
354 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
355 intel_iommu_tboot_noforce
= 1;
357 pr_notice("Unknown option - '%s'\n", str
);
360 str
+= strcspn(str
, ",");
367 __setup("intel_iommu=", intel_iommu_setup
);
369 void *alloc_pgtable_page(int node
, gfp_t gfp
)
374 page
= alloc_pages_node(node
, gfp
| __GFP_ZERO
, 0);
376 vaddr
= page_address(page
);
380 void free_pgtable_page(void *vaddr
)
382 free_page((unsigned long)vaddr
);
385 static inline int domain_type_is_si(struct dmar_domain
*domain
)
387 return domain
->domain
.type
== IOMMU_DOMAIN_IDENTITY
;
390 static inline int domain_pfn_supported(struct dmar_domain
*domain
,
393 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
395 return !(addr_width
< BITS_PER_LONG
&& pfn
>> addr_width
);
399 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
400 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
401 * the returned SAGAW.
403 static unsigned long __iommu_calculate_sagaw(struct intel_iommu
*iommu
)
405 unsigned long fl_sagaw
, sl_sagaw
;
407 fl_sagaw
= BIT(2) | (cap_fl5lp_support(iommu
->cap
) ? BIT(3) : 0);
408 sl_sagaw
= cap_sagaw(iommu
->cap
);
410 /* Second level only. */
411 if (!sm_supported(iommu
) || !ecap_flts(iommu
->ecap
))
414 /* First level only. */
415 if (!ecap_slts(iommu
->ecap
))
418 return fl_sagaw
& sl_sagaw
;
421 static int __iommu_calculate_agaw(struct intel_iommu
*iommu
, int max_gaw
)
426 sagaw
= __iommu_calculate_sagaw(iommu
);
427 for (agaw
= width_to_agaw(max_gaw
); agaw
>= 0; agaw
--) {
428 if (test_bit(agaw
, &sagaw
))
436 * Calculate max SAGAW for each iommu.
438 int iommu_calculate_max_sagaw(struct intel_iommu
*iommu
)
440 return __iommu_calculate_agaw(iommu
, MAX_AGAW_WIDTH
);
444 * calculate agaw for each iommu.
445 * "SAGAW" may be different across iommus, use a default agaw, and
446 * get a supported less agaw for iommus that don't support the default agaw.
448 int iommu_calculate_agaw(struct intel_iommu
*iommu
)
450 return __iommu_calculate_agaw(iommu
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
453 static inline bool iommu_paging_structure_coherency(struct intel_iommu
*iommu
)
455 return sm_supported(iommu
) ?
456 ecap_smpwc(iommu
->ecap
) : ecap_coherent(iommu
->ecap
);
459 static void domain_update_iommu_coherency(struct dmar_domain
*domain
)
461 struct iommu_domain_info
*info
;
462 struct dmar_drhd_unit
*drhd
;
463 struct intel_iommu
*iommu
;
467 domain
->iommu_coherency
= true;
468 xa_for_each(&domain
->iommu_array
, i
, info
) {
470 if (!iommu_paging_structure_coherency(info
->iommu
)) {
471 domain
->iommu_coherency
= false;
478 /* No hardware attached; use lowest common denominator */
480 for_each_active_iommu(iommu
, drhd
) {
481 if (!iommu_paging_structure_coherency(iommu
)) {
482 domain
->iommu_coherency
= false;
489 static int domain_update_iommu_superpage(struct dmar_domain
*domain
,
490 struct intel_iommu
*skip
)
492 struct dmar_drhd_unit
*drhd
;
493 struct intel_iommu
*iommu
;
496 if (!intel_iommu_superpage
)
499 /* set iommu_superpage to the smallest common denominator */
501 for_each_active_iommu(iommu
, drhd
) {
503 if (domain
&& domain
->use_first_level
) {
504 if (!cap_fl1gp_support(iommu
->cap
))
507 mask
&= cap_super_page_val(iommu
->cap
);
519 static int domain_update_device_node(struct dmar_domain
*domain
)
521 struct device_domain_info
*info
;
522 int nid
= NUMA_NO_NODE
;
525 spin_lock_irqsave(&domain
->lock
, flags
);
526 list_for_each_entry(info
, &domain
->devices
, link
) {
528 * There could possibly be multiple device numa nodes as devices
529 * within the same domain may sit behind different IOMMUs. There
530 * isn't perfect answer in such situation, so we select first
531 * come first served policy.
533 nid
= dev_to_node(info
->dev
);
534 if (nid
!= NUMA_NO_NODE
)
537 spin_unlock_irqrestore(&domain
->lock
, flags
);
542 static void domain_update_iotlb(struct dmar_domain
*domain
);
544 /* Return the super pagesize bitmap if supported. */
545 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain
*domain
)
547 unsigned long bitmap
= 0;
550 * 1-level super page supports page size of 2MiB, 2-level super page
551 * supports page size of both 2MiB and 1GiB.
553 if (domain
->iommu_superpage
== 1)
555 else if (domain
->iommu_superpage
== 2)
556 bitmap
|= SZ_2M
| SZ_1G
;
561 /* Some capabilities may be different across iommus */
562 static void domain_update_iommu_cap(struct dmar_domain
*domain
)
564 domain_update_iommu_coherency(domain
);
565 domain
->iommu_superpage
= domain_update_iommu_superpage(domain
, NULL
);
568 * If RHSA is missing, we should default to the device numa domain
571 if (domain
->nid
== NUMA_NO_NODE
)
572 domain
->nid
= domain_update_device_node(domain
);
575 * First-level translation restricts the input-address to a
576 * canonical address (i.e., address bits 63:N have the same
577 * value as address bit [N-1], where N is 48-bits with 4-level
578 * paging and 57-bits with 5-level paging). Hence, skip bit
581 if (domain
->use_first_level
)
582 domain
->domain
.geometry
.aperture_end
= __DOMAIN_MAX_ADDR(domain
->gaw
- 1);
584 domain
->domain
.geometry
.aperture_end
= __DOMAIN_MAX_ADDR(domain
->gaw
);
586 domain
->domain
.pgsize_bitmap
|= domain_super_pgsize_bitmap(domain
);
587 domain_update_iotlb(domain
);
590 struct context_entry
*iommu_context_addr(struct intel_iommu
*iommu
, u8 bus
,
593 struct root_entry
*root
= &iommu
->root_entry
[bus
];
594 struct context_entry
*context
;
598 * Except that the caller requested to allocate a new entry,
599 * returning a copied context entry makes no sense.
601 if (!alloc
&& context_copied(iommu
, bus
, devfn
))
605 if (sm_supported(iommu
)) {
613 context
= phys_to_virt(*entry
& VTD_PAGE_MASK
);
615 unsigned long phy_addr
;
619 context
= alloc_pgtable_page(iommu
->node
, GFP_ATOMIC
);
623 __iommu_flush_cache(iommu
, (void *)context
, CONTEXT_SIZE
);
624 phy_addr
= virt_to_phys((void *)context
);
625 *entry
= phy_addr
| 1;
626 __iommu_flush_cache(iommu
, entry
, sizeof(*entry
));
628 return &context
[devfn
];
632 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
633 * sub-hierarchy of a candidate PCI-PCI bridge
634 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
635 * @bridge: the candidate PCI-PCI bridge
637 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
640 is_downstream_to_pci_bridge(struct device
*dev
, struct device
*bridge
)
642 struct pci_dev
*pdev
, *pbridge
;
644 if (!dev_is_pci(dev
) || !dev_is_pci(bridge
))
647 pdev
= to_pci_dev(dev
);
648 pbridge
= to_pci_dev(bridge
);
650 if (pbridge
->subordinate
&&
651 pbridge
->subordinate
->number
<= pdev
->bus
->number
&&
652 pbridge
->subordinate
->busn_res
.end
>= pdev
->bus
->number
)
658 static bool quirk_ioat_snb_local_iommu(struct pci_dev
*pdev
)
660 struct dmar_drhd_unit
*drhd
;
664 /* We know that this device on this chipset has its own IOMMU.
665 * If we find it under a different IOMMU, then the BIOS is lying
666 * to us. Hope that the IOMMU for this device is actually
667 * disabled, and it needs no translation...
669 rc
= pci_bus_read_config_dword(pdev
->bus
, PCI_DEVFN(0, 0), 0xb0, &vtbar
);
672 dev_info(&pdev
->dev
, "failed to run vt-d quirk\n");
677 /* we know that the this iommu should be at offset 0xa000 from vtbar */
678 drhd
= dmar_find_matched_drhd_unit(pdev
);
679 if (!drhd
|| drhd
->reg_base_addr
- vtbar
!= 0xa000) {
680 pr_warn_once(FW_BUG
"BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
681 add_taint(TAINT_FIRMWARE_WORKAROUND
, LOCKDEP_STILL_OK
);
688 static bool iommu_is_dummy(struct intel_iommu
*iommu
, struct device
*dev
)
690 if (!iommu
|| iommu
->drhd
->ignored
)
693 if (dev_is_pci(dev
)) {
694 struct pci_dev
*pdev
= to_pci_dev(dev
);
696 if (pdev
->vendor
== PCI_VENDOR_ID_INTEL
&&
697 pdev
->device
== PCI_DEVICE_ID_INTEL_IOAT_SNB
&&
698 quirk_ioat_snb_local_iommu(pdev
))
705 struct intel_iommu
*device_to_iommu(struct device
*dev
, u8
*bus
, u8
*devfn
)
707 struct dmar_drhd_unit
*drhd
= NULL
;
708 struct pci_dev
*pdev
= NULL
;
709 struct intel_iommu
*iommu
;
717 if (dev_is_pci(dev
)) {
718 struct pci_dev
*pf_pdev
;
720 pdev
= pci_real_dma_dev(to_pci_dev(dev
));
722 /* VFs aren't listed in scope tables; we need to look up
723 * the PF instead to find the IOMMU. */
724 pf_pdev
= pci_physfn(pdev
);
726 segment
= pci_domain_nr(pdev
->bus
);
727 } else if (has_acpi_companion(dev
))
728 dev
= &ACPI_COMPANION(dev
)->dev
;
731 for_each_iommu(iommu
, drhd
) {
732 if (pdev
&& segment
!= drhd
->segment
)
735 for_each_active_dev_scope(drhd
->devices
,
736 drhd
->devices_cnt
, i
, tmp
) {
738 /* For a VF use its original BDF# not that of the PF
739 * which we used for the IOMMU lookup. Strictly speaking
740 * we could do this for all PCI devices; we only need to
741 * get the BDF# from the scope table for ACPI matches. */
742 if (pdev
&& pdev
->is_virtfn
)
746 *bus
= drhd
->devices
[i
].bus
;
747 *devfn
= drhd
->devices
[i
].devfn
;
752 if (is_downstream_to_pci_bridge(dev
, tmp
))
756 if (pdev
&& drhd
->include_all
) {
759 *bus
= pdev
->bus
->number
;
760 *devfn
= pdev
->devfn
;
767 if (iommu_is_dummy(iommu
, dev
))
775 static void domain_flush_cache(struct dmar_domain
*domain
,
776 void *addr
, int size
)
778 if (!domain
->iommu_coherency
)
779 clflush_cache_range(addr
, size
);
782 static void free_context_table(struct intel_iommu
*iommu
)
784 struct context_entry
*context
;
787 if (!iommu
->root_entry
)
790 for (i
= 0; i
< ROOT_ENTRY_NR
; i
++) {
791 context
= iommu_context_addr(iommu
, i
, 0, 0);
793 free_pgtable_page(context
);
795 if (!sm_supported(iommu
))
798 context
= iommu_context_addr(iommu
, i
, 0x80, 0);
800 free_pgtable_page(context
);
803 free_pgtable_page(iommu
->root_entry
);
804 iommu
->root_entry
= NULL
;
807 #ifdef CONFIG_DMAR_DEBUG
808 static void pgtable_walk(struct intel_iommu
*iommu
, unsigned long pfn
,
809 u8 bus
, u8 devfn
, struct dma_pte
*parent
, int level
)
815 offset
= pfn_level_offset(pfn
, level
);
816 pte
= &parent
[offset
];
817 if (!pte
|| (dma_pte_superpage(pte
) || !dma_pte_present(pte
))) {
818 pr_info("PTE not present at level %d\n", level
);
822 pr_info("pte level: %d, pte value: 0x%016llx\n", level
, pte
->val
);
827 parent
= phys_to_virt(dma_pte_addr(pte
));
832 void dmar_fault_dump_ptes(struct intel_iommu
*iommu
, u16 source_id
,
833 unsigned long long addr
, u32 pasid
)
835 struct pasid_dir_entry
*dir
, *pde
;
836 struct pasid_entry
*entries
, *pte
;
837 struct context_entry
*ctx_entry
;
838 struct root_entry
*rt_entry
;
839 int i
, dir_index
, index
, level
;
840 u8 devfn
= source_id
& 0xff;
841 u8 bus
= source_id
>> 8;
842 struct dma_pte
*pgtable
;
844 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu
->name
, addr
);
846 /* root entry dump */
847 rt_entry
= &iommu
->root_entry
[bus
];
849 pr_info("root table entry is not present\n");
853 if (sm_supported(iommu
))
854 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
855 rt_entry
->hi
, rt_entry
->lo
);
857 pr_info("root entry: 0x%016llx", rt_entry
->lo
);
859 /* context entry dump */
860 ctx_entry
= iommu_context_addr(iommu
, bus
, devfn
, 0);
862 pr_info("context table entry is not present\n");
866 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
867 ctx_entry
->hi
, ctx_entry
->lo
);
869 /* legacy mode does not require PASID entries */
870 if (!sm_supported(iommu
)) {
871 level
= agaw_to_level(ctx_entry
->hi
& 7);
872 pgtable
= phys_to_virt(ctx_entry
->lo
& VTD_PAGE_MASK
);
876 /* get the pointer to pasid directory entry */
877 dir
= phys_to_virt(ctx_entry
->lo
& VTD_PAGE_MASK
);
879 pr_info("pasid directory entry is not present\n");
882 /* For request-without-pasid, get the pasid from context entry */
883 if (intel_iommu_sm
&& pasid
== IOMMU_PASID_INVALID
)
884 pasid
= IOMMU_NO_PASID
;
886 dir_index
= pasid
>> PASID_PDE_SHIFT
;
887 pde
= &dir
[dir_index
];
888 pr_info("pasid dir entry: 0x%016llx\n", pde
->val
);
890 /* get the pointer to the pasid table entry */
891 entries
= get_pasid_table_from_pde(pde
);
893 pr_info("pasid table entry is not present\n");
896 index
= pasid
& PASID_PTE_MASK
;
897 pte
= &entries
[index
];
898 for (i
= 0; i
< ARRAY_SIZE(pte
->val
); i
++)
899 pr_info("pasid table entry[%d]: 0x%016llx\n", i
, pte
->val
[i
]);
901 if (pasid_pte_get_pgtt(pte
) == PASID_ENTRY_PGTT_FL_ONLY
) {
902 level
= pte
->val
[2] & BIT_ULL(2) ? 5 : 4;
903 pgtable
= phys_to_virt(pte
->val
[2] & VTD_PAGE_MASK
);
905 level
= agaw_to_level((pte
->val
[0] >> 2) & 0x7);
906 pgtable
= phys_to_virt(pte
->val
[0] & VTD_PAGE_MASK
);
910 pgtable_walk(iommu
, addr
>> VTD_PAGE_SHIFT
, bus
, devfn
, pgtable
, level
);
914 static struct dma_pte
*pfn_to_dma_pte(struct dmar_domain
*domain
,
915 unsigned long pfn
, int *target_level
,
918 struct dma_pte
*parent
, *pte
;
919 int level
= agaw_to_level(domain
->agaw
);
922 if (!domain_pfn_supported(domain
, pfn
))
923 /* Address beyond IOMMU's addressing capabilities. */
926 parent
= domain
->pgd
;
931 offset
= pfn_level_offset(pfn
, level
);
932 pte
= &parent
[offset
];
933 if (!*target_level
&& (dma_pte_superpage(pte
) || !dma_pte_present(pte
)))
935 if (level
== *target_level
)
938 if (!dma_pte_present(pte
)) {
941 tmp_page
= alloc_pgtable_page(domain
->nid
, gfp
);
946 domain_flush_cache(domain
, tmp_page
, VTD_PAGE_SIZE
);
947 pteval
= ((uint64_t)virt_to_dma_pfn(tmp_page
) << VTD_PAGE_SHIFT
) | DMA_PTE_READ
| DMA_PTE_WRITE
;
948 if (domain
->use_first_level
)
949 pteval
|= DMA_FL_PTE_XD
| DMA_FL_PTE_US
| DMA_FL_PTE_ACCESS
;
951 if (cmpxchg64(&pte
->val
, 0ULL, pteval
))
952 /* Someone else set it while we were thinking; use theirs. */
953 free_pgtable_page(tmp_page
);
955 domain_flush_cache(domain
, pte
, sizeof(*pte
));
960 parent
= phys_to_virt(dma_pte_addr(pte
));
965 *target_level
= level
;
970 /* return address's pte at specific level */
971 static struct dma_pte
*dma_pfn_level_pte(struct dmar_domain
*domain
,
973 int level
, int *large_page
)
975 struct dma_pte
*parent
, *pte
;
976 int total
= agaw_to_level(domain
->agaw
);
979 parent
= domain
->pgd
;
980 while (level
<= total
) {
981 offset
= pfn_level_offset(pfn
, total
);
982 pte
= &parent
[offset
];
986 if (!dma_pte_present(pte
)) {
991 if (dma_pte_superpage(pte
)) {
996 parent
= phys_to_virt(dma_pte_addr(pte
));
1002 /* clear last level pte, a tlb flush should be followed */
1003 static void dma_pte_clear_range(struct dmar_domain
*domain
,
1004 unsigned long start_pfn
,
1005 unsigned long last_pfn
)
1007 unsigned int large_page
;
1008 struct dma_pte
*first_pte
, *pte
;
1010 if (WARN_ON(!domain_pfn_supported(domain
, last_pfn
)) ||
1011 WARN_ON(start_pfn
> last_pfn
))
1014 /* we don't need lock here; nobody else touches the iova range */
1017 first_pte
= pte
= dma_pfn_level_pte(domain
, start_pfn
, 1, &large_page
);
1019 start_pfn
= align_to_level(start_pfn
+ 1, large_page
+ 1);
1024 start_pfn
+= lvl_to_nr_pages(large_page
);
1026 } while (start_pfn
<= last_pfn
&& !first_pte_in_page(pte
));
1028 domain_flush_cache(domain
, first_pte
,
1029 (void *)pte
- (void *)first_pte
);
1031 } while (start_pfn
&& start_pfn
<= last_pfn
);
1034 static void dma_pte_free_level(struct dmar_domain
*domain
, int level
,
1035 int retain_level
, struct dma_pte
*pte
,
1036 unsigned long pfn
, unsigned long start_pfn
,
1037 unsigned long last_pfn
)
1039 pfn
= max(start_pfn
, pfn
);
1040 pte
= &pte
[pfn_level_offset(pfn
, level
)];
1043 unsigned long level_pfn
;
1044 struct dma_pte
*level_pte
;
1046 if (!dma_pte_present(pte
) || dma_pte_superpage(pte
))
1049 level_pfn
= pfn
& level_mask(level
);
1050 level_pte
= phys_to_virt(dma_pte_addr(pte
));
1053 dma_pte_free_level(domain
, level
- 1, retain_level
,
1054 level_pte
, level_pfn
, start_pfn
,
1059 * Free the page table if we're below the level we want to
1060 * retain and the range covers the entire table.
1062 if (level
< retain_level
&& !(start_pfn
> level_pfn
||
1063 last_pfn
< level_pfn
+ level_size(level
) - 1)) {
1065 domain_flush_cache(domain
, pte
, sizeof(*pte
));
1066 free_pgtable_page(level_pte
);
1069 pfn
+= level_size(level
);
1070 } while (!first_pte_in_page(++pte
) && pfn
<= last_pfn
);
1074 * clear last level (leaf) ptes and free page table pages below the
1075 * level we wish to keep intact.
1077 static void dma_pte_free_pagetable(struct dmar_domain
*domain
,
1078 unsigned long start_pfn
,
1079 unsigned long last_pfn
,
1082 dma_pte_clear_range(domain
, start_pfn
, last_pfn
);
1084 /* We don't need lock here; nobody else touches the iova range */
1085 dma_pte_free_level(domain
, agaw_to_level(domain
->agaw
), retain_level
,
1086 domain
->pgd
, 0, start_pfn
, last_pfn
);
1089 if (start_pfn
== 0 && last_pfn
== DOMAIN_MAX_PFN(domain
->gaw
)) {
1090 free_pgtable_page(domain
->pgd
);
1095 /* When a page at a given level is being unlinked from its parent, we don't
1096 need to *modify* it at all. All we need to do is make a list of all the
1097 pages which can be freed just as soon as we've flushed the IOTLB and we
1098 know the hardware page-walk will no longer touch them.
1099 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101 static void dma_pte_list_pagetables(struct dmar_domain
*domain
,
1102 int level
, struct dma_pte
*pte
,
1103 struct list_head
*freelist
)
1107 pg
= pfn_to_page(dma_pte_addr(pte
) >> PAGE_SHIFT
);
1108 list_add_tail(&pg
->lru
, freelist
);
1113 pte
= page_address(pg
);
1115 if (dma_pte_present(pte
) && !dma_pte_superpage(pte
))
1116 dma_pte_list_pagetables(domain
, level
- 1, pte
, freelist
);
1118 } while (!first_pte_in_page(pte
));
1121 static void dma_pte_clear_level(struct dmar_domain
*domain
, int level
,
1122 struct dma_pte
*pte
, unsigned long pfn
,
1123 unsigned long start_pfn
, unsigned long last_pfn
,
1124 struct list_head
*freelist
)
1126 struct dma_pte
*first_pte
= NULL
, *last_pte
= NULL
;
1128 pfn
= max(start_pfn
, pfn
);
1129 pte
= &pte
[pfn_level_offset(pfn
, level
)];
1132 unsigned long level_pfn
= pfn
& level_mask(level
);
1134 if (!dma_pte_present(pte
))
1137 /* If range covers entire pagetable, free it */
1138 if (start_pfn
<= level_pfn
&&
1139 last_pfn
>= level_pfn
+ level_size(level
) - 1) {
1140 /* These suborbinate page tables are going away entirely. Don't
1141 bother to clear them; we're just going to *free* them. */
1142 if (level
> 1 && !dma_pte_superpage(pte
))
1143 dma_pte_list_pagetables(domain
, level
- 1, pte
, freelist
);
1149 } else if (level
> 1) {
1150 /* Recurse down into a level that isn't *entirely* obsolete */
1151 dma_pte_clear_level(domain
, level
- 1,
1152 phys_to_virt(dma_pte_addr(pte
)),
1153 level_pfn
, start_pfn
, last_pfn
,
1157 pfn
= level_pfn
+ level_size(level
);
1158 } while (!first_pte_in_page(++pte
) && pfn
<= last_pfn
);
1161 domain_flush_cache(domain
, first_pte
,
1162 (void *)++last_pte
- (void *)first_pte
);
1165 /* We can't just free the pages because the IOMMU may still be walking
1166 the page tables, and may have cached the intermediate levels. The
1167 pages can only be freed after the IOTLB flush has been done. */
1168 static void domain_unmap(struct dmar_domain
*domain
, unsigned long start_pfn
,
1169 unsigned long last_pfn
, struct list_head
*freelist
)
1171 if (WARN_ON(!domain_pfn_supported(domain
, last_pfn
)) ||
1172 WARN_ON(start_pfn
> last_pfn
))
1175 /* we don't need lock here; nobody else touches the iova range */
1176 dma_pte_clear_level(domain
, agaw_to_level(domain
->agaw
),
1177 domain
->pgd
, 0, start_pfn
, last_pfn
, freelist
);
1180 if (start_pfn
== 0 && last_pfn
== DOMAIN_MAX_PFN(domain
->gaw
)) {
1181 struct page
*pgd_page
= virt_to_page(domain
->pgd
);
1182 list_add_tail(&pgd_page
->lru
, freelist
);
1187 /* iommu handling */
1188 static int iommu_alloc_root_entry(struct intel_iommu
*iommu
)
1190 struct root_entry
*root
;
1192 root
= alloc_pgtable_page(iommu
->node
, GFP_ATOMIC
);
1194 pr_err("Allocating root entry for %s failed\n",
1199 __iommu_flush_cache(iommu
, root
, ROOT_SIZE
);
1200 iommu
->root_entry
= root
;
1205 static void iommu_set_root_entry(struct intel_iommu
*iommu
)
1211 addr
= virt_to_phys(iommu
->root_entry
);
1212 if (sm_supported(iommu
))
1213 addr
|= DMA_RTADDR_SMT
;
1215 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1216 dmar_writeq(iommu
->reg
+ DMAR_RTADDR_REG
, addr
);
1218 writel(iommu
->gcmd
| DMA_GCMD_SRTP
, iommu
->reg
+ DMAR_GCMD_REG
);
1220 /* Make sure hardware complete it */
1221 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1222 readl
, (sts
& DMA_GSTS_RTPS
), sts
);
1224 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1227 * Hardware invalidates all DMA remapping hardware translation
1228 * caches as part of SRTP flow.
1230 if (cap_esrtps(iommu
->cap
))
1233 iommu
->flush
.flush_context(iommu
, 0, 0, 0, DMA_CCMD_GLOBAL_INVL
);
1234 if (sm_supported(iommu
))
1235 qi_flush_pasid_cache(iommu
, 0, QI_PC_GLOBAL
, 0);
1236 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
1239 void iommu_flush_write_buffer(struct intel_iommu
*iommu
)
1244 if (!rwbf_quirk
&& !cap_rwbf(iommu
->cap
))
1247 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1248 writel(iommu
->gcmd
| DMA_GCMD_WBF
, iommu
->reg
+ DMAR_GCMD_REG
);
1250 /* Make sure hardware complete it */
1251 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1252 readl
, (!(val
& DMA_GSTS_WBFS
)), val
);
1254 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1257 /* return value determine if we need a write buffer flush */
1258 static void __iommu_flush_context(struct intel_iommu
*iommu
,
1259 u16 did
, u16 source_id
, u8 function_mask
,
1266 case DMA_CCMD_GLOBAL_INVL
:
1267 val
= DMA_CCMD_GLOBAL_INVL
;
1269 case DMA_CCMD_DOMAIN_INVL
:
1270 val
= DMA_CCMD_DOMAIN_INVL
|DMA_CCMD_DID(did
);
1272 case DMA_CCMD_DEVICE_INVL
:
1273 val
= DMA_CCMD_DEVICE_INVL
|DMA_CCMD_DID(did
)
1274 | DMA_CCMD_SID(source_id
) | DMA_CCMD_FM(function_mask
);
1277 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1281 val
|= DMA_CCMD_ICC
;
1283 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1284 dmar_writeq(iommu
->reg
+ DMAR_CCMD_REG
, val
);
1286 /* Make sure hardware complete it */
1287 IOMMU_WAIT_OP(iommu
, DMAR_CCMD_REG
,
1288 dmar_readq
, (!(val
& DMA_CCMD_ICC
)), val
);
1290 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1293 /* return value determine if we need a write buffer flush */
1294 static void __iommu_flush_iotlb(struct intel_iommu
*iommu
, u16 did
,
1295 u64 addr
, unsigned int size_order
, u64 type
)
1297 int tlb_offset
= ecap_iotlb_offset(iommu
->ecap
);
1298 u64 val
= 0, val_iva
= 0;
1302 case DMA_TLB_GLOBAL_FLUSH
:
1303 /* global flush doesn't need set IVA_REG */
1304 val
= DMA_TLB_GLOBAL_FLUSH
|DMA_TLB_IVT
;
1306 case DMA_TLB_DSI_FLUSH
:
1307 val
= DMA_TLB_DSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
1309 case DMA_TLB_PSI_FLUSH
:
1310 val
= DMA_TLB_PSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
1311 /* IH bit is passed in as part of address */
1312 val_iva
= size_order
| addr
;
1315 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1320 if (cap_write_drain(iommu
->cap
))
1321 val
|= DMA_TLB_WRITE_DRAIN
;
1323 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1324 /* Note: Only uses first TLB reg currently */
1326 dmar_writeq(iommu
->reg
+ tlb_offset
, val_iva
);
1327 dmar_writeq(iommu
->reg
+ tlb_offset
+ 8, val
);
1329 /* Make sure hardware complete it */
1330 IOMMU_WAIT_OP(iommu
, tlb_offset
+ 8,
1331 dmar_readq
, (!(val
& DMA_TLB_IVT
)), val
);
1333 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1335 /* check IOTLB invalidation granularity */
1336 if (DMA_TLB_IAIG(val
) == 0)
1337 pr_err("Flush IOTLB failed\n");
1338 if (DMA_TLB_IAIG(val
) != DMA_TLB_IIRG(type
))
1339 pr_debug("TLB flush request %Lx, actual %Lx\n",
1340 (unsigned long long)DMA_TLB_IIRG(type
),
1341 (unsigned long long)DMA_TLB_IAIG(val
));
1344 static struct device_domain_info
*
1345 domain_lookup_dev_info(struct dmar_domain
*domain
,
1346 struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
1348 struct device_domain_info
*info
;
1349 unsigned long flags
;
1351 spin_lock_irqsave(&domain
->lock
, flags
);
1352 list_for_each_entry(info
, &domain
->devices
, link
) {
1353 if (info
->iommu
== iommu
&& info
->bus
== bus
&&
1354 info
->devfn
== devfn
) {
1355 spin_unlock_irqrestore(&domain
->lock
, flags
);
1359 spin_unlock_irqrestore(&domain
->lock
, flags
);
1364 static void domain_update_iotlb(struct dmar_domain
*domain
)
1366 struct dev_pasid_info
*dev_pasid
;
1367 struct device_domain_info
*info
;
1368 bool has_iotlb_device
= false;
1369 unsigned long flags
;
1371 spin_lock_irqsave(&domain
->lock
, flags
);
1372 list_for_each_entry(info
, &domain
->devices
, link
) {
1373 if (info
->ats_enabled
) {
1374 has_iotlb_device
= true;
1379 list_for_each_entry(dev_pasid
, &domain
->dev_pasids
, link_domain
) {
1380 info
= dev_iommu_priv_get(dev_pasid
->dev
);
1381 if (info
->ats_enabled
) {
1382 has_iotlb_device
= true;
1386 domain
->has_iotlb_device
= has_iotlb_device
;
1387 spin_unlock_irqrestore(&domain
->lock
, flags
);
1391 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1392 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1393 * check because it applies only to the built-in QAT devices and it doesn't
1394 * grant additional privileges.
1396 #define BUGGY_QAT_DEVID_MASK 0x4940
1397 static bool dev_needs_extra_dtlb_flush(struct pci_dev
*pdev
)
1399 if (pdev
->vendor
!= PCI_VENDOR_ID_INTEL
)
1402 if ((pdev
->device
& 0xfffc) != BUGGY_QAT_DEVID_MASK
)
1408 static void iommu_enable_pci_caps(struct device_domain_info
*info
)
1410 struct pci_dev
*pdev
;
1412 if (!dev_is_pci(info
->dev
))
1415 pdev
= to_pci_dev(info
->dev
);
1417 /* The PCIe spec, in its wisdom, declares that the behaviour of
1418 the device if you enable PASID support after ATS support is
1419 undefined. So always enable PASID support on devices which
1420 have it, even if we can't yet know if we're ever going to
1422 if (info
->pasid_supported
&& !pci_enable_pasid(pdev
, info
->pasid_supported
& ~1))
1423 info
->pasid_enabled
= 1;
1425 if (info
->ats_supported
&& pci_ats_page_aligned(pdev
) &&
1426 !pci_enable_ats(pdev
, VTD_PAGE_SHIFT
)) {
1427 info
->ats_enabled
= 1;
1428 domain_update_iotlb(info
->domain
);
1432 static void iommu_disable_pci_caps(struct device_domain_info
*info
)
1434 struct pci_dev
*pdev
;
1436 if (!dev_is_pci(info
->dev
))
1439 pdev
= to_pci_dev(info
->dev
);
1441 if (info
->ats_enabled
) {
1442 pci_disable_ats(pdev
);
1443 info
->ats_enabled
= 0;
1444 domain_update_iotlb(info
->domain
);
1447 if (info
->pasid_enabled
) {
1448 pci_disable_pasid(pdev
);
1449 info
->pasid_enabled
= 0;
1453 static void __iommu_flush_dev_iotlb(struct device_domain_info
*info
,
1454 u64 addr
, unsigned int mask
)
1458 if (!info
|| !info
->ats_enabled
)
1461 sid
= info
->bus
<< 8 | info
->devfn
;
1462 qdep
= info
->ats_qdep
;
1463 qi_flush_dev_iotlb(info
->iommu
, sid
, info
->pfsid
,
1465 quirk_extra_dev_tlb_flush(info
, addr
, mask
, IOMMU_NO_PASID
, qdep
);
1468 static void iommu_flush_dev_iotlb(struct dmar_domain
*domain
,
1469 u64 addr
, unsigned mask
)
1471 struct dev_pasid_info
*dev_pasid
;
1472 struct device_domain_info
*info
;
1473 unsigned long flags
;
1475 if (!domain
->has_iotlb_device
)
1478 spin_lock_irqsave(&domain
->lock
, flags
);
1479 list_for_each_entry(info
, &domain
->devices
, link
)
1480 __iommu_flush_dev_iotlb(info
, addr
, mask
);
1482 list_for_each_entry(dev_pasid
, &domain
->dev_pasids
, link_domain
) {
1483 info
= dev_iommu_priv_get(dev_pasid
->dev
);
1485 if (!info
->ats_enabled
)
1488 qi_flush_dev_iotlb_pasid(info
->iommu
,
1489 PCI_DEVID(info
->bus
, info
->devfn
),
1490 info
->pfsid
, dev_pasid
->pasid
,
1491 info
->ats_qdep
, addr
,
1494 spin_unlock_irqrestore(&domain
->lock
, flags
);
1497 static void domain_flush_pasid_iotlb(struct intel_iommu
*iommu
,
1498 struct dmar_domain
*domain
, u64 addr
,
1499 unsigned long npages
, bool ih
)
1501 u16 did
= domain_id_iommu(domain
, iommu
);
1502 struct dev_pasid_info
*dev_pasid
;
1503 unsigned long flags
;
1505 spin_lock_irqsave(&domain
->lock
, flags
);
1506 list_for_each_entry(dev_pasid
, &domain
->dev_pasids
, link_domain
)
1507 qi_flush_piotlb(iommu
, did
, dev_pasid
->pasid
, addr
, npages
, ih
);
1509 if (!list_empty(&domain
->devices
))
1510 qi_flush_piotlb(iommu
, did
, IOMMU_NO_PASID
, addr
, npages
, ih
);
1511 spin_unlock_irqrestore(&domain
->lock
, flags
);
1514 static void iommu_flush_iotlb_psi(struct intel_iommu
*iommu
,
1515 struct dmar_domain
*domain
,
1516 unsigned long pfn
, unsigned int pages
,
1519 unsigned int aligned_pages
= __roundup_pow_of_two(pages
);
1520 unsigned int mask
= ilog2(aligned_pages
);
1521 uint64_t addr
= (uint64_t)pfn
<< VTD_PAGE_SHIFT
;
1522 u16 did
= domain_id_iommu(domain
, iommu
);
1524 if (WARN_ON(!pages
))
1530 if (domain
->use_first_level
) {
1531 domain_flush_pasid_iotlb(iommu
, domain
, addr
, pages
, ih
);
1533 unsigned long bitmask
= aligned_pages
- 1;
1536 * PSI masks the low order bits of the base address. If the
1537 * address isn't aligned to the mask, then compute a mask value
1538 * needed to ensure the target range is flushed.
1540 if (unlikely(bitmask
& pfn
)) {
1541 unsigned long end_pfn
= pfn
+ pages
- 1, shared_bits
;
1544 * Since end_pfn <= pfn + bitmask, the only way bits
1545 * higher than bitmask can differ in pfn and end_pfn is
1546 * by carrying. This means after masking out bitmask,
1547 * high bits starting with the first set bit in
1548 * shared_bits are all equal in both pfn and end_pfn.
1550 shared_bits
= ~(pfn
^ end_pfn
) & ~bitmask
;
1551 mask
= shared_bits
? __ffs(shared_bits
) : BITS_PER_LONG
;
1555 * Fallback to domain selective flush if no PSI support or
1556 * the size is too big.
1558 if (!cap_pgsel_inv(iommu
->cap
) ||
1559 mask
> cap_max_amask_val(iommu
->cap
))
1560 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0,
1563 iommu
->flush
.flush_iotlb(iommu
, did
, addr
| ih
, mask
,
1568 * In caching mode, changes of pages from non-present to present require
1569 * flush. However, device IOTLB doesn't need to be flushed in this case.
1571 if (!cap_caching_mode(iommu
->cap
) || !map
)
1572 iommu_flush_dev_iotlb(domain
, addr
, mask
);
1575 /* Notification for newly created mappings */
1576 static inline void __mapping_notify_one(struct intel_iommu
*iommu
,
1577 struct dmar_domain
*domain
,
1578 unsigned long pfn
, unsigned int pages
)
1581 * It's a non-present to present mapping. Only flush if caching mode
1584 if (cap_caching_mode(iommu
->cap
) && !domain
->use_first_level
)
1585 iommu_flush_iotlb_psi(iommu
, domain
, pfn
, pages
, 0, 1);
1587 iommu_flush_write_buffer(iommu
);
1590 static void intel_flush_iotlb_all(struct iommu_domain
*domain
)
1592 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
1593 struct iommu_domain_info
*info
;
1596 xa_for_each(&dmar_domain
->iommu_array
, idx
, info
) {
1597 struct intel_iommu
*iommu
= info
->iommu
;
1598 u16 did
= domain_id_iommu(dmar_domain
, iommu
);
1600 if (dmar_domain
->use_first_level
)
1601 domain_flush_pasid_iotlb(iommu
, dmar_domain
, 0, -1, 0);
1603 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0,
1606 if (!cap_caching_mode(iommu
->cap
))
1607 iommu_flush_dev_iotlb(dmar_domain
, 0, MAX_AGAW_PFN_WIDTH
);
1611 static void iommu_disable_protect_mem_regions(struct intel_iommu
*iommu
)
1614 unsigned long flags
;
1616 if (!cap_plmr(iommu
->cap
) && !cap_phmr(iommu
->cap
))
1619 raw_spin_lock_irqsave(&iommu
->register_lock
, flags
);
1620 pmen
= readl(iommu
->reg
+ DMAR_PMEN_REG
);
1621 pmen
&= ~DMA_PMEN_EPM
;
1622 writel(pmen
, iommu
->reg
+ DMAR_PMEN_REG
);
1624 /* wait for the protected region status bit to clear */
1625 IOMMU_WAIT_OP(iommu
, DMAR_PMEN_REG
,
1626 readl
, !(pmen
& DMA_PMEN_PRS
), pmen
);
1628 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1631 static void iommu_enable_translation(struct intel_iommu
*iommu
)
1634 unsigned long flags
;
1636 raw_spin_lock_irqsave(&iommu
->register_lock
, flags
);
1637 iommu
->gcmd
|= DMA_GCMD_TE
;
1638 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1640 /* Make sure hardware complete it */
1641 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1642 readl
, (sts
& DMA_GSTS_TES
), sts
);
1644 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1647 static void iommu_disable_translation(struct intel_iommu
*iommu
)
1652 if (iommu_skip_te_disable
&& iommu
->drhd
->gfx_dedicated
&&
1653 (cap_read_drain(iommu
->cap
) || cap_write_drain(iommu
->cap
)))
1656 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1657 iommu
->gcmd
&= ~DMA_GCMD_TE
;
1658 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1660 /* Make sure hardware complete it */
1661 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1662 readl
, (!(sts
& DMA_GSTS_TES
)), sts
);
1664 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1667 static int iommu_init_domains(struct intel_iommu
*iommu
)
1671 ndomains
= cap_ndoms(iommu
->cap
);
1672 pr_debug("%s: Number of Domains supported <%d>\n",
1673 iommu
->name
, ndomains
);
1675 spin_lock_init(&iommu
->lock
);
1677 iommu
->domain_ids
= bitmap_zalloc(ndomains
, GFP_KERNEL
);
1678 if (!iommu
->domain_ids
)
1682 * If Caching mode is set, then invalid translations are tagged
1683 * with domain-id 0, hence we need to pre-allocate it. We also
1684 * use domain-id 0 as a marker for non-allocated domain-id, so
1685 * make sure it is not used for a real domain.
1687 set_bit(0, iommu
->domain_ids
);
1690 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1691 * entry for first-level or pass-through translation modes should
1692 * be programmed with a domain id different from those used for
1693 * second-level or nested translation. We reserve a domain id for
1696 if (sm_supported(iommu
))
1697 set_bit(FLPT_DEFAULT_DID
, iommu
->domain_ids
);
1702 static void disable_dmar_iommu(struct intel_iommu
*iommu
)
1704 if (!iommu
->domain_ids
)
1708 * All iommu domains must have been detached from the devices,
1709 * hence there should be no domain IDs in use.
1711 if (WARN_ON(bitmap_weight(iommu
->domain_ids
, cap_ndoms(iommu
->cap
))
1712 > NUM_RESERVED_DID
))
1715 if (iommu
->gcmd
& DMA_GCMD_TE
)
1716 iommu_disable_translation(iommu
);
1719 static void free_dmar_iommu(struct intel_iommu
*iommu
)
1721 if (iommu
->domain_ids
) {
1722 bitmap_free(iommu
->domain_ids
);
1723 iommu
->domain_ids
= NULL
;
1726 if (iommu
->copied_tables
) {
1727 bitmap_free(iommu
->copied_tables
);
1728 iommu
->copied_tables
= NULL
;
1731 /* free context mapping */
1732 free_context_table(iommu
);
1734 #ifdef CONFIG_INTEL_IOMMU_SVM
1735 if (pasid_supported(iommu
)) {
1736 if (ecap_prs(iommu
->ecap
))
1737 intel_svm_finish_prq(iommu
);
1743 * Check and return whether first level is used by default for
1746 static bool first_level_by_default(unsigned int type
)
1748 /* Only SL is available in legacy mode */
1749 if (!scalable_mode_support())
1752 /* Only level (either FL or SL) is available, just use it */
1753 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1754 return intel_cap_flts_sanity();
1756 /* Both levels are available, decide it based on domain type */
1757 return type
!= IOMMU_DOMAIN_UNMANAGED
;
1760 static struct dmar_domain
*alloc_domain(unsigned int type
)
1762 struct dmar_domain
*domain
;
1764 domain
= kzalloc(sizeof(*domain
), GFP_KERNEL
);
1768 domain
->nid
= NUMA_NO_NODE
;
1769 if (first_level_by_default(type
))
1770 domain
->use_first_level
= true;
1771 domain
->has_iotlb_device
= false;
1772 INIT_LIST_HEAD(&domain
->devices
);
1773 INIT_LIST_HEAD(&domain
->dev_pasids
);
1774 spin_lock_init(&domain
->lock
);
1775 xa_init(&domain
->iommu_array
);
1780 static int domain_attach_iommu(struct dmar_domain
*domain
,
1781 struct intel_iommu
*iommu
)
1783 struct iommu_domain_info
*info
, *curr
;
1784 unsigned long ndomains
;
1785 int num
, ret
= -ENOSPC
;
1787 info
= kzalloc(sizeof(*info
), GFP_KERNEL
);
1791 spin_lock(&iommu
->lock
);
1792 curr
= xa_load(&domain
->iommu_array
, iommu
->seq_id
);
1795 spin_unlock(&iommu
->lock
);
1800 ndomains
= cap_ndoms(iommu
->cap
);
1801 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1802 if (num
>= ndomains
) {
1803 pr_err("%s: No free domain ids\n", iommu
->name
);
1807 set_bit(num
, iommu
->domain_ids
);
1810 info
->iommu
= iommu
;
1811 curr
= xa_cmpxchg(&domain
->iommu_array
, iommu
->seq_id
,
1812 NULL
, info
, GFP_ATOMIC
);
1814 ret
= xa_err(curr
) ? : -EBUSY
;
1817 domain_update_iommu_cap(domain
);
1819 spin_unlock(&iommu
->lock
);
1823 clear_bit(info
->did
, iommu
->domain_ids
);
1825 spin_unlock(&iommu
->lock
);
1830 static void domain_detach_iommu(struct dmar_domain
*domain
,
1831 struct intel_iommu
*iommu
)
1833 struct iommu_domain_info
*info
;
1835 spin_lock(&iommu
->lock
);
1836 info
= xa_load(&domain
->iommu_array
, iommu
->seq_id
);
1837 if (--info
->refcnt
== 0) {
1838 clear_bit(info
->did
, iommu
->domain_ids
);
1839 xa_erase(&domain
->iommu_array
, iommu
->seq_id
);
1840 domain
->nid
= NUMA_NO_NODE
;
1841 domain_update_iommu_cap(domain
);
1844 spin_unlock(&iommu
->lock
);
1847 static inline int guestwidth_to_adjustwidth(int gaw
)
1850 int r
= (gaw
- 12) % 9;
1861 static void domain_exit(struct dmar_domain
*domain
)
1864 LIST_HEAD(freelist
);
1866 domain_unmap(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
), &freelist
);
1867 put_pages_list(&freelist
);
1870 if (WARN_ON(!list_empty(&domain
->devices
)))
1877 * Get the PASID directory size for scalable mode context entry.
1878 * Value of X in the PDTS field of a scalable mode context entry
1879 * indicates PASID directory with 2^(X + 7) entries.
1881 static inline unsigned long context_get_sm_pds(struct pasid_table
*table
)
1883 unsigned long pds
, max_pde
;
1885 max_pde
= table
->max_pasid
>> PASID_PDE_SHIFT
;
1886 pds
= find_first_bit(&max_pde
, MAX_NR_PASID_BITS
);
1894 * Set the RID_PASID field of a scalable mode context entry. The
1895 * IOMMU hardware will use the PASID value set in this field for
1896 * DMA translations of DMA requests without PASID.
1899 context_set_sm_rid2pasid(struct context_entry
*context
, unsigned long pasid
)
1901 context
->hi
|= pasid
& ((1 << 20) - 1);
1905 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1908 static inline void context_set_sm_dte(struct context_entry
*context
)
1910 context
->lo
|= BIT_ULL(2);
1914 * Set the PRE(Page Request Enable) field of a scalable mode context
1917 static inline void context_set_sm_pre(struct context_entry
*context
)
1919 context
->lo
|= BIT_ULL(4);
1922 /* Convert value to context PASID directory size field coding. */
1923 #define context_pdts(pds) (((pds) & 0x7) << 9)
1925 static int domain_context_mapping_one(struct dmar_domain
*domain
,
1926 struct intel_iommu
*iommu
,
1927 struct pasid_table
*table
,
1930 struct device_domain_info
*info
=
1931 domain_lookup_dev_info(domain
, iommu
, bus
, devfn
);
1932 u16 did
= domain_id_iommu(domain
, iommu
);
1933 int translation
= CONTEXT_TT_MULTI_LEVEL
;
1934 struct context_entry
*context
;
1937 if (hw_pass_through
&& domain_type_is_si(domain
))
1938 translation
= CONTEXT_TT_PASS_THROUGH
;
1940 pr_debug("Set context mapping for %02x:%02x.%d\n",
1941 bus
, PCI_SLOT(devfn
), PCI_FUNC(devfn
));
1943 spin_lock(&iommu
->lock
);
1945 context
= iommu_context_addr(iommu
, bus
, devfn
, 1);
1950 if (context_present(context
) && !context_copied(iommu
, bus
, devfn
))
1954 * For kdump cases, old valid entries may be cached due to the
1955 * in-flight DMA and copied pgtable, but there is no unmapping
1956 * behaviour for them, thus we need an explicit cache flush for
1957 * the newly-mapped device. For kdump, at this point, the device
1958 * is supposed to finish reset at its driver probe stage, so no
1959 * in-flight DMA will exist, and we don't need to worry anymore
1962 if (context_copied(iommu
, bus
, devfn
)) {
1963 u16 did_old
= context_domain_id(context
);
1965 if (did_old
< cap_ndoms(iommu
->cap
)) {
1966 iommu
->flush
.flush_context(iommu
, did_old
,
1967 (((u16
)bus
) << 8) | devfn
,
1968 DMA_CCMD_MASK_NOBIT
,
1969 DMA_CCMD_DEVICE_INVL
);
1970 iommu
->flush
.flush_iotlb(iommu
, did_old
, 0, 0,
1974 clear_context_copied(iommu
, bus
, devfn
);
1977 context_clear_entry(context
);
1979 if (sm_supported(iommu
)) {
1982 /* Setup the PASID DIR pointer: */
1983 pds
= context_get_sm_pds(table
);
1984 context
->lo
= (u64
)virt_to_phys(table
->table
) |
1987 /* Setup the RID_PASID field: */
1988 context_set_sm_rid2pasid(context
, IOMMU_NO_PASID
);
1991 * Setup the Device-TLB enable bit and Page request
1994 if (info
&& info
->ats_supported
)
1995 context_set_sm_dte(context
);
1996 if (info
&& info
->pri_supported
)
1997 context_set_sm_pre(context
);
1998 if (info
&& info
->pasid_supported
)
1999 context_set_pasid(context
);
2001 struct dma_pte
*pgd
= domain
->pgd
;
2004 context_set_domain_id(context
, did
);
2006 if (translation
!= CONTEXT_TT_PASS_THROUGH
) {
2008 * Skip top levels of page tables for iommu which has
2009 * less agaw than default. Unnecessary for PT mode.
2011 for (agaw
= domain
->agaw
; agaw
> iommu
->agaw
; agaw
--) {
2013 pgd
= phys_to_virt(dma_pte_addr(pgd
));
2014 if (!dma_pte_present(pgd
))
2018 if (info
&& info
->ats_supported
)
2019 translation
= CONTEXT_TT_DEV_IOTLB
;
2021 translation
= CONTEXT_TT_MULTI_LEVEL
;
2023 context_set_address_root(context
, virt_to_phys(pgd
));
2024 context_set_address_width(context
, agaw
);
2027 * In pass through mode, AW must be programmed to
2028 * indicate the largest AGAW value supported by
2029 * hardware. And ASR is ignored by hardware.
2031 context_set_address_width(context
, iommu
->msagaw
);
2034 context_set_translation_type(context
, translation
);
2037 context_set_fault_enable(context
);
2038 context_set_present(context
);
2039 if (!ecap_coherent(iommu
->ecap
))
2040 clflush_cache_range(context
, sizeof(*context
));
2043 * It's a non-present to present mapping. If hardware doesn't cache
2044 * non-present entry we only need to flush the write-buffer. If the
2045 * _does_ cache non-present entries, then it does so in the special
2046 * domain #0, which we have to flush:
2048 if (cap_caching_mode(iommu
->cap
)) {
2049 iommu
->flush
.flush_context(iommu
, 0,
2050 (((u16
)bus
) << 8) | devfn
,
2051 DMA_CCMD_MASK_NOBIT
,
2052 DMA_CCMD_DEVICE_INVL
);
2053 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0, DMA_TLB_DSI_FLUSH
);
2055 iommu_flush_write_buffer(iommu
);
2061 spin_unlock(&iommu
->lock
);
2066 struct domain_context_mapping_data
{
2067 struct dmar_domain
*domain
;
2068 struct intel_iommu
*iommu
;
2069 struct pasid_table
*table
;
2072 static int domain_context_mapping_cb(struct pci_dev
*pdev
,
2073 u16 alias
, void *opaque
)
2075 struct domain_context_mapping_data
*data
= opaque
;
2077 return domain_context_mapping_one(data
->domain
, data
->iommu
,
2078 data
->table
, PCI_BUS_NUM(alias
),
2083 domain_context_mapping(struct dmar_domain
*domain
, struct device
*dev
)
2085 struct domain_context_mapping_data data
;
2086 struct pasid_table
*table
;
2087 struct intel_iommu
*iommu
;
2090 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2094 table
= intel_pasid_get_table(dev
);
2096 if (!dev_is_pci(dev
))
2097 return domain_context_mapping_one(domain
, iommu
, table
,
2100 data
.domain
= domain
;
2104 return pci_for_each_dma_alias(to_pci_dev(dev
),
2105 &domain_context_mapping_cb
, &data
);
2108 /* Returns a number of VTD pages, but aligned to MM page size */
2109 static inline unsigned long aligned_nrpages(unsigned long host_addr
,
2112 host_addr
&= ~PAGE_MASK
;
2113 return PAGE_ALIGN(host_addr
+ size
) >> VTD_PAGE_SHIFT
;
2116 /* Return largest possible superpage level for a given mapping */
2117 static inline int hardware_largepage_caps(struct dmar_domain
*domain
,
2118 unsigned long iov_pfn
,
2119 unsigned long phy_pfn
,
2120 unsigned long pages
)
2122 int support
, level
= 1;
2123 unsigned long pfnmerge
;
2125 support
= domain
->iommu_superpage
;
2127 /* To use a large page, the virtual *and* physical addresses
2128 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2129 of them will mean we have to use smaller pages. So just
2130 merge them and check both at once. */
2131 pfnmerge
= iov_pfn
| phy_pfn
;
2133 while (support
&& !(pfnmerge
& ~VTD_STRIDE_MASK
)) {
2134 pages
>>= VTD_STRIDE_SHIFT
;
2137 pfnmerge
>>= VTD_STRIDE_SHIFT
;
2145 * Ensure that old small page tables are removed to make room for superpage(s).
2146 * We're going to add new large pages, so make sure we don't remove their parent
2147 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2149 static void switch_to_super_page(struct dmar_domain
*domain
,
2150 unsigned long start_pfn
,
2151 unsigned long end_pfn
, int level
)
2153 unsigned long lvl_pages
= lvl_to_nr_pages(level
);
2154 struct iommu_domain_info
*info
;
2155 struct dma_pte
*pte
= NULL
;
2158 while (start_pfn
<= end_pfn
) {
2160 pte
= pfn_to_dma_pte(domain
, start_pfn
, &level
,
2163 if (dma_pte_present(pte
)) {
2164 dma_pte_free_pagetable(domain
, start_pfn
,
2165 start_pfn
+ lvl_pages
- 1,
2168 xa_for_each(&domain
->iommu_array
, i
, info
)
2169 iommu_flush_iotlb_psi(info
->iommu
, domain
,
2170 start_pfn
, lvl_pages
,
2175 start_pfn
+= lvl_pages
;
2176 if (first_pte_in_page(pte
))
2182 __domain_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
2183 unsigned long phys_pfn
, unsigned long nr_pages
, int prot
,
2186 struct dma_pte
*first_pte
= NULL
, *pte
= NULL
;
2187 unsigned int largepage_lvl
= 0;
2188 unsigned long lvl_pages
= 0;
2192 if (unlikely(!domain_pfn_supported(domain
, iov_pfn
+ nr_pages
- 1)))
2195 if ((prot
& (DMA_PTE_READ
|DMA_PTE_WRITE
)) == 0)
2198 attr
= prot
& (DMA_PTE_READ
| DMA_PTE_WRITE
| DMA_PTE_SNP
);
2199 attr
|= DMA_FL_PTE_PRESENT
;
2200 if (domain
->use_first_level
) {
2201 attr
|= DMA_FL_PTE_XD
| DMA_FL_PTE_US
| DMA_FL_PTE_ACCESS
;
2202 if (prot
& DMA_PTE_WRITE
)
2203 attr
|= DMA_FL_PTE_DIRTY
;
2206 pteval
= ((phys_addr_t
)phys_pfn
<< VTD_PAGE_SHIFT
) | attr
;
2208 while (nr_pages
> 0) {
2212 largepage_lvl
= hardware_largepage_caps(domain
, iov_pfn
,
2213 phys_pfn
, nr_pages
);
2215 pte
= pfn_to_dma_pte(domain
, iov_pfn
, &largepage_lvl
,
2221 lvl_pages
= lvl_to_nr_pages(largepage_lvl
);
2223 /* It is large page*/
2224 if (largepage_lvl
> 1) {
2225 unsigned long end_pfn
;
2226 unsigned long pages_to_remove
;
2228 pteval
|= DMA_PTE_LARGE_PAGE
;
2229 pages_to_remove
= min_t(unsigned long, nr_pages
,
2230 nr_pte_to_next_page(pte
) * lvl_pages
);
2231 end_pfn
= iov_pfn
+ pages_to_remove
- 1;
2232 switch_to_super_page(domain
, iov_pfn
, end_pfn
, largepage_lvl
);
2234 pteval
&= ~(uint64_t)DMA_PTE_LARGE_PAGE
;
2238 /* We don't need lock here, nobody else
2239 * touches the iova range
2241 tmp
= cmpxchg64_local(&pte
->val
, 0ULL, pteval
);
2243 static int dumps
= 5;
2244 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2245 iov_pfn
, tmp
, (unsigned long long)pteval
);
2248 debug_dma_dump_mappings(NULL
);
2253 nr_pages
-= lvl_pages
;
2254 iov_pfn
+= lvl_pages
;
2255 phys_pfn
+= lvl_pages
;
2256 pteval
+= lvl_pages
* VTD_PAGE_SIZE
;
2258 /* If the next PTE would be the first in a new page, then we
2259 * need to flush the cache on the entries we've just written.
2260 * And then we'll need to recalculate 'pte', so clear it and
2261 * let it get set again in the if (!pte) block above.
2263 * If we're done (!nr_pages) we need to flush the cache too.
2265 * Also if we've been setting superpages, we may need to
2266 * recalculate 'pte' and switch back to smaller pages for the
2267 * end of the mapping, if the trailing size is not enough to
2268 * use another superpage (i.e. nr_pages < lvl_pages).
2271 if (!nr_pages
|| first_pte_in_page(pte
) ||
2272 (largepage_lvl
> 1 && nr_pages
< lvl_pages
)) {
2273 domain_flush_cache(domain
, first_pte
,
2274 (void *)pte
- (void *)first_pte
);
2282 static void domain_context_clear_one(struct device_domain_info
*info
, u8 bus
, u8 devfn
)
2284 struct intel_iommu
*iommu
= info
->iommu
;
2285 struct context_entry
*context
;
2291 spin_lock(&iommu
->lock
);
2292 context
= iommu_context_addr(iommu
, bus
, devfn
, 0);
2294 spin_unlock(&iommu
->lock
);
2298 if (sm_supported(iommu
)) {
2299 if (hw_pass_through
&& domain_type_is_si(info
->domain
))
2300 did_old
= FLPT_DEFAULT_DID
;
2302 did_old
= domain_id_iommu(info
->domain
, iommu
);
2304 did_old
= context_domain_id(context
);
2307 context_clear_entry(context
);
2308 __iommu_flush_cache(iommu
, context
, sizeof(*context
));
2309 spin_unlock(&iommu
->lock
);
2310 iommu
->flush
.flush_context(iommu
,
2312 (((u16
)bus
) << 8) | devfn
,
2313 DMA_CCMD_MASK_NOBIT
,
2314 DMA_CCMD_DEVICE_INVL
);
2316 if (sm_supported(iommu
))
2317 qi_flush_pasid_cache(iommu
, did_old
, QI_PC_ALL_PASIDS
, 0);
2319 iommu
->flush
.flush_iotlb(iommu
,
2325 __iommu_flush_dev_iotlb(info
, 0, MAX_AGAW_PFN_WIDTH
);
2328 static int domain_setup_first_level(struct intel_iommu
*iommu
,
2329 struct dmar_domain
*domain
,
2333 struct dma_pte
*pgd
= domain
->pgd
;
2338 * Skip top levels of page tables for iommu which has
2339 * less agaw than default. Unnecessary for PT mode.
2341 for (agaw
= domain
->agaw
; agaw
> iommu
->agaw
; agaw
--) {
2342 pgd
= phys_to_virt(dma_pte_addr(pgd
));
2343 if (!dma_pte_present(pgd
))
2347 level
= agaw_to_level(agaw
);
2348 if (level
!= 4 && level
!= 5)
2352 flags
|= PASID_FLAG_FL5LP
;
2354 if (domain
->force_snooping
)
2355 flags
|= PASID_FLAG_PAGE_SNOOP
;
2357 return intel_pasid_setup_first_level(iommu
, dev
, (pgd_t
*)pgd
, pasid
,
2358 domain_id_iommu(domain
, iommu
),
2362 static bool dev_is_real_dma_subdevice(struct device
*dev
)
2364 return dev
&& dev_is_pci(dev
) &&
2365 pci_real_dma_dev(to_pci_dev(dev
)) != to_pci_dev(dev
);
2368 static int iommu_domain_identity_map(struct dmar_domain
*domain
,
2369 unsigned long first_vpfn
,
2370 unsigned long last_vpfn
)
2373 * RMRR range might have overlap with physical memory range,
2376 dma_pte_clear_range(domain
, first_vpfn
, last_vpfn
);
2378 return __domain_mapping(domain
, first_vpfn
,
2379 first_vpfn
, last_vpfn
- first_vpfn
+ 1,
2380 DMA_PTE_READ
|DMA_PTE_WRITE
, GFP_KERNEL
);
2383 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
);
2385 static int __init
si_domain_init(int hw
)
2387 struct dmar_rmrr_unit
*rmrr
;
2391 si_domain
= alloc_domain(IOMMU_DOMAIN_IDENTITY
);
2395 if (md_domain_init(si_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
2396 domain_exit(si_domain
);
2404 for_each_online_node(nid
) {
2405 unsigned long start_pfn
, end_pfn
;
2408 for_each_mem_pfn_range(i
, nid
, &start_pfn
, &end_pfn
, NULL
) {
2409 ret
= iommu_domain_identity_map(si_domain
,
2410 mm_to_dma_pfn_start(start_pfn
),
2411 mm_to_dma_pfn_end(end_pfn
));
2418 * Identity map the RMRRs so that devices with RMRRs could also use
2421 for_each_rmrr_units(rmrr
) {
2422 for_each_active_dev_scope(rmrr
->devices
, rmrr
->devices_cnt
,
2424 unsigned long long start
= rmrr
->base_address
;
2425 unsigned long long end
= rmrr
->end_address
;
2427 if (WARN_ON(end
< start
||
2428 end
>> agaw_to_width(si_domain
->agaw
)))
2431 ret
= iommu_domain_identity_map(si_domain
,
2432 mm_to_dma_pfn_start(start
>> PAGE_SHIFT
),
2433 mm_to_dma_pfn_end(end
>> PAGE_SHIFT
));
2442 static int dmar_domain_attach_device(struct dmar_domain
*domain
,
2445 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
2446 struct intel_iommu
*iommu
;
2447 unsigned long flags
;
2451 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2455 ret
= domain_attach_iommu(domain
, iommu
);
2458 info
->domain
= domain
;
2459 spin_lock_irqsave(&domain
->lock
, flags
);
2460 list_add(&info
->link
, &domain
->devices
);
2461 spin_unlock_irqrestore(&domain
->lock
, flags
);
2463 /* PASID table is mandatory for a PCI device in scalable mode. */
2464 if (sm_supported(iommu
) && !dev_is_real_dma_subdevice(dev
)) {
2465 /* Setup the PASID entry for requests without PASID: */
2466 if (hw_pass_through
&& domain_type_is_si(domain
))
2467 ret
= intel_pasid_setup_pass_through(iommu
, domain
,
2468 dev
, IOMMU_NO_PASID
);
2469 else if (domain
->use_first_level
)
2470 ret
= domain_setup_first_level(iommu
, domain
, dev
,
2473 ret
= intel_pasid_setup_second_level(iommu
, domain
,
2474 dev
, IOMMU_NO_PASID
);
2476 dev_err(dev
, "Setup RID2PASID failed\n");
2477 device_block_translation(dev
);
2482 ret
= domain_context_mapping(domain
, dev
);
2484 dev_err(dev
, "Domain context map failed\n");
2485 device_block_translation(dev
);
2489 iommu_enable_pci_caps(info
);
2495 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2496 * is relaxable (ie. is allowed to be not enforced under some conditions)
2497 * @dev: device handle
2499 * We assume that PCI USB devices with RMRRs have them largely
2500 * for historical reasons and that the RMRR space is not actively used post
2501 * boot. This exclusion may change if vendors begin to abuse it.
2503 * The same exception is made for graphics devices, with the requirement that
2504 * any use of the RMRR regions will be torn down before assigning the device
2507 * Return: true if the RMRR is relaxable, false otherwise
2509 static bool device_rmrr_is_relaxable(struct device
*dev
)
2511 struct pci_dev
*pdev
;
2513 if (!dev_is_pci(dev
))
2516 pdev
= to_pci_dev(dev
);
2517 if (IS_USB_DEVICE(pdev
) || IS_GFX_DEVICE(pdev
))
2524 * Return the required default domain type for a specific device.
2526 * @dev: the device in query
2527 * @startup: true if this is during early boot
2530 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2531 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2532 * - 0: both identity and dynamic domains work for this device
2534 static int device_def_domain_type(struct device
*dev
)
2536 if (dev_is_pci(dev
)) {
2537 struct pci_dev
*pdev
= to_pci_dev(dev
);
2539 if ((iommu_identity_mapping
& IDENTMAP_AZALIA
) && IS_AZALIA(pdev
))
2540 return IOMMU_DOMAIN_IDENTITY
;
2542 if ((iommu_identity_mapping
& IDENTMAP_GFX
) && IS_GFX_DEVICE(pdev
))
2543 return IOMMU_DOMAIN_IDENTITY
;
2549 static void intel_iommu_init_qi(struct intel_iommu
*iommu
)
2552 * Start from the sane iommu hardware state.
2553 * If the queued invalidation is already initialized by us
2554 * (for example, while enabling interrupt-remapping) then
2555 * we got the things already rolling from a sane state.
2559 * Clear any previous faults.
2561 dmar_fault(-1, iommu
);
2563 * Disable queued invalidation if supported and already enabled
2564 * before OS handover.
2566 dmar_disable_qi(iommu
);
2569 if (dmar_enable_qi(iommu
)) {
2571 * Queued Invalidate not enabled, use Register Based Invalidate
2573 iommu
->flush
.flush_context
= __iommu_flush_context
;
2574 iommu
->flush
.flush_iotlb
= __iommu_flush_iotlb
;
2575 pr_info("%s: Using Register based invalidation\n",
2578 iommu
->flush
.flush_context
= qi_flush_context
;
2579 iommu
->flush
.flush_iotlb
= qi_flush_iotlb
;
2580 pr_info("%s: Using Queued invalidation\n", iommu
->name
);
2584 static int copy_context_table(struct intel_iommu
*iommu
,
2585 struct root_entry
*old_re
,
2586 struct context_entry
**tbl
,
2589 int tbl_idx
, pos
= 0, idx
, devfn
, ret
= 0, did
;
2590 struct context_entry
*new_ce
= NULL
, ce
;
2591 struct context_entry
*old_ce
= NULL
;
2592 struct root_entry re
;
2593 phys_addr_t old_ce_phys
;
2595 tbl_idx
= ext
? bus
* 2 : bus
;
2596 memcpy(&re
, old_re
, sizeof(re
));
2598 for (devfn
= 0; devfn
< 256; devfn
++) {
2599 /* First calculate the correct index */
2600 idx
= (ext
? devfn
* 2 : devfn
) % 256;
2603 /* First save what we may have and clean up */
2605 tbl
[tbl_idx
] = new_ce
;
2606 __iommu_flush_cache(iommu
, new_ce
,
2616 old_ce_phys
= root_entry_lctp(&re
);
2618 old_ce_phys
= root_entry_uctp(&re
);
2621 if (ext
&& devfn
== 0) {
2622 /* No LCTP, try UCTP */
2631 old_ce
= memremap(old_ce_phys
, PAGE_SIZE
,
2636 new_ce
= alloc_pgtable_page(iommu
->node
, GFP_KERNEL
);
2643 /* Now copy the context entry */
2644 memcpy(&ce
, old_ce
+ idx
, sizeof(ce
));
2646 if (!context_present(&ce
))
2649 did
= context_domain_id(&ce
);
2650 if (did
>= 0 && did
< cap_ndoms(iommu
->cap
))
2651 set_bit(did
, iommu
->domain_ids
);
2653 set_context_copied(iommu
, bus
, devfn
);
2657 tbl
[tbl_idx
+ pos
] = new_ce
;
2659 __iommu_flush_cache(iommu
, new_ce
, VTD_PAGE_SIZE
);
2668 static int copy_translation_tables(struct intel_iommu
*iommu
)
2670 struct context_entry
**ctxt_tbls
;
2671 struct root_entry
*old_rt
;
2672 phys_addr_t old_rt_phys
;
2673 int ctxt_table_entries
;
2678 rtaddr_reg
= dmar_readq(iommu
->reg
+ DMAR_RTADDR_REG
);
2679 ext
= !!(rtaddr_reg
& DMA_RTADDR_SMT
);
2680 new_ext
= !!sm_supported(iommu
);
2683 * The RTT bit can only be changed when translation is disabled,
2684 * but disabling translation means to open a window for data
2685 * corruption. So bail out and don't copy anything if we would
2686 * have to change the bit.
2691 iommu
->copied_tables
= bitmap_zalloc(BIT_ULL(16), GFP_KERNEL
);
2692 if (!iommu
->copied_tables
)
2695 old_rt_phys
= rtaddr_reg
& VTD_PAGE_MASK
;
2699 old_rt
= memremap(old_rt_phys
, PAGE_SIZE
, MEMREMAP_WB
);
2703 /* This is too big for the stack - allocate it from slab */
2704 ctxt_table_entries
= ext
? 512 : 256;
2706 ctxt_tbls
= kcalloc(ctxt_table_entries
, sizeof(void *), GFP_KERNEL
);
2710 for (bus
= 0; bus
< 256; bus
++) {
2711 ret
= copy_context_table(iommu
, &old_rt
[bus
],
2712 ctxt_tbls
, bus
, ext
);
2714 pr_err("%s: Failed to copy context table for bus %d\n",
2720 spin_lock(&iommu
->lock
);
2722 /* Context tables are copied, now write them to the root_entry table */
2723 for (bus
= 0; bus
< 256; bus
++) {
2724 int idx
= ext
? bus
* 2 : bus
;
2727 if (ctxt_tbls
[idx
]) {
2728 val
= virt_to_phys(ctxt_tbls
[idx
]) | 1;
2729 iommu
->root_entry
[bus
].lo
= val
;
2732 if (!ext
|| !ctxt_tbls
[idx
+ 1])
2735 val
= virt_to_phys(ctxt_tbls
[idx
+ 1]) | 1;
2736 iommu
->root_entry
[bus
].hi
= val
;
2739 spin_unlock(&iommu
->lock
);
2743 __iommu_flush_cache(iommu
, iommu
->root_entry
, PAGE_SIZE
);
2753 static int __init
init_dmars(void)
2755 struct dmar_drhd_unit
*drhd
;
2756 struct intel_iommu
*iommu
;
2759 ret
= intel_cap_audit(CAP_AUDIT_STATIC_DMAR
, NULL
);
2763 for_each_iommu(iommu
, drhd
) {
2764 if (drhd
->ignored
) {
2765 iommu_disable_translation(iommu
);
2770 * Find the max pasid size of all IOMMU's in the system.
2771 * We need to ensure the system pasid table is no bigger
2772 * than the smallest supported.
2774 if (pasid_supported(iommu
)) {
2775 u32 temp
= 2 << ecap_pss(iommu
->ecap
);
2777 intel_pasid_max_id
= min_t(u32
, temp
,
2778 intel_pasid_max_id
);
2781 intel_iommu_init_qi(iommu
);
2783 ret
= iommu_init_domains(iommu
);
2787 init_translation_status(iommu
);
2789 if (translation_pre_enabled(iommu
) && !is_kdump_kernel()) {
2790 iommu_disable_translation(iommu
);
2791 clear_translation_pre_enabled(iommu
);
2792 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2798 * we could share the same root & context tables
2799 * among all IOMMU's. Need to Split it later.
2801 ret
= iommu_alloc_root_entry(iommu
);
2805 if (translation_pre_enabled(iommu
)) {
2806 pr_info("Translation already enabled - trying to copy translation structures\n");
2808 ret
= copy_translation_tables(iommu
);
2811 * We found the IOMMU with translation
2812 * enabled - but failed to copy over the
2813 * old root-entry table. Try to proceed
2814 * by disabling translation now and
2815 * allocating a clean root-entry table.
2816 * This might cause DMAR faults, but
2817 * probably the dump will still succeed.
2819 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2821 iommu_disable_translation(iommu
);
2822 clear_translation_pre_enabled(iommu
);
2824 pr_info("Copied translation tables from previous kernel for %s\n",
2829 if (!ecap_pass_through(iommu
->ecap
))
2830 hw_pass_through
= 0;
2831 intel_svm_check(iommu
);
2835 * Now that qi is enabled on all iommus, set the root entry and flush
2836 * caches. This is required on some Intel X58 chipsets, otherwise the
2837 * flush_context function will loop forever and the boot hangs.
2839 for_each_active_iommu(iommu
, drhd
) {
2840 iommu_flush_write_buffer(iommu
);
2841 iommu_set_root_entry(iommu
);
2844 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2849 iommu_identity_mapping
|= IDENTMAP_GFX
;
2851 check_tylersburg_isoch();
2853 ret
= si_domain_init(hw_pass_through
);
2860 * global invalidate context cache
2861 * global invalidate iotlb
2862 * enable translation
2864 for_each_iommu(iommu
, drhd
) {
2865 if (drhd
->ignored
) {
2867 * we always have to disable PMRs or DMA may fail on
2871 iommu_disable_protect_mem_regions(iommu
);
2875 iommu_flush_write_buffer(iommu
);
2877 #ifdef CONFIG_INTEL_IOMMU_SVM
2878 if (pasid_supported(iommu
) && ecap_prs(iommu
->ecap
)) {
2880 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2881 * could cause possible lock race condition.
2883 up_write(&dmar_global_lock
);
2884 ret
= intel_svm_enable_prq(iommu
);
2885 down_write(&dmar_global_lock
);
2890 ret
= dmar_set_interrupt(iommu
);
2898 for_each_active_iommu(iommu
, drhd
) {
2899 disable_dmar_iommu(iommu
);
2900 free_dmar_iommu(iommu
);
2903 domain_exit(si_domain
);
2910 static void __init
init_no_remapping_devices(void)
2912 struct dmar_drhd_unit
*drhd
;
2916 for_each_drhd_unit(drhd
) {
2917 if (!drhd
->include_all
) {
2918 for_each_active_dev_scope(drhd
->devices
,
2919 drhd
->devices_cnt
, i
, dev
)
2921 /* ignore DMAR unit if no devices exist */
2922 if (i
== drhd
->devices_cnt
)
2927 for_each_active_drhd_unit(drhd
) {
2928 if (drhd
->include_all
)
2931 for_each_active_dev_scope(drhd
->devices
,
2932 drhd
->devices_cnt
, i
, dev
)
2933 if (!dev_is_pci(dev
) || !IS_GFX_DEVICE(to_pci_dev(dev
)))
2935 if (i
< drhd
->devices_cnt
)
2938 /* This IOMMU has *only* gfx devices. Either bypass it or
2939 set the gfx_mapped flag, as appropriate */
2940 drhd
->gfx_dedicated
= 1;
2946 #ifdef CONFIG_SUSPEND
2947 static int init_iommu_hw(void)
2949 struct dmar_drhd_unit
*drhd
;
2950 struct intel_iommu
*iommu
= NULL
;
2953 for_each_active_iommu(iommu
, drhd
) {
2955 ret
= dmar_reenable_qi(iommu
);
2961 for_each_iommu(iommu
, drhd
) {
2962 if (drhd
->ignored
) {
2964 * we always have to disable PMRs or DMA may fail on
2968 iommu_disable_protect_mem_regions(iommu
);
2972 iommu_flush_write_buffer(iommu
);
2973 iommu_set_root_entry(iommu
);
2974 iommu_enable_translation(iommu
);
2975 iommu_disable_protect_mem_regions(iommu
);
2981 static void iommu_flush_all(void)
2983 struct dmar_drhd_unit
*drhd
;
2984 struct intel_iommu
*iommu
;
2986 for_each_active_iommu(iommu
, drhd
) {
2987 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
2988 DMA_CCMD_GLOBAL_INVL
);
2989 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
2990 DMA_TLB_GLOBAL_FLUSH
);
2994 static int iommu_suspend(void)
2996 struct dmar_drhd_unit
*drhd
;
2997 struct intel_iommu
*iommu
= NULL
;
3000 for_each_active_iommu(iommu
, drhd
) {
3001 iommu
->iommu_state
= kcalloc(MAX_SR_DMAR_REGS
, sizeof(u32
),
3003 if (!iommu
->iommu_state
)
3009 for_each_active_iommu(iommu
, drhd
) {
3010 iommu_disable_translation(iommu
);
3012 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
3014 iommu
->iommu_state
[SR_DMAR_FECTL_REG
] =
3015 readl(iommu
->reg
+ DMAR_FECTL_REG
);
3016 iommu
->iommu_state
[SR_DMAR_FEDATA_REG
] =
3017 readl(iommu
->reg
+ DMAR_FEDATA_REG
);
3018 iommu
->iommu_state
[SR_DMAR_FEADDR_REG
] =
3019 readl(iommu
->reg
+ DMAR_FEADDR_REG
);
3020 iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
] =
3021 readl(iommu
->reg
+ DMAR_FEUADDR_REG
);
3023 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
3028 for_each_active_iommu(iommu
, drhd
)
3029 kfree(iommu
->iommu_state
);
3034 static void iommu_resume(void)
3036 struct dmar_drhd_unit
*drhd
;
3037 struct intel_iommu
*iommu
= NULL
;
3040 if (init_iommu_hw()) {
3042 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3044 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3048 for_each_active_iommu(iommu
, drhd
) {
3050 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
3052 writel(iommu
->iommu_state
[SR_DMAR_FECTL_REG
],
3053 iommu
->reg
+ DMAR_FECTL_REG
);
3054 writel(iommu
->iommu_state
[SR_DMAR_FEDATA_REG
],
3055 iommu
->reg
+ DMAR_FEDATA_REG
);
3056 writel(iommu
->iommu_state
[SR_DMAR_FEADDR_REG
],
3057 iommu
->reg
+ DMAR_FEADDR_REG
);
3058 writel(iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
],
3059 iommu
->reg
+ DMAR_FEUADDR_REG
);
3061 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
3064 for_each_active_iommu(iommu
, drhd
)
3065 kfree(iommu
->iommu_state
);
3068 static struct syscore_ops iommu_syscore_ops
= {
3069 .resume
= iommu_resume
,
3070 .suspend
= iommu_suspend
,
3073 static void __init
init_iommu_pm_ops(void)
3075 register_syscore_ops(&iommu_syscore_ops
);
3079 static inline void init_iommu_pm_ops(void) {}
3080 #endif /* CONFIG_PM */
3082 static int __init
rmrr_sanity_check(struct acpi_dmar_reserved_memory
*rmrr
)
3084 if (!IS_ALIGNED(rmrr
->base_address
, PAGE_SIZE
) ||
3085 !IS_ALIGNED(rmrr
->end_address
+ 1, PAGE_SIZE
) ||
3086 rmrr
->end_address
<= rmrr
->base_address
||
3087 arch_rmrr_sanity_check(rmrr
))
3093 int __init
dmar_parse_one_rmrr(struct acpi_dmar_header
*header
, void *arg
)
3095 struct acpi_dmar_reserved_memory
*rmrr
;
3096 struct dmar_rmrr_unit
*rmrru
;
3098 rmrr
= (struct acpi_dmar_reserved_memory
*)header
;
3099 if (rmrr_sanity_check(rmrr
)) {
3101 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3102 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3103 rmrr
->base_address
, rmrr
->end_address
,
3104 dmi_get_system_info(DMI_BIOS_VENDOR
),
3105 dmi_get_system_info(DMI_BIOS_VERSION
),
3106 dmi_get_system_info(DMI_PRODUCT_VERSION
));
3107 add_taint(TAINT_FIRMWARE_WORKAROUND
, LOCKDEP_STILL_OK
);
3110 rmrru
= kzalloc(sizeof(*rmrru
), GFP_KERNEL
);
3114 rmrru
->hdr
= header
;
3116 rmrru
->base_address
= rmrr
->base_address
;
3117 rmrru
->end_address
= rmrr
->end_address
;
3119 rmrru
->devices
= dmar_alloc_dev_scope((void *)(rmrr
+ 1),
3120 ((void *)rmrr
) + rmrr
->header
.length
,
3121 &rmrru
->devices_cnt
);
3122 if (rmrru
->devices_cnt
&& rmrru
->devices
== NULL
)
3125 list_add(&rmrru
->list
, &dmar_rmrr_units
);
3134 static struct dmar_atsr_unit
*dmar_find_atsr(struct acpi_dmar_atsr
*atsr
)
3136 struct dmar_atsr_unit
*atsru
;
3137 struct acpi_dmar_atsr
*tmp
;
3139 list_for_each_entry_rcu(atsru
, &dmar_atsr_units
, list
,
3141 tmp
= (struct acpi_dmar_atsr
*)atsru
->hdr
;
3142 if (atsr
->segment
!= tmp
->segment
)
3144 if (atsr
->header
.length
!= tmp
->header
.length
)
3146 if (memcmp(atsr
, tmp
, atsr
->header
.length
) == 0)
3153 int dmar_parse_one_atsr(struct acpi_dmar_header
*hdr
, void *arg
)
3155 struct acpi_dmar_atsr
*atsr
;
3156 struct dmar_atsr_unit
*atsru
;
3158 if (system_state
>= SYSTEM_RUNNING
&& !intel_iommu_enabled
)
3161 atsr
= container_of(hdr
, struct acpi_dmar_atsr
, header
);
3162 atsru
= dmar_find_atsr(atsr
);
3166 atsru
= kzalloc(sizeof(*atsru
) + hdr
->length
, GFP_KERNEL
);
3171 * If memory is allocated from slab by ACPI _DSM method, we need to
3172 * copy the memory content because the memory buffer will be freed
3175 atsru
->hdr
= (void *)(atsru
+ 1);
3176 memcpy(atsru
->hdr
, hdr
, hdr
->length
);
3177 atsru
->include_all
= atsr
->flags
& 0x1;
3178 if (!atsru
->include_all
) {
3179 atsru
->devices
= dmar_alloc_dev_scope((void *)(atsr
+ 1),
3180 (void *)atsr
+ atsr
->header
.length
,
3181 &atsru
->devices_cnt
);
3182 if (atsru
->devices_cnt
&& atsru
->devices
== NULL
) {
3188 list_add_rcu(&atsru
->list
, &dmar_atsr_units
);
3193 static void intel_iommu_free_atsr(struct dmar_atsr_unit
*atsru
)
3195 dmar_free_dev_scope(&atsru
->devices
, &atsru
->devices_cnt
);
3199 int dmar_release_one_atsr(struct acpi_dmar_header
*hdr
, void *arg
)
3201 struct acpi_dmar_atsr
*atsr
;
3202 struct dmar_atsr_unit
*atsru
;
3204 atsr
= container_of(hdr
, struct acpi_dmar_atsr
, header
);
3205 atsru
= dmar_find_atsr(atsr
);
3207 list_del_rcu(&atsru
->list
);
3209 intel_iommu_free_atsr(atsru
);
3215 int dmar_check_one_atsr(struct acpi_dmar_header
*hdr
, void *arg
)
3219 struct acpi_dmar_atsr
*atsr
;
3220 struct dmar_atsr_unit
*atsru
;
3222 atsr
= container_of(hdr
, struct acpi_dmar_atsr
, header
);
3223 atsru
= dmar_find_atsr(atsr
);
3227 if (!atsru
->include_all
&& atsru
->devices
&& atsru
->devices_cnt
) {
3228 for_each_active_dev_scope(atsru
->devices
, atsru
->devices_cnt
,
3236 static struct dmar_satc_unit
*dmar_find_satc(struct acpi_dmar_satc
*satc
)
3238 struct dmar_satc_unit
*satcu
;
3239 struct acpi_dmar_satc
*tmp
;
3241 list_for_each_entry_rcu(satcu
, &dmar_satc_units
, list
,
3243 tmp
= (struct acpi_dmar_satc
*)satcu
->hdr
;
3244 if (satc
->segment
!= tmp
->segment
)
3246 if (satc
->header
.length
!= tmp
->header
.length
)
3248 if (memcmp(satc
, tmp
, satc
->header
.length
) == 0)
3255 int dmar_parse_one_satc(struct acpi_dmar_header
*hdr
, void *arg
)
3257 struct acpi_dmar_satc
*satc
;
3258 struct dmar_satc_unit
*satcu
;
3260 if (system_state
>= SYSTEM_RUNNING
&& !intel_iommu_enabled
)
3263 satc
= container_of(hdr
, struct acpi_dmar_satc
, header
);
3264 satcu
= dmar_find_satc(satc
);
3268 satcu
= kzalloc(sizeof(*satcu
) + hdr
->length
, GFP_KERNEL
);
3272 satcu
->hdr
= (void *)(satcu
+ 1);
3273 memcpy(satcu
->hdr
, hdr
, hdr
->length
);
3274 satcu
->atc_required
= satc
->flags
& 0x1;
3275 satcu
->devices
= dmar_alloc_dev_scope((void *)(satc
+ 1),
3276 (void *)satc
+ satc
->header
.length
,
3277 &satcu
->devices_cnt
);
3278 if (satcu
->devices_cnt
&& !satcu
->devices
) {
3282 list_add_rcu(&satcu
->list
, &dmar_satc_units
);
3287 static int intel_iommu_add(struct dmar_drhd_unit
*dmaru
)
3290 struct intel_iommu
*iommu
= dmaru
->iommu
;
3292 ret
= intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR
, iommu
);
3296 if (hw_pass_through
&& !ecap_pass_through(iommu
->ecap
)) {
3297 pr_warn("%s: Doesn't support hardware pass through.\n",
3302 sp
= domain_update_iommu_superpage(NULL
, iommu
) - 1;
3303 if (sp
>= 0 && !(cap_super_page_val(iommu
->cap
) & (1 << sp
))) {
3304 pr_warn("%s: Doesn't support large page.\n",
3310 * Disable translation if already enabled prior to OS handover.
3312 if (iommu
->gcmd
& DMA_GCMD_TE
)
3313 iommu_disable_translation(iommu
);
3315 ret
= iommu_init_domains(iommu
);
3317 ret
= iommu_alloc_root_entry(iommu
);
3321 intel_svm_check(iommu
);
3323 if (dmaru
->ignored
) {
3325 * we always have to disable PMRs or DMA may fail on this device
3328 iommu_disable_protect_mem_regions(iommu
);
3332 intel_iommu_init_qi(iommu
);
3333 iommu_flush_write_buffer(iommu
);
3335 #ifdef CONFIG_INTEL_IOMMU_SVM
3336 if (pasid_supported(iommu
) && ecap_prs(iommu
->ecap
)) {
3337 ret
= intel_svm_enable_prq(iommu
);
3342 ret
= dmar_set_interrupt(iommu
);
3346 iommu_set_root_entry(iommu
);
3347 iommu_enable_translation(iommu
);
3349 iommu_disable_protect_mem_regions(iommu
);
3353 disable_dmar_iommu(iommu
);
3355 free_dmar_iommu(iommu
);
3359 int dmar_iommu_hotplug(struct dmar_drhd_unit
*dmaru
, bool insert
)
3362 struct intel_iommu
*iommu
= dmaru
->iommu
;
3364 if (!intel_iommu_enabled
)
3370 ret
= intel_iommu_add(dmaru
);
3372 disable_dmar_iommu(iommu
);
3373 free_dmar_iommu(iommu
);
3379 static void intel_iommu_free_dmars(void)
3381 struct dmar_rmrr_unit
*rmrru
, *rmrr_n
;
3382 struct dmar_atsr_unit
*atsru
, *atsr_n
;
3383 struct dmar_satc_unit
*satcu
, *satc_n
;
3385 list_for_each_entry_safe(rmrru
, rmrr_n
, &dmar_rmrr_units
, list
) {
3386 list_del(&rmrru
->list
);
3387 dmar_free_dev_scope(&rmrru
->devices
, &rmrru
->devices_cnt
);
3391 list_for_each_entry_safe(atsru
, atsr_n
, &dmar_atsr_units
, list
) {
3392 list_del(&atsru
->list
);
3393 intel_iommu_free_atsr(atsru
);
3395 list_for_each_entry_safe(satcu
, satc_n
, &dmar_satc_units
, list
) {
3396 list_del(&satcu
->list
);
3397 dmar_free_dev_scope(&satcu
->devices
, &satcu
->devices_cnt
);
3402 static struct dmar_satc_unit
*dmar_find_matched_satc_unit(struct pci_dev
*dev
)
3404 struct dmar_satc_unit
*satcu
;
3405 struct acpi_dmar_satc
*satc
;
3409 dev
= pci_physfn(dev
);
3412 list_for_each_entry_rcu(satcu
, &dmar_satc_units
, list
) {
3413 satc
= container_of(satcu
->hdr
, struct acpi_dmar_satc
, header
);
3414 if (satc
->segment
!= pci_domain_nr(dev
->bus
))
3416 for_each_dev_scope(satcu
->devices
, satcu
->devices_cnt
, i
, tmp
)
3417 if (to_pci_dev(tmp
) == dev
)
3426 static int dmar_ats_supported(struct pci_dev
*dev
, struct intel_iommu
*iommu
)
3429 struct pci_bus
*bus
;
3430 struct pci_dev
*bridge
= NULL
;
3432 struct acpi_dmar_atsr
*atsr
;
3433 struct dmar_atsr_unit
*atsru
;
3434 struct dmar_satc_unit
*satcu
;
3436 dev
= pci_physfn(dev
);
3437 satcu
= dmar_find_matched_satc_unit(dev
);
3440 * This device supports ATS as it is in SATC table.
3441 * When IOMMU is in legacy mode, enabling ATS is done
3442 * automatically by HW for the device that requires
3443 * ATS, hence OS should not enable this device ATS
3444 * to avoid duplicated TLB invalidation.
3446 return !(satcu
->atc_required
&& !sm_supported(iommu
));
3448 for (bus
= dev
->bus
; bus
; bus
= bus
->parent
) {
3450 /* If it's an integrated device, allow ATS */
3453 /* Connected via non-PCIe: no ATS */
3454 if (!pci_is_pcie(bridge
) ||
3455 pci_pcie_type(bridge
) == PCI_EXP_TYPE_PCI_BRIDGE
)
3457 /* If we found the root port, look it up in the ATSR */
3458 if (pci_pcie_type(bridge
) == PCI_EXP_TYPE_ROOT_PORT
)
3463 list_for_each_entry_rcu(atsru
, &dmar_atsr_units
, list
) {
3464 atsr
= container_of(atsru
->hdr
, struct acpi_dmar_atsr
, header
);
3465 if (atsr
->segment
!= pci_domain_nr(dev
->bus
))
3468 for_each_dev_scope(atsru
->devices
, atsru
->devices_cnt
, i
, tmp
)
3469 if (tmp
== &bridge
->dev
)
3472 if (atsru
->include_all
)
3482 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info
*info
)
3485 struct dmar_rmrr_unit
*rmrru
;
3486 struct dmar_atsr_unit
*atsru
;
3487 struct dmar_satc_unit
*satcu
;
3488 struct acpi_dmar_atsr
*atsr
;
3489 struct acpi_dmar_reserved_memory
*rmrr
;
3490 struct acpi_dmar_satc
*satc
;
3492 if (!intel_iommu_enabled
&& system_state
>= SYSTEM_RUNNING
)
3495 list_for_each_entry(rmrru
, &dmar_rmrr_units
, list
) {
3496 rmrr
= container_of(rmrru
->hdr
,
3497 struct acpi_dmar_reserved_memory
, header
);
3498 if (info
->event
== BUS_NOTIFY_ADD_DEVICE
) {
3499 ret
= dmar_insert_dev_scope(info
, (void *)(rmrr
+ 1),
3500 ((void *)rmrr
) + rmrr
->header
.length
,
3501 rmrr
->segment
, rmrru
->devices
,
3502 rmrru
->devices_cnt
);
3505 } else if (info
->event
== BUS_NOTIFY_REMOVED_DEVICE
) {
3506 dmar_remove_dev_scope(info
, rmrr
->segment
,
3507 rmrru
->devices
, rmrru
->devices_cnt
);
3511 list_for_each_entry(atsru
, &dmar_atsr_units
, list
) {
3512 if (atsru
->include_all
)
3515 atsr
= container_of(atsru
->hdr
, struct acpi_dmar_atsr
, header
);
3516 if (info
->event
== BUS_NOTIFY_ADD_DEVICE
) {
3517 ret
= dmar_insert_dev_scope(info
, (void *)(atsr
+ 1),
3518 (void *)atsr
+ atsr
->header
.length
,
3519 atsr
->segment
, atsru
->devices
,
3520 atsru
->devices_cnt
);
3525 } else if (info
->event
== BUS_NOTIFY_REMOVED_DEVICE
) {
3526 if (dmar_remove_dev_scope(info
, atsr
->segment
,
3527 atsru
->devices
, atsru
->devices_cnt
))
3531 list_for_each_entry(satcu
, &dmar_satc_units
, list
) {
3532 satc
= container_of(satcu
->hdr
, struct acpi_dmar_satc
, header
);
3533 if (info
->event
== BUS_NOTIFY_ADD_DEVICE
) {
3534 ret
= dmar_insert_dev_scope(info
, (void *)(satc
+ 1),
3535 (void *)satc
+ satc
->header
.length
,
3536 satc
->segment
, satcu
->devices
,
3537 satcu
->devices_cnt
);
3542 } else if (info
->event
== BUS_NOTIFY_REMOVED_DEVICE
) {
3543 if (dmar_remove_dev_scope(info
, satc
->segment
,
3544 satcu
->devices
, satcu
->devices_cnt
))
3552 static int intel_iommu_memory_notifier(struct notifier_block
*nb
,
3553 unsigned long val
, void *v
)
3555 struct memory_notify
*mhp
= v
;
3556 unsigned long start_vpfn
= mm_to_dma_pfn_start(mhp
->start_pfn
);
3557 unsigned long last_vpfn
= mm_to_dma_pfn_end(mhp
->start_pfn
+
3561 case MEM_GOING_ONLINE
:
3562 if (iommu_domain_identity_map(si_domain
,
3563 start_vpfn
, last_vpfn
)) {
3564 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3565 start_vpfn
, last_vpfn
);
3571 case MEM_CANCEL_ONLINE
:
3573 struct dmar_drhd_unit
*drhd
;
3574 struct intel_iommu
*iommu
;
3575 LIST_HEAD(freelist
);
3577 domain_unmap(si_domain
, start_vpfn
, last_vpfn
, &freelist
);
3580 for_each_active_iommu(iommu
, drhd
)
3581 iommu_flush_iotlb_psi(iommu
, si_domain
,
3582 start_vpfn
, mhp
->nr_pages
,
3583 list_empty(&freelist
), 0);
3585 put_pages_list(&freelist
);
3593 static struct notifier_block intel_iommu_memory_nb
= {
3594 .notifier_call
= intel_iommu_memory_notifier
,
3598 static void intel_disable_iommus(void)
3600 struct intel_iommu
*iommu
= NULL
;
3601 struct dmar_drhd_unit
*drhd
;
3603 for_each_iommu(iommu
, drhd
)
3604 iommu_disable_translation(iommu
);
3607 void intel_iommu_shutdown(void)
3609 struct dmar_drhd_unit
*drhd
;
3610 struct intel_iommu
*iommu
= NULL
;
3612 if (no_iommu
|| dmar_disabled
)
3615 down_write(&dmar_global_lock
);
3617 /* Disable PMRs explicitly here. */
3618 for_each_iommu(iommu
, drhd
)
3619 iommu_disable_protect_mem_regions(iommu
);
3621 /* Make sure the IOMMUs are switched off */
3622 intel_disable_iommus();
3624 up_write(&dmar_global_lock
);
3627 static inline struct intel_iommu
*dev_to_intel_iommu(struct device
*dev
)
3629 struct iommu_device
*iommu_dev
= dev_to_iommu_device(dev
);
3631 return container_of(iommu_dev
, struct intel_iommu
, iommu
);
3634 static ssize_t
version_show(struct device
*dev
,
3635 struct device_attribute
*attr
, char *buf
)
3637 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
3638 u32 ver
= readl(iommu
->reg
+ DMAR_VER_REG
);
3639 return sysfs_emit(buf
, "%d:%d\n",
3640 DMAR_VER_MAJOR(ver
), DMAR_VER_MINOR(ver
));
3642 static DEVICE_ATTR_RO(version
);
3644 static ssize_t
address_show(struct device
*dev
,
3645 struct device_attribute
*attr
, char *buf
)
3647 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
3648 return sysfs_emit(buf
, "%llx\n", iommu
->reg_phys
);
3650 static DEVICE_ATTR_RO(address
);
3652 static ssize_t
cap_show(struct device
*dev
,
3653 struct device_attribute
*attr
, char *buf
)
3655 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
3656 return sysfs_emit(buf
, "%llx\n", iommu
->cap
);
3658 static DEVICE_ATTR_RO(cap
);
3660 static ssize_t
ecap_show(struct device
*dev
,
3661 struct device_attribute
*attr
, char *buf
)
3663 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
3664 return sysfs_emit(buf
, "%llx\n", iommu
->ecap
);
3666 static DEVICE_ATTR_RO(ecap
);
3668 static ssize_t
domains_supported_show(struct device
*dev
,
3669 struct device_attribute
*attr
, char *buf
)
3671 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
3672 return sysfs_emit(buf
, "%ld\n", cap_ndoms(iommu
->cap
));
3674 static DEVICE_ATTR_RO(domains_supported
);
3676 static ssize_t
domains_used_show(struct device
*dev
,
3677 struct device_attribute
*attr
, char *buf
)
3679 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
3680 return sysfs_emit(buf
, "%d\n",
3681 bitmap_weight(iommu
->domain_ids
,
3682 cap_ndoms(iommu
->cap
)));
3684 static DEVICE_ATTR_RO(domains_used
);
3686 static struct attribute
*intel_iommu_attrs
[] = {
3687 &dev_attr_version
.attr
,
3688 &dev_attr_address
.attr
,
3690 &dev_attr_ecap
.attr
,
3691 &dev_attr_domains_supported
.attr
,
3692 &dev_attr_domains_used
.attr
,
3696 static struct attribute_group intel_iommu_group
= {
3697 .name
= "intel-iommu",
3698 .attrs
= intel_iommu_attrs
,
3701 const struct attribute_group
*intel_iommu_groups
[] = {
3706 static inline bool has_external_pci(void)
3708 struct pci_dev
*pdev
= NULL
;
3710 for_each_pci_dev(pdev
)
3711 if (pdev
->external_facing
) {
3719 static int __init
platform_optin_force_iommu(void)
3721 if (!dmar_platform_optin() || no_platform_optin
|| !has_external_pci())
3724 if (no_iommu
|| dmar_disabled
)
3725 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3728 * If Intel-IOMMU is disabled by default, we will apply identity
3729 * map for all devices except those marked as being untrusted.
3732 iommu_set_default_passthrough(false);
3740 static int __init
probe_acpi_namespace_devices(void)
3742 struct dmar_drhd_unit
*drhd
;
3743 /* To avoid a -Wunused-but-set-variable warning. */
3744 struct intel_iommu
*iommu __maybe_unused
;
3748 for_each_active_iommu(iommu
, drhd
) {
3749 for_each_active_dev_scope(drhd
->devices
,
3750 drhd
->devices_cnt
, i
, dev
) {
3751 struct acpi_device_physical_node
*pn
;
3752 struct acpi_device
*adev
;
3754 if (dev
->bus
!= &acpi_bus_type
)
3757 adev
= to_acpi_device(dev
);
3758 mutex_lock(&adev
->physical_node_lock
);
3759 list_for_each_entry(pn
,
3760 &adev
->physical_node_list
, node
) {
3761 ret
= iommu_probe_device(pn
->dev
);
3765 mutex_unlock(&adev
->physical_node_lock
);
3775 static __init
int tboot_force_iommu(void)
3777 if (!tboot_enabled())
3780 if (no_iommu
|| dmar_disabled
)
3781 pr_warn("Forcing Intel-IOMMU to enabled\n");
3789 int __init
intel_iommu_init(void)
3792 struct dmar_drhd_unit
*drhd
;
3793 struct intel_iommu
*iommu
;
3796 * Intel IOMMU is required for a TXT/tboot launch or platform
3797 * opt in, so enforce that.
3799 force_on
= (!intel_iommu_tboot_noforce
&& tboot_force_iommu()) ||
3800 platform_optin_force_iommu();
3802 down_write(&dmar_global_lock
);
3803 if (dmar_table_init()) {
3805 panic("tboot: Failed to initialize DMAR table\n");
3809 if (dmar_dev_scope_init() < 0) {
3811 panic("tboot: Failed to initialize DMAR device scope\n");
3815 up_write(&dmar_global_lock
);
3818 * The bus notifier takes the dmar_global_lock, so lockdep will
3819 * complain later when we register it under the lock.
3821 dmar_register_bus_notifier();
3823 down_write(&dmar_global_lock
);
3826 intel_iommu_debugfs_init();
3828 if (no_iommu
|| dmar_disabled
) {
3830 * We exit the function here to ensure IOMMU's remapping and
3831 * mempool aren't setup, which means that the IOMMU's PMRs
3832 * won't be disabled via the call to init_dmars(). So disable
3833 * it explicitly here. The PMRs were setup by tboot prior to
3834 * calling SENTER, but the kernel is expected to reset/tear
3837 if (intel_iommu_tboot_noforce
) {
3838 for_each_iommu(iommu
, drhd
)
3839 iommu_disable_protect_mem_regions(iommu
);
3843 * Make sure the IOMMUs are switched off, even when we
3844 * boot into a kexec kernel and the previous kernel left
3847 intel_disable_iommus();
3851 if (list_empty(&dmar_rmrr_units
))
3852 pr_info("No RMRR found\n");
3854 if (list_empty(&dmar_atsr_units
))
3855 pr_info("No ATSR found\n");
3857 if (list_empty(&dmar_satc_units
))
3858 pr_info("No SATC found\n");
3860 init_no_remapping_devices();
3865 panic("tboot: Failed to initialize DMARs\n");
3866 pr_err("Initialization failed\n");
3869 up_write(&dmar_global_lock
);
3871 init_iommu_pm_ops();
3873 down_read(&dmar_global_lock
);
3874 for_each_active_iommu(iommu
, drhd
) {
3876 * The flush queue implementation does not perform
3877 * page-selective invalidations that are required for efficient
3878 * TLB flushes in virtual environments. The benefit of batching
3879 * is likely to be much lower than the overhead of synchronizing
3880 * the virtual and physical IOMMU page-tables.
3882 if (cap_caching_mode(iommu
->cap
) &&
3883 !first_level_by_default(IOMMU_DOMAIN_DMA
)) {
3884 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3885 iommu_set_dma_strict();
3887 iommu_device_sysfs_add(&iommu
->iommu
, NULL
,
3890 iommu_device_register(&iommu
->iommu
, &intel_iommu_ops
, NULL
);
3892 iommu_pmu_register(iommu
);
3894 up_read(&dmar_global_lock
);
3896 if (si_domain
&& !hw_pass_through
)
3897 register_memory_notifier(&intel_iommu_memory_nb
);
3899 down_read(&dmar_global_lock
);
3900 if (probe_acpi_namespace_devices())
3901 pr_warn("ACPI name space devices didn't probe correctly\n");
3903 /* Finally, we enable the DMA remapping hardware. */
3904 for_each_iommu(iommu
, drhd
) {
3905 if (!drhd
->ignored
&& !translation_pre_enabled(iommu
))
3906 iommu_enable_translation(iommu
);
3908 iommu_disable_protect_mem_regions(iommu
);
3910 up_read(&dmar_global_lock
);
3912 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3914 intel_iommu_enabled
= 1;
3919 intel_iommu_free_dmars();
3920 up_write(&dmar_global_lock
);
3924 static int domain_context_clear_one_cb(struct pci_dev
*pdev
, u16 alias
, void *opaque
)
3926 struct device_domain_info
*info
= opaque
;
3928 domain_context_clear_one(info
, PCI_BUS_NUM(alias
), alias
& 0xff);
3933 * NB - intel-iommu lacks any sort of reference counting for the users of
3934 * dependent devices. If multiple endpoints have intersecting dependent
3935 * devices, unbinding the driver from any one of them will possibly leave
3936 * the others unable to operate.
3938 static void domain_context_clear(struct device_domain_info
*info
)
3940 if (!info
->iommu
|| !info
->dev
|| !dev_is_pci(info
->dev
))
3943 pci_for_each_dma_alias(to_pci_dev(info
->dev
),
3944 &domain_context_clear_one_cb
, info
);
3947 static void dmar_remove_one_dev_info(struct device
*dev
)
3949 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
3950 struct dmar_domain
*domain
= info
->domain
;
3951 struct intel_iommu
*iommu
= info
->iommu
;
3952 unsigned long flags
;
3954 if (!dev_is_real_dma_subdevice(info
->dev
)) {
3955 if (dev_is_pci(info
->dev
) && sm_supported(iommu
))
3956 intel_pasid_tear_down_entry(iommu
, info
->dev
,
3957 IOMMU_NO_PASID
, false);
3959 iommu_disable_pci_caps(info
);
3960 domain_context_clear(info
);
3963 spin_lock_irqsave(&domain
->lock
, flags
);
3964 list_del(&info
->link
);
3965 spin_unlock_irqrestore(&domain
->lock
, flags
);
3967 domain_detach_iommu(domain
, iommu
);
3968 info
->domain
= NULL
;
3972 * Clear the page table pointer in context or pasid table entries so that
3973 * all DMA requests without PASID from the device are blocked. If the page
3974 * table has been set, clean up the data structures.
3976 static void device_block_translation(struct device
*dev
)
3978 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
3979 struct intel_iommu
*iommu
= info
->iommu
;
3980 unsigned long flags
;
3982 iommu_disable_pci_caps(info
);
3983 if (!dev_is_real_dma_subdevice(dev
)) {
3984 if (sm_supported(iommu
))
3985 intel_pasid_tear_down_entry(iommu
, dev
,
3986 IOMMU_NO_PASID
, false);
3988 domain_context_clear(info
);
3994 spin_lock_irqsave(&info
->domain
->lock
, flags
);
3995 list_del(&info
->link
);
3996 spin_unlock_irqrestore(&info
->domain
->lock
, flags
);
3998 domain_detach_iommu(info
->domain
, iommu
);
3999 info
->domain
= NULL
;
4002 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
)
4006 /* calculate AGAW */
4007 domain
->gaw
= guest_width
;
4008 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
4009 domain
->agaw
= width_to_agaw(adjust_width
);
4011 domain
->iommu_coherency
= false;
4012 domain
->iommu_superpage
= 0;
4013 domain
->max_addr
= 0;
4015 /* always allocate the top pgd */
4016 domain
->pgd
= alloc_pgtable_page(domain
->nid
, GFP_ATOMIC
);
4019 domain_flush_cache(domain
, domain
->pgd
, PAGE_SIZE
);
4023 static int blocking_domain_attach_dev(struct iommu_domain
*domain
,
4026 device_block_translation(dev
);
4030 static struct iommu_domain blocking_domain
= {
4031 .ops
= &(const struct iommu_domain_ops
) {
4032 .attach_dev
= blocking_domain_attach_dev
,
4033 .free
= intel_iommu_domain_free
4037 static struct iommu_domain
*intel_iommu_domain_alloc(unsigned type
)
4039 struct dmar_domain
*dmar_domain
;
4040 struct iommu_domain
*domain
;
4043 case IOMMU_DOMAIN_BLOCKED
:
4044 return &blocking_domain
;
4045 case IOMMU_DOMAIN_DMA
:
4046 case IOMMU_DOMAIN_UNMANAGED
:
4047 dmar_domain
= alloc_domain(type
);
4049 pr_err("Can't allocate dmar_domain\n");
4052 if (md_domain_init(dmar_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
4053 pr_err("Domain initialization failed\n");
4054 domain_exit(dmar_domain
);
4058 domain
= &dmar_domain
->domain
;
4059 domain
->geometry
.aperture_start
= 0;
4060 domain
->geometry
.aperture_end
=
4061 __DOMAIN_MAX_ADDR(dmar_domain
->gaw
);
4062 domain
->geometry
.force_aperture
= true;
4065 case IOMMU_DOMAIN_IDENTITY
:
4066 return &si_domain
->domain
;
4067 case IOMMU_DOMAIN_SVA
:
4068 return intel_svm_domain_alloc();
4076 static void intel_iommu_domain_free(struct iommu_domain
*domain
)
4078 if (domain
!= &si_domain
->domain
&& domain
!= &blocking_domain
)
4079 domain_exit(to_dmar_domain(domain
));
4082 static int prepare_domain_attach_device(struct iommu_domain
*domain
,
4085 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
4086 struct intel_iommu
*iommu
;
4089 iommu
= device_to_iommu(dev
, NULL
, NULL
);
4093 if (dmar_domain
->force_snooping
&& !ecap_sc_support(iommu
->ecap
))
4096 /* check if this iommu agaw is sufficient for max mapped address */
4097 addr_width
= agaw_to_width(iommu
->agaw
);
4098 if (addr_width
> cap_mgaw(iommu
->cap
))
4099 addr_width
= cap_mgaw(iommu
->cap
);
4101 if (dmar_domain
->max_addr
> (1LL << addr_width
))
4103 dmar_domain
->gaw
= addr_width
;
4106 * Knock out extra levels of page tables if necessary
4108 while (iommu
->agaw
< dmar_domain
->agaw
) {
4109 struct dma_pte
*pte
;
4111 pte
= dmar_domain
->pgd
;
4112 if (dma_pte_present(pte
)) {
4113 dmar_domain
->pgd
= phys_to_virt(dma_pte_addr(pte
));
4114 free_pgtable_page(pte
);
4116 dmar_domain
->agaw
--;
4122 static int intel_iommu_attach_device(struct iommu_domain
*domain
,
4125 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
4129 device_block_translation(dev
);
4131 ret
= prepare_domain_attach_device(domain
, dev
);
4135 return dmar_domain_attach_device(to_dmar_domain(domain
), dev
);
4138 static int intel_iommu_map(struct iommu_domain
*domain
,
4139 unsigned long iova
, phys_addr_t hpa
,
4140 size_t size
, int iommu_prot
, gfp_t gfp
)
4142 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
4146 if (iommu_prot
& IOMMU_READ
)
4147 prot
|= DMA_PTE_READ
;
4148 if (iommu_prot
& IOMMU_WRITE
)
4149 prot
|= DMA_PTE_WRITE
;
4150 if (dmar_domain
->set_pte_snp
)
4151 prot
|= DMA_PTE_SNP
;
4153 max_addr
= iova
+ size
;
4154 if (dmar_domain
->max_addr
< max_addr
) {
4157 /* check if minimum agaw is sufficient for mapped address */
4158 end
= __DOMAIN_MAX_ADDR(dmar_domain
->gaw
) + 1;
4159 if (end
< max_addr
) {
4160 pr_err("%s: iommu width (%d) is not "
4161 "sufficient for the mapped address (%llx)\n",
4162 __func__
, dmar_domain
->gaw
, max_addr
);
4165 dmar_domain
->max_addr
= max_addr
;
4167 /* Round up size to next multiple of PAGE_SIZE, if it and
4168 the low bits of hpa would take us onto the next page */
4169 size
= aligned_nrpages(hpa
, size
);
4170 return __domain_mapping(dmar_domain
, iova
>> VTD_PAGE_SHIFT
,
4171 hpa
>> VTD_PAGE_SHIFT
, size
, prot
, gfp
);
4174 static int intel_iommu_map_pages(struct iommu_domain
*domain
,
4175 unsigned long iova
, phys_addr_t paddr
,
4176 size_t pgsize
, size_t pgcount
,
4177 int prot
, gfp_t gfp
, size_t *mapped
)
4179 unsigned long pgshift
= __ffs(pgsize
);
4180 size_t size
= pgcount
<< pgshift
;
4183 if (pgsize
!= SZ_4K
&& pgsize
!= SZ_2M
&& pgsize
!= SZ_1G
)
4186 if (!IS_ALIGNED(iova
| paddr
, pgsize
))
4189 ret
= intel_iommu_map(domain
, iova
, paddr
, size
, prot
, gfp
);
4196 static size_t intel_iommu_unmap(struct iommu_domain
*domain
,
4197 unsigned long iova
, size_t size
,
4198 struct iommu_iotlb_gather
*gather
)
4200 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
4201 unsigned long start_pfn
, last_pfn
;
4204 /* Cope with horrid API which requires us to unmap more than the
4205 size argument if it happens to be a large-page mapping. */
4206 if (unlikely(!pfn_to_dma_pte(dmar_domain
, iova
>> VTD_PAGE_SHIFT
,
4207 &level
, GFP_ATOMIC
)))
4210 if (size
< VTD_PAGE_SIZE
<< level_to_offset_bits(level
))
4211 size
= VTD_PAGE_SIZE
<< level_to_offset_bits(level
);
4213 start_pfn
= iova
>> VTD_PAGE_SHIFT
;
4214 last_pfn
= (iova
+ size
- 1) >> VTD_PAGE_SHIFT
;
4216 domain_unmap(dmar_domain
, start_pfn
, last_pfn
, &gather
->freelist
);
4218 if (dmar_domain
->max_addr
== iova
+ size
)
4219 dmar_domain
->max_addr
= iova
;
4222 * We do not use page-selective IOTLB invalidation in flush queue,
4223 * so there is no need to track page and sync iotlb.
4225 if (!iommu_iotlb_gather_queued(gather
))
4226 iommu_iotlb_gather_add_page(domain
, gather
, iova
, size
);
4231 static size_t intel_iommu_unmap_pages(struct iommu_domain
*domain
,
4233 size_t pgsize
, size_t pgcount
,
4234 struct iommu_iotlb_gather
*gather
)
4236 unsigned long pgshift
= __ffs(pgsize
);
4237 size_t size
= pgcount
<< pgshift
;
4239 return intel_iommu_unmap(domain
, iova
, size
, gather
);
4242 static void intel_iommu_tlb_sync(struct iommu_domain
*domain
,
4243 struct iommu_iotlb_gather
*gather
)
4245 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
4246 unsigned long iova_pfn
= IOVA_PFN(gather
->start
);
4247 size_t size
= gather
->end
- gather
->start
;
4248 struct iommu_domain_info
*info
;
4249 unsigned long start_pfn
;
4250 unsigned long nrpages
;
4253 nrpages
= aligned_nrpages(gather
->start
, size
);
4254 start_pfn
= mm_to_dma_pfn_start(iova_pfn
);
4256 xa_for_each(&dmar_domain
->iommu_array
, i
, info
)
4257 iommu_flush_iotlb_psi(info
->iommu
, dmar_domain
,
4259 list_empty(&gather
->freelist
), 0);
4261 put_pages_list(&gather
->freelist
);
4264 static phys_addr_t
intel_iommu_iova_to_phys(struct iommu_domain
*domain
,
4267 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
4268 struct dma_pte
*pte
;
4272 pte
= pfn_to_dma_pte(dmar_domain
, iova
>> VTD_PAGE_SHIFT
, &level
,
4274 if (pte
&& dma_pte_present(pte
))
4275 phys
= dma_pte_addr(pte
) +
4276 (iova
& (BIT_MASK(level_to_offset_bits(level
) +
4277 VTD_PAGE_SHIFT
) - 1));
4282 static bool domain_support_force_snooping(struct dmar_domain
*domain
)
4284 struct device_domain_info
*info
;
4285 bool support
= true;
4287 assert_spin_locked(&domain
->lock
);
4288 list_for_each_entry(info
, &domain
->devices
, link
) {
4289 if (!ecap_sc_support(info
->iommu
->ecap
)) {
4298 static void domain_set_force_snooping(struct dmar_domain
*domain
)
4300 struct device_domain_info
*info
;
4302 assert_spin_locked(&domain
->lock
);
4304 * Second level page table supports per-PTE snoop control. The
4305 * iommu_map() interface will handle this by setting SNP bit.
4307 if (!domain
->use_first_level
) {
4308 domain
->set_pte_snp
= true;
4312 list_for_each_entry(info
, &domain
->devices
, link
)
4313 intel_pasid_setup_page_snoop_control(info
->iommu
, info
->dev
,
4317 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain
*domain
)
4319 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
4320 unsigned long flags
;
4322 if (dmar_domain
->force_snooping
)
4325 spin_lock_irqsave(&dmar_domain
->lock
, flags
);
4326 if (!domain_support_force_snooping(dmar_domain
)) {
4327 spin_unlock_irqrestore(&dmar_domain
->lock
, flags
);
4331 domain_set_force_snooping(dmar_domain
);
4332 dmar_domain
->force_snooping
= true;
4333 spin_unlock_irqrestore(&dmar_domain
->lock
, flags
);
4338 static bool intel_iommu_capable(struct device
*dev
, enum iommu_cap cap
)
4340 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
4343 case IOMMU_CAP_CACHE_COHERENCY
:
4344 case IOMMU_CAP_DEFERRED_FLUSH
:
4346 case IOMMU_CAP_PRE_BOOT_PROTECTION
:
4347 return dmar_platform_optin();
4348 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY
:
4349 return ecap_sc_support(info
->iommu
->ecap
);
4355 static struct iommu_device
*intel_iommu_probe_device(struct device
*dev
)
4357 struct pci_dev
*pdev
= dev_is_pci(dev
) ? to_pci_dev(dev
) : NULL
;
4358 struct device_domain_info
*info
;
4359 struct intel_iommu
*iommu
;
4363 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
4364 if (!iommu
|| !iommu
->iommu
.ops
)
4365 return ERR_PTR(-ENODEV
);
4367 info
= kzalloc(sizeof(*info
), GFP_KERNEL
);
4369 return ERR_PTR(-ENOMEM
);
4371 if (dev_is_real_dma_subdevice(dev
)) {
4372 info
->bus
= pdev
->bus
->number
;
4373 info
->devfn
= pdev
->devfn
;
4374 info
->segment
= pci_domain_nr(pdev
->bus
);
4377 info
->devfn
= devfn
;
4378 info
->segment
= iommu
->segment
;
4382 info
->iommu
= iommu
;
4383 if (dev_is_pci(dev
)) {
4384 if (ecap_dev_iotlb_support(iommu
->ecap
) &&
4385 pci_ats_supported(pdev
) &&
4386 dmar_ats_supported(pdev
, iommu
)) {
4387 info
->ats_supported
= 1;
4388 info
->dtlb_extra_inval
= dev_needs_extra_dtlb_flush(pdev
);
4391 * For IOMMU that supports device IOTLB throttling
4392 * (DIT), we assign PFSID to the invalidation desc
4393 * of a VF such that IOMMU HW can gauge queue depth
4394 * at PF level. If DIT is not set, PFSID will be
4395 * treated as reserved, which should be set to 0.
4397 if (ecap_dit(iommu
->ecap
))
4398 info
->pfsid
= pci_dev_id(pci_physfn(pdev
));
4399 info
->ats_qdep
= pci_ats_queue_depth(pdev
);
4401 if (sm_supported(iommu
)) {
4402 if (pasid_supported(iommu
)) {
4403 int features
= pci_pasid_features(pdev
);
4406 info
->pasid_supported
= features
| 1;
4409 if (info
->ats_supported
&& ecap_prs(iommu
->ecap
) &&
4410 pci_pri_supported(pdev
))
4411 info
->pri_supported
= 1;
4415 dev_iommu_priv_set(dev
, info
);
4417 if (sm_supported(iommu
) && !dev_is_real_dma_subdevice(dev
)) {
4418 ret
= intel_pasid_alloc_table(dev
);
4420 dev_err(dev
, "PASID table allocation failed\n");
4421 dev_iommu_priv_set(dev
, NULL
);
4423 return ERR_PTR(ret
);
4427 return &iommu
->iommu
;
4430 static void intel_iommu_release_device(struct device
*dev
)
4432 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
4434 dmar_remove_one_dev_info(dev
);
4435 intel_pasid_free_table(dev
);
4436 dev_iommu_priv_set(dev
, NULL
);
4438 set_dma_ops(dev
, NULL
);
4441 static void intel_iommu_probe_finalize(struct device
*dev
)
4443 set_dma_ops(dev
, NULL
);
4444 iommu_setup_dma_ops(dev
, 0, U64_MAX
);
4447 static void intel_iommu_get_resv_regions(struct device
*device
,
4448 struct list_head
*head
)
4450 int prot
= DMA_PTE_READ
| DMA_PTE_WRITE
;
4451 struct iommu_resv_region
*reg
;
4452 struct dmar_rmrr_unit
*rmrr
;
4453 struct device
*i_dev
;
4457 for_each_rmrr_units(rmrr
) {
4458 for_each_active_dev_scope(rmrr
->devices
, rmrr
->devices_cnt
,
4460 struct iommu_resv_region
*resv
;
4461 enum iommu_resv_type type
;
4464 if (i_dev
!= device
&&
4465 !is_downstream_to_pci_bridge(device
, i_dev
))
4468 length
= rmrr
->end_address
- rmrr
->base_address
+ 1;
4470 type
= device_rmrr_is_relaxable(device
) ?
4471 IOMMU_RESV_DIRECT_RELAXABLE
: IOMMU_RESV_DIRECT
;
4473 resv
= iommu_alloc_resv_region(rmrr
->base_address
,
4479 list_add_tail(&resv
->list
, head
);
4484 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4485 if (dev_is_pci(device
)) {
4486 struct pci_dev
*pdev
= to_pci_dev(device
);
4488 if ((pdev
->class >> 8) == PCI_CLASS_BRIDGE_ISA
) {
4489 reg
= iommu_alloc_resv_region(0, 1UL << 24, prot
,
4490 IOMMU_RESV_DIRECT_RELAXABLE
,
4493 list_add_tail(®
->list
, head
);
4496 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4498 reg
= iommu_alloc_resv_region(IOAPIC_RANGE_START
,
4499 IOAPIC_RANGE_END
- IOAPIC_RANGE_START
+ 1,
4500 0, IOMMU_RESV_MSI
, GFP_KERNEL
);
4503 list_add_tail(®
->list
, head
);
4506 static struct iommu_group
*intel_iommu_device_group(struct device
*dev
)
4508 if (dev_is_pci(dev
))
4509 return pci_device_group(dev
);
4510 return generic_device_group(dev
);
4513 static int intel_iommu_enable_sva(struct device
*dev
)
4515 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
4516 struct intel_iommu
*iommu
;
4518 if (!info
|| dmar_disabled
)
4521 iommu
= info
->iommu
;
4525 if (!(iommu
->flags
& VTD_FLAG_SVM_CAPABLE
))
4528 if (!info
->pasid_enabled
|| !info
->ats_enabled
)
4532 * Devices having device-specific I/O fault handling should not
4533 * support PCI/PRI. The IOMMU side has no means to check the
4534 * capability of device-specific IOPF. Therefore, IOMMU can only
4535 * default that if the device driver enables SVA on a non-PRI
4536 * device, it will handle IOPF in its own way.
4538 if (!info
->pri_supported
)
4541 /* Devices supporting PRI should have it enabled. */
4542 if (!info
->pri_enabled
)
4548 static int intel_iommu_enable_iopf(struct device
*dev
)
4550 struct pci_dev
*pdev
= dev_is_pci(dev
) ? to_pci_dev(dev
) : NULL
;
4551 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
4552 struct intel_iommu
*iommu
;
4555 if (!pdev
|| !info
|| !info
->ats_enabled
|| !info
->pri_supported
)
4558 if (info
->pri_enabled
)
4561 iommu
= info
->iommu
;
4565 /* PASID is required in PRG Response Message. */
4566 if (info
->pasid_enabled
&& !pci_prg_resp_pasid_required(pdev
))
4569 ret
= pci_reset_pri(pdev
);
4573 ret
= iopf_queue_add_device(iommu
->iopf_queue
, dev
);
4577 ret
= iommu_register_device_fault_handler(dev
, iommu_queue_iopf
, dev
);
4579 goto iopf_remove_device
;
4581 ret
= pci_enable_pri(pdev
, PRQ_DEPTH
);
4583 goto iopf_unregister_handler
;
4584 info
->pri_enabled
= 1;
4588 iopf_unregister_handler
:
4589 iommu_unregister_device_fault_handler(dev
);
4591 iopf_queue_remove_device(iommu
->iopf_queue
, dev
);
4596 static int intel_iommu_disable_iopf(struct device
*dev
)
4598 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
4599 struct intel_iommu
*iommu
= info
->iommu
;
4601 if (!info
->pri_enabled
)
4605 * PCIe spec states that by clearing PRI enable bit, the Page
4606 * Request Interface will not issue new page requests, but has
4607 * outstanding page requests that have been transmitted or are
4608 * queued for transmission. This is supposed to be called after
4609 * the device driver has stopped DMA, all PASIDs have been
4610 * unbound and the outstanding PRQs have been drained.
4612 pci_disable_pri(to_pci_dev(dev
));
4613 info
->pri_enabled
= 0;
4616 * With PRI disabled and outstanding PRQs drained, unregistering
4617 * fault handler and removing device from iopf queue should never
4620 WARN_ON(iommu_unregister_device_fault_handler(dev
));
4621 WARN_ON(iopf_queue_remove_device(iommu
->iopf_queue
, dev
));
4627 intel_iommu_dev_enable_feat(struct device
*dev
, enum iommu_dev_features feat
)
4630 case IOMMU_DEV_FEAT_IOPF
:
4631 return intel_iommu_enable_iopf(dev
);
4633 case IOMMU_DEV_FEAT_SVA
:
4634 return intel_iommu_enable_sva(dev
);
4642 intel_iommu_dev_disable_feat(struct device
*dev
, enum iommu_dev_features feat
)
4645 case IOMMU_DEV_FEAT_IOPF
:
4646 return intel_iommu_disable_iopf(dev
);
4648 case IOMMU_DEV_FEAT_SVA
:
4656 static bool intel_iommu_is_attach_deferred(struct device
*dev
)
4658 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
4660 return translation_pre_enabled(info
->iommu
) && !info
->domain
;
4664 * Check that the device does not live on an external facing PCI port that is
4665 * marked as untrusted. Such devices should not be able to apply quirks and
4666 * thus not be able to bypass the IOMMU restrictions.
4668 static bool risky_device(struct pci_dev
*pdev
)
4670 if (pdev
->untrusted
) {
4672 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4673 pdev
->vendor
, pdev
->device
);
4674 pci_info(pdev
, "Please check with your BIOS/Platform vendor about this\n");
4680 static void intel_iommu_iotlb_sync_map(struct iommu_domain
*domain
,
4681 unsigned long iova
, size_t size
)
4683 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
4684 unsigned long pages
= aligned_nrpages(iova
, size
);
4685 unsigned long pfn
= iova
>> VTD_PAGE_SHIFT
;
4686 struct iommu_domain_info
*info
;
4689 xa_for_each(&dmar_domain
->iommu_array
, i
, info
)
4690 __mapping_notify_one(info
->iommu
, dmar_domain
, pfn
, pages
);
4693 static void intel_iommu_remove_dev_pasid(struct device
*dev
, ioasid_t pasid
)
4695 struct intel_iommu
*iommu
= device_to_iommu(dev
, NULL
, NULL
);
4696 struct dev_pasid_info
*curr
, *dev_pasid
= NULL
;
4697 struct dmar_domain
*dmar_domain
;
4698 struct iommu_domain
*domain
;
4699 unsigned long flags
;
4701 domain
= iommu_get_domain_for_dev_pasid(dev
, pasid
, 0);
4702 if (WARN_ON_ONCE(!domain
))
4706 * The SVA implementation needs to handle its own stuffs like the mm
4707 * notification. Before consolidating that code into iommu core, let
4708 * the intel sva code handle it.
4710 if (domain
->type
== IOMMU_DOMAIN_SVA
) {
4711 intel_svm_remove_dev_pasid(dev
, pasid
);
4715 dmar_domain
= to_dmar_domain(domain
);
4716 spin_lock_irqsave(&dmar_domain
->lock
, flags
);
4717 list_for_each_entry(curr
, &dmar_domain
->dev_pasids
, link_domain
) {
4718 if (curr
->dev
== dev
&& curr
->pasid
== pasid
) {
4719 list_del(&curr
->link_domain
);
4724 WARN_ON_ONCE(!dev_pasid
);
4725 spin_unlock_irqrestore(&dmar_domain
->lock
, flags
);
4727 domain_detach_iommu(dmar_domain
, iommu
);
4730 intel_pasid_tear_down_entry(iommu
, dev
, pasid
, false);
4731 intel_drain_pasid_prq(dev
, pasid
);
4734 static int intel_iommu_set_dev_pasid(struct iommu_domain
*domain
,
4735 struct device
*dev
, ioasid_t pasid
)
4737 struct device_domain_info
*info
= dev_iommu_priv_get(dev
);
4738 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
4739 struct intel_iommu
*iommu
= info
->iommu
;
4740 struct dev_pasid_info
*dev_pasid
;
4741 unsigned long flags
;
4744 if (!pasid_supported(iommu
) || dev_is_real_dma_subdevice(dev
))
4747 if (context_copied(iommu
, info
->bus
, info
->devfn
))
4750 ret
= prepare_domain_attach_device(domain
, dev
);
4754 dev_pasid
= kzalloc(sizeof(*dev_pasid
), GFP_KERNEL
);
4758 ret
= domain_attach_iommu(dmar_domain
, iommu
);
4762 if (domain_type_is_si(dmar_domain
))
4763 ret
= intel_pasid_setup_pass_through(iommu
, dmar_domain
,
4765 else if (dmar_domain
->use_first_level
)
4766 ret
= domain_setup_first_level(iommu
, dmar_domain
,
4769 ret
= intel_pasid_setup_second_level(iommu
, dmar_domain
,
4772 goto out_detach_iommu
;
4774 dev_pasid
->dev
= dev
;
4775 dev_pasid
->pasid
= pasid
;
4776 spin_lock_irqsave(&dmar_domain
->lock
, flags
);
4777 list_add(&dev_pasid
->link_domain
, &dmar_domain
->dev_pasids
);
4778 spin_unlock_irqrestore(&dmar_domain
->lock
, flags
);
4782 domain_detach_iommu(dmar_domain
, iommu
);
4788 const struct iommu_ops intel_iommu_ops
= {
4789 .capable
= intel_iommu_capable
,
4790 .domain_alloc
= intel_iommu_domain_alloc
,
4791 .probe_device
= intel_iommu_probe_device
,
4792 .probe_finalize
= intel_iommu_probe_finalize
,
4793 .release_device
= intel_iommu_release_device
,
4794 .get_resv_regions
= intel_iommu_get_resv_regions
,
4795 .device_group
= intel_iommu_device_group
,
4796 .dev_enable_feat
= intel_iommu_dev_enable_feat
,
4797 .dev_disable_feat
= intel_iommu_dev_disable_feat
,
4798 .is_attach_deferred
= intel_iommu_is_attach_deferred
,
4799 .def_domain_type
= device_def_domain_type
,
4800 .remove_dev_pasid
= intel_iommu_remove_dev_pasid
,
4801 .pgsize_bitmap
= SZ_4K
,
4802 #ifdef CONFIG_INTEL_IOMMU_SVM
4803 .page_response
= intel_svm_page_response
,
4805 .default_domain_ops
= &(const struct iommu_domain_ops
) {
4806 .attach_dev
= intel_iommu_attach_device
,
4807 .set_dev_pasid
= intel_iommu_set_dev_pasid
,
4808 .map_pages
= intel_iommu_map_pages
,
4809 .unmap_pages
= intel_iommu_unmap_pages
,
4810 .iotlb_sync_map
= intel_iommu_iotlb_sync_map
,
4811 .flush_iotlb_all
= intel_flush_iotlb_all
,
4812 .iotlb_sync
= intel_iommu_tlb_sync
,
4813 .iova_to_phys
= intel_iommu_iova_to_phys
,
4814 .free
= intel_iommu_domain_free
,
4815 .enforce_cache_coherency
= intel_iommu_enforce_cache_coherency
,
4819 static void quirk_iommu_igfx(struct pci_dev
*dev
)
4821 if (risky_device(dev
))
4824 pci_info(dev
, "Disabling IOMMU for graphics on this chipset\n");
4828 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_igfx
);
4830 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e00, quirk_iommu_igfx
);
4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e10, quirk_iommu_igfx
);
4832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e20, quirk_iommu_igfx
);
4833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e30, quirk_iommu_igfx
);
4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e40, quirk_iommu_igfx
);
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e90, quirk_iommu_igfx
);
4837 /* Broadwell igfx malfunctions with dmar */
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x1606, quirk_iommu_igfx
);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x160B, quirk_iommu_igfx
);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x160E, quirk_iommu_igfx
);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x1602, quirk_iommu_igfx
);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x160A, quirk_iommu_igfx
);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x160D, quirk_iommu_igfx
);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x1616, quirk_iommu_igfx
);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x161B, quirk_iommu_igfx
);
4846 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x161E, quirk_iommu_igfx
);
4847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x1612, quirk_iommu_igfx
);
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x161A, quirk_iommu_igfx
);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x161D, quirk_iommu_igfx
);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x1626, quirk_iommu_igfx
);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x162B, quirk_iommu_igfx
);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x162E, quirk_iommu_igfx
);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x1622, quirk_iommu_igfx
);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x162A, quirk_iommu_igfx
);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x162D, quirk_iommu_igfx
);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x1636, quirk_iommu_igfx
);
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x163B, quirk_iommu_igfx
);
4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x163E, quirk_iommu_igfx
);
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x1632, quirk_iommu_igfx
);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x163A, quirk_iommu_igfx
);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x163D, quirk_iommu_igfx
);
4863 static void quirk_iommu_rwbf(struct pci_dev
*dev
)
4865 if (risky_device(dev
))
4869 * Mobile 4 Series Chipset neglects to set RWBF capability,
4870 * but needs it. Same seems to hold for the desktop versions.
4872 pci_info(dev
, "Forcing write-buffer flush capability\n");
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_rwbf
);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e00, quirk_iommu_rwbf
);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e10, quirk_iommu_rwbf
);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e20, quirk_iommu_rwbf
);
4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e30, quirk_iommu_rwbf
);
4881 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e40, quirk_iommu_rwbf
);
4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e90, quirk_iommu_rwbf
);
4885 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4886 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4887 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4888 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4889 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4890 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4891 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4892 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4894 static void quirk_calpella_no_shadow_gtt(struct pci_dev
*dev
)
4898 if (risky_device(dev
))
4901 if (pci_read_config_word(dev
, GGC
, &ggc
))
4904 if (!(ggc
& GGC_MEMORY_VT_ENABLED
)) {
4905 pci_info(dev
, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4907 } else if (dmar_map_gfx
) {
4908 /* we have to ensure the gfx device is idle before we flush */
4909 pci_info(dev
, "Disabling batched IOTLB flush on Ironlake\n");
4910 iommu_set_dma_strict();
4913 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0040, quirk_calpella_no_shadow_gtt
);
4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0044, quirk_calpella_no_shadow_gtt
);
4915 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0062, quirk_calpella_no_shadow_gtt
);
4916 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x006a, quirk_calpella_no_shadow_gtt
);
4918 static void quirk_igfx_skip_te_disable(struct pci_dev
*dev
)
4922 if (!IS_GFX_DEVICE(dev
))
4925 ver
= (dev
->device
>> 8) & 0xff;
4926 if (ver
!= 0x45 && ver
!= 0x46 && ver
!= 0x4c &&
4927 ver
!= 0x4e && ver
!= 0x8a && ver
!= 0x98 &&
4928 ver
!= 0x9a && ver
!= 0xa7)
4931 if (risky_device(dev
))
4934 pci_info(dev
, "Skip IOMMU disabling for graphics\n");
4935 iommu_skip_te_disable
= 1;
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, PCI_ANY_ID
, quirk_igfx_skip_te_disable
);
4939 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4940 ISOCH DMAR unit for the Azalia sound device, but not give it any
4941 TLB entries, which causes it to deadlock. Check for that. We do
4942 this in a function called from init_dmars(), instead of in a PCI
4943 quirk, because we don't want to print the obnoxious "BIOS broken"
4944 message if VT-d is actually disabled.
4946 static void __init
check_tylersburg_isoch(void)
4948 struct pci_dev
*pdev
;
4949 uint32_t vtisochctrl
;
4951 /* If there's no Azalia in the system anyway, forget it. */
4952 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x3a3e, NULL
);
4956 if (risky_device(pdev
)) {
4963 /* System Management Registers. Might be hidden, in which case
4964 we can't do the sanity check. But that's OK, because the
4965 known-broken BIOSes _don't_ actually hide it, so far. */
4966 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x342e, NULL
);
4970 if (risky_device(pdev
)) {
4975 if (pci_read_config_dword(pdev
, 0x188, &vtisochctrl
)) {
4982 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4983 if (vtisochctrl
& 1)
4986 /* Drop all bits other than the number of TLB entries */
4987 vtisochctrl
&= 0x1c;
4989 /* If we have the recommended number of TLB entries (16), fine. */
4990 if (vtisochctrl
== 0x10)
4993 /* Zero TLB entries? You get to ride the short bus to school. */
4995 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4996 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4997 dmi_get_system_info(DMI_BIOS_VENDOR
),
4998 dmi_get_system_info(DMI_BIOS_VERSION
),
4999 dmi_get_system_info(DMI_PRODUCT_VERSION
));
5000 iommu_identity_mapping
|= IDENTMAP_AZALIA
;
5004 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5009 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5010 * invalidation completion before posted writes initiated with translated address
5011 * that utilized translations matching the invalidation address range, violating
5012 * the invalidation completion ordering.
5013 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5014 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5015 * under the control of the trusted/privileged host device driver must use this
5017 * Device TLBs are invalidated under the following six conditions:
5018 * 1. Device driver does DMA API unmap IOVA
5019 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5020 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5021 * exit_mmap() due to crash
5022 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5023 * VM has to free pages that were unmapped
5024 * 5. Userspace driver unmaps a DMA buffer
5025 * 6. Cache invalidation in vSVA usage (upcoming)
5027 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5028 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5029 * invalidate TLB the same way as normal user unmap which will use this quirk.
5030 * The dTLB invalidation after PASID cache flush does not need this quirk.
5032 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5034 void quirk_extra_dev_tlb_flush(struct device_domain_info
*info
,
5035 unsigned long address
, unsigned long mask
,
5036 u32 pasid
, u16 qdep
)
5040 if (likely(!info
->dtlb_extra_inval
))
5043 sid
= PCI_DEVID(info
->bus
, info
->devfn
);
5044 if (pasid
== IOMMU_NO_PASID
) {
5045 qi_flush_dev_iotlb(info
->iommu
, sid
, info
->pfsid
,
5046 qdep
, address
, mask
);
5048 qi_flush_dev_iotlb_pasid(info
->iommu
, sid
, info
->pfsid
,
5049 pasid
, qdep
, address
, mask
);
5053 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5056 * Function to submit a command to the enhanced command interface. The
5057 * valid enhanced command descriptions are defined in Table 47 of the
5058 * VT-d spec. The VT-d hardware implementation may support some but not
5059 * all commands, which can be determined by checking the Enhanced
5060 * Command Capability Register.
5063 * - 0: Command successful without any error;
5064 * - Negative: software error value;
5065 * - Nonzero positive: failure status code defined in Table 48.
5067 int ecmd_submit_sync(struct intel_iommu
*iommu
, u8 ecmd
, u64 oa
, u64 ob
)
5069 unsigned long flags
;
5073 if (!cap_ecmds(iommu
->cap
))
5076 raw_spin_lock_irqsave(&iommu
->register_lock
, flags
);
5078 res
= dmar_readq(iommu
->reg
+ DMAR_ECRSP_REG
);
5079 if (res
& DMA_ECMD_ECRSP_IP
) {
5085 * Unconditionally write the operand B, because
5086 * - There is no side effect if an ecmd doesn't require an
5087 * operand B, but we set the register to some value.
5088 * - It's not invoked in any critical path. The extra MMIO
5089 * write doesn't bring any performance concerns.
5091 dmar_writeq(iommu
->reg
+ DMAR_ECEO_REG
, ob
);
5092 dmar_writeq(iommu
->reg
+ DMAR_ECMD_REG
, ecmd
| (oa
<< DMA_ECMD_OA_SHIFT
));
5094 IOMMU_WAIT_OP(iommu
, DMAR_ECRSP_REG
, dmar_readq
,
5095 !(res
& DMA_ECMD_ECRSP_IP
), res
);
5097 if (res
& DMA_ECMD_ECRSP_IP
) {
5102 ret
= ecmd_get_status_code(res
);
5104 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flags
);