From: Jason Gunthorpe Date: Thu, 23 Oct 2025 18:22:33 +0000 (-0300) Subject: iommupt: Add the Intel VT-d second stage page table format X-Git-Tag: v6.19-rc1~133^2^8~36 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5448c1558f60d4051c90938f2878c6fb20e2982a;p=thirdparty%2Fkernel%2Flinux.git iommupt: Add the Intel VT-d second stage page table format The VT-d second stage format is almost the same as the x86 PAE format, except the bit encodings in the PTE are different and a few new PTE features, like force coherency are present. Among all the formats it is unique in not having a designated present bit. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 53,66 , 50,64 , 21.21 2^21, 59,70 , 56,67 , 16.16 2^30, 54,66 , 52,63 , 17.17 256*2^12, 384,524 , 337,516 , 34.34 256*2^21, 387,632 , 336,626 , 46.46 256*2^30, 376,629 , 323,623 , 48.48 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 67,86 , 63,84 , 25.25 2^21, 64,84 , 59,80 , 26.26 2^30, 59,78 , 56,74 , 24.24 256*2^12, 216,335 , 198,317 , 37.37 256*2^21, 245,350 , 232,344 , 32.32 256*2^30, 248,345 , 226,339 , 33.33 Cc: Tina Zhang Cc: Kevin Tian Cc: Lu Baolu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig index 2016c5e5ac0fe..52ac9e661ffd2 100644 --- a/drivers/iommu/generic_pt/.kunitconfig +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -3,6 +3,7 @@ CONFIG_GENERIC_PT=y CONFIG_DEBUG_GENERIC_PT=y CONFIG_IOMMU_PT=y CONFIG_IOMMU_PT_AMDV1=y +CONFIG_IOMMU_PT_VTDSS=y CONFIG_IOMMU_PT_X86_64=y CONFIG_IOMMU_PT_KUNIT_TEST=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index 6dcb771b3c582..79f65268f3121 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -42,6 +42,16 @@ config IOMMU_PT_AMDV1 Selected automatically by an IOMMU driver that uses this format. +config IOMMU_PT_VTDSS + tristate "IOMMU page table for Intel VT-d Second Stage" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5 + level Second Stage page table. It is similar to the X86_64 format with + 4K/2M/1G page sizes. + + Selected automatically by an IOMMU driver that uses this format. + config IOMMU_PT_X86_64 tristate "IOMMU page table for x86 64-bit, 4/5 levels" depends on !GENERIC_ATOMIC64 # for cmpxchg64 @@ -57,6 +67,7 @@ config IOMMU_PT_KUNIT_TEST depends on KUNIT depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 + depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS default KUNIT_ALL_TESTS help Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile index 5a3379107999f..976b49ec97dca 100644 --- a/drivers/iommu/generic_pt/fmt/Makefile +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -3,6 +3,8 @@ iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock +iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss + iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 IOMMU_PT_KUNIT_TEST := diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h new file mode 100644 index 0000000000000..4a239bcaae2a9 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H +#define __GENERIC_PT_FMT_DEFS_VTDSS_H + +#include +#include + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct vtdss_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs vtdss_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c new file mode 100644 index 0000000000000..f551711e2a336 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT vtdss +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \ + BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h new file mode 100644 index 0000000000000..d9774848eb6f7 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/vtdss.h @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + * Intel VT-d Second Stange 5/4 level page table + * + * This is described in + * Section "3.7 Second-Stage Translation" + * Section "9.8 Second-Stage Paging Entries" + * + * Of the "Intel Virtualization Technology for Directed I/O Architecture + * Specification". + * + * The named levels in the spec map to the pts->level as: + * Table/SS-PTE - 0 + * Directory/SS-PDE - 1 + * Directory Ptr/SS-PDPTE - 2 + * PML4/SS-PML4E - 3 + * PML5/SS-PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_VTDSS_H +#define __GENERIC_PT_FMT_VTDSS_H + +#include "defs_vtdss.h" +#include "../pt_defs.h" + +#include +#include +#include + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* SSPTPTR is 4k aligned and limited by HAW */ + PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12), +}; + +/* Shared descriptor bits */ +enum { + VTDSS_FMT_R = BIT(0), + VTDSS_FMT_W = BIT(1), + VTDSS_FMT_A = BIT(8), + VTDSS_FMT_D = BIT(9), + VTDSS_FMT_SNP = BIT(11), + VTDSS_FMT_OA = GENMASK_ULL(51, 12), +}; + +/* PDPTE/PDE */ +enum { + VTDSS_FMT_PS = BIT(7), +}; + +#define common_to_vtdss_pt(common_ptr) \ + container_of_const(common_ptr, struct pt_vtdss, common) +#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common) + +static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa vtdss_pt_table_pa + +static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa vtdss_pt_entry_oa + +static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf vtdss_pt_can_have_leaf + +static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 vtdss_pt_num_items_lg2 + +static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!entry) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw vtdss_pt_load_entry_raw + +static inline void +vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= VTDSS_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry vtdss_pt_install_leaf_entry + +static inline bool vtdss_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = VTDSS_FMT_R | VTDSS_FMT_W | + FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + return pt_table_install64(pts, entry); +} +#define pt_install_table vtdss_pt_install_table + +static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP); +} +#define pt_attr_from_entry vtdss_pt_attr_from_entry + +static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + return READ_ONCE(*tablep) & VTDSS_FMT_D; +} +#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty + +static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D); +} +#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean + +static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | VTDSS_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty + +static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common) +{ + return 10; +} +#define pt_max_sw_bit vtdss_pt_max_sw_bit + +static inline u64 vtdss_pt_sw_bit(unsigned int bitnr) +{ + /* Bits marked Ignored in the specification */ + switch (bitnr) { + case 0: + return BIT(10); + case 1 ... 9: + return BIT_ULL((bitnr - 1) + 52); + case 10: + return BIT_ULL(63); + /* Some bits in 9-3 are available in some entries */ + default: + if (__builtin_constant_p(bitnr)) + BUILD_BUG(); + else + PT_WARN_ON(true); + return 0; + } +} +#define pt_sw_bit vtdss_pt_sw_bit + +/* --- iommu */ +#include +#include + +#define pt_iommu_table pt_iommu_vtdss + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->vtdss_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, vtdss_pt.common) + ->iommu; +} + +static inline int vtdss_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + /* + * VTDSS does not have a present bit, so we tell if any entry is present + * by checking for R or W. + */ + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) + return -EINVAL; + + if (iommu_prot & IOMMU_READ) + pte |= VTDSS_FMT_R; + if (iommu_prot & IOMMU_WRITE) + pte |= VTDSS_FMT_W; + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE)) + pte |= VTDSS_FMT_SNP; + + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) && + !(iommu_prot & IOMMU_WRITE)) { + pr_err_ratelimited( + "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot vtdss_pt_iommu_set_prot + +static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table, + const struct pt_iommu_vtdss_cfg *cfg) +{ + struct pt_vtdss *table = &iommu_table->vtdss_pt; + unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2; + + if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2) + return -EOPNOTSUPP; + else if (vasz_lg2 > 48) + pt_top_set_level(&table->common, 4); + else if (vasz_lg2 > 39) + pt_top_set_level(&table->common, 3); + else if (vasz_lg2 > 30) + pt_top_set_level(&table->common, 2); + else + return -EOPNOTSUPP; + return 0; +} +#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init + +static inline void +vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table, + const struct pt_range *top_range, + struct pt_iommu_vtdss_hw_info *info) +{ + info->ssptptr = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK); + /* + * top_level = 2 = 3 level table aw=1 + * top_level = 3 = 4 level table aw=2 + * top_level = 4 = 5 level table aw=3 + */ + info->aw = top_range->top_level - 1; +} +#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = { + [0] = { .common.hw_max_vasz_lg2 = 39 }, + [1] = { .common.hw_max_vasz_lg2 = 48 }, + [2] = { .common.hw_max_vasz_lg2 = 57 }, +}; +#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) }; +#endif +#endif diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 883069e329527..6a9a1acb5aad3 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -157,6 +157,24 @@ enum { PT_FEAT_AMDV1_FORCE_COHERENCE, }; +struct pt_vtdss { + struct pt_common common; +}; + +enum { + /* + * The PTEs are set to prevent cache incoherent traffic, such as PCI no + * snoop. This is set either at creation time or before the first map + * operation. + */ + PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START, + /* + * Prevent creating read-only PTEs. Used to work around HW errata + * ERRATA_772415_SPR17. + */ + PT_FEAT_VTDSS_FORCE_WRITEABLE, +}; + struct pt_x86_64 { struct pt_common common; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 21132e342a791..cfe05a77f86b0 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -262,6 +262,17 @@ IOMMU_FORMAT(amdv1, amdpt); struct pt_iommu_amdv1_mock_hw_info; IOMMU_PROTOTYPES(amdv1_mock); +struct pt_iommu_vtdss_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_vtdss_hw_info { + u64 ssptptr; + u8 aw; +}; + +IOMMU_FORMAT(vtdss, vtdss_pt); + struct pt_iommu_x86_64_cfg { struct pt_iommu_cfg common; };