]> git.ipfire.org Git - people/arne_f/kernel.git/commitdiff
Merge branch 'linus' into x86/mm to pick up fixes and to fix conflicts
authorIngo Molnar <mingo@kernel.org>
Sat, 26 Aug 2017 07:19:13 +0000 (09:19 +0200)
committerIngo Molnar <mingo@kernel.org>
Sat, 26 Aug 2017 07:19:13 +0000 (09:19 +0200)
Conflicts:
arch/x86/kernel/head64.c
arch/x86/mm/mmap.c

Signed-off-by: Ingo Molnar <mingo@kernel.org>
119 files changed:
Documentation/admin-guide/kernel-parameters.txt
Documentation/x86/amd-memory-encryption.txt [new file with mode: 0644]
Documentation/x86/protection-keys.txt
Documentation/x86/x86_64/5level-paging.txt [new file with mode: 0644]
arch/ia64/include/asm/acpi.h
arch/ia64/kernel/efi.c
arch/x86/Kconfig
arch/x86/boot/compressed/kaslr.c
arch/x86/boot/compressed/pagetable.c
arch/x86/include/asm/acpi.h
arch/x86/include/asm/cmdline.h
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/disabled-features.h
arch/x86/include/asm/dma-mapping.h
arch/x86/include/asm/dmi.h
arch/x86/include/asm/e820/api.h
arch/x86/include/asm/elf.h
arch/x86/include/asm/fixmap.h
arch/x86/include/asm/init.h
arch/x86/include/asm/io.h
arch/x86/include/asm/kexec.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/mem_encrypt.h [new file with mode: 0644]
arch/x86/include/asm/mmu.h
arch/x86/include/asm/mmu_context.h
arch/x86/include/asm/mpx.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/page_64.h
arch/x86/include/asm/page_types.h
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/processor-flags.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/realmode.h
arch/x86/include/asm/set_memory.h
arch/x86/include/asm/tlbflush.h
arch/x86/include/asm/vga.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/bugs.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/scattered.c
arch/x86/kernel/e820.c
arch/x86/kernel/espfix_64.c
arch/x86/kernel/head64.c
arch/x86/kernel/head_64.S
arch/x86/kernel/kdebugfs.c
arch/x86/kernel/ksysfs.c
arch/x86/kernel/machine_kexec_64.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/pci-dma.c
arch/x86/kernel/pci-nommu.c
arch/x86/kernel/pci-swiotlb.c
arch/x86/kernel/process.c
arch/x86/kernel/relocate_kernel_64.S
arch/x86/kernel/setup.c
arch/x86/kernel/sys_x86_64.c
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/lib/cmdline.c
arch/x86/mm/Makefile
arch/x86/mm/dump_pagetables.c
arch/x86/mm/hugetlbpage.c
arch/x86/mm/ident_map.c
arch/x86/mm/init.c
arch/x86/mm/ioremap.c
arch/x86/mm/kasan_init_64.c
arch/x86/mm/mem_encrypt.c [new file with mode: 0644]
arch/x86/mm/mem_encrypt_boot.S [new file with mode: 0644]
arch/x86/mm/mmap.c
arch/x86/mm/mpx.c
arch/x86/mm/pageattr.c
arch/x86/mm/pat.c
arch/x86/mm/tlb.c
arch/x86/pci/common.c
arch/x86/platform/efi/efi.c
arch/x86/platform/efi/efi_64.c
arch/x86/realmode/init.c
arch/x86/realmode/rm/trampoline_64.S
arch/x86/xen/Kconfig
arch/x86/xen/enlighten_pv.c
arch/x86/xen/mmu_pv.c
arch/x86/xen/xen-head.S
drivers/acpi/processor_idle.c
drivers/firmware/dmi-sysfs.c
drivers/firmware/efi/efi.c
drivers/firmware/pcdp.c
drivers/gpu/drm/drm_gem.c
drivers/gpu/drm/drm_vm.c
drivers/gpu/drm/ttm/ttm_bo_vm.c
drivers/gpu/drm/udl/udl_fb.c
drivers/idle/intel_idle.c
drivers/iommu/amd_iommu.c
drivers/iommu/amd_iommu_init.c
drivers/iommu/amd_iommu_proto.h
drivers/iommu/amd_iommu_types.h
drivers/sfi/sfi_core.c
drivers/video/fbdev/core/fbmem.c
include/asm-generic/early_ioremap.h
include/asm-generic/pgtable.h
include/linux/compiler-gcc.h
include/linux/compiler.h
include/linux/dma-mapping.h
include/linux/efi.h
include/linux/io.h
include/linux/kexec.h
include/linux/mem_encrypt.h [new file with mode: 0644]
include/linux/mm_inline.h
include/linux/swiotlb.h
init/main.c
kernel/kexec_core.c
kernel/memremap.c
lib/swiotlb.c
mm/early_ioremap.c
mm/memory-failure.c

index d9c171ce4190845950e7c14e362265b4d26adc74..372cc66bba23286c485abd8aa178abe2f3119fe7 100644 (file)
                        memory contents and reserves bad memory
                        regions that are detected.
 
+       mem_encrypt=    [X86-64] AMD Secure Memory Encryption (SME) control
+                       Valid arguments: on, off
+                       Default (depends on kernel configuration option):
+                         on  (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y)
+                         off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n)
+                       mem_encrypt=on:         Activate SME
+                       mem_encrypt=off:        Do not activate SME
+
+                       Refer to Documentation/x86/amd-memory-encryption.txt
+                       for details on when memory encryption can be activated.
+
        mem_sleep_default=      [SUSPEND] Default system suspend mode:
                        s2idle  - Suspend-To-Idle
                        shallow - Power-On Suspend or equivalent (if supported)
        nopat           [X86] Disable PAT (page attribute table extension of
                        pagetables) support.
 
+       nopcid          [X86-64] Disable the PCID cpu feature.
+
        norandmaps      Don't use address space randomization.  Equivalent to
                        echo 0 > /proc/sys/kernel/randomize_va_space
 
diff --git a/Documentation/x86/amd-memory-encryption.txt b/Documentation/x86/amd-memory-encryption.txt
new file mode 100644 (file)
index 0000000..f512ab7
--- /dev/null
@@ -0,0 +1,68 @@
+Secure Memory Encryption (SME) is a feature found on AMD processors.
+
+SME provides the ability to mark individual pages of memory as encrypted using
+the standard x86 page tables.  A page that is marked encrypted will be
+automatically decrypted when read from DRAM and encrypted when written to
+DRAM.  SME can therefore be used to protect the contents of DRAM from physical
+attacks on the system.
+
+A page is encrypted when a page table entry has the encryption bit set (see
+below on how to determine its position).  The encryption bit can also be
+specified in the cr3 register, allowing the PGD table to be encrypted. Each
+successive level of page tables can also be encrypted by setting the encryption
+bit in the page table entry that points to the next table. This allows the full
+page table hierarchy to be encrypted. Note, this means that just because the
+encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted.
+Each page table entry in the hierarchy needs to have the encryption bit set to
+achieve that. So, theoretically, you could have the encryption bit set in cr3
+so that the PGD is encrypted, but not set the encryption bit in the PGD entry
+for a PUD which results in the PUD pointed to by that entry to not be
+encrypted.
+
+Support for SME can be determined through the CPUID instruction. The CPUID
+function 0x8000001f reports information related to SME:
+
+       0x8000001f[eax]:
+               Bit[0] indicates support for SME
+       0x8000001f[ebx]:
+               Bits[5:0]  pagetable bit number used to activate memory
+                          encryption
+               Bits[11:6] reduction in physical address space, in bits, when
+                          memory encryption is enabled (this only affects
+                          system physical addresses, not guest physical
+                          addresses)
+
+If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to
+determine if SME is enabled and/or to enable memory encryption:
+
+       0xc0010010:
+               Bit[23]   0 = memory encryption features are disabled
+                         1 = memory encryption features are enabled
+
+Linux relies on BIOS to set this bit if BIOS has determined that the reduction
+in the physical address space as a result of enabling memory encryption (see
+CPUID information above) will not conflict with the address space resource
+requirements for the system.  If this bit is not set upon Linux startup then
+Linux itself will not set it and memory encryption will not be possible.
+
+The state of SME in the Linux kernel can be documented as follows:
+       - Supported:
+         The CPU supports SME (determined through CPUID instruction).
+
+       - Enabled:
+         Supported and bit 23 of MSR_K8_SYSCFG is set.
+
+       - Active:
+         Supported, Enabled and the Linux kernel is actively applying
+         the encryption bit to page table entries (the SME mask in the
+         kernel is non-zero).
+
+SME can also be enabled and activated in the BIOS. If SME is enabled and
+activated in the BIOS, then all memory accesses will be encrypted and it will
+not be necessary to activate the Linux memory encryption support.  If the BIOS
+merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate
+memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or
+by supplying mem_encrypt=on on the kernel command line.  However, if BIOS does
+not enable SME, then Linux will not be able to activate memory encryption, even
+if configured to do so by default or the mem_encrypt=on command line parameter
+is specified.
index b643045408218669de1af81b9a9a661c035ffc70..fa46dcb347bc1d2ac60901c4621bd3bad81de601 100644 (file)
@@ -34,7 +34,7 @@ with a key.  In this example WRPKRU is wrapped by a C function
 called pkey_set().
 
        int real_prot = PROT_READ|PROT_WRITE;
-       pkey = pkey_alloc(0, PKEY_DENY_WRITE);
+       pkey = pkey_alloc(0, PKEY_DISABLE_WRITE);
        ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
        ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
        ... application runs here
@@ -42,9 +42,9 @@ called pkey_set().
 Now, if the application needs to update the data at 'ptr', it can
 gain access, do the update, then remove its write access:
 
-       pkey_set(pkey, 0); // clear PKEY_DENY_WRITE
+       pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE
        *ptr = foo; // assign something
-       pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again
+       pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again
 
 Now when it frees the memory, it will also free the pkey since it
 is no longer in use:
diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt
new file mode 100644 (file)
index 0000000..087251a
--- /dev/null
@@ -0,0 +1,64 @@
+== Overview ==
+
+Original x86-64 was limited by 4-level paing to 256 TiB of virtual address
+space and 64 TiB of physical address space. We are already bumping into
+this limit: some vendors offers servers with 64 TiB of memory today.
+
+To overcome the limitation upcoming hardware will introduce support for
+5-level paging. It is a straight-forward extension of the current page
+table structure adding one more layer of translation.
+
+It bumps the limits to 128 PiB of virtual address space and 4 PiB of
+physical address space. This "ought to be enough for anybody" Â©.
+
+QEMU 2.9 and later support 5-level paging.
+
+Virtual memory layout for 5-level paging is described in
+Documentation/x86/x86_64/mm.txt
+
+== Enabling 5-level paging ==
+
+CONFIG_X86_5LEVEL=y enables the feature.
+
+So far, a kernel compiled with the option enabled will be able to boot
+only on machines that supports the feature -- see for 'la57' flag in
+/proc/cpuinfo.
+
+The plan is to implement boot-time switching between 4- and 5-level paging
+in the future.
+
+== User-space and large virtual address space ==
+
+On x86, 5-level paging enables 56-bit userspace virtual address space.
+Not all user space is ready to handle wide addresses. It's known that
+at least some JIT compilers use higher bits in pointers to encode their
+information. It collides with valid pointers with 5-level paging and
+leads to crashes.
+
+To mitigate this, we are not going to allocate virtual address space
+above 47-bit by default.
+
+But userspace can ask for allocation from full address space by
+specifying hint address (with or without MAP_FIXED) above 47-bits.
+
+If hint address set above 47-bit, but MAP_FIXED is not specified, we try
+to look for unmapped area by specified address. If it's already
+occupied, we look for unmapped area in *full* address space, rather than
+from 47-bit window.
+
+A high hint address would only affect the allocation in question, but not
+any future mmap()s.
+
+Specifying high hint address on older kernel or on machine without 5-level
+paging support is safe. The hint will be ignored and kernel will fall back
+to allocation from 47-bit address space.
+
+This approach helps to easily make application's memory allocator aware
+about large address space without manually tracking allocated virtual
+address space.
+
+One important case we need to handle here is interaction with MPX.
+MPX (without MAWA extension) cannot handle addresses above 47-bit, so we
+need to make sure that MPX cannot be enabled we already have VMA above
+the boundary and forbid creating such VMAs once MPX is enabled.
+
index a3d0211970e95e5152edbaeeecd8ab3a42041ac6..c86a947f5368633b86d1a8e5dc7ddd1803982c64 100644 (file)
@@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
        buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP;
 }
 
-#define acpi_unlazy_tlb(x)
-
 #ifdef CONFIG_ACPI_NUMA
 extern cpumask_t early_cpu_possible_map;
 #define for_each_possible_early_cpu(cpu)  \
index 121295637d0df831fbbf919983ee9a0cd1a7d366..81416000c5e07c18f16032bb0b9c9f77abc8b883 100644 (file)
@@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
        return 0;
 }
 
-u32
+int
 efi_mem_type (unsigned long phys_addr)
 {
        efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
 
        if (md)
                return md->type;
-       return 0;
+       return -EINVAL;
 }
 
 u64
index 323cb065be5eda120b44dac79618a13301ece231..e4844e934728c58f054eefaf1375e6b018dc982a 100644 (file)
@@ -327,6 +327,7 @@ config FIX_EARLYCON_MEM
 
 config PGTABLE_LEVELS
        int
+       default 5 if X86_5LEVEL
        default 4 if X86_64
        default 3 if X86_PAE
        default 2
@@ -1399,6 +1400,24 @@ config X86_PAE
          has the cost of more pagetable lookup overhead, and also
          consumes more pagetable space per process.
 
+config X86_5LEVEL
+       bool "Enable 5-level page tables support"
+       depends on X86_64
+       ---help---
+         5-level paging enables access to larger address space:
+         upto 128 PiB of virtual address space and 4 PiB of
+         physical address space.
+
+         It will be supported by future Intel CPUs.
+
+         Note: a kernel with this option enabled can only be booted
+         on machines that support the feature.
+
+         See Documentation/x86/x86_64/5level-paging.txt for more
+         information.
+
+         Say N if unsure.
+
 config ARCH_PHYS_ADDR_T_64BIT
        def_bool y
        depends on X86_64 || X86_PAE
@@ -1416,6 +1435,35 @@ config X86_DIRECT_GBPAGES
          supports them), so don't confuse the user by printing
          that we have them enabled.
 
+config ARCH_HAS_MEM_ENCRYPT
+       def_bool y
+
+config AMD_MEM_ENCRYPT
+       bool "AMD Secure Memory Encryption (SME) support"
+       depends on X86_64 && CPU_SUP_AMD
+       ---help---
+         Say yes to enable support for the encryption of system memory.
+         This requires an AMD processor that supports Secure Memory
+         Encryption (SME).
+
+config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
+       bool "Activate AMD Secure Memory Encryption (SME) by default"
+       default y
+       depends on AMD_MEM_ENCRYPT
+       ---help---
+         Say yes to have system memory encrypted by default if running on
+         an AMD processor that supports Secure Memory Encryption (SME).
+
+         If set to Y, then the encryption of system memory can be
+         deactivated with the mem_encrypt=off command line option.
+
+         If set to N, then the encryption of system memory can be
+         activated with the mem_encrypt=on command line option.
+
+config ARCH_USE_MEMREMAP_PROT
+       def_bool y
+       depends on AMD_MEM_ENCRYPT
+
 # Common NUMA Features
 config NUMA
        bool "Numa Memory Allocation and Scheduler Support"
index 91f27ab970ef74347c1915e7ff6ac5f8a8803b8d..99c7194f7ea626379ac649faa7da83fdb95776bf 100644 (file)
@@ -479,35 +479,31 @@ static unsigned long slots_fetch_random(void)
        return 0;
 }
 
-static void process_e820_entry(struct boot_e820_entry *entry,
+static void process_mem_region(struct mem_vector *entry,
                               unsigned long minimum,
                               unsigned long image_size)
 {
        struct mem_vector region, overlap;
        struct slot_area slot_area;
        unsigned long start_orig, end;
-       struct boot_e820_entry cur_entry;
-
-       /* Skip non-RAM entries. */
-       if (entry->type != E820_TYPE_RAM)
-               return;
+       struct mem_vector cur_entry;
 
        /* On 32-bit, ignore entries entirely above our maximum. */
-       if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE)
+       if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
                return;
 
        /* Ignore entries entirely below our minimum. */
-       if (entry->addr + entry->size < minimum)
+       if (entry->start + entry->size < minimum)
                return;
 
        /* Ignore entries above memory limit */
-       end = min(entry->size + entry->addr, mem_limit);
-       if (entry->addr >= end)
+       end = min(entry->size + entry->start, mem_limit);
+       if (entry->start >= end)
                return;
-       cur_entry.addr = entry->addr;
-       cur_entry.size = end - entry->addr;
+       cur_entry.start = entry->start;
+       cur_entry.size = end - entry->start;
 
-       region.start = cur_entry.addr;
+       region.start = cur_entry.start;
        region.size = cur_entry.size;
 
        /* Give up if slot area array is full. */
@@ -521,8 +517,8 @@ static void process_e820_entry(struct boot_e820_entry *entry,
                /* Potentially raise address to meet alignment needs. */
                region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
 
-               /* Did we raise the address above this e820 region? */
-               if (region.start > cur_entry.addr + cur_entry.size)
+               /* Did we raise the address above the passed in memory entry? */
+               if (region.start > cur_entry.start + cur_entry.size)
                        return;
 
                /* Reduce size by any delta from the original address. */
@@ -562,12 +558,32 @@ static void process_e820_entry(struct boot_e820_entry *entry,
        }
 }
 
-static unsigned long find_random_phys_addr(unsigned long minimum,
-                                          unsigned long image_size)
+static void process_e820_entries(unsigned long minimum,
+                                unsigned long image_size)
 {
        int i;
-       unsigned long addr;
+       struct mem_vector region;
+       struct boot_e820_entry *entry;
+
+       /* Verify potential e820 positions, appending to slots list. */
+       for (i = 0; i < boot_params->e820_entries; i++) {
+               entry = &boot_params->e820_table[i];
+               /* Skip non-RAM entries. */
+               if (entry->type != E820_TYPE_RAM)
+                       continue;
+               region.start = entry->addr;
+               region.size = entry->size;
+               process_mem_region(&region, minimum, image_size);
+               if (slot_area_index == MAX_SLOT_AREA) {
+                       debug_putstr("Aborted e820 scan (slot_areas full)!\n");
+                       break;
+               }
+       }
+}
 
+static unsigned long find_random_phys_addr(unsigned long minimum,
+                                          unsigned long image_size)
+{
        /* Check if we had too many memmaps. */
        if (memmap_too_large) {
                debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n");
@@ -577,16 +593,7 @@ static unsigned long find_random_phys_addr(unsigned long minimum,
        /* Make sure minimum is aligned. */
        minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
 
-       /* Verify potential e820 positions, appending to slots list. */
-       for (i = 0; i < boot_params->e820_entries; i++) {
-               process_e820_entry(&boot_params->e820_table[i], minimum,
-                                  image_size);
-               if (slot_area_index == MAX_SLOT_AREA) {
-                       debug_putstr("Aborted e820 scan (slot_areas full)!\n");
-                       break;
-               }
-       }
-
+       process_e820_entries(minimum, image_size);
        return slots_fetch_random();
 }
 
index 28029be47fbb839f248826b517a9e295f4389395..f1aa43854bed423e7bfccaa84ff66ea91996b731 100644 (file)
 #define __pa(x)  ((unsigned long)(x))
 #define __va(x)  ((void *)((unsigned long)(x)))
 
+/*
+ * The pgtable.h and mm/ident_map.c includes make use of the SME related
+ * information which is not used in the compressed image support. Un-define
+ * the SME support to avoid any compile and link errors.
+ */
+#undef CONFIG_AMD_MEM_ENCRYPT
+
 #include "misc.h"
 
 /* These actually do the work of building the kernel identity maps. */
index 2efc768e43627a48118d28729c380c0fbcaa7681..72d867f6b518e4db5a79a10c924f858a3edb0af8 100644 (file)
@@ -150,8 +150,6 @@ static inline void disable_acpi(void) { }
 extern int x86_acpi_numa_init(void);
 #endif /* CONFIG_ACPI_NUMA */
 
-#define acpi_unlazy_tlb(x)     leave_mm(x)
-
 #ifdef CONFIG_ACPI_APEI
 static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
 {
@@ -162,12 +160,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
         * you call efi_mem_attributes() during boot and at runtime,
         * you could theoretically see different attributes.
         *
-        * Since we are yet to see any x86 platforms that require
-        * anything other than PAGE_KERNEL (some arm64 platforms
-        * require the equivalent of PAGE_KERNEL_NOCACHE), return that
-        * until we know differently.
+        * We are yet to see any x86 platforms that require anything
+        * other than PAGE_KERNEL (some ARM64 platforms require the
+        * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME
+        * is active, the ACPI information will not be encrypted,
+        * so return PAGE_KERNEL_NOENC until we know differently.
         */
-        return PAGE_KERNEL;
+       return PAGE_KERNEL_NOENC;
 }
 #endif
 
index e01f7f7ccb0c5711db4f1fe130938cee36169965..84ae170bc3d0cd73cfa7d08fa6f730bd3106e4ea 100644 (file)
@@ -2,5 +2,7 @@
 #define _ASM_X86_CMDLINE_H
 
 int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+                       char *buffer, int bufsize);
 
 #endif /* _ASM_X86_CMDLINE_H */
index 5a28e8e55e36fd2164c0cda175288a7fcc534c4b..66ac08607471c4a6fcdfb919304f2fd3405458ef 100644 (file)
 
 #define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_SME                ( 7*32+10) /* AMD Secure Memory Encryption */
 
 #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT   ( 7*32+15) /* Intel Processor Trace */
index 5dff775af7cd6456f7177d9ce5888ae78dc6bc10..c10c9128f54e6b7296014a74e7a253a1eedaacd9 100644 (file)
 # define DISABLE_K6_MTRR       (1<<(X86_FEATURE_K6_MTRR & 31))
 # define DISABLE_CYRIX_ARR     (1<<(X86_FEATURE_CYRIX_ARR & 31))
 # define DISABLE_CENTAUR_MCR   (1<<(X86_FEATURE_CENTAUR_MCR & 31))
+# define DISABLE_PCID          0
 #else
 # define DISABLE_VME           0
 # define DISABLE_K6_MTRR       0
 # define DISABLE_CYRIX_ARR     0
 # define DISABLE_CENTAUR_MCR   0
+# define DISABLE_PCID          (1<<(X86_FEATURE_PCID & 31))
 #endif /* CONFIG_X86_64 */
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -49,7 +51,7 @@
 #define DISABLED_MASK1 0
 #define DISABLED_MASK2 0
 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
-#define DISABLED_MASK4 0
+#define DISABLED_MASK4 (DISABLE_PCID)
 #define DISABLED_MASK5 0
 #define DISABLED_MASK6 0
 #define DISABLED_MASK7 0
index 398c79889f5c43d384b72238ce025f140debeb0b..1387dafdba2d2c24061adb014354b82174f620d4 100644 (file)
@@ -12,6 +12,7 @@
 #include <asm/io.h>
 #include <asm/swiotlb.h>
 #include <linux/dma-contiguous.h>
+#include <linux/mem_encrypt.h>
 
 #ifdef CONFIG_ISA
 # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
@@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-       return paddr;
+       return __sme_set(paddr);
 }
 
 static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
-       return daddr;
+       return __sme_clr(daddr);
 }
 #endif /* CONFIG_X86_DMA_REMAP */
 
index 3c69fed215c56c3203e59d97a6c7e11381c97cf8..a8e15b04565b842def6a1bd1fa4b3b03db756c3c 100644 (file)
@@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
 }
 
 /* Use early IO mappings for DMI because it's initialized early */
-#define dmi_early_remap                early_ioremap
-#define dmi_early_unmap                early_iounmap
-#define dmi_remap              ioremap_cache
-#define dmi_unmap              iounmap
+#define dmi_early_remap                early_memremap
+#define dmi_early_unmap                early_memunmap
+#define dmi_remap(_x, _l)      memremap(_x, _l, MEMREMAP_WB)
+#define dmi_unmap(_x)          memunmap(_x)
 
 #endif /* _ASM_X86_DMI_H */
index a504adc661a4954ca0d1c5ed04149fece0e042a3..cd266d830e4960fd023aa0411c991a2701db09f9 100644 (file)
@@ -39,6 +39,8 @@ extern void e820__setup_pci_gap(void);
 extern void e820__reallocate_tables(void);
 extern void e820__register_nosave_regions(unsigned long limit_pfn);
 
+extern int  e820__get_entry_type(u64 start, u64 end);
+
 /*
  * Returns true iff the specified range [start,end) is completely contained inside
  * the ISA region.
index 9aeb91935ce02387d8dae5e2f51bf1750f420a43..a3de31ffb72254199d769afd9266997f563c67d5 100644 (file)
@@ -304,8 +304,8 @@ static inline int mmap_is_ia32(void)
                test_thread_flag(TIF_ADDR32));
 }
 
-extern unsigned long tasksize_32bit(void);
-extern unsigned long tasksize_64bit(void);
+extern unsigned long task_size_32bit(void);
+extern unsigned long task_size_64bit(int full_addr_space);
 extern unsigned long get_mmap_base(int is_legacy);
 
 #ifdef CONFIG_X86_32
index b65155cc3760a72b49b680c3f70923ddedf684d2..dcd9fb55e67991821d46602754a392c6f2ed0e06 100644 (file)
@@ -157,6 +157,26 @@ static inline void __set_fixmap(enum fixed_addresses idx,
 }
 #endif
 
+/*
+ * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not
+ * supported for MMIO addresses, so make sure that the memory encryption
+ * mask is not part of the page attributes.
+ */
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
+
+/*
+ * Early memremap routines used for in-place encryption. The mappings created
+ * by these routines are intended to be used as temporary mappings.
+ */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+                                     unsigned long size);
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+                                        unsigned long size);
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+                                     unsigned long size);
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+                                        unsigned long size);
+
 #include <asm-generic/fixmap.h>
 
 #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
index 474eb8c66feeb2c98de2f5d6fe1db84de752c806..05c4aa00cc862e3b1dad1b344b0eddb9d6f44db4 100644 (file)
@@ -7,6 +7,7 @@ struct x86_mapping_info {
        unsigned long page_flag;         /* page flag for PMD or PUD entry */
        unsigned long offset;            /* ident mapping offset */
        bool direct_gbpages;             /* PUD level 1GB page support */
+       unsigned long kernpg_flag;       /* kernel pagetable flag override */
 };
 
 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
index 48febf07e828099d0580be40ad82a8baee6ade18..4bc6f459a8b6dd9861f078c794d6c26534db5148 100644 (file)
@@ -381,4 +381,12 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
 #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
 #endif
 
+extern bool arch_memremap_can_ram_remap(resource_size_t offset,
+                                       unsigned long size,
+                                       unsigned long flags);
+#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
+
+extern bool phys_mem_access_encrypted(unsigned long phys_addr,
+                                     unsigned long size);
+
 #endif /* _ASM_X86_IO_H */
index 70ef205489f00e53ff568180c4dcbf6fb9e6ded1..942c1f444da88ddeb182e57f582a068c15cb2717 100644 (file)
@@ -147,7 +147,8 @@ unsigned long
 relocate_kernel(unsigned long indirection_page,
                unsigned long page_list,
                unsigned long start_address,
-               unsigned int preserve_context);
+               unsigned int preserve_context,
+               unsigned int sme_active);
 #endif
 
 #define ARCH_HAS_KIMAGE_ARCH
@@ -207,6 +208,14 @@ struct kexec_entry64_regs {
        uint64_t r15;
        uint64_t rip;
 };
+
+extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
+                                      gfp_t gfp);
+#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
+
+extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
+#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
+
 #endif
 
 typedef void crash_vmclear_fn(void);
index 87ac4fba6d8e12f07e8a9f191bdb028a1c3e6234..7cbaab523f22dcd91812dc269021d1e884ec0df2 100644 (file)
@@ -1078,7 +1078,7 @@ void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask);
+               u64 acc_track_mask, u64 me_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
new file mode 100644 (file)
index 0000000..8e618fc
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __X86_MEM_ENCRYPT_H__
+#define __X86_MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+extern unsigned long sme_me_mask;
+
+void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
+                        unsigned long decrypted_kernel_vaddr,
+                        unsigned long kernel_len,
+                        unsigned long encryption_wa,
+                        unsigned long encryption_pgd);
+
+void __init sme_early_encrypt(resource_size_t paddr,
+                             unsigned long size);
+void __init sme_early_decrypt(resource_size_t paddr,
+                             unsigned long size);
+
+void __init sme_map_bootdata(char *real_mode_data);
+void __init sme_unmap_bootdata(char *real_mode_data);
+
+void __init sme_early_init(void);
+
+void __init sme_encrypt_kernel(void);
+void __init sme_enable(struct boot_params *bp);
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void);
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
+
+#else  /* !CONFIG_AMD_MEM_ENCRYPT */
+
+#define sme_me_mask    0UL
+
+static inline void __init sme_early_encrypt(resource_size_t paddr,
+                                           unsigned long size) { }
+static inline void __init sme_early_decrypt(resource_size_t paddr,
+                                           unsigned long size) { }
+
+static inline void __init sme_map_bootdata(char *real_mode_data) { }
+static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
+
+static inline void __init sme_early_init(void) { }
+
+static inline void __init sme_encrypt_kernel(void) { }
+static inline void __init sme_enable(struct boot_params *bp) { }
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+/*
+ * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
+ * writing to or comparing values from the cr3 register.  Having the
+ * encryption mask set in cr3 enables the PGD entry to be encrypted and
+ * avoid special case handling of PGD allocations.
+ */
+#define __sme_pa(x)            (__pa(x) | sme_me_mask)
+#define __sme_pa_nodebug(x)    (__pa_nodebug(x) | sme_me_mask)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __X86_MEM_ENCRYPT_H__ */
index 79b647a7ebd0079b96472e52634c898301d3a63e..bb8c597c2248a9c8341d04813b04e1d42e9c7019 100644 (file)
@@ -3,12 +3,28 @@
 
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
+#include <linux/atomic.h>
 
 /*
- * The x86 doesn't have a mmu context, but
- * we put the segment information here.
+ * x86 has arch-specific MMU state beyond what lives in mm_struct.
  */
 typedef struct {
+       /*
+        * ctx_id uniquely identifies this mm_struct.  A ctx_id will never
+        * be reused, and zero is not a valid ctx_id.
+        */
+       u64 ctx_id;
+
+       /*
+        * Any code that needs to do any sort of TLB flushing for this
+        * mm will first make its changes to the page tables, then
+        * increment tlb_gen, then flush.  This lets the low-level
+        * flushing code keep track of what needs flushing.
+        *
+        * This is not used on Xen PV.
+        */
+       atomic64_t tlb_gen;
+
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
        struct ldt_struct *ldt;
 #endif
@@ -37,6 +53,11 @@ typedef struct {
 #endif
 } mm_context_t;
 
+#define INIT_MM_CONTEXT(mm)                                            \
+       .context = {                                                    \
+               .ctx_id = 1,                                            \
+       }
+
 void leave_mm(int cpu);
 
 #endif /* _ASM_X86_MMU_H */
index 265c907d7d4c9b8c69e24792a20c5a3dfb6c95ee..d25d9f4abb15a1e06e83e6e4ebac1ef9f2c70c04 100644 (file)
@@ -12,6 +12,9 @@
 #include <asm/tlbflush.h>
 #include <asm/paravirt.h>
 #include <asm/mpx.h>
+
+extern atomic64_t last_mm_ctx_id;
+
 #ifndef CONFIG_PARAVIRT
 static inline void paravirt_activate_mm(struct mm_struct *prev,
                                        struct mm_struct *next)
@@ -125,13 +128,18 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
-       if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
-               this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+       int cpu = smp_processor_id();
+
+       if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
+               cpumask_clear_cpu(cpu, mm_cpumask(mm));
 }
 
 static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
 {
+       mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
+       atomic64_set(&mm->context.tlb_gen, 0);
+
        #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                /* pkey 0 is the default and always allocated */
@@ -292,6 +300,9 @@ static inline unsigned long __get_current_cr3_fast(void)
 {
        unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
 
+       if (static_cpu_has(X86_FEATURE_PCID))
+               cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
        /* For now, be very restrictive about when this can be called. */
        VM_WARN_ON(in_nmi() || preemptible());
 
index a0d662be4c5b8545a7b0b8c5c846187ced8d0020..7d7404756bb4a734bcb04c5f57931daff8af73b3 100644 (file)
@@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm)
 }
 void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long start, unsigned long end);
+
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+               unsigned long flags);
 #else
 static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
@@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm,
                                    unsigned long start, unsigned long end)
 {
 }
+
+static inline unsigned long mpx_unmapped_area_check(unsigned long addr,
+               unsigned long len, unsigned long flags)
+{
+       return addr;
+}
 #endif /* CONFIG_X86_INTEL_MPX */
 
 #endif /* _ASM_X86_MPX_H */
index 5573c75f8e4ced276c8585b71f0df9b786ea9e90..17f5c12e1afd0c6ddb3fa2e6fc94b1fec52f7d5c 100644 (file)
 #define MSR_K8_TOP_MEM1                        0xc001001a
 #define MSR_K8_TOP_MEM2                        0xc001001d
 #define MSR_K8_SYSCFG                  0xc0010010
+#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT  23
+#define MSR_K8_SYSCFG_MEM_ENCRYPT      BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
 #define MSR_K8_INT_PENDING_MSG         0xc0010055
 /* C1E active bits in int pending message */
 #define K8_INTP_C1E_ACTIVE_MASK                0x18000000
index b4a0d43248cf3d6f2c7ef042200c31a25f29f122..b50df06ad251f143e105843ab84ac1a14f40abdf 100644 (file)
@@ -51,6 +51,10 @@ static inline void clear_page(void *page)
 
 void copy_page(void *to, void *from);
 
+#ifdef CONFIG_X86_MCE
+#define arch_unmap_kpfn arch_unmap_kpfn
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
index 7bd0099384cac4ed0fa89a4e25c42e4a63ae5f9c..b98ed9d1463098936bcebbae46b59dc00495508c 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/const.h>
 #include <linux/types.h>
+#include <linux/mem_encrypt.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT             12
@@ -15,7 +16,7 @@
 #define PUD_PAGE_SIZE          (_AC(1, UL) << PUD_SHIFT)
 #define PUD_PAGE_MASK          (~(PUD_PAGE_SIZE-1))
 
-#define __PHYSICAL_MASK                ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
+#define __PHYSICAL_MASK                ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1)))
 #define __VIRTUAL_MASK         ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
 
 /* Cast *PAGE_MASK to a signed type so that it is sign-extended if
index 77037b6f1caa22f622f50d67ea6cebd00f76f685..bbeae4a2bd01a3209e6d68f8f2af918eeb17dff2 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_PGTABLE_H
 #define _ASM_X86_PGTABLE_H
 
+#include <linux/mem_encrypt.h>
 #include <asm/page.h>
 #include <asm/pgtable_types.h>
 
                     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))     \
         : (prot))
 
+/*
+ * Macros to add or remove encryption attribute
+ */
+#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot)))
+#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
+
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
+
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level_checkwx(void);
 
@@ -38,6 +48,8 @@ extern struct list_head pgd_list;
 
 extern struct mm_struct *pgd_page_get_mm(struct page *page);
 
+extern pmdval_t early_pmd_flags;
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
@@ -195,6 +207,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d)
        return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
 }
 
+static inline unsigned long pgd_pfn(pgd_t pgd)
+{
+       return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
 static inline int p4d_large(p4d_t p4d)
 {
        /* No 512 GiB pages yet */
@@ -704,8 +721,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pmd_page(pmd)          \
-       pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT)
+#define pmd_page(pmd)  pfn_to_page(pmd_pfn(pmd))
 
 /*
  * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -773,8 +789,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pud_page(pud)          \
-       pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT)
+#define pud_page(pud)  pfn_to_page(pud_pfn(pud))
 
 /* Find an entry in the second-level page table.. */
 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@ -824,8 +839,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define p4d_page(p4d)          \
-       pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT)
+#define p4d_page(p4d)  pfn_to_page(p4d_pfn(p4d))
 
 /* Find an entry in the third-level page table.. */
 static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
@@ -859,7 +873,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pgd_page(pgd)          pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+#define pgd_page(pgd)  pfn_to_page(pgd_pfn(pgd))
 
 /* to find an entry in a page-table-directory. */
 static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
index bf9638e1ee4215d4101d2836d5c3963d59e6dbab..399261ce904ca1df269e5194b8e523d73b8a3f69 100644 (file)
@@ -2,6 +2,8 @@
 #define _ASM_X86_PGTABLE_DEFS_H
 
 #include <linux/const.h>
+#include <linux/mem_encrypt.h>
+
 #include <asm/page_types.h>
 
 #define FIRST_USER_ADDRESS     0UL
 
 #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
-#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
-                        _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |    \
-                        _PAGE_DIRTY)
+#define _PAGE_TABLE_NOENC      (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
+                                _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE_NOENC    (_PAGE_PRESENT | _PAGE_RW |             \
+                                _PAGE_ACCESSED | _PAGE_DIRTY)
 
 /*
  * Set of bits not changed in pte_modify.  The pte's
@@ -159,6 +161,7 @@ enum page_cache_mode {
 
 #define _PAGE_CACHE_MASK       (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
 #define _PAGE_NOCACHE          (cachemode2protval(_PAGE_CACHE_MODE_UC))
+#define _PAGE_CACHE_WP         (cachemode2protval(_PAGE_CACHE_MODE_WP))
 
 #define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
 #define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
@@ -187,22 +190,42 @@ enum page_cache_mode {
 #define __PAGE_KERNEL_VVAR             (__PAGE_KERNEL_RO | _PAGE_USER)
 #define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+#define __PAGE_KERNEL_WP               (__PAGE_KERNEL | _PAGE_CACHE_WP)
 
 #define __PAGE_KERNEL_IO               (__PAGE_KERNEL)
 #define __PAGE_KERNEL_IO_NOCACHE       (__PAGE_KERNEL_NOCACHE)
 
-#define PAGE_KERNEL                    __pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO                 __pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC               __pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX                 __pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_NOCACHE            __pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_LARGE              __pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC         __pgprot(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL           __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VVAR               __pgprot(__PAGE_KERNEL_VVAR)
+#ifndef __ASSEMBLY__
+
+#define _PAGE_ENC      (_AT(pteval_t, sme_me_mask))
+
+#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
+                        _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
+#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |    \
+                        _PAGE_DIRTY | _PAGE_ENC)
+
+#define __PAGE_KERNEL_ENC      (__PAGE_KERNEL | _PAGE_ENC)
+#define __PAGE_KERNEL_ENC_WP   (__PAGE_KERNEL_WP | _PAGE_ENC)
+
+#define __PAGE_KERNEL_NOENC    (__PAGE_KERNEL)
+#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP)
+
+#define PAGE_KERNEL            __pgprot(__PAGE_KERNEL | _PAGE_ENC)
+#define PAGE_KERNEL_NOENC      __pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO         __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC       __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX         __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
+#define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE      __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_VSYSCALL   __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC)
+#define PAGE_KERNEL_VVAR       __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
+
+#define PAGE_KERNEL_IO         __pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
 
-#define PAGE_KERNEL_IO                 __pgprot(__PAGE_KERNEL_IO)
-#define PAGE_KERNEL_IO_NOCACHE         __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#endif /* __ASSEMBLY__ */
 
 /*         xwr */
 #define __P000 PAGE_NONE
@@ -287,6 +310,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d)
 #else
 #include <asm-generic/pgtable-nop4d.h>
 
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+       return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
+}
+
 static inline p4dval_t native_p4d_val(p4d_t p4d)
 {
        return native_pgd_val(p4d.pgd);
index 79aa2f98398d4eaabf45a1493baaa072f11782ec..dc723b64acf0675689c1feb187bfd957480c5d94 100644 (file)
@@ -2,6 +2,7 @@
 #define _ASM_X86_PROCESSOR_FLAGS_H
 
 #include <uapi/asm/processor-flags.h>
+#include <linux/mem_encrypt.h>
 
 #ifdef CONFIG_VM86
 #define X86_VM_MASK    X86_EFLAGS_VM
  * CR3_ADDR_MASK is the mask used by read_cr3_pa().
  */
 #ifdef CONFIG_X86_64
-/* Mask off the address space ID bits. */
-#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
-#define CR3_PCID_MASK 0xFFFull
+/* Mask off the address space ID and SME encryption bits. */
+#define CR3_ADDR_MASK  __sme_clr(0x7FFFFFFFFFFFF000ull)
+#define CR3_PCID_MASK  0xFFFull
+#define CR3_NOFLUSH    BIT_ULL(63)
 #else
 /*
  * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
  * a tiny bit of code size by setting all the bits.
  */
-#define CR3_ADDR_MASK 0xFFFFFFFFull
-#define CR3_PCID_MASK 0ull
+#define CR3_ADDR_MASK  0xFFFFFFFFull
+#define CR3_PCID_MASK  0ull
+#define CR3_NOFLUSH    0
 #endif
 
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
index 028245e1c42b23d1498643427ebb73be25ded661..c61bab07a84e05a0a21afc9e166db6e37fa15442 100644 (file)
@@ -29,6 +29,7 @@ struct vm86;
 #include <linux/math64.h>
 #include <linux/err.h>
 #include <linux/irqflags.h>
+#include <linux/mem_encrypt.h>
 
 /*
  * We handle most unaligned accesses in hardware.  On the other hand
@@ -239,9 +240,14 @@ static inline unsigned long read_cr3_pa(void)
        return __read_cr3() & CR3_ADDR_MASK;
 }
 
+static inline unsigned long native_read_cr3_pa(void)
+{
+       return __native_read_cr3() & CR3_ADDR_MASK;
+}
+
 static inline void load_cr3(pgd_t *pgdir)
 {
-       write_cr3(__pa(pgdir));
+       write_cr3(__sme_pa(pgdir));
 }
 
 #ifdef CONFIG_X86_32
@@ -802,7 +808,9 @@ static inline void spin_lock_prefetch(const void *x)
  */
 #define IA32_PAGE_OFFSET       PAGE_OFFSET
 #define TASK_SIZE              PAGE_OFFSET
+#define TASK_SIZE_LOW          TASK_SIZE
 #define TASK_SIZE_MAX          TASK_SIZE
+#define DEFAULT_MAP_WINDOW     TASK_SIZE
 #define STACK_TOP              TASK_SIZE
 #define STACK_TOP_MAX          STACK_TOP
 
@@ -842,7 +850,9 @@ static inline void spin_lock_prefetch(const void *x)
  * particular problem by preventing anything from being mapped
  * at the maximum canonical address.
  */
-#define TASK_SIZE_MAX  ((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_MAX  ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
+
+#define DEFAULT_MAP_WINDOW     ((1UL << 47) - PAGE_SIZE)
 
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
@@ -850,12 +860,14 @@ static inline void spin_lock_prefetch(const void *x)
 #define IA32_PAGE_OFFSET       ((current->personality & ADDR_LIMIT_3GB) ? \
                                        0xc0000000 : 0xFFFFe000)
 
+#define TASK_SIZE_LOW          (test_thread_flag(TIF_ADDR32) ? \
+                                       IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
 #define TASK_SIZE              (test_thread_flag(TIF_ADDR32) ? \
                                        IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 #define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
                                        IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 
-#define STACK_TOP              TASK_SIZE
+#define STACK_TOP              TASK_SIZE_LOW
 #define STACK_TOP_MAX          TASK_SIZE_MAX
 
 #define INIT_THREAD  {                                         \
@@ -876,7 +888,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
  * space during mmap's.
  */
 #define __TASK_UNMAPPED_BASE(task_size)        (PAGE_ALIGN(task_size / 3))
-#define TASK_UNMAPPED_BASE             __TASK_UNMAPPED_BASE(TASK_SIZE)
+#define TASK_UNMAPPED_BASE             __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
 
 #define KSTK_EIP(task)         (task_pt_regs(task)->ip)
 
index 230e1903acf07faa831c8d2496a4457aaae9b5ea..90d91520c13ab09b566ca0a599a6170353153e47 100644 (file)
@@ -1,6 +1,15 @@
 #ifndef _ARCH_X86_REALMODE_H
 #define _ARCH_X86_REALMODE_H
 
+/*
+ * Flag bit definitions for use with the flags field of the trampoline header
+ * in the CONFIG_X86_64 variant.
+ */
+#define TH_FLAGS_SME_ACTIVE_BIT                0
+#define TH_FLAGS_SME_ACTIVE            BIT(TH_FLAGS_SME_ACTIVE_BIT)
+
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 #include <asm/io.h>
 
@@ -38,6 +47,7 @@ struct trampoline_header {
        u64 start;
        u64 efer;
        u32 cr4;
+       u32 flags;
 #endif
 };
 
@@ -69,4 +79,6 @@ static inline size_t real_mode_size_needed(void)
 void set_real_mode_mem(phys_addr_t mem, size_t size);
 void reserve_real_mode(void);
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ARCH_X86_REALMODE_H */
index eaec6c364e42d07f55930a80ef29b9d5c248165d..cd71273ec49d91aacffe3eff8883617ef94d8627 100644 (file)
@@ -11,6 +11,7 @@
  * Executability : eXeutable, NoteXecutable
  * Read/Write    : ReadOnly, ReadWrite
  * Presence      : NotPresent
+ * Encryption    : Encrypted, Decrypted
  *
  * Within a category, the attributes are mutually exclusive.
  *
@@ -42,6 +43,8 @@ int set_memory_wt(unsigned long addr, int numpages);
 int set_memory_wb(unsigned long addr, int numpages);
 int set_memory_np(unsigned long addr, int numpages);
 int set_memory_4k(unsigned long addr, int numpages);
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
 
 int set_memory_array_uc(unsigned long *addr, int addrinarray);
 int set_memory_array_wc(unsigned long *addr, int addrinarray);
index 50ea3482e1d1d0babfecf7864a6e9307ba6651fe..d23e61dc0640e451d8d1f997fad65af11b30dbfa 100644 (file)
@@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void)
        __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
 }
 
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+       u64 new_tlb_gen;
+
+       /*
+        * Bump the generation count.  This also serves as a full barrier
+        * that synchronizes with switch_mm(): callers are required to order
+        * their read of mm_cpumask after their writes to the paging
+        * structures.
+        */
+       smp_mb__before_atomic();
+       new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
+       smp_mb__after_atomic();
+
+       return new_tlb_gen;
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -65,6 +82,17 @@ static inline void invpcid_flush_all_nonglobals(void)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in
+ * two cache lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
+struct tlb_context {
+       u64 ctx_id;
+       u64 tlb_gen;
+};
+
 struct tlb_state {
        /*
         * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
@@ -73,13 +101,35 @@ struct tlb_state {
         * mode even if we've already switched back to swapper_pg_dir.
         */
        struct mm_struct *loaded_mm;
-       int state;
+       u16 loaded_mm_asid;
+       u16 next_asid;
 
        /*
         * Access to this CR4 shadow and to H/W CR4 is protected by
         * disabling interrupts when modifying either one.
         */
        unsigned long cr4;
+
+       /*
+        * This is a list of all contexts that might exist in the TLB.
+        * There is one per ASID that we use, and the ASID (what the
+        * CPU calls PCID) is the index into ctxts.
+        *
+        * For each context, ctx_id indicates which mm the TLB's user
+        * entries came from.  As an invariant, the TLB will never
+        * contain entries that are out-of-date as when that mm reached
+        * the tlb_gen in the list.
+        *
+        * To be clear, this means that it's legal for the TLB code to
+        * flush the TLB without updating tlb_gen.  This can happen
+        * (for now, at least) due to paravirt remote flushes.
+        *
+        * NB: context 0 is a bit special, since it's also used by
+        * various bits of init code.  This is fine -- code that
+        * isn't aware of PCID will end up harmlessly flushing
+        * context 0.
+        */
+       struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
@@ -207,6 +257,14 @@ static inline void __flush_tlb_all(void)
                __flush_tlb_global();
        else
                __flush_tlb();
+
+       /*
+        * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+        * we'd end up flushing kernel translations for the current ASID but
+        * we might fail to flush kernel translations for other cached ASIDs.
+        *
+        * To avoid this issue, we force PCID off if PGE is off.
+        */
 }
 
 static inline void __flush_tlb_one(unsigned long addr)
@@ -231,9 +289,26 @@ static inline void __flush_tlb_one(unsigned long addr)
  * and page-granular flushes are available only on i486 and up.
  */
 struct flush_tlb_info {
-       struct mm_struct *mm;
-       unsigned long start;
-       unsigned long end;
+       /*
+        * We support several kinds of flushes.
+        *
+        * - Fully flush a single mm.  .mm will be set, .end will be
+        *   TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
+        *   which the IPI sender is trying to catch us up.
+        *
+        * - Partially flush a single mm.  .mm will be set, .start and
+        *   .end will indicate the range, and .new_tlb_gen will be set
+        *   such that the changes between generation .new_tlb_gen-1 and
+        *   .new_tlb_gen are entirely contained in the indicated range.
+        *
+        * - Fully flush all mms whose tlb_gens have been updated.  .mm
+        *   will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
+        *   will be zero.
+        */
+       struct mm_struct        *mm;
+       unsigned long           start;
+       unsigned long           end;
+       u64                     new_tlb_gen;
 };
 
 #define local_flush_tlb() __flush_tlb()
@@ -256,12 +331,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 void native_flush_tlb_others(const struct cpumask *cpumask,
                             const struct flush_tlb_info *info);
 
-#define TLBSTATE_OK    1
-#define TLBSTATE_LAZY  2
-
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
                                        struct mm_struct *mm)
 {
+       inc_mm_tlb_gen(mm);
        cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
 }
 
index c4b9dc2f67c5f6f7a095dd7d4a3a6bb7fd7b3975..9f42beefc67a3d5d5242e054425c713ca394553e 100644 (file)
@@ -7,12 +7,24 @@
 #ifndef _ASM_X86_VGA_H
 #define _ASM_X86_VGA_H
 
+#include <asm/set_memory.h>
+
 /*
  *     On the PC, we can just recalculate addresses and then
  *     access the videoram directly without any black magic.
+ *     To support memory encryption however, we need to access
+ *     the videoram as decrypted memory.
  */
 
-#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x)
+#define VGA_MAP_MEM(x, s)                                      \
+({                                                             \
+       unsigned long start = (unsigned long)phys_to_virt(x);   \
+                                                               \
+       if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))                 \
+               set_memory_decrypted(start, (s) >> PAGE_SHIFT); \
+                                                               \
+       start;                                                  \
+})
 
 #define vga_readb(x) (*(x))
 #define vga_writeb(x, y) (*(y) = (x))
index 7491e73d92530bf868a9b91b2c71c892bcbdb94a..97bb2caf342879ba365582d3bd7b1caab5f77dfe 100644 (file)
@@ -115,7 +115,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
 #define        ACPI_INVALID_GSI                INT_MIN
 
 /*
- * This is just a simple wrapper around early_ioremap(),
+ * This is just a simple wrapper around early_memremap(),
  * with sanity checks for phys == 0 and size == 0.
  */
 char *__init __acpi_map_table(unsigned long phys, unsigned long size)
@@ -124,7 +124,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
        if (!phys || !size)
                return NULL;
 
-       return early_ioremap(phys, size);
+       return early_memremap(phys, size);
 }
 
 void __init __acpi_unmap_table(char *map, unsigned long size)
@@ -132,7 +132,7 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
        if (!map || !size)
                return;
 
-       early_iounmap(map, size);
+       early_memunmap(map, size);
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
index 3b9e220621f83c8a5161e8b57b297233370f72ea..110ca5d2bb872a7f15cffe4349ffab74b03a4c86 100644 (file)
@@ -548,8 +548,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 
 static void early_init_amd(struct cpuinfo_x86 *c)
 {
+       u32 dummy;
+
        early_init_amd_mc(c);
 
+       rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
        /*
         * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
         * with P/T states and does not stop in deep C-states
@@ -612,6 +616,27 @@ static void early_init_amd(struct cpuinfo_x86 *c)
         */
        if (cpu_has_amd_erratum(c, amd_erratum_400))
                set_cpu_bug(c, X86_BUG_AMD_E400);
+
+       /*
+        * BIOS support is required for SME. If BIOS has enabled SME then
+        * adjust x86_phys_bits by the SME physical address space reduction
+        * value. If BIOS has not enabled SME then don't advertise the
+        * feature (set in scattered.c). Also, since the SME support requires
+        * long mode, don't advertise the feature under CONFIG_X86_32.
+        */
+       if (cpu_has(c, X86_FEATURE_SME)) {
+               u64 msr;
+
+               /* Check if SME is enabled */
+               rdmsrl(MSR_K8_SYSCFG, msr);
+               if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
+                       c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+                       if (IS_ENABLED(CONFIG_X86_32))
+                               clear_cpu_cap(c, X86_FEATURE_SME);
+               } else {
+                       clear_cpu_cap(c, X86_FEATURE_SME);
+               }
+       }
 }
 
 static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -730,8 +755,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
 
 static void init_amd(struct cpuinfo_x86 *c)
 {
-       u32 dummy;
-
        early_init_amd(c);
 
        /*
@@ -793,8 +816,6 @@ static void init_amd(struct cpuinfo_x86 *c)
        if (c->x86 > 0x11)
                set_cpu_cap(c, X86_FEATURE_ARAT);
 
-       rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
-
        /* 3DNow or LM implies PREFETCHW */
        if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
                if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
index 0af86d9242da0f6882f1f5252dfa659038c627ac..db684880d74ae47fbff37888ff31dd13b9a4653b 100644 (file)
 
 void __init check_bugs(void)
 {
+#ifdef CONFIG_X86_32
+       /*
+        * Regardless of whether PCID is enumerated, the SDM says
+        * that it can't be enabled in 32-bit mode.
+        */
+       setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
        identify_boot_cpu();
 
        if (!IS_ENABLED(CONFIG_SMP)) {
index c8b39870f33e8d5579eb1b30ecd24ace23de9d86..b95cd94ca97bc191121e87bd5c0471d0ad8de494 100644 (file)
@@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s)
 }
 __setup("nompx", x86_mpx_setup);
 
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+       /* require an exact match without trailing characters */
+       if (strlen(s))
+               return 0;
+
+       /* do not emit a message if the feature is not present */
+       if (!boot_cpu_has(X86_FEATURE_PCID))
+               return 1;
+
+       setup_clear_cpu_cap(X86_FEATURE_PCID);
+       pr_info("nopcid: PCID feature disabled\n");
+       return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
 static int __init x86_noinvpcid_setup(char *s)
 {
        /* noinvpcid doesn't accept parameters */
@@ -311,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
        }
 }
 
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+       if (cpu_has(c, X86_FEATURE_PCID)) {
+               if (cpu_has(c, X86_FEATURE_PGE)) {
+                       cr4_set_bits(X86_CR4_PCIDE);
+               } else {
+                       /*
+                        * flush_tlb_all(), as currently implemented, won't
+                        * work if PCID is on but PGE is not.  Since that
+                        * combination doesn't exist on real hardware, there's
+                        * no reason to try to fully support it, but it's
+                        * polite to avoid corrupting data if we're on
+                        * an improperly configured VM.
+                        */
+                       clear_cpu_cap(c, X86_FEATURE_PCID);
+               }
+       }
+}
+
 /*
  * Protection Keys are not available in 32-bit mode.
  */
@@ -1125,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
        setup_smep(c);
        setup_smap(c);
 
+       /* Set up PCID */
+       setup_pcid(c);
+
        /*
         * The vendor-specific functions might have changed features.
         * Now we do "generic changes."
index 6dde0497efc7514b9981bd2d33014a6686c65677..3b413065c61308104794db2efe6ab939b39411a1 100644 (file)
@@ -51,6 +51,7 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/reboot.h>
+#include <asm/set_memory.h>
 
 #include "mce-internal.h"
 
@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
        return ret;
 }
 
+#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
+
+void arch_unmap_kpfn(unsigned long pfn)
+{
+       unsigned long decoy_addr;
+
+       /*
+        * Unmap this page from the kernel 1:1 mappings to make sure
+        * we don't log more errors because of speculative access to
+        * the page.
+        * We would like to just call:
+        *      set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
+        * but doing that would radically increase the odds of a
+        * speculative access to the posion page because we'd have
+        * the virtual address of the kernel 1:1 mapping sitting
+        * around in registers.
+        * Instead we get tricky.  We create a non-canonical address
+        * that looks just like the one we want, but has bit 63 flipped.
+        * This relies on set_memory_np() not checking whether we passed
+        * a legal address.
+        */
+
+/*
+ * Build time check to see if we have a spare virtual bit. Don't want
+ * to leave this until run time because most developers don't have a
+ * system that can exercise this code path. This will only become a
+ * problem if/when we move beyond 5-level page tables.
+ *
+ * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
+ */
+#if PGDIR_SHIFT + 9 < 63
+       decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+#else
+#error "no unused virtual bit available"
+#endif
+
+       if (set_memory_np(decoy_addr, 1))
+               pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+
+}
+#endif
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
index 23c23508c0125e50caf69a6959aaf000151cf572..05459ad3db46e2139b7d97514899d398c321c541 100644 (file)
@@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = {
        { X86_FEATURE_HW_PSTATE,        CPUID_EDX,  7, 0x80000007, 0 },
        { X86_FEATURE_CPB,              CPUID_EDX,  9, 0x80000007, 0 },
        { X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
+       { X86_FEATURE_SME,              CPUID_EAX,  0, 0x8000001f, 0 },
        { 0, 0, 0, 0, 0 }
 };
 
index 532da61d605ccc2271067fdf67bac83c454616aa..71c11ad5643e80059d4f262002fc9620044b594b 100644 (file)
@@ -96,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any);
  * Note: this function only works correctly once the E820 table is sorted and
  * not-overlapping (at least for the range specified), which is the case normally.
  */
-bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
+                                            enum e820_type type)
 {
        int i;
 
@@ -122,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
                 * coverage of the desired range exists:
                 */
                if (start >= end)
-                       return 1;
+                       return entry;
        }
-       return 0;
+
+       return NULL;
+}
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ */
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+{
+       return __e820__mapped_all(start, end, type);
+}
+
+/*
+ * This function returns the type associated with the range <start,end>.
+ */
+int e820__get_entry_type(u64 start, u64 end)
+{
+       struct e820_entry *entry = __e820__mapped_all(start, end, 0);
+
+       return entry ? entry->type : -EINVAL;
 }
 
 /*
index 6b91e2eb8d3f8a5b8ad1a57c7a2a47d0a3b4440d..9c4e7ba6870c142921cfbbd07b8bbf45285e5c07 100644 (file)
@@ -195,7 +195,7 @@ void init_espfix_ap(int cpu)
 
        pte_p = pte_offset_kernel(&pmd, addr);
        stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
-       pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+       pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
        for (n = 0; n < ESPFIX_PTE_CLONES; n++)
                set_pte(&pte_p[n*PTE_STRIDE], pte);
 
index 9ba79543d9ee9f1ee19bce38955467afa04b125f..6a193b93fd952d59b4bca8a2071859edf8bcfcb6 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/start_kernel.h>
 #include <linux/io.h>
 #include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/processor.h>
 #include <asm/proto.h>
@@ -33,7 +34,6 @@
 /*
  * Manage page tables very early on.
  */
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
 static unsigned int __initdata next_early_pgt;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
@@ -45,9 +45,11 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
        return ptr - (void *)_text + (void *)physaddr;
 }
 
-void __head __startup_64(unsigned long physaddr)
+unsigned long __head __startup_64(unsigned long physaddr,
+                                 struct boot_params *bp)
 {
        unsigned long load_delta, *p;
+       unsigned long pgtable_flags;
        pgdval_t *pgd;
        p4dval_t *p4d;
        pudval_t *pud;
@@ -69,6 +71,12 @@ void __head __startup_64(unsigned long physaddr)
        if (load_delta & ~PMD_PAGE_MASK)
                for (;;);
 
+       /* Activate Secure Memory Encryption (SME) if supported and enabled */
+       sme_enable(bp);
+
+       /* Include the SME encryption mask in the fixup value */
+       load_delta += sme_get_me_mask();
+
        /* Fixup the physical addresses in the page table */
 
        pgd = fixup_pointer(&early_top_pgt, physaddr);
@@ -92,31 +100,35 @@ void __head __startup_64(unsigned long physaddr)
         * creates a bunch of nonsense entries but that is fine --
         * it avoids problems around wraparound.
         */
+
        next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
        pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
        pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
 
+       pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
+
        if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
                p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
 
                i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
-               pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
-               pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+               pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+               pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
 
                i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
-               p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
-               p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+               p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
+               p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
        } else {
                i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
-               pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
-               pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+               pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+               pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
        }
 
        i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
-       pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
-       pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+       pud[i + 0] = (pudval_t)pmd + pgtable_flags;
+       pud[i + 1] = (pudval_t)pmd + pgtable_flags;
 
        pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+       pmd_entry += sme_get_me_mask();
        pmd_entry +=  physaddr;
 
        for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
@@ -137,9 +149,30 @@ void __head __startup_64(unsigned long physaddr)
                        pmd[i] += load_delta;
        }
 
-       /* Fixup phys_base */
+       /*
+        * Fixup phys_base - remove the memory encryption mask to obtain
+        * the true physical address.
+        */
        p = fixup_pointer(&phys_base, physaddr);
-       *p += load_delta;
+       *p += load_delta - sme_get_me_mask();
+
+       /* Encrypt the kernel (if SME is active) */
+       sme_encrypt_kernel();
+
+       /*
+        * Return the SME encryption mask (if SME is active) to be used as a
+        * modifier for the initial pgdir entry programmed into CR3.
+        */
+       return sme_get_me_mask();
+}
+
+unsigned long __startup_secondary_64(void)
+{
+       /*
+        * Return the SME encryption mask (if SME is active) to be used as a
+        * modifier for the initial pgdir entry programmed into CR3.
+        */
+       return sme_get_me_mask();
 }
 
 /* Wipe all early page tables except for the kernel symbol map */
@@ -147,17 +180,17 @@ static void __init reset_early_page_tables(void)
 {
        memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
        next_early_pgt = 0;
-       write_cr3(__pa_nodebug(early_top_pgt));
+       write_cr3(__sme_pa_nodebug(early_top_pgt));
 }
 
 /* Create a new PMD entry */
-int __init early_make_pgtable(unsigned long address)
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
 {
        unsigned long physaddr = address - __PAGE_OFFSET;
        pgdval_t pgd, *pgd_p;
        p4dval_t p4d, *p4d_p;
        pudval_t pud, *pud_p;
-       pmdval_t pmd, *pmd_p;
+       pmdval_t *pmd_p;
 
        /* Invalid address or early pgt is done ?  */
        if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
@@ -216,12 +249,21 @@ again:
                memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
                *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
        }
-       pmd = (physaddr & PMD_MASK) + early_pmd_flags;
        pmd_p[pmd_index(address)] = pmd;
 
        return 0;
 }
 
+int __init early_make_pgtable(unsigned long address)
+{
+       unsigned long physaddr = address - __PAGE_OFFSET;
+       pmdval_t pmd;
+
+       pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+       return __early_make_pgtable(address, pmd);
+}
+
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
    yet. */
 static void __init clear_bss(void)
@@ -244,6 +286,12 @@ static void __init copy_bootdata(char *real_mode_data)
        char * command_line;
        unsigned long cmd_line_ptr;
 
+       /*
+        * If SME is active, this will create decrypted mappings of the
+        * boot data in advance of the copy operations.
+        */
+       sme_map_bootdata(real_mode_data);
+
        memcpy(&boot_params, real_mode_data, sizeof boot_params);
        sanitize_boot_params(&boot_params);
        cmd_line_ptr = get_cmd_line_ptr();
@@ -251,6 +299,14 @@ static void __init copy_bootdata(char *real_mode_data)
                command_line = __va(cmd_line_ptr);
                memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
        }
+
+       /*
+        * The old boot data is no longer needed and won't be reserved,
+        * freeing up that memory for use by the system. If SME is active,
+        * we need to remove the mappings that were created so that the
+        * memory doesn't remain mapped as decrypted.
+        */
+       sme_unmap_bootdata(real_mode_data);
 }
 
 asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
@@ -280,6 +336,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 
        clear_page(init_top_pgt);
 
+       /*
+        * SME support may update early_pmd_flags to include the memory
+        * encryption mask, so it needs to be called before anything
+        * that may generate a page fault.
+        */
+       sme_early_init();
+
        kasan_early_init();
 
        for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
index 6225550883dfe1e98bcf35ac5ed870e334b34399..513cbb012eccc51f18ac16f9dac74a33abaa4c16 100644 (file)
@@ -73,12 +73,19 @@ startup_64:
        /* Sanitize CPU configuration */
        call verify_cpu
 
+       /*
+        * Perform pagetable fixups. Additionally, if SME is active, encrypt
+        * the kernel and retrieve the modifier (SME encryption mask if SME
+        * is active) to be added to the initial pgdir entry that will be
+        * programmed into CR3.
+        */
        leaq    _text(%rip), %rdi
        pushq   %rsi
        call    __startup_64
        popq    %rsi
 
-       movq    $(early_top_pgt - __START_KERNEL_map), %rax
+       /* Form the CR3 value being sure to include the CR3 modifier */
+       addq    $(early_top_pgt - __START_KERNEL_map), %rax
        jmp 1f
 ENTRY(secondary_startup_64)
        /*
@@ -98,7 +105,16 @@ ENTRY(secondary_startup_64)
        /* Sanitize CPU configuration */
        call verify_cpu
 
-       movq    $(init_top_pgt - __START_KERNEL_map), %rax
+       /*
+        * Retrieve the modifier (SME encryption mask if SME is active) to be
+        * added to the initial pgdir entry that will be programmed into CR3.
+        */
+       pushq   %rsi
+       call    __startup_secondary_64
+       popq    %rsi
+
+       /* Form the CR3 value being sure to include the CR3 modifier */
+       addq    $(init_top_pgt - __START_KERNEL_map), %rax
 1:
 
        /* Enable PAE mode, PGE and LA57 */
@@ -335,9 +351,9 @@ GLOBAL(name)
 NEXT_PAGE(early_top_pgt)
        .fill   511,8,0
 #ifdef CONFIG_X86_5LEVEL
-       .quad   level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #else
-       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
 
 NEXT_PAGE(early_dynamic_pgts)
@@ -350,15 +366,15 @@ NEXT_PAGE(init_top_pgt)
        .fill   512,8,0
 #else
 NEXT_PAGE(init_top_pgt)
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
        .org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
        .org    init_top_pgt + PGD_START_KERNEL*8, 0
        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 
 NEXT_PAGE(level3_ident_pgt)
-       .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
        .fill   511, 8, 0
 NEXT_PAGE(level2_ident_pgt)
        /* Since I easily can, map the first 1G.
@@ -370,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt)
 #ifdef CONFIG_X86_5LEVEL
 NEXT_PAGE(level4_kernel_pgt)
        .fill   511,8,0
-       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
 
 NEXT_PAGE(level3_kernel_pgt)
        .fill   L3_START_KERNEL,8,0
        /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-       .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+       .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 
 NEXT_PAGE(level2_kernel_pgt)
        /*
@@ -395,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt)
 
 NEXT_PAGE(level2_fixmap_pgt)
        .fill   506,8,0
-       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
        /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
        .fill   5,8,0
 
index 38b64587b31be5611a763df6dafe8434db2a66b5..fd6f8fbbe6f2a05d061c3e0196c313293ae8ad08 100644 (file)
@@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
        struct setup_data_node *node = file->private_data;
        unsigned long remain;
        loff_t pos = *ppos;
-       struct page *pg;
        void *p;
        u64 pa;
 
@@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
                count = node->len - pos;
 
        pa = node->paddr + sizeof(struct setup_data) + pos;
-       pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
-       if (PageHighMem(pg)) {
-               p = ioremap_cache(pa, count);
-               if (!p)
-                       return -ENXIO;
-       } else
-               p = __va(pa);
+       p = memremap(pa, count, MEMREMAP_WB);
+       if (!p)
+               return -ENOMEM;
 
        remain = copy_to_user(user_buf, p, count);
 
-       if (PageHighMem(pg))
-               iounmap(p);
+       memunmap(p);
 
        if (remain)
                return -EFAULT;
@@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent)
        struct setup_data *data;
        int error;
        struct dentry *d;
-       struct page *pg;
        u64 pa_data;
        int no = 0;
 
@@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent)
                        goto err_dir;
                }
 
-               pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
-               if (PageHighMem(pg)) {
-                       data = ioremap_cache(pa_data, sizeof(*data));
-                       if (!data) {
-                               kfree(node);
-                               error = -ENXIO;
-                               goto err_dir;
-                       }
-               } else
-                       data = __va(pa_data);
+               data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+               if (!data) {
+                       kfree(node);
+                       error = -ENOMEM;
+                       goto err_dir;
+               }
 
                node->paddr = pa_data;
                node->type = data->type;
@@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
                error = create_setup_data_node(d, no, node);
                pa_data = data->next;
 
-               if (PageHighMem(pg))
-                       iounmap(data);
+               memunmap(data);
                if (error)
                        goto err_dir;
                no++;
index 06e1ff5562c0b4a5a28f3c052ed74b545c7ccca0..4b0592ca9e47b332d0ce67f8bcf5f555653587b2 100644 (file)
@@ -16,8 +16,8 @@
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/io.h>
 
-#include <asm/io.h>
 #include <asm/setup.h>
 
 static ssize_t version_show(struct kobject *kobj,
@@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr)
                        *paddr = pa_data;
                        return 0;
                }
-               data = ioremap_cache(pa_data, sizeof(*data));
+               data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
                if (!data)
                        return -ENOMEM;
 
                pa_data = data->next;
-               iounmap(data);
+               memunmap(data);
                i++;
        }
        return -EINVAL;
@@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size)
        u64 pa_data = boot_params.hdr.setup_data;
 
        while (pa_data) {
-               data = ioremap_cache(pa_data, sizeof(*data));
+               data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
                if (!data)
                        return -ENOMEM;
                if (nr == i) {
                        *size = data->len;
-                       iounmap(data);
+                       memunmap(data);
                        return 0;
                }
 
                pa_data = data->next;
-               iounmap(data);
+               memunmap(data);
                i++;
        }
        return -EINVAL;
@@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj,
        ret = get_setup_data_paddr(nr, &paddr);
        if (ret)
                return ret;
-       data = ioremap_cache(paddr, sizeof(*data));
+       data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
        if (!data)
                return -ENOMEM;
 
        ret = sprintf(buf, "0x%x\n", data->type);
-       iounmap(data);
+       memunmap(data);
        return ret;
 }
 
@@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp,
        ret = get_setup_data_paddr(nr, &paddr);
        if (ret)
                return ret;
-       data = ioremap_cache(paddr, sizeof(*data));
+       data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
        if (!data)
                return -ENOMEM;
 
@@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp,
                goto out;
 
        ret = count;
-       p = ioremap_cache(paddr + sizeof(*data), data->len);
+       p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
        if (!p) {
                ret = -ENOMEM;
                goto out;
        }
        memcpy(buf, p + off, count);
-       iounmap(p);
+       memunmap(p);
 out:
-       iounmap(data);
+       memunmap(data);
        return ret;
 }
 
@@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr)
        *nr = 0;
        while (pa_data) {
                *nr += 1;
-               data = ioremap_cache(pa_data, sizeof(*data));
+               data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
                if (!data) {
                        ret = -ENOMEM;
                        goto out;
                }
                pa_data = data->next;
-               iounmap(data);
+               memunmap(data);
        }
 
 out:
index cb0a30473c2310b76695c73ec6fad3cd1e7b051f..1f790cf9d38fe0e10e46eaf9b5bef945d25a9370 100644 (file)
@@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
                set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
        }
        pte = pte_offset_kernel(pmd, vaddr);
-       set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+       set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
        return 0;
 err:
        free_transition_pgtable(image);
@@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
                .alloc_pgt_page = alloc_pgt_page,
                .context        = image,
                .page_flag      = __PAGE_KERNEL_LARGE_EXEC,
+               .kernpg_flag    = _KERNPG_TABLE_NOENC,
        };
        unsigned long mstart, mend;
        pgd_t *level4p;
@@ -334,7 +335,8 @@ void machine_kexec(struct kimage *image)
        image->start = relocate_kernel((unsigned long)image->head,
                                       (unsigned long)page_list,
                                       image->start,
-                                      image->preserve_context);
+                                      image->preserve_context,
+                                      sme_active());
 
 #ifdef CONFIG_KEXEC_JUMP
        if (image->preserve_context)
@@ -602,3 +604,22 @@ void arch_kexec_unprotect_crashkres(void)
 {
        kexec_mark_crashkres(false);
 }
+
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+       /*
+        * If SME is active we need to be sure that kexec pages are
+        * not encrypted because when we boot to the new kernel the
+        * pages won't be accessed encrypted (initially).
+        */
+       return set_memory_decrypted((unsigned long)vaddr, pages);
+}
+
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+       /*
+        * If SME is active we need to reset the pages back to being
+        * an encrypted mapping before freeing them.
+        */
+       set_memory_encrypted((unsigned long)vaddr, pages);
+}
index 0d904d759ff1d42c97e57a7ef5e126bd961b8b03..5cbb3177ed17270b993a4c1a686282ce80608ed1 100644 (file)
@@ -429,16 +429,16 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
        }
 }
 
-static struct mpf_intel *mpf_found;
+static unsigned long mpf_base;
 
 static unsigned long __init get_mpc_size(unsigned long physptr)
 {
        struct mpc_table *mpc;
        unsigned long size;
 
-       mpc = early_ioremap(physptr, PAGE_SIZE);
+       mpc = early_memremap(physptr, PAGE_SIZE);
        size = mpc->length;
-       early_iounmap(mpc, PAGE_SIZE);
+       early_memunmap(mpc, PAGE_SIZE);
        apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
 
        return size;
@@ -450,7 +450,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
        unsigned long size;
 
        size = get_mpc_size(mpf->physptr);
-       mpc = early_ioremap(mpf->physptr, size);
+       mpc = early_memremap(mpf->physptr, size);
+
        /*
         * Read the physical hardware table.  Anything here will
         * override the defaults.
@@ -461,10 +462,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
 #endif
                pr_err("BIOS bug, MP table errors detected!...\n");
                pr_cont("... disabling SMP support. (tell your hw vendor)\n");
-               early_iounmap(mpc, size);
+               early_memunmap(mpc, size);
                return -1;
        }
-       early_iounmap(mpc, size);
+       early_memunmap(mpc, size);
 
        if (early)
                return -1;
@@ -497,12 +498,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
  */
 void __init default_get_smp_config(unsigned int early)
 {
-       struct mpf_intel *mpf = mpf_found;
+       struct mpf_intel *mpf;
 
        if (!smp_found_config)
                return;
 
-       if (!mpf)
+       if (!mpf_base)
                return;
 
        if (acpi_lapic && early)
@@ -515,6 +516,12 @@ void __init default_get_smp_config(unsigned int early)
        if (acpi_lapic && acpi_ioapic)
                return;
 
+       mpf = early_memremap(mpf_base, sizeof(*mpf));
+       if (!mpf) {
+               pr_err("MPTABLE: error mapping MP table\n");
+               return;
+       }
+
        pr_info("Intel MultiProcessor Specification v1.%d\n",
                mpf->specification);
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -529,7 +536,7 @@ void __init default_get_smp_config(unsigned int early)
        /*
         * Now see if we need to read further.
         */
-       if (mpf->feature1 != 0) {
+       if (mpf->feature1) {
                if (early) {
                        /*
                         * local APIC has default address
@@ -542,8 +549,10 @@ void __init default_get_smp_config(unsigned int early)
                construct_default_ISA_mptable(mpf->feature1);
 
        } else if (mpf->physptr) {
-               if (check_physptr(mpf, early))
+               if (check_physptr(mpf, early)) {
+                       early_memunmap(mpf, sizeof(*mpf));
                        return;
+               }
        } else
                BUG();
 
@@ -552,6 +561,8 @@ void __init default_get_smp_config(unsigned int early)
        /*
         * Only use the first configuration found.
         */
+
+       early_memunmap(mpf, sizeof(*mpf));
 }
 
 static void __init smp_reserve_memory(struct mpf_intel *mpf)
@@ -561,15 +572,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
 
 static int __init smp_scan_config(unsigned long base, unsigned long length)
 {
-       unsigned int *bp = phys_to_virt(base);
+       unsigned int *bp;
        struct mpf_intel *mpf;
-       unsigned long mem;
+       int ret = 0;
 
        apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
                    base, base + length - 1);
        BUILD_BUG_ON(sizeof(*mpf) != 16);
 
        while (length > 0) {
+               bp = early_memremap(base, length);
                mpf = (struct mpf_intel *)bp;
                if ((*bp == SMP_MAGIC_IDENT) &&
                    (mpf->length == 1) &&
@@ -579,24 +591,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 #ifdef CONFIG_X86_LOCAL_APIC
                        smp_found_config = 1;
 #endif
-                       mpf_found = mpf;
+                       mpf_base = base;
 
-                       pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
-                               (unsigned long long) virt_to_phys(mpf),
-                               (unsigned long long) virt_to_phys(mpf) +
-                               sizeof(*mpf) - 1, mpf);
+                       pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n",
+                               base, base + sizeof(*mpf) - 1, mpf);
 
-                       mem = virt_to_phys(mpf);
-                       memblock_reserve(mem, sizeof(*mpf));
+                       memblock_reserve(base, sizeof(*mpf));
                        if (mpf->physptr)
                                smp_reserve_memory(mpf);
 
-                       return 1;
+                       ret = 1;
                }
-               bp += 4;
+               early_memunmap(bp, length);
+
+               if (ret)
+                       break;
+
+               base += 16;
                length -= 16;
        }
-       return 0;
+       return ret;
 }
 
 void __init default_find_smp_config(void)
@@ -838,29 +852,40 @@ static int __init update_mp_table(void)
        char oem[10];
        struct mpf_intel *mpf;
        struct mpc_table *mpc, *mpc_new;
+       unsigned long size;
 
        if (!enable_update_mptable)
                return 0;
 
-       mpf = mpf_found;
-       if (!mpf)
+       if (!mpf_base)
                return 0;
 
+       mpf = early_memremap(mpf_base, sizeof(*mpf));
+       if (!mpf) {
+               pr_err("MPTABLE: mpf early_memremap() failed\n");
+               return 0;
+       }
+
        /*
         * Now see if we need to go further.
         */
-       if (mpf->feature1 != 0)
-               return 0;
+       if (mpf->feature1)
+               goto do_unmap_mpf;
 
        if (!mpf->physptr)
-               return 0;
+               goto do_unmap_mpf;
 
-       mpc = phys_to_virt(mpf->physptr);
+       size = get_mpc_size(mpf->physptr);
+       mpc = early_memremap(mpf->physptr, size);
+       if (!mpc) {
+               pr_err("MPTABLE: mpc early_memremap() failed\n");
+               goto do_unmap_mpf;
+       }
 
        if (!smp_check_mpc(mpc, oem, str))
-               return 0;
+               goto do_unmap_mpc;
 
-       pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf));
+       pr_info("mpf: %llx\n", (u64)mpf_base);
        pr_info("physptr: %x\n", mpf->physptr);
 
        if (mpc_new_phys && mpc->length > mpc_new_length) {
@@ -878,21 +903,32 @@ static int __init update_mp_table(void)
                new = mpf_checksum((unsigned char *)mpc, mpc->length);
                if (old == new) {
                        pr_info("mpc is readonly, please try alloc_mptable instead\n");
-                       return 0;
+                       goto do_unmap_mpc;
                }
                pr_info("use in-position replacing\n");
        } else {
+               mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
+               if (!mpc_new) {
+                       pr_err("MPTABLE: new mpc early_memremap() failed\n");
+                       goto do_unmap_mpc;
+               }
                mpf->physptr = mpc_new_phys;
-               mpc_new = phys_to_virt(mpc_new_phys);
                memcpy(mpc_new, mpc, mpc->length);
+               early_memunmap(mpc, size);
                mpc = mpc_new;
+               size = mpc_new_length;
                /* check if we can modify that */
                if (mpc_new_phys - mpf->physptr) {
                        struct mpf_intel *mpf_new;
                        /* steal 16 bytes from [0, 1k) */
+                       mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
+                       if (!mpf_new) {
+                               pr_err("MPTABLE: new mpf early_memremap() failed\n");
+                               goto do_unmap_mpc;
+                       }
                        pr_info("mpf new: %x\n", 0x400 - 16);
-                       mpf_new = phys_to_virt(0x400 - 16);
                        memcpy(mpf_new, mpf, 16);
+                       early_memunmap(mpf, sizeof(*mpf));
                        mpf = mpf_new;
                        mpf->physptr = mpc_new_phys;
                }
@@ -909,6 +945,12 @@ static int __init update_mp_table(void)
         */
        replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
 
+do_unmap_mpc:
+       early_memunmap(mpc, size);
+
+do_unmap_mpf:
+       early_memunmap(mpf, sizeof(*mpf));
+
        return 0;
 }
 
index 5e16d3f2959468f6d05aaa1be5eef2f8b945abb1..0accc2404b9214d1b318577668f103bc41bc8d35 100644 (file)
@@ -93,9 +93,12 @@ again:
        if (gfpflags_allow_blocking(flag)) {
                page = dma_alloc_from_contiguous(dev, count, get_order(size),
                                                 flag);
-               if (page && page_to_phys(page) + size > dma_mask) {
-                       dma_release_from_contiguous(dev, page, count);
-                       page = NULL;
+               if (page) {
+                       addr = phys_to_dma(dev, page_to_phys(page));
+                       if (addr + size > dma_mask) {
+                               dma_release_from_contiguous(dev, page, count);
+                               page = NULL;
+                       }
                }
        }
        /* fallback */
@@ -104,7 +107,7 @@ again:
        if (!page)
                return NULL;
 
-       addr = page_to_phys(page);
+       addr = phys_to_dma(dev, page_to_phys(page));
        if (addr + size > dma_mask) {
                __free_pages(page, get_order(size));
 
index a6d404087fe3285f65de353613153abde4a698a0..4fc3cb60ea11a546b08ae8c2f3fdbb9007dee670 100644 (file)
@@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
                                 enum dma_data_direction dir,
                                 unsigned long attrs)
 {
-       dma_addr_t bus = page_to_phys(page) + offset;
+       dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
        WARN_ON(size == 0);
        if (!check_addr("map_single", dev, bus, size))
                return NOMMU_MAPPING_ERROR;
index 1e23577e17cf10f87d584e4cdc38f4691db57862..677077510e308ebfabb56d8072a2afed439f23c8 100644 (file)
@@ -6,12 +6,14 @@
 #include <linux/swiotlb.h>
 #include <linux/bootmem.h>
 #include <linux/dma-mapping.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/iommu.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
 #include <asm/xen/swiotlb-xen.h>
 #include <asm/iommu_table.h>
+
 int swiotlb __read_mostly;
 
 void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
                  pci_swiotlb_late_init);
 
 /*
- * if 4GB or more detected (and iommu=off not set) return 1
- * and set swiotlb to 1.
+ * If 4GB or more detected (and iommu=off not set) or if SME is active
+ * then set swiotlb to 1 and return 1.
  */
 int __init pci_swiotlb_detect_4gb(void)
 {
@@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void)
        if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
                swiotlb = 1;
 #endif
+
+       /*
+        * If SME is active then swiotlb will be set to 1 so that bounce
+        * buffers are allocated and used for devices that do not support
+        * the addressing range required for the encryption mask.
+        */
+       if (sme_active())
+               swiotlb = 1;
+
        return swiotlb;
 }
 IOMMU_INIT(pci_swiotlb_detect_4gb,
index 3ca198080ea9294486ae9a1121e7815dfba7cb19..bd6b85fac66696da70e316656ad6f0d51291f8aa 100644 (file)
@@ -355,6 +355,7 @@ bool xen_set_default_idle(void)
        return ret;
 }
 #endif
+
 void stop_this_cpu(void *dummy)
 {
        local_irq_disable();
@@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy)
        disable_local_APIC();
        mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 
-       for (;;)
-               halt();
+       for (;;) {
+               /*
+                * Use wbinvd followed by hlt to stop the processor. This
+                * provides support for kexec on a processor that supports
+                * SME. With kexec, going from SME inactive to SME active
+                * requires clearing cache entries so that addresses without
+                * the encryption bit set don't corrupt the same physical
+                * address that has the encryption bit set when caches are
+                * flushed. To achieve this a wbinvd is performed followed by
+                * a hlt. Even if the processor is not in the kexec/SME
+                * scenario this only adds a wbinvd to a halting processor.
+                */
+               asm volatile("wbinvd; hlt" : : : "memory");
+       }
 }
 
 /*
index 98111b38ebfd6eb9949242c5aae7b18bbbdb4489..307d3bac5f04ece485ac1fe42226ee111c0c6e85 100644 (file)
@@ -47,6 +47,7 @@ relocate_kernel:
         * %rsi page_list
         * %rdx start address
         * %rcx preserve_context
+        * %r8  sme_active
         */
 
        /* Save the CPU context, used for jumping back */
@@ -71,6 +72,9 @@ relocate_kernel:
        pushq $0
        popfq
 
+       /* Save SME active flag */
+       movq    %r8, %r12
+
        /*
         * get physical address of control page now
         * this is impossible after page table switch
@@ -132,6 +136,16 @@ identity_mapped:
        /* Flush the TLB (needed?) */
        movq    %r9, %cr3
 
+       /*
+        * If SME is active, there could be old encrypted cache line
+        * entries that will conflict with the now unencrypted memory
+        * used by kexec. Flush the caches before copying the kernel.
+        */
+       testq   %r12, %r12
+       jz 1f
+       wbinvd
+1:
+
        movq    %rcx, %r11
        call    swap_pages
 
index 3486d04988000b05344a590ce9ab8c86e96d0ec2..0bfe0c1628f638a0ae7b1ab69e9414485fbae147 100644 (file)
@@ -69,6 +69,7 @@
 #include <linux/crash_dump.h>
 #include <linux/tboot.h>
 #include <linux/jiffies.h>
+#include <linux/mem_encrypt.h>
 
 #include <linux/usb/xhci-dbgp.h>
 #include <video/edid.h>
@@ -374,6 +375,14 @@ static void __init reserve_initrd(void)
            !ramdisk_image || !ramdisk_size)
                return;         /* No initrd provided by bootloader */
 
+       /*
+        * If SME is active, this memory will be marked encrypted by the
+        * kernel when it is accessed (including relocation). However, the
+        * ramdisk image was loaded decrypted by the bootloader, so make
+        * sure that it is encrypted before accessing it.
+        */
+       sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
+
        initrd_start = 0;
 
        mapped_size = memblock_mem_size(max_pfn_mapped);
index 213ddf3e937d800577514a3677f742813298555e..73e4d28112f8a14a1d741ab664b3f76ea0bf6479 100644 (file)
@@ -21,6 +21,7 @@
 #include <asm/compat.h>
 #include <asm/ia32.h>
 #include <asm/syscalls.h>
+#include <asm/mpx.h>
 
 /*
  * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
@@ -100,8 +101,8 @@ out:
        return error;
 }
 
-static void find_start_end(unsigned long flags, unsigned long *begin,
-                          unsigned long *end)
+static void find_start_end(unsigned long addr, unsigned long flags,
+               unsigned long *begin, unsigned long *end)
 {
        if (!in_compat_syscall() && (flags & MAP_32BIT)) {
                /* This is usually used needed to map code in small
@@ -120,7 +121,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
        }
 
        *begin  = get_mmap_base(1);
-       *end    = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
+       if (in_compat_syscall())
+               *end = task_size_32bit();
+       else
+               *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
 }
 
 unsigned long
@@ -132,10 +136,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
        struct vm_unmapped_area_info info;
        unsigned long begin, end;
 
+       addr = mpx_unmapped_area_check(addr, len, flags);
+       if (IS_ERR_VALUE(addr))
+               return addr;
+
        if (flags & MAP_FIXED)
                return addr;
 
-       find_start_end(flags, &begin, &end);
+       find_start_end(addr, flags, &begin, &end);
 
        if (len > end)
                return -ENOMEM;
@@ -171,6 +179,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        unsigned long addr = addr0;
        struct vm_unmapped_area_info info;
 
+       addr = mpx_unmapped_area_check(addr, len, flags);
+       if (IS_ERR_VALUE(addr))
+               return addr;
+
        /* requested length too big for entire address space */
        if (len > TASK_SIZE)
                return -ENOMEM;
@@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        info.length = len;
        info.low_limit = PAGE_SIZE;
        info.high_limit = get_mmap_base(0);
+
+       /*
+        * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+        * in the full address space.
+        *
+        * !in_compat_syscall() check to avoid high addresses for x32.
+        */
+       if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+               info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
        info.align_mask = 0;
        info.align_offset = pgoff << PAGE_SHIFT;
        if (filp) {
index 9b1dd114956a8bcb9e724bd4df792460f68a0943..ccb70b8d16ccd24545f0d71c92c3c28f5f54992e 100644 (file)
@@ -108,7 +108,7 @@ module_param(dbg, bool, 0644);
        (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 
 
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
 #define PT64_DIR_BASE_ADDR_MASK \
        (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
 #define PT64_LVL_ADDR_MASK(level) \
@@ -126,7 +126,7 @@ module_param(dbg, bool, 0644);
                                            * PT32_LEVEL_BITS))) - 1))
 
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
-                       | shadow_x_mask | shadow_nx_mask)
+                       | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
 
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -186,6 +186,7 @@ static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
 static u64 __read_mostly shadow_mmio_value;
 static u64 __read_mostly shadow_present_mask;
+static u64 __read_mostly shadow_me_mask;
 
 /*
  * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
@@ -349,7 +350,7 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
  */
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask)
+               u64 acc_track_mask, u64 me_mask)
 {
        BUG_ON(!dirty_mask != !accessed_mask);
        BUG_ON(!accessed_mask && !acc_track_mask);
@@ -362,6 +363,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
        shadow_x_mask = x_mask;
        shadow_present_mask = p_mask;
        shadow_acc_track_mask = acc_track_mask;
+       shadow_me_mask = me_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -2433,7 +2435,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
        BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
 
        spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
-              shadow_user_mask | shadow_x_mask;
+              shadow_user_mask | shadow_x_mask | shadow_me_mask;
 
        if (sp_ad_disabled(sp))
                spte |= shadow_acc_track_value;
@@ -2745,6 +2747,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                pte_access &= ~ACC_WRITE_MASK;
 
        spte |= (u64)pfn << PAGE_SHIFT;
+       spte |= shadow_me_mask;
 
        if (pte_access & ACC_WRITE_MASK) {
 
index d7d248a000dd6772681f3f5541e344f9677a2d1d..3cc725590ab9fd848c6f228e7c152d12e24a1569 100644 (file)
@@ -48,7 +48,7 @@
 
 static inline u64 rsvd_bits(int s, int e)
 {
-       return ((1ULL << (e - s + 1)) - 1) << s;
+       return __sme_clr(((1ULL << (e - s + 1)) - 1) << s);
 }
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
index 56ba05312759d3ed4568e546492d9d8bfad05b71..099ff08b4aff9c7ca4a1ea0a3a14a02c3891f506 100644 (file)
@@ -1167,9 +1167,9 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
 {
        struct vmcb *vmcb = svm->vmcb;
        struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
-       phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
-       phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
-       phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
+       phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+       phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page));
+       phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page));
 
        vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
        vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -1232,8 +1232,8 @@ static void init_vmcb(struct vcpu_svm *svm)
                set_intercept(svm, INTERCEPT_MWAIT);
        }
 
-       control->iopm_base_pa = iopm_base;
-       control->msrpm_base_pa = __pa(svm->msrpm);
+       control->iopm_base_pa = __sme_set(iopm_base);
+       control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
        control->int_ctl = V_INTR_MASKING_MASK;
 
        init_seg(&save->es);
@@ -1377,9 +1377,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
                return -EINVAL;
 
        new_entry = READ_ONCE(*entry);
-       new_entry = (page_to_phys(svm->avic_backing_page) &
-                    AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
-                    AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+       new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+                             AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+                             AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
        WRITE_ONCE(*entry, new_entry);
 
        svm->avic_physical_id_cache = entry;
@@ -1647,7 +1647,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
        svm->vmcb = page_address(page);
        clear_page(svm->vmcb);
-       svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
+       svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
        svm->asid_generation = 0;
        init_vmcb(svm);
 
@@ -1675,7 +1675,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+       __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
        __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
        __free_page(virt_to_page(svm->nested.hsave));
        __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
@@ -2335,7 +2335,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
        u64 pdpte;
        int ret;
 
-       ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+       ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
                                       offset_in_page(cr3) + index * 8, 8);
        if (ret)
                return 0;
@@ -2347,7 +2347,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       svm->vmcb->control.nested_cr3 = root;
+       svm->vmcb->control.nested_cr3 = __sme_set(root);
        mark_dirty(svm->vmcb, VMCB_NPT);
        svm_flush_tlb(vcpu);
 }
@@ -2878,7 +2878,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
                svm->nested.msrpm[p] = svm->msrpm[p] | value;
        }
 
-       svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+       svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
 
        return true;
 }
@@ -4511,7 +4511,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
        pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
                 irq.vector);
        *svm = to_svm(vcpu);
-       vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+       vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
        vcpu_info->vector = irq.vector;
 
        return 0;
@@ -4562,7 +4562,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                        struct amd_iommu_pi_data pi;
 
                        /* Try to enable guest_mode in IRTE */
-                       pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+                       pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+                                           AVIC_HPA_MASK);
                        pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
                                                     svm->vcpu.vcpu_id);
                        pi.is_guest_mode = true;
@@ -5011,7 +5012,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       svm->vmcb->save.cr3 = root;
+       svm->vmcb->save.cr3 = __sme_set(root);
        mark_dirty(svm->vmcb, VMCB_CR);
        svm_flush_tlb(vcpu);
 }
@@ -5020,7 +5021,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       svm->vmcb->control.nested_cr3 = root;
+       svm->vmcb->control.nested_cr3 = __sme_set(root);
        mark_dirty(svm->vmcb, VMCB_NPT);
 
        /* Also sync guest cr3 here in case we live migrate */
index 9b21b12230354e334900e6536b7612285f75b7e3..416d5ed320b605ef361b96db01c53ba6d09bff7a 100644 (file)
@@ -6563,7 +6563,7 @@ void vmx_enable_tdp(void)
                enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
                0ull, VMX_EPT_EXECUTABLE_MASK,
                cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-               VMX_EPT_RWX_MASK);
+               VMX_EPT_RWX_MASK, 0ull);
 
        ept_set_mmio_spte_mask();
        kvm_enable_tdp();
index d734aa8c5b4f7290e365badd00ea962fd0af9acd..eda4bdbd7e5e1cb722de3c2c00fd6ed2323aa8a0 100644 (file)
@@ -54,6 +54,7 @@
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
 #include <linux/sched/stat.h>
+#include <linux/mem_encrypt.h>
 
 #include <trace/events/kvm.h>
 
@@ -6116,7 +6117,7 @@ int kvm_arch_init(void *opaque)
 
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                        PT_DIRTY_MASK, PT64_NX_MASK, 0,
-                       PT_PRESENT_MASK, 0);
+                       PT_PRESENT_MASK, 0, sme_me_mask);
        kvm_timer_init();
 
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
index 5cc78bf572325fb1c5b5d6f854bfe878c55dfeb3..3261abb21ef4f5e0d9c9239cc86651ee96b0a74a 100644 (file)
@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
        return 0;       /* Buffer overrun */
 }
 
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+                     const char *option, char *buffer, int bufsize)
+{
+       char c;
+       int pos = 0, len = -1;
+       const char *opptr = NULL;
+       char *bufptr = buffer;
+       enum {
+               st_wordstart = 0,       /* Start of word/after whitespace */
+               st_wordcmp,     /* Comparing this word */
+               st_wordskip,    /* Miscompare, skip */
+               st_bufcpy,      /* Copying this to buffer */
+       } state = st_wordstart;
+
+       if (!cmdline)
+               return -1;      /* No command line */
+
+       /*
+        * This 'pos' check ensures we do not overrun
+        * a non-NULL-terminated 'cmdline'
+        */
+       while (pos++ < max_cmdline_size) {
+               c = *(char *)cmdline++;
+               if (!c)
+                       break;
+
+               switch (state) {
+               case st_wordstart:
+                       if (myisspace(c))
+                               break;
+
+                       state = st_wordcmp;
+                       opptr = option;
+                       /* fall through */
+
+               case st_wordcmp:
+                       if ((c == '=') && !*opptr) {
+                               /*
+                                * We matched all the way to the end of the
+                                * option we were looking for, prepare to
+                                * copy the argument.
+                                */
+                               len = 0;
+                               bufptr = buffer;
+                               state = st_bufcpy;
+                               break;
+                       } else if (c == *opptr++) {
+                               /*
+                                * We are currently matching, so continue
+                                * to the next character on the cmdline.
+                                */
+                               break;
+                       }
+                       state = st_wordskip;
+                       /* fall through */
+
+               case st_wordskip:
+                       if (myisspace(c))
+                               state = st_wordstart;
+                       break;
+
+               case st_bufcpy:
+                       if (myisspace(c)) {
+                               state = st_wordstart;
+                       } else {
+                               /*
+                                * Increment len, but don't overrun the
+                                * supplied buffer and leave room for the
+                                * NULL terminator.
+                                */
+                               if (++len < bufsize)
+                                       *bufptr++ = c;
+                       }
+                       break;
+               }
+       }
+
+       if (bufsize)
+               *bufptr = '\0';
+
+       return len;
+}
+
 int cmdline_find_option_bool(const char *cmdline, const char *option)
 {
        return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
 }
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+                       int bufsize)
+{
+       return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+                                    buffer, bufsize);
+}
index 0fbdcb64f9f836c0556ca0abcca4c673aea143f0..72bf8c01c6e3a58254cc915aded88eea8146f41a 100644 (file)
@@ -39,3 +39,5 @@ obj-$(CONFIG_X86_INTEL_MPX)   += mpx.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
 
+obj-$(CONFIG_AMD_MEM_ENCRYPT)  += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT)  += mem_encrypt_boot.o
index 0470826d2bdca2b04bbcba902ae42f8aed728cb5..5e3ac6fe6c9e32ed1906f4f9bf736310a7193c7d 100644 (file)
  */
 
 #include <linux/debugfs.h>
+#include <linux/kasan.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 
-#include <asm/kasan.h>
 #include <asm/pgtable.h>
 
 /*
@@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
 {
        pgprotval_t pr = pgprot_val(prot);
        static const char * const level_name[] =
-               { "cr3", "pgd", "pud", "pmd", "pte" };
+               { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
 
        if (!pgprot_val(prot)) {
                /* Not present */
@@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
                        pt_dump_cont_printf(m, dmsg, "    ");
 
                /* Bit 7 has a different meaning on level 3 vs 4 */
-               if (level <= 3 && pr & _PAGE_PSE)
+               if (level <= 4 && pr & _PAGE_PSE)
                        pt_dump_cont_printf(m, dmsg, "PSE ");
                else
                        pt_dump_cont_printf(m, dmsg, "    ");
-               if ((level == 4 && pr & _PAGE_PAT) ||
-                   ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
+               if ((level == 5 && pr & _PAGE_PAT) ||
+                   ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
                        pt_dump_cont_printf(m, dmsg, "PAT ");
                else
                        pt_dump_cont_printf(m, dmsg, "    ");
@@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
  */
 static unsigned long normalize_addr(unsigned long u)
 {
-#ifdef CONFIG_X86_64
-       return (signed long)(u << 16) >> 16;
-#else
-       return u;
-#endif
+       int shift;
+       if (!IS_ENABLED(CONFIG_X86_64))
+               return u;
+
+       shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+       return (signed long)(u << shift) >> shift;
 }
 
 /*
@@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
        for (i = 0; i < PTRS_PER_PTE; i++) {
                prot = pte_flags(*start);
                st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
-               note_page(m, st, __pgprot(prot), 4);
+               note_page(m, st, __pgprot(prot), 5);
                start++;
        }
 }
+#ifdef CONFIG_KASAN
+
+/*
+ * This is an optimization for KASAN=y case. Since all kasan page tables
+ * eventually point to the kasan_zero_page we could call note_page()
+ * right away without walking through lower level page tables. This saves
+ * us dozens of seconds (minutes for 5-level config) while checking for
+ * W+X mapping or reading kernel_page_tables debugfs file.
+ */
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+                               void *pt)
+{
+       if (__pa(pt) == __pa(kasan_zero_pmd) ||
+#ifdef CONFIG_X86_5LEVEL
+           __pa(pt) == __pa(kasan_zero_p4d) ||
+#endif
+           __pa(pt) == __pa(kasan_zero_pud)) {
+               pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
+               note_page(m, st, __pgprot(prot), 5);
+               return true;
+       }
+       return false;
+}
+#else
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+                               void *pt)
+{
+       return false;
+}
+#endif
 
 #if PTRS_PER_PMD > 1
 
 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
 {
        int i;
-       pmd_t *start;
+       pmd_t *start, *pmd_start;
        pgprotval_t prot;
 
-       start = (pmd_t *)pud_page_vaddr(addr);
+       pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
        for (i = 0; i < PTRS_PER_PMD; i++) {
                st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
                if (!pmd_none(*start)) {
                        if (pmd_large(*start) || !pmd_present(*start)) {
                                prot = pmd_flags(*start);
-                               note_page(m, st, __pgprot(prot), 3);
-                       } else {
+                               note_page(m, st, __pgprot(prot), 4);
+                       } else if (!kasan_page_table(m, st, pmd_start)) {
                                walk_pte_level(m, st, *start,
                                               P + i * PMD_LEVEL_MULT);
                        }
                } else
-                       note_page(m, st, __pgprot(0), 3);
+                       note_page(m, st, __pgprot(0), 4);
                start++;
        }
 }
@@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
 
 #if PTRS_PER_PUD > 1
 
-/*
- * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
- * KASAN fills page tables with the same values. Since there is no
- * point in checking page table more than once we just skip repeated
- * entries. This saves us dozens of seconds during boot.
- */
-static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
-{
-       return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
-}
-
 static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
 {
        int i;
-       pud_t *start;
+       pud_t *start, *pud_start;
        pgprotval_t prot;
        pud_t *prev_pud = NULL;
 
-       start = (pud_t *)p4d_page_vaddr(addr);
+       pud_start = start = (pud_t *)p4d_page_vaddr(addr);
 
        for (i = 0; i < PTRS_PER_PUD; i++) {
                st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
-               if (!pud_none(*start) &&
-                   !pud_already_checked(prev_pud, start, st->check_wx)) {
+               if (!pud_none(*start)) {
                        if (pud_large(*start) || !pud_present(*start)) {
                                prot = pud_flags(*start);
-                               note_page(m, st, __pgprot(prot), 2);
-                       } else {
+                               note_page(m, st, __pgprot(prot), 3);
+                       } else if (!kasan_page_table(m, st, pud_start)) {
                                walk_pmd_level(m, st, *start,
                                               P + i * PUD_LEVEL_MULT);
                        }
                } else
-                       note_page(m, st, __pgprot(0), 2);
+                       note_page(m, st, __pgprot(0), 3);
 
                prev_pud = start;
                start++;
@@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
 {
        int i;
-       p4d_t *start;
+       p4d_t *start, *p4d_start;
        pgprotval_t prot;
 
-       start = (p4d_t *)pgd_page_vaddr(addr);
+       p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
 
        for (i = 0; i < PTRS_PER_P4D; i++) {
                st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
@@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
                        if (p4d_large(*start) || !p4d_present(*start)) {
                                prot = p4d_flags(*start);
                                note_page(m, st, __pgprot(prot), 2);
-                       } else {
+                       } else if (!kasan_page_table(m, st, p4d_start)) {
                                walk_pud_level(m, st, *start,
                                               P + i * P4D_LEVEL_MULT);
                        }
index 2824607df1081fe38a96d5e8af70e7c295e43ed5..6d06cf33e3de54ab6daa49910458c2cbb79c848b 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 #include <asm/elf.h>
+#include <asm/mpx.h>
 
 #if 0  /* This is just for testing */
 struct page *
@@ -85,25 +86,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
        info.flags = 0;
        info.length = len;
        info.low_limit = get_mmap_base(1);
+
+       /*
+        * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+        * in the full address space.
+        */
        info.high_limit = in_compat_syscall() ?
-               tasksize_32bit() : tasksize_64bit();
+               task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+
        info.align_mask = PAGE_MASK & ~huge_page_mask(h);
        info.align_offset = 0;
        return vm_unmapped_area(&info);
 }
 
 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
-               unsigned long addr0, unsigned long len,
+               unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags)
 {
        struct hstate *h = hstate_file(file);
        struct vm_unmapped_area_info info;
-       unsigned long addr;
 
        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        info.low_limit = PAGE_SIZE;
        info.high_limit = get_mmap_base(0);
+
+       /*
+        * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+        * in the full address space.
+        */
+       if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+               info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
        info.align_mask = PAGE_MASK & ~huge_page_mask(h);
        info.align_offset = 0;
        addr = vm_unmapped_area(&info);
@@ -118,7 +132,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
                VM_BUG_ON(addr != -ENOMEM);
                info.flags = 0;
                info.low_limit = TASK_UNMAPPED_BASE;
-               info.high_limit = TASK_SIZE;
+               info.high_limit = TASK_SIZE_LOW;
                addr = vm_unmapped_area(&info);
        }
 
@@ -135,6 +149,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
        if (len & ~huge_page_mask(h))
                return -EINVAL;
+
+       addr = mpx_unmapped_area_check(addr, len, flags);
+       if (IS_ERR_VALUE(addr))
+               return addr;
+
        if (len > TASK_SIZE)
                return -ENOMEM;
 
index adab1595f4bd89ba0729db70dd21d619d93e829d..31cea988fa36c5571687d1e18e2d5c914271b0f8 100644 (file)
@@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
                if (!pmd)
                        return -ENOMEM;
                ident_pmd_init(info, pmd, addr, next);
-               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+               set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
        }
 
        return 0;
@@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
                if (!pud)
                        return -ENOMEM;
                ident_pud_init(info, pud, addr, next);
-               set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+               set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
        }
 
        return 0;
@@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
        unsigned long next;
        int result;
 
+       /* Set the default pagetable flags if not supplied */
+       if (!info->kernpg_flag)
+               info->kernpg_flag = _KERNPG_TABLE;
+
        for (; addr < end; addr = next) {
                pgd_t *pgd = pgd_page + pgd_index(addr);
                p4d_t *p4d;
@@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
                if (result)
                        return result;
                if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
-                       set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+                       set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
                } else {
                        /*
                         * With p4d folded, pgd is equal to p4d.
                         * The pgd entry has to point to the pud page table in this case.
                         */
                        pud_t *pud = pud_offset(p4d, 0);
-                       set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+                       set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
                }
        }
 
index bf3f1065d6addb88b898ba3a86089cccff6ed15e..7777ccc0e9f979dc76cc9d520885eea02114223e 100644 (file)
@@ -815,7 +815,7 @@ void __init zone_sizes_init(void)
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
        .loaded_mm = &init_mm,
-       .state = 0,
+       .next_asid = 1,
        .cr4 = ~0UL,    /* fail hard if we screw up cr4 shadow initialization */
 };
 EXPORT_SYMBOL_GPL(cpu_tlbstate);
index 4c1b5fd0c7ad5512a231f556de46df459b97a058..34f0e1847dd64bc82a10679b9896c3d8886aa330 100644 (file)
@@ -13,6 +13,8 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mmiotrace.h>
+#include <linux/mem_encrypt.h>
+#include <linux/efi.h>
 
 #include <asm/set_memory.h>
 #include <asm/e820/api.h>
@@ -21,6 +23,7 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 #include <asm/pat.h>
+#include <asm/setup.h>
 
 #include "physaddr.h"
 
@@ -105,12 +108,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
                return NULL;
        }
 
-       /*
-        * Don't remap the low PCI/ISA area, it's always mapped..
-        */
-       if (is_ISA_range(phys_addr, last_addr))
-               return (__force void __iomem *)phys_to_virt(phys_addr);
-
        /*
         * Don't allow anybody to remap normal RAM that we're using..
         */
@@ -340,13 +337,17 @@ void iounmap(volatile void __iomem *addr)
                return;
 
        /*
-        * __ioremap special-cases the PCI/ISA range by not instantiating a
-        * vm_area and by simply returning an address into the kernel mapping
-        * of ISA space.   So handle that here.
+        * The PCI/ISA range special-casing was removed from __ioremap()
+        * so this check, in theory, can be removed. However, there are
+        * cases where iounmap() is called for addresses not obtained via
+        * ioremap() (vga16fb for example). Add a warning so that these
+        * cases can be caught and fixed.
         */
        if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
-           (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
+           (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) {
+               WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n");
                return;
+       }
 
        addr = (volatile void __iomem *)
                (PAGE_MASK & (unsigned long __force)addr);
@@ -399,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
        unsigned long offset = phys & ~PAGE_MASK;
        void *vaddr;
 
-       /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
-       if (page_is_ram(start >> PAGE_SHIFT))
-               return __va(phys);
+       /* memremap() maps if RAM, otherwise falls back to ioremap() */
+       vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB);
 
-       vaddr = ioremap_cache(start, PAGE_SIZE);
-       /* Only add the offset on success and return NULL if the ioremap() failed: */
+       /* Only add the offset on success and return NULL if memremap() failed */
        if (vaddr)
                vaddr += offset;
 
@@ -413,11 +412,263 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
 
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 {
-       if (page_is_ram(phys >> PAGE_SHIFT))
-               return;
+       memunmap((void *)((unsigned long)addr & PAGE_MASK));
+}
+
+/*
+ * Examine the physical address to determine if it is an area of memory
+ * that should be mapped decrypted.  If the memory is not part of the
+ * kernel usable area it was accessed and created decrypted, so these
+ * areas should be mapped decrypted. And since the encryption key can
+ * change across reboots, persistent memory should also be mapped
+ * decrypted.
+ */
+static bool memremap_should_map_decrypted(resource_size_t phys_addr,
+                                         unsigned long size)
+{
+       int is_pmem;
+
+       /*
+        * Check if the address is part of a persistent memory region.
+        * This check covers areas added by E820, EFI and ACPI.
+        */
+       is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM,
+                                   IORES_DESC_PERSISTENT_MEMORY);
+       if (is_pmem != REGION_DISJOINT)
+               return true;
+
+       /*
+        * Check if the non-volatile attribute is set for an EFI
+        * reserved area.
+        */
+       if (efi_enabled(EFI_BOOT)) {
+               switch (efi_mem_type(phys_addr)) {
+               case EFI_RESERVED_TYPE:
+                       if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV)
+                               return true;
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       /* Check if the address is outside kernel usable area */
+       switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
+       case E820_TYPE_RESERVED:
+       case E820_TYPE_ACPI:
+       case E820_TYPE_NVS:
+       case E820_TYPE_UNUSABLE:
+       case E820_TYPE_PRAM:
+               return true;
+       default:
+               break;
+       }
+
+       return false;
+}
+
+/*
+ * Examine the physical address to determine if it is EFI data. Check
+ * it against the boot params structure and EFI tables and memory types.
+ */
+static bool memremap_is_efi_data(resource_size_t phys_addr,
+                                unsigned long size)
+{
+       u64 paddr;
+
+       /* Check if the address is part of EFI boot/runtime data */
+       if (!efi_enabled(EFI_BOOT))
+               return false;
+
+       paddr = boot_params.efi_info.efi_memmap_hi;
+       paddr <<= 32;
+       paddr |= boot_params.efi_info.efi_memmap;
+       if (phys_addr == paddr)
+               return true;
+
+       paddr = boot_params.efi_info.efi_systab_hi;
+       paddr <<= 32;
+       paddr |= boot_params.efi_info.efi_systab;
+       if (phys_addr == paddr)
+               return true;
+
+       if (efi_is_table_address(phys_addr))
+               return true;
+
+       switch (efi_mem_type(phys_addr)) {
+       case EFI_BOOT_SERVICES_DATA:
+       case EFI_RUNTIME_SERVICES_DATA:
+               return true;
+       default:
+               break;
+       }
+
+       return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain.
+ */
+static bool memremap_is_setup_data(resource_size_t phys_addr,
+                                  unsigned long size)
+{
+       struct setup_data *data;
+       u64 paddr, paddr_next;
+
+       paddr = boot_params.hdr.setup_data;
+       while (paddr) {
+               unsigned int len;
+
+               if (phys_addr == paddr)
+                       return true;
+
+               data = memremap(paddr, sizeof(*data),
+                               MEMREMAP_WB | MEMREMAP_DEC);
+
+               paddr_next = data->next;
+               len = data->len;
+
+               memunmap(data);
+
+               if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+                       return true;
+
+               paddr = paddr_next;
+       }
+
+       return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain (early boot version).
+ */
+static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
+                                               unsigned long size)
+{
+       struct setup_data *data;
+       u64 paddr, paddr_next;
+
+       paddr = boot_params.hdr.setup_data;
+       while (paddr) {
+               unsigned int len;
+
+               if (phys_addr == paddr)
+                       return true;
+
+               data = early_memremap_decrypted(paddr, sizeof(*data));
+
+               paddr_next = data->next;
+               len = data->len;
+
+               early_memunmap(data, sizeof(*data));
+
+               if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+                       return true;
+
+               paddr = paddr_next;
+       }
+
+       return false;
+}
+
+/*
+ * Architecture function to determine if RAM remap is allowed. By default, a
+ * RAM remap will map the data as encrypted. Determine if a RAM remap should
+ * not be done so that the data will be mapped decrypted.
+ */
+bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
+                                unsigned long flags)
+{
+       if (!sme_active())
+               return true;
+
+       if (flags & MEMREMAP_ENC)
+               return true;
+
+       if (flags & MEMREMAP_DEC)
+               return false;
+
+       if (memremap_is_setup_data(phys_addr, size) ||
+           memremap_is_efi_data(phys_addr, size) ||
+           memremap_should_map_decrypted(phys_addr, size))
+               return false;
+
+       return true;
+}
+
+/*
+ * Architecture override of __weak function to adjust the protection attributes
+ * used when remapping memory. By default, early_memremap() will map the data
+ * as encrypted. Determine if an encrypted mapping should not be done and set
+ * the appropriate protection attributes.
+ */
+pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
+                                            unsigned long size,
+                                            pgprot_t prot)
+{
+       if (!sme_active())
+               return prot;
+
+       if (early_memremap_is_setup_data(phys_addr, size) ||
+           memremap_is_efi_data(phys_addr, size) ||
+           memremap_should_map_decrypted(phys_addr, size))
+               prot = pgprot_decrypted(prot);
+       else
+               prot = pgprot_encrypted(prot);
+
+       return prot;
+}
+
+bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
+{
+       return arch_memremap_can_ram_remap(phys_addr, size, 0);
+}
+
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+/* Remap memory with encryption */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+                                     unsigned long size)
+{
+       return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC);
+}
+
+/*
+ * Remap memory with encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+                                        unsigned long size)
+{
+       /* Be sure the write-protect PAT entry is set for write-protect */
+       if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+               return NULL;
+
+       return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
+}
+
+/* Remap memory without encryption */
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+                                     unsigned long size)
+{
+       return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC);
+}
+
+/*
+ * Remap memory without encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+                                        unsigned long size)
+{
+       /* Be sure the write-protect PAT entry is set for write-protect */
+       if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+               return NULL;
 
-       iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+       return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
 }
+#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */
 
 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 
index 02c9d75534091a0cf06b78716a990c41847cb6e4..bc84b73684b7e134a910b193c0103942e2cd0a19 100644 (file)
@@ -11,8 +11,8 @@
 #include <asm/e820/types.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/pgtable.h>
 
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
 static int __init map_range(struct range *range)
@@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = {
 void __init kasan_early_init(void)
 {
        int i;
-       pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+       pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC;
        pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
        pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
        p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
@@ -153,7 +153,7 @@ void __init kasan_init(void)
         */
        memset(kasan_zero_page, 0, PAGE_SIZE);
        for (i = 0; i < PTRS_PER_PTE; i++) {
-               pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+               pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC);
                set_pte(&kasan_zero_pte[i], pte);
        }
        /* Flush TLBs again to be sure that write protection applied. */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
new file mode 100644 (file)
index 0000000..0fbd092
--- /dev/null
@@ -0,0 +1,593 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/sections.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+static char sme_cmdline_arg[] __initdata = "mem_encrypt";
+static char sme_cmdline_on[]  __initdata = "on";
+static char sme_cmdline_off[] __initdata = "off";
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+unsigned long sme_me_mask __section(.data) = 0;
+EXPORT_SYMBOL_GPL(sme_me_mask);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+                                      unsigned long size, bool enc)
+{
+       void *src, *dst;
+       size_t len;
+
+       if (!sme_me_mask)
+               return;
+
+       local_flush_tlb();
+       wbinvd();
+
+       /*
+        * There are limited number of early mapping slots, so map (at most)
+        * one page at time.
+        */
+       while (size) {
+               len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+               /*
+                * Create mappings for the current and desired format of
+                * the memory. Use a write-protected mapping for the source.
+                */
+               src = enc ? early_memremap_decrypted_wp(paddr, len) :
+                           early_memremap_encrypted_wp(paddr, len);
+
+               dst = enc ? early_memremap_encrypted(paddr, len) :
+                           early_memremap_decrypted(paddr, len);
+
+               /*
+                * If a mapping can't be obtained to perform the operation,
+                * then eventual access of that area in the desired mode
+                * will cause a crash.
+                */
+               BUG_ON(!src || !dst);
+
+               /*
+                * Use a temporary buffer, of cache-line multiple size, to
+                * avoid data corruption as documented in the APM.
+                */
+               memcpy(sme_early_buffer, src, len);
+               memcpy(dst, sme_early_buffer, len);
+
+               early_memunmap(dst, len);
+               early_memunmap(src, len);
+
+               paddr += len;
+               size -= len;
+       }
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+       __sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+       __sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+                                            bool map)
+{
+       unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+       pmdval_t pmd_flags, pmd;
+
+       /* Use early_pmd_flags but remove the encryption mask */
+       pmd_flags = __sme_clr(early_pmd_flags);
+
+       do {
+               pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+               __early_make_pgtable((unsigned long)vaddr, pmd);
+
+               vaddr += PMD_SIZE;
+               paddr += PMD_SIZE;
+               size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+       } while (size);
+
+       __native_flush_tlb();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+       struct boot_params *boot_data;
+       unsigned long cmdline_paddr;
+
+       if (!sme_active())
+               return;
+
+       /* Get the command line address before unmapping the real_mode_data */
+       boot_data = (struct boot_params *)real_mode_data;
+       cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+       __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+       if (!cmdline_paddr)
+               return;
+
+       __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+       struct boot_params *boot_data;
+       unsigned long cmdline_paddr;
+
+       if (!sme_active())
+               return;
+
+       __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+       /* Get the command line address after mapping the real_mode_data */
+       boot_data = (struct boot_params *)real_mode_data;
+       cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+       if (!cmdline_paddr)
+               return;
+
+       __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sme_early_init(void)
+{
+       unsigned int i;
+
+       if (!sme_me_mask)
+               return;
+
+       early_pmd_flags = __sme_set(early_pmd_flags);
+
+       __supported_pte_mask = __sme_set(__supported_pte_mask);
+
+       /* Update the protection map with memory encryption mask */
+       for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+               protection_map[i] = pgprot_encrypted(protection_map[i]);
+}
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void)
+{
+       if (!sme_me_mask)
+               return;
+
+       /* Call into SWIOTLB to update the SWIOTLB DMA buffers */
+       swiotlb_update_mem_attributes();
+
+       pr_info("AMD Secure Memory Encryption (SME) active\n");
+}
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
+{
+       WARN(PAGE_ALIGN(size) != size,
+            "size is not page-aligned (%#lx)\n", size);
+
+       /* Make the SWIOTLB buffer area decrypted */
+       set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
+}
+
+static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
+                                unsigned long end)
+{
+       unsigned long pgd_start, pgd_end, pgd_size;
+       pgd_t *pgd_p;
+
+       pgd_start = start & PGDIR_MASK;
+       pgd_end = end & PGDIR_MASK;
+
+       pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1);
+       pgd_size *= sizeof(pgd_t);
+
+       pgd_p = pgd_base + pgd_index(start);
+
+       memset(pgd_p, 0, pgd_size);
+}
+
+#define PGD_FLAGS      _KERNPG_TABLE_NOENC
+#define P4D_FLAGS      _KERNPG_TABLE_NOENC
+#define PUD_FLAGS      _KERNPG_TABLE_NOENC
+#define PMD_FLAGS      (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
+                                    unsigned long vaddr, pmdval_t pmd_val)
+{
+       pgd_t *pgd_p;
+       p4d_t *p4d_p;
+       pud_t *pud_p;
+       pmd_t *pmd_p;
+
+       pgd_p = pgd_base + pgd_index(vaddr);
+       if (native_pgd_val(*pgd_p)) {
+               if (IS_ENABLED(CONFIG_X86_5LEVEL))
+                       p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+               else
+                       pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+       } else {
+               pgd_t pgd;
+
+               if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+                       p4d_p = pgtable_area;
+                       memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+                       pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
+
+                       pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
+               } else {
+                       pud_p = pgtable_area;
+                       memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+                       pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+                       pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
+               }
+               native_set_pgd(pgd_p, pgd);
+       }
+
+       if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+               p4d_p += p4d_index(vaddr);
+               if (native_p4d_val(*p4d_p)) {
+                       pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
+               } else {
+                       p4d_t p4d;
+
+                       pud_p = pgtable_area;
+                       memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+                       pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+                       p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
+                       native_set_p4d(p4d_p, p4d);
+               }
+       }
+
+       pud_p += pud_index(vaddr);
+       if (native_pud_val(*pud_p)) {
+               if (native_pud_val(*pud_p) & _PAGE_PSE)
+                       goto out;
+
+               pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
+       } else {
+               pud_t pud;
+
+               pmd_p = pgtable_area;
+               memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+               pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
+
+               pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
+               native_set_pud(pud_p, pud);
+       }
+
+       pmd_p += pmd_index(vaddr);
+       if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
+               native_set_pmd(pmd_p, native_make_pmd(pmd_val));
+
+out:
+       return pgtable_area;
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+       unsigned long p4d_size, pud_size, pmd_size;
+       unsigned long total;
+
+       /*
+        * Perform a relatively simplistic calculation of the pagetable
+        * entries that are needed. That mappings will be covered by 2MB
+        * PMD entries so we can conservatively calculate the required
+        * number of P4D, PUD and PMD structures needed to perform the
+        * mappings. Incrementing the count for each covers the case where
+        * the addresses cross entries.
+        */
+       if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+               p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+               p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+               pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
+               pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+       } else {
+               p4d_size = 0;
+               pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+               pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+       }
+       pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
+       pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+       total = p4d_size + pud_size + pmd_size;
+
+       /*
+        * Now calculate the added pagetable structures needed to populate
+        * the new pagetables.
+        */
+       if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+               p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+               p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+               pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
+               pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+       } else {
+               p4d_size = 0;
+               pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+               pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+       }
+       pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
+       pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+       total += p4d_size + pud_size + pmd_size;
+
+       return total;
+}
+
+void __init sme_encrypt_kernel(void)
+{
+       unsigned long workarea_start, workarea_end, workarea_len;
+       unsigned long execute_start, execute_end, execute_len;
+       unsigned long kernel_start, kernel_end, kernel_len;
+       unsigned long pgtable_area_len;
+       unsigned long paddr, pmd_flags;
+       unsigned long decrypted_base;
+       void *pgtable_area;
+       pgd_t *pgd;
+
+       if (!sme_active())
+               return;
+
+       /*
+        * Prepare for encrypting the kernel by building new pagetables with
+        * the necessary attributes needed to encrypt the kernel in place.
+        *
+        *   One range of virtual addresses will map the memory occupied
+        *   by the kernel as encrypted.
+        *
+        *   Another range of virtual addresses will map the memory occupied
+        *   by the kernel as decrypted and write-protected.
+        *
+        *     The use of write-protect attribute will prevent any of the
+        *     memory from being cached.
+        */
+
+       /* Physical addresses gives us the identity mapped virtual addresses */
+       kernel_start = __pa_symbol(_text);
+       kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+       kernel_len = kernel_end - kernel_start;
+
+       /* Set the encryption workarea to be immediately after the kernel */
+       workarea_start = kernel_end;
+
+       /*
+        * Calculate required number of workarea bytes needed:
+        *   executable encryption area size:
+        *     stack page (PAGE_SIZE)
+        *     encryption routine page (PAGE_SIZE)
+        *     intermediate copy buffer (PMD_PAGE_SIZE)
+        *   pagetable structures for the encryption of the kernel
+        *   pagetable structures for workarea (in case not currently mapped)
+        */
+       execute_start = workarea_start;
+       execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+       execute_len = execute_end - execute_start;
+
+       /*
+        * One PGD for both encrypted and decrypted mappings and a set of
+        * PUDs and PMDs for each of the encrypted and decrypted mappings.
+        */
+       pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+       pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+
+       /* PUDs and PMDs needed in the current pagetables for the workarea */
+       pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+       /*
+        * The total workarea includes the executable encryption area and
+        * the pagetable area.
+        */
+       workarea_len = execute_len + pgtable_area_len;
+       workarea_end = workarea_start + workarea_len;
+
+       /*
+        * Set the address to the start of where newly created pagetable
+        * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+        * structures are created when the workarea is added to the current
+        * pagetables and when the new encrypted and decrypted kernel
+        * mappings are populated.
+        */
+       pgtable_area = (void *)execute_end;
+
+       /*
+        * Make sure the current pagetable structure has entries for
+        * addressing the workarea.
+        */
+       pgd = (pgd_t *)native_read_cr3_pa();
+       paddr = workarea_start;
+       while (paddr < workarea_end) {
+               pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+                                               paddr,
+                                               paddr + PMD_FLAGS);
+
+               paddr += PMD_PAGE_SIZE;
+       }
+
+       /* Flush the TLB - no globals so cr3 is enough */
+       native_write_cr3(__native_read_cr3());
+
+       /*
+        * A new pagetable structure is being built to allow for the kernel
+        * to be encrypted. It starts with an empty PGD that will then be
+        * populated with new PUDs and PMDs as the encrypted and decrypted
+        * kernel mappings are created.
+        */
+       pgd = pgtable_area;
+       memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
+       pgtable_area += sizeof(*pgd) * PTRS_PER_PGD;
+
+       /* Add encrypted kernel (identity) mappings */
+       pmd_flags = PMD_FLAGS | _PAGE_ENC;
+       paddr = kernel_start;
+       while (paddr < kernel_end) {
+               pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+                                               paddr,
+                                               paddr + pmd_flags);
+
+               paddr += PMD_PAGE_SIZE;
+       }
+
+       /*
+        * A different PGD index/entry must be used to get different
+        * pagetable entries for the decrypted mapping. Choose the next
+        * PGD index and convert it to a virtual address to be used as
+        * the base of the mapping.
+        */
+       decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+       decrypted_base <<= PGDIR_SHIFT;
+
+       /* Add decrypted, write-protected kernel (non-identity) mappings */
+       pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
+       paddr = kernel_start;
+       while (paddr < kernel_end) {
+               pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+                                               paddr + decrypted_base,
+                                               paddr + pmd_flags);
+
+               paddr += PMD_PAGE_SIZE;
+       }
+
+       /* Add decrypted workarea mappings to both kernel mappings */
+       paddr = workarea_start;
+       while (paddr < workarea_end) {
+               pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+                                               paddr,
+                                               paddr + PMD_FLAGS);
+
+               pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+                                               paddr + decrypted_base,
+                                               paddr + PMD_FLAGS);
+
+               paddr += PMD_PAGE_SIZE;
+       }
+
+       /* Perform the encryption */
+       sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+                           kernel_len, workarea_start, (unsigned long)pgd);
+
+       /*
+        * At this point we are running encrypted.  Remove the mappings for
+        * the decrypted areas - all that is needed for this is to remove
+        * the PGD entry/entries.
+        */
+       sme_clear_pgd(pgd, kernel_start + decrypted_base,
+                     kernel_end + decrypted_base);
+
+       sme_clear_pgd(pgd, workarea_start + decrypted_base,
+                     workarea_end + decrypted_base);
+
+       /* Flush the TLB - no globals so cr3 is enough */
+       native_write_cr3(__native_read_cr3());
+}
+
+void __init __nostackprotector sme_enable(struct boot_params *bp)
+{
+       const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
+       unsigned int eax, ebx, ecx, edx;
+       bool active_by_default;
+       unsigned long me_mask;
+       char buffer[16];
+       u64 msr;
+
+       /* Check for the SME support leaf */
+       eax = 0x80000000;
+       ecx = 0;
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+       if (eax < 0x8000001f)
+               return;
+
+       /*
+        * Check for the SME feature:
+        *   CPUID Fn8000_001F[EAX] - Bit 0
+        *     Secure Memory Encryption support
+        *   CPUID Fn8000_001F[EBX] - Bits 5:0
+        *     Pagetable bit position used to indicate encryption
+        */
+       eax = 0x8000001f;
+       ecx = 0;
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+       if (!(eax & 1))
+               return;
+
+       me_mask = 1UL << (ebx & 0x3f);
+
+       /* Check if SME is enabled */
+       msr = __rdmsr(MSR_K8_SYSCFG);
+       if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+               return;
+
+       /*
+        * Fixups have not been applied to phys_base yet and we're running
+        * identity mapped, so we must obtain the address to the SME command
+        * line argument data using rip-relative addressing.
+        */
+       asm ("lea sme_cmdline_arg(%%rip), %0"
+            : "=r" (cmdline_arg)
+            : "p" (sme_cmdline_arg));
+       asm ("lea sme_cmdline_on(%%rip), %0"
+            : "=r" (cmdline_on)
+            : "p" (sme_cmdline_on));
+       asm ("lea sme_cmdline_off(%%rip), %0"
+            : "=r" (cmdline_off)
+            : "p" (sme_cmdline_off));
+
+       if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
+               active_by_default = true;
+       else
+               active_by_default = false;
+
+       cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+                                    ((u64)bp->ext_cmd_line_ptr << 32));
+
+       cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+
+       if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+               sme_me_mask = me_mask;
+       else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
+               sme_me_mask = 0;
+       else
+               sme_me_mask = active_by_default ? me_mask : 0;
+}
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
new file mode 100644 (file)
index 0000000..b327e04
--- /dev/null
@@ -0,0 +1,149 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/frame.h>
+
+       .text
+       .code64
+ENTRY(sme_encrypt_execute)
+
+       /*
+        * Entry parameters:
+        *   RDI - virtual address for the encrypted kernel mapping
+        *   RSI - virtual address for the decrypted kernel mapping
+        *   RDX - length of kernel
+        *   RCX - virtual address of the encryption workarea, including:
+        *     - stack page (PAGE_SIZE)
+        *     - encryption routine page (PAGE_SIZE)
+        *     - intermediate copy buffer (PMD_PAGE_SIZE)
+        *    R8 - physcial address of the pagetables to use for encryption
+        */
+
+       FRAME_BEGIN                     /* RBP now has original stack pointer */
+
+       /* Set up a one page stack in the non-encrypted memory area */
+       movq    %rcx, %rax              /* Workarea stack page */
+       leaq    PAGE_SIZE(%rax), %rsp   /* Set new stack pointer */
+       addq    $PAGE_SIZE, %rax        /* Workarea encryption routine */
+
+       push    %r12
+       movq    %rdi, %r10              /* Encrypted kernel */
+       movq    %rsi, %r11              /* Decrypted kernel */
+       movq    %rdx, %r12              /* Kernel length */
+
+       /* Copy encryption routine into the workarea */
+       movq    %rax, %rdi                              /* Workarea encryption routine */
+       leaq    __enc_copy(%rip), %rsi                  /* Encryption routine */
+       movq    $(.L__enc_copy_end - __enc_copy), %rcx  /* Encryption routine length */
+       rep     movsb
+
+       /* Setup registers for call */
+       movq    %r10, %rdi              /* Encrypted kernel */
+       movq    %r11, %rsi              /* Decrypted kernel */
+       movq    %r8, %rdx               /* Pagetables used for encryption */
+       movq    %r12, %rcx              /* Kernel length */
+       movq    %rax, %r8               /* Workarea encryption routine */
+       addq    $PAGE_SIZE, %r8         /* Workarea intermediate copy buffer */
+
+       call    *%rax                   /* Call the encryption routine */
+
+       pop     %r12
+
+       movq    %rbp, %rsp              /* Restore original stack pointer */
+       FRAME_END
+
+       ret
+ENDPROC(sme_encrypt_execute)
+
+ENTRY(__enc_copy)
+/*
+ * Routine used to encrypt kernel.
+ *   This routine must be run outside of the kernel proper since
+ *   the kernel will be encrypted during the process. So this
+ *   routine is defined here and then copied to an area outside
+ *   of the kernel where it will remain and run decrypted
+ *   during execution.
+ *
+ *   On entry the registers must be:
+ *     RDI - virtual address for the encrypted kernel mapping
+ *     RSI - virtual address for the decrypted kernel mapping
+ *     RDX - address of the pagetables to use for encryption
+ *     RCX - length of kernel
+ *      R8 - intermediate copy buffer
+ *
+ *     RAX - points to this routine
+ *
+ * The kernel will be encrypted by copying from the non-encrypted
+ * kernel space to an intermediate buffer and then copying from the
+ * intermediate buffer back to the encrypted kernel space. The physical
+ * addresses of the two kernel space mappings are the same which
+ * results in the kernel being encrypted "in place".
+ */
+       /* Enable the new page tables */
+       mov     %rdx, %cr3
+
+       /* Flush any global TLBs */
+       mov     %cr4, %rdx
+       andq    $~X86_CR4_PGE, %rdx
+       mov     %rdx, %cr4
+       orq     $X86_CR4_PGE, %rdx
+       mov     %rdx, %cr4
+
+       /* Set the PAT register PA5 entry to write-protect */
+       push    %rcx
+       movl    $MSR_IA32_CR_PAT, %ecx
+       rdmsr
+       push    %rdx                    /* Save original PAT value */
+       andl    $0xffff00ff, %edx       /* Clear PA5 */
+       orl     $0x00000500, %edx       /* Set PA5 to WP */
+       wrmsr
+       pop     %rdx                    /* RDX contains original PAT value */
+       pop     %rcx
+
+       movq    %rcx, %r9               /* Save kernel length */
+       movq    %rdi, %r10              /* Save encrypted kernel address */
+       movq    %rsi, %r11              /* Save decrypted kernel address */
+
+       wbinvd                          /* Invalidate any cache entries */
+
+       /* Copy/encrypt 2MB at a time */
+1:
+       movq    %r11, %rsi              /* Source - decrypted kernel */
+       movq    %r8, %rdi               /* Dest   - intermediate copy buffer */
+       movq    $PMD_PAGE_SIZE, %rcx    /* 2MB length */
+       rep     movsb
+
+       movq    %r8, %rsi               /* Source - intermediate copy buffer */
+       movq    %r10, %rdi              /* Dest   - encrypted kernel */
+       movq    $PMD_PAGE_SIZE, %rcx    /* 2MB length */
+       rep     movsb
+
+       addq    $PMD_PAGE_SIZE, %r11
+       addq    $PMD_PAGE_SIZE, %r10
+       subq    $PMD_PAGE_SIZE, %r9     /* Kernel length decrement */
+       jnz     1b                      /* Kernel length not zero? */
+
+       /* Restore PAT register */
+       push    %rdx                    /* Save original PAT value */
+       movl    $MSR_IA32_CR_PAT, %ecx
+       rdmsr
+       pop     %rdx                    /* Restore original PAT value */
+       wrmsr
+
+       ret
+.L__enc_copy_end:
+ENDPROC(__enc_copy)
index a88cfbfbd0781a4d20c398d7ca1b30f50f05be43..a9967982684649155cfcdc921d5247c8fbfe70d6 100644 (file)
@@ -37,21 +37,21 @@ struct va_alignment __read_mostly va_align = {
        .flags = -1,
 };
 
-unsigned long tasksize_32bit(void)
+unsigned long task_size_32bit(void)
 {
        return IA32_PAGE_OFFSET;
 }
 
-unsigned long tasksize_64bit(void)
+unsigned long task_size_64bit(int full_addr_space)
 {
-       return TASK_SIZE_MAX;
+       return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW;
 }
 
 static unsigned long stack_maxrandom_size(unsigned long task_size)
 {
        unsigned long max = 0;
        if (current->flags & PF_RANDOMIZE) {
-               max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
+               max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit());
                max <<= PAGE_SHIFT;
        }
 
@@ -141,7 +141,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 
        arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
-                       arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+                       arch_rnd(mmap64_rnd_bits), task_size_64bit(0));
 
 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
        /*
@@ -151,7 +151,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
         * mmap_base, the compat syscall uses mmap_compat_base.
         */
        arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
-                       arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+                       arch_rnd(mmap32_rnd_bits), task_size_32bit());
 #endif
 }
 
index 1c34b767c84ca650386f1b550aa5e3f13d24d5ce..9ceaa955d2bacc317582b510753a91f64eca401b 100644 (file)
@@ -355,10 +355,19 @@ int mpx_enable_management(void)
         */
        bd_base = mpx_get_bounds_dir();
        down_write(&mm->mmap_sem);
+
+       /* MPX doesn't support addresses above 47 bits yet. */
+       if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
+               pr_warn_once("%s (%d): MPX cannot handle addresses "
+                               "above 47-bits. Disabling.",
+                               current->comm, current->pid);
+               ret = -ENXIO;
+               goto out;
+       }
        mm->context.bd_addr = bd_base;
        if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
                ret = -ENXIO;
-
+out:
        up_write(&mm->mmap_sem);
        return ret;
 }
@@ -1030,3 +1039,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
        if (ret)
                force_sig(SIGSEGV, current);
 }
+
+/* MPX cannot handle addresses above 47 bits yet. */
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+               unsigned long flags)
+{
+       if (!kernel_managing_mpx_tables(current->mm))
+               return addr;
+       if (addr + len <= DEFAULT_MAP_WINDOW)
+               return addr;
+       if (flags & MAP_FIXED)
+               return -ENOMEM;
+
+       /*
+        * Requested len is larger than the whole area we're allowed to map in.
+        * Resetting hinting address wouldn't do much good -- fail early.
+        */
+       if (len > DEFAULT_MAP_WINDOW)
+               return -ENOMEM;
+
+       /* Look for unmap area within DEFAULT_MAP_WINDOW */
+       return 0;
+}
index 757b0bcdf712dfb1e73527c603ab5d5b05f5bcae..dfb7d657cf4322b0dedcd0bb63c1058bd090b4ea 100644 (file)
@@ -1775,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages)
                                        __pgprot(0), 1, 0, NULL);
 }
 
+static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+{
+       struct cpa_data cpa;
+       unsigned long start;
+       int ret;
+
+       /* Nothing to do if the SME is not active */
+       if (!sme_active())
+               return 0;
+
+       /* Should not be working on unaligned addresses */
+       if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
+               addr &= PAGE_MASK;
+
+       start = addr;
+
+       memset(&cpa, 0, sizeof(cpa));
+       cpa.vaddr = &addr;
+       cpa.numpages = numpages;
+       cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
+       cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
+       cpa.pgd = init_mm.pgd;
+
+       /* Must avoid aliasing mappings in the highmem code */
+       kmap_flush_unused();
+       vm_unmap_aliases();
+
+       /*
+        * Before changing the encryption attribute, we need to flush caches.
+        */
+       if (static_cpu_has(X86_FEATURE_CLFLUSH))
+               cpa_flush_range(start, numpages, 1);
+       else
+               cpa_flush_all(1);
+
+       ret = __change_page_attr_set_clr(&cpa, 1);
+
+       /*
+        * After changing the encryption attribute, we need to flush TLBs
+        * again in case any speculative TLB caching occurred (but no need
+        * to flush caches again).  We could just use cpa_flush_all(), but
+        * in case TLB flushing gets optimized in the cpa_flush_range()
+        * path use the same logic as above.
+        */
+       if (static_cpu_has(X86_FEATURE_CLFLUSH))
+               cpa_flush_range(start, numpages, 0);
+       else
+               cpa_flush_all(0);
+
+       return ret;
+}
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+       return __set_memory_enc_dec(addr, numpages, true);
+}
+EXPORT_SYMBOL_GPL(set_memory_encrypted);
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+       return __set_memory_enc_dec(addr, numpages, false);
+}
+EXPORT_SYMBOL_GPL(set_memory_decrypted);
+
 int set_pages_uc(struct page *page, int numpages)
 {
        unsigned long addr = (unsigned long)page_address(page);
@@ -2020,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
        if (!(page_flags & _PAGE_RW))
                cpa.mask_clr = __pgprot(_PAGE_RW);
 
+       if (!(page_flags & _PAGE_ENC))
+               cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
+
        cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
 
        retval = __change_page_attr_set_clr(&cpa, 0);
index 45979502f64b13e98b31dc862f4ac86652b44592..fe7d57a8fb6003a15e609bfd7481060005e429e4 100644 (file)
@@ -293,7 +293,7 @@ void init_cache_modes(void)
  * pat_init - Initialize PAT MSR and PAT table
  *
  * This function initializes PAT MSR and PAT table with an OS-defined value
- * to enable additional cache attributes, WC and WT.
+ * to enable additional cache attributes, WC, WT and WP.
  *
  * This function must be called on all CPUs using the specific sequence of
  * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
@@ -352,7 +352,7 @@ void pat_init(void)
                 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
                 *      011    3    UC : _PAGE_CACHE_MODE_UC
                 *      100    4    WB : Reserved
-                *      101    5    WC : Reserved
+                *      101    5    WP : _PAGE_CACHE_MODE_WP
                 *      110    6    UC-: Reserved
                 *      111    7    WT : _PAGE_CACHE_MODE_WT
                 *
@@ -360,7 +360,7 @@ void pat_init(void)
                 * corresponding types in the presence of PAT errata.
                 */
                pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
-                     PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+                     PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
        }
 
        if (!boot_cpu_done) {
@@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc);
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                                unsigned long size, pgprot_t vma_prot)
 {
+       if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
+               vma_prot = pgprot_decrypted(vma_prot);
+
        return vma_prot;
 }
 
index 014d07a800535f0320108effe4c9b087c6eb1e88..ce104b962a1704f9950b600c1fc19b464e03359e 100644 (file)
  *     Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
+atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+                           u16 *new_asid, bool *need_flush)
+{
+       u16 asid;
+
+       if (!static_cpu_has(X86_FEATURE_PCID)) {
+               *new_asid = 0;
+               *need_flush = true;
+               return;
+       }
+
+       for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+               if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
+                   next->context.ctx_id)
+                       continue;
+
+               *new_asid = asid;
+               *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
+                              next_tlb_gen);
+               return;
+       }
+
+       /*
+        * We don't currently own an ASID slot on this CPU.
+        * Allocate a slot.
+        */
+       *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
+       if (*new_asid >= TLB_NR_DYN_ASIDS) {
+               *new_asid = 0;
+               this_cpu_write(cpu_tlbstate.next_asid, 1);
+       }
+       *need_flush = true;
+}
+
 void leave_mm(int cpu)
 {
        struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -43,12 +79,11 @@ void leave_mm(int cpu)
        if (loaded_mm == &init_mm)
                return;
 
-       if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
-               BUG();
+       /* Warn if we're not lazy. */
+       WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
 
        switch_mm(NULL, &init_mm, NULL);
 }
-EXPORT_SYMBOL_GPL(leave_mm);
 
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
               struct task_struct *tsk)
@@ -63,115 +98,219 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                        struct task_struct *tsk)
 {
-       unsigned cpu = smp_processor_id();
        struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+       u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+       unsigned cpu = smp_processor_id();
+       u64 next_tlb_gen;
 
        /*
-        * NB: The scheduler will call us with prev == next when
-        * switching from lazy TLB mode to normal mode if active_mm
-        * isn't changing.  When this happens, there is no guarantee
-        * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+        * NB: The scheduler will call us with prev == next when switching
+        * from lazy TLB mode to normal mode if active_mm isn't changing.
+        * When this happens, we don't assume that CR3 (and hence
+        * cpu_tlbstate.loaded_mm) matches next.
         *
         * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
         */
 
-       this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+       /* We don't want flush_tlb_func_* to run concurrently with us. */
+       if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+               WARN_ON_ONCE(!irqs_disabled());
+
+       /*
+        * Verify that CR3 is what we think it is.  This will catch
+        * hypothetical buggy code that directly switches to swapper_pg_dir
+        * without going through leave_mm() / switch_mm_irqs_off() or that
+        * does something like write_cr3(read_cr3_pa()).
+        */
+       VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
 
        if (real_prev == next) {
-               /*
-                * There's nothing to do: we always keep the per-mm control
-                * regs in sync with cpu_tlbstate.loaded_mm.  Just
-                * sanity-check mm_cpumask.
-                */
-               if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
-                       cpumask_set_cpu(cpu, mm_cpumask(next));
-               return;
-       }
+               VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+                         next->context.ctx_id);
+
+               if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
+                       /*
+                        * There's nothing to do: we weren't lazy, and we
+                        * aren't changing our mm.  We don't need to flush
+                        * anything, nor do we need to update CR3, CR4, or
+                        * LDTR.
+                        */
+                       return;
+               }
+
+               /* Resume remote flushes and then read tlb_gen. */
+               cpumask_set_cpu(cpu, mm_cpumask(next));
+               next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+               if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
+                   next_tlb_gen) {
+                       /*
+                        * Ideally, we'd have a flush_tlb() variant that
+                        * takes the known CR3 value as input.  This would
+                        * be faster on Xen PV and on hypothetical CPUs
+                        * on which INVPCID is fast.
+                        */
+                       this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
+                                      next_tlb_gen);
+                       write_cr3(__sme_pa(next->pgd) | prev_asid);
+                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
+                                       TLB_FLUSH_ALL);
+               }
 
-       if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                /*
-                * If our current stack is in vmalloc space and isn't
-                * mapped in the new pgd, we'll double-fault.  Forcibly
-                * map it.
+                * We just exited lazy mode, which means that CR4 and/or LDTR
+                * may be stale.  (Changes to the required CR4 and LDTR states
+                * are not reflected in tlb_gen.)
                 */
-               unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
-               pgd_t *pgd = next->pgd + stack_pgd_index;
-
-               if (unlikely(pgd_none(*pgd)))
-                       set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
-       }
+       } else {
+               u16 new_asid;
+               bool need_flush;
+
+               if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+                       /*
+                        * If our current stack is in vmalloc space and isn't
+                        * mapped in the new pgd, we'll double-fault.  Forcibly
+                        * map it.
+                        */
+                       unsigned int index = pgd_index(current_stack_pointer());
+                       pgd_t *pgd = next->pgd + index;
+
+                       if (unlikely(pgd_none(*pgd)))
+                               set_pgd(pgd, init_mm.pgd[index]);
+               }
 
-       this_cpu_write(cpu_tlbstate.loaded_mm, next);
+               /* Stop remote flushes for the previous mm */
+               if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
+                       cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
 
-       WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
-       cpumask_set_cpu(cpu, mm_cpumask(next));
+               VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
 
-       /*
-        * Re-load page tables.
-        *
-        * This logic has an ordering constraint:
-        *
-        *  CPU 0: Write to a PTE for 'next'
-        *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
-        *  CPU 1: set bit 1 in next's mm_cpumask
-        *  CPU 1: load from the PTE that CPU 0 writes (implicit)
-        *
-        * We need to prevent an outcome in which CPU 1 observes
-        * the new PTE value and CPU 0 observes bit 1 clear in
-        * mm_cpumask.  (If that occurs, then the IPI will never
-        * be sent, and CPU 0's TLB will contain a stale entry.)
-        *
-        * The bad outcome can occur if either CPU's load is
-        * reordered before that CPU's store, so both CPUs must
-        * execute full barriers to prevent this from happening.
-        *
-        * Thus, switch_mm needs a full barrier between the
-        * store to mm_cpumask and any operation that could load
-        * from next->pgd.  TLB fills are special and can happen
-        * due to instruction fetches or for no reason at all,
-        * and neither LOCK nor MFENCE orders them.
-        * Fortunately, load_cr3() is serializing and gives the
-        * ordering guarantee we need.
-        */
-       load_cr3(next->pgd);
-
-       /*
-        * This gets called via leave_mm() in the idle path where RCU
-        * functions differently.  Tracing normally uses RCU, so we have to
-        * call the tracepoint specially here.
-        */
-       trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+               /*
+                * Start remote flushes and then read tlb_gen.
+                */
+               cpumask_set_cpu(cpu, mm_cpumask(next));
+               next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+               choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+
+               if (need_flush) {
+                       this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+                       this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+                       write_cr3(__sme_pa(next->pgd) | new_asid);
+                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
+                                       TLB_FLUSH_ALL);
+               } else {
+                       /* The new ASID is already up to date. */
+                       write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
+                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+               }
 
-       /* Stop flush ipis for the previous mm */
-       WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
-                    real_prev != &init_mm);
-       cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+               this_cpu_write(cpu_tlbstate.loaded_mm, next);
+               this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+       }
 
-       /* Load per-mm CR4 and LDTR state */
        load_mm_cr4(next);
        switch_ldt(real_prev, next);
 }
 
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen.  We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
 static void flush_tlb_func_common(const struct flush_tlb_info *f,
                                  bool local, enum tlb_flush_reason reason)
 {
+       /*
+        * We have three different tlb_gen values in here.  They are:
+        *
+        * - mm_tlb_gen:     the latest generation.
+        * - local_tlb_gen:  the generation that this CPU has already caught
+        *                   up to.
+        * - f->new_tlb_gen: the generation that the requester of the flush
+        *                   wants us to catch up to.
+        */
+       struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+       u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+       u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+       u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
        /* This code cannot presently handle being reentered. */
        VM_WARN_ON(!irqs_disabled());
 
-       if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
-               leave_mm(smp_processor_id());
+       VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+                  loaded_mm->context.ctx_id);
+
+       if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+               /*
+                * We're in lazy mode -- don't flush.  We can get here on
+                * remote flushes due to races and on local flushes if a
+                * kernel thread coincidentally flushes the mm it's lazily
+                * still using.
+                */
                return;
        }
 
-       if (f->end == TLB_FLUSH_ALL) {
-               local_flush_tlb();
-               if (local)
-                       count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-               trace_tlb_flush(reason, TLB_FLUSH_ALL);
-       } else {
+       if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+               /*
+                * There's nothing to do: we're already up to date.  This can
+                * happen if two concurrent flushes happen -- the first flush to
+                * be handled can catch us all the way up, leaving no work for
+                * the second flush.
+                */
+               trace_tlb_flush(reason, 0);
+               return;
+       }
+
+       WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+       WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+       /*
+        * If we get to this point, we know that our TLB is out of date.
+        * This does not strictly imply that we need to flush (it's
+        * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+        * going to need to flush in the very near future, so we might
+        * as well get it over with.
+        *
+        * The only question is whether to do a full or partial flush.
+        *
+        * We do a partial flush if requested and two extra conditions
+        * are met:
+        *
+        * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
+        *    we've always done all needed flushes to catch up to
+        *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
+        *    f->new_tlb_gen == 3, then we know that the flush needed to bring
+        *    us up to date for tlb_gen 3 is the partial flush we're
+        *    processing.
+        *
+        *    As an example of why this check is needed, suppose that there
+        *    are two concurrent flushes.  The first is a full flush that
+        *    changes context.tlb_gen from 1 to 2.  The second is a partial
+        *    flush that changes context.tlb_gen from 2 to 3.  If they get
+        *    processed on this CPU in reverse order, we'll see
+        *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+        *    If we were to use __flush_tlb_single() and set local_tlb_gen to
+        *    3, we'd be break the invariant: we'd update local_tlb_gen above
+        *    1 without the full flush that's needed for tlb_gen 2.
+        *
+        * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
+        *    Partial TLB flushes are not all that much cheaper than full TLB
+        *    flushes, so it seems unlikely that it would be a performance win
+        *    to do a partial flush if that won't bring our TLB fully up to
+        *    date.  By doing a full flush instead, we can increase
+        *    local_tlb_gen all the way to mm_tlb_gen and we can probably
+        *    avoid another flush in the very near future.
+        */
+       if (f->end != TLB_FLUSH_ALL &&
+           f->new_tlb_gen == local_tlb_gen + 1 &&
+           f->new_tlb_gen == mm_tlb_gen) {
+               /* Partial flush */
                unsigned long addr;
                unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+
                addr = f->start;
                while (addr < f->end) {
                        __flush_tlb_single(addr);
@@ -180,7 +319,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
                if (local)
                        count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
                trace_tlb_flush(reason, nr_pages);
+       } else {
+               /* Full flush. */
+               local_flush_tlb();
+               if (local)
+                       count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+               trace_tlb_flush(reason, TLB_FLUSH_ALL);
        }
+
+       /* Both paths above update our state to mm_tlb_gen. */
+       this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
 }
 
 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
@@ -214,6 +362,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                                (info->end - info->start) >> PAGE_SHIFT);
 
        if (is_uv_system()) {
+               /*
+                * This whole special case is confused.  UV has a "Broadcast
+                * Assist Unit", which seems to be a fancy way to send IPIs.
+                * Back when x86 used an explicit TLB flush IPI, UV was
+                * optimized to use its own mechanism.  These days, x86 uses
+                * smp_call_function_many(), but UV still uses a manual IPI,
+                * and that IPI's action is out of date -- it does a manual
+                * flush instead of calling flush_tlb_func_remote().  This
+                * means that the percpu tlb_gen variables won't be updated
+                * and we'll do pointless flushes on future context switches.
+                *
+                * Rather than hooking native_flush_tlb_others() here, I think
+                * that UV should be updated so that smp_call_function_many(),
+                * etc, are optimal on UV.
+                */
                unsigned int cpu;
 
                cpu = smp_processor_id();
@@ -250,8 +413,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 
        cpu = get_cpu();
 
-       /* Synchronize with switch_mm. */
-       smp_mb();
+       /* This is also a barrier that synchronizes with switch_mm(). */
+       info.new_tlb_gen = inc_mm_tlb_gen(mm);
 
        /* Should we flush just the requested range? */
        if ((end != TLB_FLUSH_ALL) &&
@@ -273,6 +436,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 
        if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
                flush_tlb_others(mm_cpumask(mm), &info);
+
        put_cpu();
 }
 
@@ -281,8 +445,6 @@ static void do_flush_tlb_all(void *info)
 {
        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
        __flush_tlb_all();
-       if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
-               leave_mm(smp_processor_id());
 }
 
 void flush_tlb_all(void)
@@ -335,6 +497,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 
        if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
                flush_tlb_others(&batch->cpumask, &info);
+
        cpumask_clear(&batch->cpumask);
 
        put_cpu();
index dbe2132b0ed4cabf19119a5fe0ffa4d89480745d..7a5350d08cef711a14c29cf1f8fcedb70ecc7465 100644 (file)
@@ -674,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev)
 
        pa_data = boot_params.hdr.setup_data;
        while (pa_data) {
-               data = ioremap(pa_data, sizeof(*rom));
+               data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB);
                if (!data)
                        return -ENOMEM;
 
@@ -693,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev)
                        }
                }
                pa_data = data->next;
-               iounmap(data);
+               memunmap(data);
        }
        set_dma_domain_ops(dev);
        set_dev_domain_options(dev);
index f084d8718ac4990fca3c4cbf004d91551b3a8e76..6217b23e85f6ce3824583f6b49fe6c0f46c0ccf9 100644 (file)
@@ -1035,12 +1035,12 @@ void __init efi_enter_virtual_mode(void)
 /*
  * Convenience functions to obtain memory types and attributes
  */
-u32 efi_mem_type(unsigned long phys_addr)
+int efi_mem_type(unsigned long phys_addr)
 {
        efi_memory_desc_t *md;
 
        if (!efi_enabled(EFI_MEMMAP))
-               return 0;
+               return -ENOTSUPP;
 
        for_each_efi_memory_desc(md) {
                if ((md->phys_addr <= phys_addr) &&
@@ -1048,7 +1048,7 @@ u32 efi_mem_type(unsigned long phys_addr)
                                  (md->num_pages << EFI_PAGE_SHIFT))))
                        return md->type;
        }
-       return 0;
+       return -EINVAL;
 }
 
 static int __init arch_parse_efi_cmdline(char *str)
index 9bf72f5bfedb6c86c452642ca8635c0b370e64e5..12e83888e5b96714a4ad7bd8b9cc84844dd91ded 100644 (file)
@@ -327,7 +327,7 @@ virt_to_phys_or_null_size(void *va, unsigned long size)
 
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 {
-       unsigned long pfn, text;
+       unsigned long pfn, text, pf;
        struct page *page;
        unsigned npages;
        pgd_t *pgd;
@@ -335,7 +335,12 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
        if (efi_enabled(EFI_OLD_MEMMAP))
                return 0;
 
-       efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd);
+       /*
+        * Since the PGD is encrypted, set the encryption mask so that when
+        * this value is loaded into cr3 the PGD will be decrypted during
+        * the pagetable walk.
+        */
+       efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd);
        pgd = efi_pgd;
 
        /*
@@ -345,7 +350,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
         * phys_efi_set_virtual_address_map().
         */
        pfn = pa_memmap >> PAGE_SHIFT;
-       if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) {
+       pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC;
+       if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) {
                pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
                return 1;
        }
@@ -388,7 +394,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
        text = __pa(_text);
        pfn = text >> PAGE_SHIFT;
 
-       if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) {
+       pf = _PAGE_RW | _PAGE_ENC;
+       if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) {
                pr_err("Failed to map kernel text 1:1\n");
                return 1;
        }
index cd4be19c36dc611f482ad5291d27365db47a92c5..1f71980fc5e0fa9983c1687aa4ff81f02eb0b989 100644 (file)
@@ -1,6 +1,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/set_memory.h>
 #include <asm/pgtable.h>
@@ -59,6 +60,13 @@ static void __init setup_real_mode(void)
 
        base = (unsigned char *)real_mode_header;
 
+       /*
+        * If SME is active, the trampoline area will need to be in
+        * decrypted memory in order to bring up other processors
+        * successfully.
+        */
+       set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
+
        memcpy(base, real_mode_blob, size);
 
        phys_base = __pa(base);
@@ -100,6 +108,10 @@ static void __init setup_real_mode(void)
        trampoline_cr4_features = &trampoline_header->cr4;
        *trampoline_cr4_features = mmu_cr4_features;
 
+       trampoline_header->flags = 0;
+       if (sme_active())
+               trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
+
        trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
        trampoline_pgd[0] = trampoline_pgd_entry.pgd;
        trampoline_pgd[511] = init_top_pgt[511].pgd;
index dac7b20d2f9de40f0244f623e8560c304394c178..614fd7064d0a21366c3f27721e610038fda1e9b0 100644 (file)
@@ -30,6 +30,7 @@
 #include <asm/msr.h>
 #include <asm/segment.h>
 #include <asm/processor-flags.h>
+#include <asm/realmode.h>
 #include "realmode.h"
 
        .text
@@ -92,6 +93,28 @@ ENTRY(startup_32)
        movl    %edx, %fs
        movl    %edx, %gs
 
+       /*
+        * Check for memory encryption support. This is a safety net in
+        * case BIOS hasn't done the necessary step of setting the bit in
+        * the MSR for this AP. If SME is active and we've gotten this far
+        * then it is safe for us to set the MSR bit and continue. If we
+        * don't we'll eventually crash trying to execute encrypted
+        * instructions.
+        */
+       bt      $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
+       jnc     .Ldone
+       movl    $MSR_K8_SYSCFG, %ecx
+       rdmsr
+       bts     $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax
+       jc      .Ldone
+
+       /*
+        * Memory encryption is enabled but the SME enable bit for this
+        * CPU has has not been set.  It is safe to set it, so do so.
+        */
+       wrmsr
+.Ldone:
+
        movl    pa_tr_cr4, %eax
        movl    %eax, %cr4              # Enable PAE mode
 
@@ -147,6 +170,7 @@ GLOBAL(trampoline_header)
        tr_start:               .space  8
        GLOBAL(tr_efer)         .space  8
        GLOBAL(tr_cr4)          .space  4
+       GLOBAL(tr_flags)        .space  4
 END(trampoline_header)
 
 #include "trampoline_common.S"
index 027987638e9800cc4cf651a352e6718c9ac26eb4..1ecd419811a2b30a88d1fccc140829292e184035 100644 (file)
@@ -17,6 +17,9 @@ config XEN_PV
        bool "Xen PV guest support"
        default y
        depends on XEN
+       # XEN_PV is not ready to work with 5-level paging.
+       # Changes to hypervisor are also required.
+       depends on !X86_5LEVEL
        select XEN_HAVE_PVMMU
        select XEN_HAVE_VPMU
        help
@@ -75,4 +78,6 @@ config XEN_DEBUG_FS
 config XEN_PVH
        bool "Support for running as a PVH guest"
        depends on XEN && XEN_PVHVM && ACPI
+       # Pre-built page tables are not ready to handle 5-level paging.
+       depends on !X86_5LEVEL
        def_bool n
index 811e4ddb3f37484180c0099e92b4eb351515d5bc..df1921751aa5cc03f9b0b54f1fa559dfc2d64b28 100644 (file)
@@ -263,6 +263,13 @@ static void __init xen_init_capabilities(void)
        setup_clear_cpu_cap(X86_FEATURE_MTRR);
        setup_clear_cpu_cap(X86_FEATURE_ACC);
        setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+       setup_clear_cpu_cap(X86_FEATURE_SME);
+
+       /*
+        * Xen PV would need some work to support PCID: CR3 handling as well
+        * as xen_flush_tlb_others() would need updating.
+        */
+       setup_clear_cpu_cap(X86_FEATURE_PCID);
 
        if (!xen_initial_domain())
                setup_clear_cpu_cap(X86_FEATURE_ACPI);
index cab28cf2cffbb78d09325c46413f65381ec6c5e3..e437714750f8fccbaa424cfc03a1e46a5053a010 100644 (file)
@@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
        /* Get the "official" set of cpus referring to our pagetable. */
        if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
                for_each_online_cpu(cpu) {
-                       if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
-                           && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+                       if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
                                continue;
                        smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
                }
                return;
        }
-       cpumask_copy(mask, mm_cpumask(mm));
 
        /*
         * It's possible that a vcpu may have a stale reference to our
@@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
         * look at its actual current cr3 value, and force it to flush
         * if needed.
         */
+       cpumask_clear(mask);
        for_each_online_cpu(cpu) {
                if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
                        cpumask_set_cpu(cpu, mask);
index 72a8e6adebe6c09da07303bafac194236f2df7c6..a7525e95d53fe9d35ce31349d6fb45dc112f9418 100644 (file)
@@ -58,7 +58,7 @@ ENTRY(hypercall_page)
 #else
        ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
        /* Map the p2m table to a 512GB-aligned user address. */
-       ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad PGDIR_SIZE)
+       ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad (PUD_SIZE * PTRS_PER_PUD))
 #endif
 #ifdef CONFIG_XEN_PV
        ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
index 5c8aa9cf62d70de12b240e62887aaf59311e4a99..fe3d2a40f3111bea5440f00ee25ba92df0ef3e64 100644 (file)
@@ -708,8 +708,6 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
 static void acpi_idle_enter_bm(struct acpi_processor *pr,
                               struct acpi_processor_cx *cx, bool timer_bc)
 {
-       acpi_unlazy_tlb(smp_processor_id());
-
        /*
         * Must be done before busmaster disable as we might need to
         * access HPET !
index ef76e5eecf0b090d7488f0b73b9cb4d6248da09f..d5de6ee8466d51532d041ae5ff9b07dfd2d03ca5 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/io.h>
+#include <asm/dmi.h>
 
 #define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider
                              the top entry type is only 8 bits */
@@ -380,7 +381,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry,
        u8 __iomem *mapped;
        ssize_t wrote = 0;
 
-       mapped = ioremap(sel->access_method_address, sel->area_length);
+       mapped = dmi_remap(sel->access_method_address, sel->area_length);
        if (!mapped)
                return -EIO;
 
@@ -390,7 +391,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry,
                wrote++;
        }
 
-       iounmap(mapped);
+       dmi_unmap(mapped);
        return wrote;
 }
 
index 045d6d311bde2defc5ebabaf229ff7f71a94f60f..69d4d130e055c28c50155934628a40808e248c00 100644 (file)
@@ -55,6 +55,25 @@ struct efi __read_mostly efi = {
 };
 EXPORT_SYMBOL(efi);
 
+static unsigned long *efi_tables[] = {
+       &efi.mps,
+       &efi.acpi,
+       &efi.acpi20,
+       &efi.smbios,
+       &efi.smbios3,
+       &efi.sal_systab,
+       &efi.boot_info,
+       &efi.hcdp,
+       &efi.uga,
+       &efi.uv_systab,
+       &efi.fw_vendor,
+       &efi.runtime,
+       &efi.config_table,
+       &efi.esrt,
+       &efi.properties_table,
+       &efi.mem_attr_table,
+};
+
 static bool disable_runtime;
 static int __init setup_noefi(char *arg)
 {
@@ -855,6 +874,20 @@ int efi_status_to_err(efi_status_t status)
        return err;
 }
 
+bool efi_is_table_address(unsigned long phys_addr)
+{
+       unsigned int i;
+
+       if (phys_addr == EFI_INVALID_TABLE_ADDR)
+               return false;
+
+       for (i = 0; i < ARRAY_SIZE(efi_tables); i++)
+               if (*(efi_tables[i]) == phys_addr)
+                       return true;
+
+       return false;
+}
+
 #ifdef CONFIG_KEXEC
 static int update_efi_random_seed(struct notifier_block *nb,
                                  unsigned long code, void *unused)
index 75273a2516039116da1d706d5f44565cadc2b41a..e83d6aec0c1376bc263352f9feec7ab40cb31c78 100644 (file)
@@ -95,7 +95,7 @@ efi_setup_pcdp_console(char *cmdline)
        if (efi.hcdp == EFI_INVALID_TABLE_ADDR)
                return -ENODEV;
 
-       pcdp = early_ioremap(efi.hcdp, 4096);
+       pcdp = early_memremap(efi.hcdp, 4096);
        printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp);
 
        if (strstr(cmdline, "console=hcdp")) {
@@ -131,6 +131,6 @@ efi_setup_pcdp_console(char *cmdline)
        }
 
 out:
-       early_iounmap(pcdp, 4096);
+       early_memunmap(pcdp, 4096);
        return rc;
 }
index 8dc11064253d9e5ed58f8c817a471b36c25c5951..7a61a07ac4de97643199b09cbdaf185e80b9edaf 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/pagemap.h>
 #include <linux/shmem_fs.h>
 #include <linux/dma-buf.h>
+#include <linux/mem_encrypt.h>
 #include <drm/drmP.h>
 #include <drm/drm_vma_manager.h>
 #include <drm/drm_gem.h>
@@ -928,6 +929,7 @@ int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size,
        vma->vm_ops = dev->driver->gem_vm_ops;
        vma->vm_private_data = obj;
        vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+       vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
 
        /* Take a ref for this mapping of the object, so that the fault
         * handler can dereference the mmap offset's pointer to the object.
index 1170b3209a1269aff7c1cfa3692b0fe73b413bb8..ed4bcbfd60864ca46064620cd1c9b83e0b1c8c25 100644 (file)
@@ -40,6 +40,7 @@
 #include <linux/efi.h>
 #include <linux/slab.h>
 #endif
+#include <linux/mem_encrypt.h>
 #include <asm/pgtable.h>
 #include "drm_internal.h"
 #include "drm_legacy.h"
@@ -58,6 +59,9 @@ static pgprot_t drm_io_prot(struct drm_local_map *map,
 {
        pgprot_t tmp = vm_get_page_prot(vma->vm_flags);
 
+       /* We don't want graphics memory to be mapped encrypted */
+       tmp = pgprot_decrypted(tmp);
+
 #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)
        if (map->type == _DRM_REGISTERS && !(map->flags & _DRM_WRITE_COMBINING))
                tmp = pgprot_noncached(tmp);
index b442d12f2f7d64819faff9eace7bdc51fa199f89..84fb009d4eb045eeafb1e731faf418107613c69e 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/rbtree.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/mem_encrypt.h>
 
 #define TTM_BO_VM_NUM_PREFAULT 16
 
@@ -230,9 +231,11 @@ static int ttm_bo_vm_fault(struct vm_fault *vmf)
         * first page.
         */
        for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
-               if (bo->mem.bus.is_iomem)
+               if (bo->mem.bus.is_iomem) {
+                       /* Iomem should not be marked encrypted */
+                       cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
                        pfn = bdev->driver->io_mem_pfn(bo, page_offset);
-               else {
+               else {
                        page = ttm->pages[page_offset];
                        if (unlikely(!page && i == 0)) {
                                retval = VM_FAULT_OOM;
index 4a650036256444ed84d1d2ef4e349236d5eb5f26..92e1690e28de90faf99d3fb0d2eec7308aa9d693 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/fb.h>
 #include <linux/dma-buf.h>
+#include <linux/mem_encrypt.h>
 
 #include <drm/drmP.h>
 #include <drm/drm_crtc.h>
@@ -169,6 +170,9 @@ static int udl_fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
        pr_notice("mmap() framebuffer addr:%lu size:%lu\n",
                  pos, size);
 
+       /* We don't want the framebuffer to be mapped encrypted */
+       vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
        while (size > 0) {
                page = vmalloc_to_pfn((void *)pos);
                if (remap_pfn_range(vma, start, page, PAGE_SIZE, PAGE_SHARED))
index c2ae819a871cb6d8f09412702e46463397f9fc0f..e87ffb3c31a92faeb29ca07b47397f2ad5504120 100644 (file)
@@ -913,16 +913,15 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
        struct cpuidle_state *state = &drv->states[index];
        unsigned long eax = flg2MWAIT(state->flags);
        unsigned int cstate;
-       int cpu = smp_processor_id();
 
        cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
 
        /*
-        * leave_mm() to avoid costly and often unnecessary wakeups
-        * for flushing the user TLB's associated with the active mm.
+        * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition
+        * will probably flush the TLB.  It's not guaranteed to flush
+        * the TLB, though, so it's not clear that we can do anything
+        * useful with this knowledge.
         */
-       if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
-               leave_mm(cpu);
 
        if (!(lapic_timer_reliable_states & (1 << (cstate))))
                tick_broadcast_enter();
index 354cbd6392cdf261ba657548ed2c208a09ddf50f..4ad7e5e31943db7b1d1d90850fac652fbfeb8ed6 100644 (file)
@@ -575,7 +575,7 @@ static void dump_dte_entry(u16 devid)
 
 static void dump_command(unsigned long phys_addr)
 {
-       struct iommu_cmd *cmd = phys_to_virt(phys_addr);
+       struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
        int i;
 
        for (i = 0; i < 4; ++i)
@@ -919,11 +919,13 @@ static void copy_cmd_to_buffer(struct amd_iommu *iommu,
 
 static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
 {
+       u64 paddr = iommu_virt_to_phys((void *)address);
+
        WARN_ON(address & 0x7ULL);
 
        memset(cmd, 0, sizeof(*cmd));
-       cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
-       cmd->data[1] = upper_32_bits(__pa(address));
+       cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
+       cmd->data[1] = upper_32_bits(paddr);
        cmd->data[2] = 1;
        CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
 }
@@ -1383,7 +1385,7 @@ static bool increase_address_space(struct protection_domain *domain,
                return false;
 
        *pte             = PM_LEVEL_PDE(domain->mode,
-                                       virt_to_phys(domain->pt_root));
+                                       iommu_virt_to_phys(domain->pt_root));
        domain->pt_root  = pte;
        domain->mode    += 1;
        domain->updated  = true;
@@ -1420,7 +1422,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
                        if (!page)
                                return NULL;
 
-                       __npte = PM_LEVEL_PDE(level, virt_to_phys(page));
+                       __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
 
                        /* pte could have been changed somewhere. */
                        if (cmpxchg64(pte, __pte, __npte) != __pte) {
@@ -1536,10 +1538,10 @@ static int iommu_map_page(struct protection_domain *dom,
                        return -EBUSY;
 
        if (count > 1) {
-               __pte = PAGE_SIZE_PTE(phys_addr, page_size);
+               __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
                __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
        } else
-               __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
+               __pte = __sme_set(phys_addr) | IOMMU_PTE_P | IOMMU_PTE_FC;
 
        if (prot & IOMMU_PROT_IR)
                __pte |= IOMMU_PTE_IR;
@@ -1755,7 +1757,7 @@ static void free_gcr3_tbl_level1(u64 *tbl)
                if (!(tbl[i] & GCR3_VALID))
                        continue;
 
-               ptr = __va(tbl[i] & PAGE_MASK);
+               ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
 
                free_page((unsigned long)ptr);
        }
@@ -1770,7 +1772,7 @@ static void free_gcr3_tbl_level2(u64 *tbl)
                if (!(tbl[i] & GCR3_VALID))
                        continue;
 
-               ptr = __va(tbl[i] & PAGE_MASK);
+               ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
 
                free_gcr3_tbl_level1(ptr);
        }
@@ -2049,7 +2051,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
        u64 flags = 0;
 
        if (domain->mode != PAGE_MODE_NONE)
-               pte_root = virt_to_phys(domain->pt_root);
+               pte_root = iommu_virt_to_phys(domain->pt_root);
 
        pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
                    << DEV_ENTRY_MODE_SHIFT;
@@ -2061,7 +2063,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
                flags |= DTE_FLAG_IOTLB;
 
        if (domain->flags & PD_IOMMUV2_MASK) {
-               u64 gcr3 = __pa(domain->gcr3_tbl);
+               u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
                u64 glx  = domain->glx;
                u64 tmp;
 
@@ -3606,10 +3608,10 @@ static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
                        if (root == NULL)
                                return NULL;
 
-                       *pte = __pa(root) | GCR3_VALID;
+                       *pte = iommu_virt_to_phys(root) | GCR3_VALID;
                }
 
-               root = __va(*pte & PAGE_MASK);
+               root = iommu_phys_to_virt(*pte & PAGE_MASK);
 
                level -= 1;
        }
@@ -3788,7 +3790,7 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
 
        dte     = amd_iommu_dev_table[devid].data[2];
        dte     &= ~DTE_IRQ_PHYS_ADDR_MASK;
-       dte     |= virt_to_phys(table->table);
+       dte     |= iommu_virt_to_phys(table->table);
        dte     |= DTE_IRQ_REMAP_INTCTL;
        dte     |= DTE_IRQ_TABLE_LEN;
        dte     |= DTE_IRQ_REMAP_ENABLE;
index 372303700566f4f984e6656bd937e2d99bf07cc6..2292a6cece76e02e73411935c58f5d35387f60fc 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/iommu.h>
 #include <linux/kmemleak.h>
 #include <linux/crash_dump.h>
+#include <linux/mem_encrypt.h>
 #include <asm/pci-direct.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
@@ -348,7 +349,7 @@ static void iommu_set_device_table(struct amd_iommu *iommu)
 
        BUG_ON(iommu->mmio_base == NULL);
 
-       entry = virt_to_phys(amd_iommu_dev_table);
+       entry = iommu_virt_to_phys(amd_iommu_dev_table);
        entry |= (dev_table_size >> 12) - 1;
        memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
                        &entry, sizeof(entry));
@@ -606,7 +607,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
 
        BUG_ON(iommu->cmd_buf == NULL);
 
-       entry = (u64)virt_to_phys(iommu->cmd_buf);
+       entry = iommu_virt_to_phys(iommu->cmd_buf);
        entry |= MMIO_CMD_SIZE_512;
 
        memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -635,7 +636,7 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu)
 
        BUG_ON(iommu->evt_buf == NULL);
 
-       entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+       entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
 
        memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
                    &entry, sizeof(entry));
@@ -668,7 +669,7 @@ static void iommu_enable_ppr_log(struct amd_iommu *iommu)
        if (iommu->ppr_log == NULL)
                return;
 
-       entry = (u64)virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512;
+       entry = iommu_virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512;
 
        memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET,
                    &entry, sizeof(entry));
@@ -748,10 +749,10 @@ static int iommu_init_ga_log(struct amd_iommu *iommu)
        if (!iommu->ga_log_tail)
                goto err_out;
 
-       entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
+       entry = iommu_virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
        memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET,
                    &entry, sizeof(entry));
-       entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL;
+       entry = (iommu_virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL;
        memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET,
                    &entry, sizeof(entry));
        writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
@@ -2564,6 +2565,24 @@ static int __init amd_iommu_init(void)
        return ret;
 }
 
+static bool amd_iommu_sme_check(void)
+{
+       if (!sme_active() || (boot_cpu_data.x86 != 0x17))
+               return true;
+
+       /* For Fam17h, a specific level of support is required */
+       if (boot_cpu_data.microcode >= 0x08001205)
+               return true;
+
+       if ((boot_cpu_data.microcode >= 0x08001126) &&
+           (boot_cpu_data.microcode <= 0x080011ff))
+               return true;
+
+       pr_notice("AMD-Vi: IOMMU not currently supported when SME is active\n");
+
+       return false;
+}
+
 /****************************************************************************
  *
  * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -2578,6 +2597,9 @@ int __init amd_iommu_detect(void)
        if (no_iommu || (iommu_detected && !gart_iommu_aperture))
                return -ENODEV;
 
+       if (!amd_iommu_sme_check())
+               return -ENODEV;
+
        ret = iommu_go_to_state(IOMMU_IVRS_DETECTED);
        if (ret)
                return ret;
index 466260f8a1df37bb79738d4c8f91b90568cdd7c6..3f12fb2338ea5bbc1850f779fd98fff92808d7c0 100644 (file)
@@ -87,4 +87,14 @@ static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
        return !!(iommu->features & f);
 }
 
+static inline u64 iommu_virt_to_phys(void *vaddr)
+{
+       return (u64)__sme_set(virt_to_phys(vaddr));
+}
+
+static inline void *iommu_phys_to_virt(unsigned long paddr)
+{
+       return phys_to_virt(__sme_clr(paddr));
+}
+
 #endif /* _ASM_X86_AMD_IOMMU_PROTO_H  */
index 294a409e283b7ae4b52c59350756983711bba9d1..8591f43c467c9c93aa1c3dcf5d8e64d2f3bcdbd5 100644 (file)
 
 #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
 #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
-#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
+#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
 #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
 
 #define IOMMU_PROT_MASK 0x03
index 296db7a69c27548c2529c52d532832c39f0db1aa..153b3f3cc795fdd5b7d0e31ff5a18043b738b4c8 100644 (file)
@@ -68,6 +68,7 @@
 #include <linux/init.h>
 #include <linux/sfi.h>
 #include <linux/slab.h>
+#include <linux/io.h>
 
 #include "sfi_core.h"
 
@@ -86,13 +87,13 @@ static struct sfi_table_simple *syst_va __read_mostly;
 /*
  * FW creates and saves the SFI tables in memory. When these tables get
  * used, they may need to be mapped to virtual address space, and the mapping
- * can happen before or after the ioremap() is ready, so a flag is needed
+ * can happen before or after the memremap() is ready, so a flag is needed
  * to indicating this
  */
-static u32 sfi_use_ioremap __read_mostly;
+static u32 sfi_use_memremap __read_mostly;
 
 /*
- * sfi_un/map_memory calls early_ioremap/iounmap which is a __init function
+ * sfi_un/map_memory calls early_memremap/memunmap which is a __init function
  * and introduces section mismatch. So use __ref to make it calm.
  */
 static void __iomem * __ref sfi_map_memory(u64 phys, u32 size)
@@ -100,10 +101,10 @@ static void __iomem * __ref sfi_map_memory(u64 phys, u32 size)
        if (!phys || !size)
                return NULL;
 
-       if (sfi_use_ioremap)
-               return ioremap_cache(phys, size);
+       if (sfi_use_memremap)
+               return memremap(phys, size, MEMREMAP_WB);
        else
-               return early_ioremap(phys, size);
+               return early_memremap(phys, size);
 }
 
 static void __ref sfi_unmap_memory(void __iomem *virt, u32 size)
@@ -111,10 +112,10 @@ static void __ref sfi_unmap_memory(void __iomem *virt, u32 size)
        if (!virt || !size)
                return;
 
-       if (sfi_use_ioremap)
-               iounmap(virt);
+       if (sfi_use_memremap)
+               memunmap(virt);
        else
-               early_iounmap(virt, size);
+               early_memunmap(virt, size);
 }
 
 static void sfi_print_table_header(unsigned long long pa,
@@ -507,8 +508,8 @@ void __init sfi_init_late(void)
        length = syst_va->header.len;
        sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple));
 
-       /* Use ioremap now after it is ready */
-       sfi_use_ioremap = 1;
+       /* Use memremap now after it is ready */
+       sfi_use_memremap = 1;
        syst_va = sfi_map_memory(syst_pa, length);
 
        sfi_acpi_init();
index 7a42238db446b0093323505b710963ac94a53e15..25e862c487f643f97353abb607caf23ea336a757 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/device.h>
 #include <linux/efi.h>
 #include <linux/fb.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/fb.h>
 
@@ -1396,6 +1397,12 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
        mutex_lock(&info->mm_lock);
        if (fb->fb_mmap) {
                int res;
+
+               /*
+                * The framebuffer needs to be accessed decrypted, be sure
+                * SME protection is removed ahead of the call
+                */
+               vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
                res = fb->fb_mmap(info, vma);
                mutex_unlock(&info->mm_lock);
                return res;
@@ -1421,6 +1428,11 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
        mutex_unlock(&info->mm_lock);
 
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+       /*
+        * The framebuffer needs to be accessed decrypted, be sure
+        * SME protection is removed
+        */
+       vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
        fb_pgprotect(file, vma, start);
 
        return vm_iomap_memory(vma, start, len);
index 734ad4db388c6d922fb812391f913cbdda710f12..2edef8d7fa6b8a1fe65bacb8e0d110f55b5e803f 100644 (file)
@@ -13,6 +13,8 @@ extern void *early_memremap(resource_size_t phys_addr,
                            unsigned long size);
 extern void *early_memremap_ro(resource_size_t phys_addr,
                               unsigned long size);
+extern void *early_memremap_prot(resource_size_t phys_addr,
+                                unsigned long size, unsigned long prot_val);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
 extern void early_memunmap(void *addr, unsigned long size);
 
index 7dfa767dc68012ac52ac81d48e92b3ec79c97311..4d7bb98f41340f52881f78a4d8e4b9dc2f21600f 100644 (file)
@@ -582,6 +582,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
 #endif /* CONFIG_MMU */
 
+/*
+ * No-op macros that just return the current protection value. Defined here
+ * because these macros can be used used even if CONFIG_MMU is not defined.
+ */
+#ifndef pgprot_encrypted
+#define pgprot_encrypted(prot) (prot)
+#endif
+
+#ifndef pgprot_decrypted
+#define pgprot_decrypted(prot) (prot)
+#endif
+
 /*
  * A facility to provide lazy MMU batching.  This allows PTE updates and
  * page invalidations to be delayed until a call to leave lazy MMU mode
index bdb80c4aef6e13631075b2ad06b39d2788c8eba5..71b86a5d3061dda87c7ab3ee95211c9512adbea0 100644 (file)
 
 #if GCC_VERSION >= 40100
 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
+
+#define __nostackprotector     __attribute__((__optimize__("no-stack-protector")))
 #endif
 
 #if GCC_VERSION >= 40300
index eca8ad75e28b054db4657d5e562b3904120b042e..43cac547f773d2af821f69cd03610861d33bc571 100644 (file)
@@ -475,6 +475,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 #define __visible
 #endif
 
+#ifndef __nostackprotector
+# define __nostackprotector
+#endif
+
 /*
  * Assume alignment of return value.
  */
index 03c0196a6f2474ea4e34e9840638ff80a45370ec..2189c79cde5d5b0f0f76bb6b9e4211e427f8d6d9 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/scatterlist.h>
 #include <linux/kmemcheck.h>
 #include <linux/bug.h>
+#include <linux/mem_encrypt.h>
 
 /**
  * List of possible attributes associated with a DMA mapping. The semantics
@@ -572,6 +573,12 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
        return 0;
 }
 
+static inline void dma_check_mask(struct device *dev, u64 mask)
+{
+       if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
+               dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
+}
+
 static inline int dma_supported(struct device *dev, u64 mask)
 {
        const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -588,6 +595,9 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
 {
        if (!dev->dma_mask || !dma_supported(dev, mask))
                return -EIO;
+
+       dma_check_mask(dev, mask);
+
        *dev->dma_mask = mask;
        return 0;
 }
@@ -607,6 +617,9 @@ static inline int dma_set_coherent_mask(struct device *dev, u64 mask)
 {
        if (!dma_supported(dev, mask))
                return -EIO;
+
+       dma_check_mask(dev, mask);
+
        dev->coherent_dma_mask = mask;
        return 0;
 }
index 8269bcb8ccf7961bd01f52e39fc4d34d370f25d3..4e47f78430bece2a3015f2fe1a74c88ae339f9e7 100644 (file)
@@ -985,7 +985,7 @@ static inline void efi_esrt_init(void) { }
 extern int efi_config_parse_tables(void *config_tables, int count, int sz,
                                   efi_config_table_type_t *arch_tables);
 extern u64 efi_get_iobase (void);
-extern u32 efi_mem_type (unsigned long phys_addr);
+extern int efi_mem_type(unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
 extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
 extern int __init efi_uart_console_only (void);
@@ -1091,6 +1091,8 @@ static inline bool efi_enabled(int feature)
        return test_bit(feature, &efi.flags) != 0;
 }
 extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused);
+
+extern bool efi_is_table_address(unsigned long phys_addr);
 #else
 static inline bool efi_enabled(int feature)
 {
@@ -1104,6 +1106,11 @@ efi_capsule_pending(int *reset_type)
 {
        return false;
 }
+
+static inline bool efi_is_table_address(unsigned long phys_addr)
+{
+       return false;
+}
 #endif
 
 extern int efi_status_to_err(efi_status_t status);
index 2195d9ea4aaae0c054f04aab2da7cff851d2b997..32e30e8fb9db92cf1472c6188ff3310a610b3c5f 100644 (file)
@@ -157,6 +157,8 @@ enum {
        MEMREMAP_WB = 1 << 0,
        MEMREMAP_WT = 1 << 1,
        MEMREMAP_WC = 1 << 2,
+       MEMREMAP_ENC = 1 << 3,
+       MEMREMAP_DEC = 1 << 4,
 };
 
 void *memremap(resource_size_t offset, size_t size, unsigned long flags);
index dd056fab9e35c958c7aee14156e1df5640fb2556..2b7590f5483a1fc4474fbecddd099977222531cb 100644 (file)
@@ -327,6 +327,14 @@ static inline void *boot_phys_to_virt(unsigned long entry)
        return phys_to_virt(boot_phys_to_phys(entry));
 }
 
+#ifndef arch_kexec_post_alloc_pages
+static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; }
+#endif
+
+#ifndef arch_kexec_pre_free_pages
+static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
+#endif
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
new file mode 100644 (file)
index 0000000..1255f09
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __MEM_ENCRYPT_H__
+#define __MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+
+#include <asm/mem_encrypt.h>
+
+#else  /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
+
+#define sme_me_mask    0UL
+
+#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */
+
+static inline bool sme_active(void)
+{
+       return !!sme_me_mask;
+}
+
+static inline unsigned long sme_get_me_mask(void)
+{
+       return sme_me_mask;
+}
+
+/*
+ * The __sme_set() and __sme_clr() macros are useful for adding or removing
+ * the encryption mask from a value (e.g. when dealing with pagetable
+ * entries).
+ */
+#define __sme_set(x)           ((unsigned long)(x) | sme_me_mask)
+#define __sme_clr(x)           ((unsigned long)(x) & ~sme_me_mask)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __MEM_ENCRYPT_H__ */
index e030a68ead7e211fcd1e19277fd3708e665bf78b..25438b2b6f223fb29989c41e1ce8ff854c425d10 100644 (file)
@@ -126,4 +126,10 @@ static __always_inline enum lru_list page_lru(struct page *page)
 
 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
 
+#ifdef arch_unmap_kpfn
+extern void arch_unmap_kpfn(unsigned long pfn);
+#else
+static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
+#endif
+
 #endif
index 4ee479f2f355b1fa2658b1c2aabbdfc376125651..15e7160751a85a4be938f860257d05595b03105d 100644 (file)
@@ -35,6 +35,7 @@ int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 extern unsigned long swiotlb_nr_tbl(void);
 unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
+extern void __init swiotlb_update_mem_attributes(void);
 
 /*
  * Enumeration for sync targets
index 052481fbe3633f64b420c5bbd6deea3be261e6a9..9789ab7fe85e14f7434be3b0298a9ebf920242e0 100644 (file)
@@ -488,6 +488,8 @@ void __init __weak thread_stack_cache_init(void)
 }
 #endif
 
+void __init __weak mem_encrypt_init(void) { }
+
 /*
  * Set up kernel memory allocators
  */
@@ -641,6 +643,14 @@ asmlinkage __visible void __init start_kernel(void)
         */
        locking_selftest();
 
+       /*
+        * This needs to be called before any devices perform DMA
+        * operations that might use the SWIOTLB bounce buffers. It will
+        * mark the bounce buffers as decrypted so that their usage will
+        * not cause "plain-text" data to be decrypted when accessed.
+        */
+       mem_encrypt_init();
+
 #ifdef CONFIG_BLK_DEV_INITRD
        if (initrd_start && !initrd_below_start_ok &&
            page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
index 1ae7c41c33c19c54e4b08d33c0c59da78244efba..20fef1a38602d9d0ed6fdb5d359d5604fbafc3dd 100644 (file)
@@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
        struct page *pages;
 
-       pages = alloc_pages(gfp_mask, order);
+       pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
        if (pages) {
                unsigned int count, i;
 
@@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
                count = 1 << order;
                for (i = 0; i < count; i++)
                        SetPageReserved(pages + i);
+
+               arch_kexec_post_alloc_pages(page_address(pages), count,
+                                           gfp_mask);
+
+               if (gfp_mask & __GFP_ZERO)
+                       for (i = 0; i < count; i++)
+                               clear_highpage(pages + i);
        }
 
        return pages;
@@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page)
 
        order = page_private(page);
        count = 1 << order;
+
+       arch_kexec_pre_free_pages(page_address(page), count);
+
        for (i = 0; i < count; i++)
                ClearPageReserved(page + i);
        __free_pages(page, order);
index 124bed776532d3d2e69f4079e3f2d23da193fd2e..9afdc434fb490a3384d847bc50647fa3dd3ab16a 100644 (file)
@@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
 }
 #endif
 
-static void *try_ram_remap(resource_size_t offset, size_t size)
+#ifndef arch_memremap_can_ram_remap
+static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
+                                       unsigned long flags)
+{
+       return true;
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size,
+                          unsigned long flags)
 {
        unsigned long pfn = PHYS_PFN(offset);
 
        /* In the simple case just return the existing linear address */
-       if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
+       if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
+           arch_memremap_can_ram_remap(offset, size, flags))
                return __va(offset);
+
        return NULL; /* fallback to arch_memremap_wb */
 }
 
@@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
  * memremap() - remap an iomem_resource as cacheable memory
  * @offset: iomem resource start address
  * @size: size of remap
- * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
+ *               MEMREMAP_ENC, MEMREMAP_DEC
  *
  * memremap() is "ioremap" for cases where it is known that the resource
  * being mapped does not have i/o side effects and the __iomem
@@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
                 * the requested range is potentially in System RAM.
                 */
                if (is_ram == REGION_INTERSECTS)
-                       addr = try_ram_remap(offset, size);
+                       addr = try_ram_remap(offset, size, flags);
                if (!addr)
                        addr = arch_memremap_wb(offset, size);
        }
index a8d74a733a38b54a912f5e292f0a15a2cfff4a95..8c6c83ef57a43336e0a33a52f691e3323eb8f3f4 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/highmem.h>
 #include <linux/gfp.h>
 #include <linux/scatterlist.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/io.h>
 #include <asm/dma.h>
@@ -155,6 +156,15 @@ unsigned long swiotlb_size_or_default(void)
        return size ? size : (IO_TLB_DEFAULT_SIZE);
 }
 
+void __weak swiotlb_set_mem_attributes(void *vaddr, unsigned long size) { }
+
+/* For swiotlb, clear memory encryption mask from dma addresses */
+static dma_addr_t swiotlb_phys_to_dma(struct device *hwdev,
+                                     phys_addr_t address)
+{
+       return __sme_clr(phys_to_dma(hwdev, address));
+}
+
 /* Note that this doesn't work with highmem page */
 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
                                      volatile void *address)
@@ -183,6 +193,31 @@ void swiotlb_print_info(void)
               bytes >> 20, vstart, vend - 1);
 }
 
+/*
+ * Early SWIOTLB allocation may be too early to allow an architecture to
+ * perform the desired operations.  This function allows the architecture to
+ * call SWIOTLB when the operations are possible.  It needs to be called
+ * before the SWIOTLB memory is used.
+ */
+void __init swiotlb_update_mem_attributes(void)
+{
+       void *vaddr;
+       unsigned long bytes;
+
+       if (no_iotlb_memory || late_alloc)
+               return;
+
+       vaddr = phys_to_virt(io_tlb_start);
+       bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
+       swiotlb_set_mem_attributes(vaddr, bytes);
+       memset(vaddr, 0, bytes);
+
+       vaddr = phys_to_virt(io_tlb_overflow_buffer);
+       bytes = PAGE_ALIGN(io_tlb_overflow);
+       swiotlb_set_mem_attributes(vaddr, bytes);
+       memset(vaddr, 0, bytes);
+}
+
 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
        void *v_overflow_buffer;
@@ -320,6 +355,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
        io_tlb_start = virt_to_phys(tlb);
        io_tlb_end = io_tlb_start + bytes;
 
+       swiotlb_set_mem_attributes(tlb, bytes);
        memset(tlb, 0, bytes);
 
        /*
@@ -330,6 +366,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
        if (!v_overflow_buffer)
                goto cleanup2;
 
+       swiotlb_set_mem_attributes(v_overflow_buffer, io_tlb_overflow);
+       memset(v_overflow_buffer, 0, io_tlb_overflow);
        io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
 
        /*
@@ -469,6 +507,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
        if (no_iotlb_memory)
                panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
 
+       if (sme_active())
+               pr_warn_once("SME is active and system is using DMA bounce buffers\n");
+
        mask = dma_get_seg_boundary(hwdev);
 
        tbl_dma_addr &= mask;
@@ -581,7 +622,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
                return SWIOTLB_MAP_ERROR;
        }
 
-       start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
+       start_dma_addr = swiotlb_phys_to_dma(hwdev, io_tlb_start);
        return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
                                      dir, attrs);
 }
@@ -702,7 +743,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                        goto err_warn;
 
                ret = phys_to_virt(paddr);
-               dev_addr = phys_to_dma(hwdev, paddr);
+               dev_addr = swiotlb_phys_to_dma(hwdev, paddr);
 
                /* Confirm address can be DMA'd by device */
                if (dev_addr + size - 1 > dma_mask) {
@@ -812,10 +853,10 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
        map = map_single(dev, phys, size, dir, attrs);
        if (map == SWIOTLB_MAP_ERROR) {
                swiotlb_full(dev, size, dir, 1);
-               return phys_to_dma(dev, io_tlb_overflow_buffer);
+               return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
        }
 
-       dev_addr = phys_to_dma(dev, map);
+       dev_addr = swiotlb_phys_to_dma(dev, map);
 
        /* Ensure that the address returned is DMA'ble */
        if (dma_capable(dev, dev_addr, size))
@@ -824,7 +865,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
        attrs |= DMA_ATTR_SKIP_CPU_SYNC;
        swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
 
-       return phys_to_dma(dev, io_tlb_overflow_buffer);
+       return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
 }
 EXPORT_SYMBOL_GPL(swiotlb_map_page);
 
@@ -958,7 +999,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
                                sg_dma_len(sgl) = 0;
                                return 0;
                        }
-                       sg->dma_address = phys_to_dma(hwdev, map);
+                       sg->dma_address = swiotlb_phys_to_dma(hwdev, map);
                } else
                        sg->dma_address = dev_addr;
                sg_dma_len(sg) = sg->length;
@@ -1026,7 +1067,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
-       return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer));
+       return (dma_addr == swiotlb_phys_to_dma(hwdev, io_tlb_overflow_buffer));
 }
 EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 
@@ -1039,6 +1080,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 int
 swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-       return phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+       return swiotlb_phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
 EXPORT_SYMBOL(swiotlb_dma_supported);
index 6d5717bd7197ba0428c32941df27d90da04264b5..b1dd4a948fc0b3afc375964d23ebcb9f69eaafa3 100644 (file)
@@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup);
 
 static int after_paging_init __initdata;
 
+pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
+                                                   unsigned long size,
+                                                   pgprot_t prot)
+{
+       return prot;
+}
+
 void __init __weak early_ioremap_shutdown(void)
 {
 }
@@ -215,14 +222,29 @@ early_ioremap(resource_size_t phys_addr, unsigned long size)
 void __init *
 early_memremap(resource_size_t phys_addr, unsigned long size)
 {
-       return (__force void *)__early_ioremap(phys_addr, size,
-                                              FIXMAP_PAGE_NORMAL);
+       pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+                                                    FIXMAP_PAGE_NORMAL);
+
+       return (__force void *)__early_ioremap(phys_addr, size, prot);
 }
 #ifdef FIXMAP_PAGE_RO
 void __init *
 early_memremap_ro(resource_size_t phys_addr, unsigned long size)
 {
-       return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
+       pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+                                                    FIXMAP_PAGE_RO);
+
+       return (__force void *)__early_ioremap(phys_addr, size, prot);
+}
+#endif
+
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+void __init *
+early_memremap_prot(resource_size_t phys_addr, unsigned long size,
+                   unsigned long prot_val)
+{
+       return (__force void *)__early_ioremap(phys_addr, size,
+                                              __pgprot(prot_val));
 }
 #endif
 
index 1cd3b3569af8a79285b75bfdb2485b7de7a69aa8..88366626c0b7c790d92514f32cb3cea701dd0ffb 100644 (file)
@@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                return 0;
        }
 
+       arch_unmap_kpfn(pfn);
+
        orig_head = hpage = compound_head(p);
        num_poisoned_pages_inc();