Merge branch 'linus' into x86/mm to pick up fixes and to fix conflicts

author Ingo Molnar <mingo@kernel.org>

Sat, 26 Aug 2017 07:19:13 +0000 (09:19 +0200)

committer Ingo Molnar <mingo@kernel.org>

Sat, 26 Aug 2017 07:19:13 +0000 (09:19 +0200)
author Ingo Molnar <mingo@kernel.org>
Sat, 26 Aug 2017 07:19:13 +0000 (09:19 +0200)
committer Ingo Molnar <mingo@kernel.org>
Sat, 26 Aug 2017 07:19:13 +0000 (09:19 +0200)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index d9c171ce4190845950e7c14e362265b4d26adc74..372cc66bba23286c485abd8aa178abe2f3119fe7 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2233,6 +2233,17 @@
                         memory contents and reserves bad memory
                         regions that are detected.
  
+       mem_encrypt=    [X86-64] AMD Secure Memory Encryption (SME) control
+                       Valid arguments: on, off
+                       Default (depends on kernel configuration option):
+                         on  (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y)
+                         off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n)
+                       mem_encrypt=on:         Activate SME
+                       mem_encrypt=off:        Do not activate SME
+
+                       Refer to Documentation/x86/amd-memory-encryption.txt
+                       for details on when memory encryption can be activated.
+
         mem_sleep_default=      [SUSPEND] Default system suspend mode:
                         s2idle  - Suspend-To-Idle
                         shallow - Power-On Suspend or equivalent (if supported)
@@ -2696,6 +2707,8 @@
         nopat           [X86] Disable PAT (page attribute table extension of
                         pagetables) support.
  
+       nopcid          [X86-64] Disable the PCID cpu feature.
+
         norandmaps      Don't use address space randomization.  Equivalent to
                         echo 0 > /proc/sys/kernel/randomize_va_space
  
diff --git a/Documentation/x86/amd-memory-encryption.txt b/Documentation/x86/amd-memory-encryption.txt

new file mode 100644 (file)

index 0000000..f512ab7
--- /dev/null
+++ b/Documentation/x86/amd-memory-encryption.txt
@@ -0,0 +1,68 @@
+Secure Memory Encryption (SME) is a feature found on AMD processors.
+
+SME provides the ability to mark individual pages of memory as encrypted using
+the standard x86 page tables.  A page that is marked encrypted will be
+automatically decrypted when read from DRAM and encrypted when written to
+DRAM.  SME can therefore be used to protect the contents of DRAM from physical
+attacks on the system.
+
+A page is encrypted when a page table entry has the encryption bit set (see
+below on how to determine its position).  The encryption bit can also be
+specified in the cr3 register, allowing the PGD table to be encrypted. Each
+successive level of page tables can also be encrypted by setting the encryption
+bit in the page table entry that points to the next table. This allows the full
+page table hierarchy to be encrypted. Note, this means that just because the
+encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted.
+Each page table entry in the hierarchy needs to have the encryption bit set to
+achieve that. So, theoretically, you could have the encryption bit set in cr3
+so that the PGD is encrypted, but not set the encryption bit in the PGD entry
+for a PUD which results in the PUD pointed to by that entry to not be
+encrypted.
+
+Support for SME can be determined through the CPUID instruction. The CPUID
+function 0x8000001f reports information related to SME:
+
+       0x8000001f[eax]:
+               Bit[0] indicates support for SME
+       0x8000001f[ebx]:
+               Bits[5:0]  pagetable bit number used to activate memory
+                          encryption
+               Bits[11:6] reduction in physical address space, in bits, when
+                          memory encryption is enabled (this only affects
+                          system physical addresses, not guest physical
+                          addresses)
+
+If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to
+determine if SME is enabled and/or to enable memory encryption:
+
+       0xc0010010:
+               Bit[23]   0 = memory encryption features are disabled
+                         1 = memory encryption features are enabled
+
+Linux relies on BIOS to set this bit if BIOS has determined that the reduction
+in the physical address space as a result of enabling memory encryption (see
+CPUID information above) will not conflict with the address space resource
+requirements for the system.  If this bit is not set upon Linux startup then
+Linux itself will not set it and memory encryption will not be possible.
+
+The state of SME in the Linux kernel can be documented as follows:
+       - Supported:
+         The CPU supports SME (determined through CPUID instruction).
+
+       - Enabled:
+         Supported and bit 23 of MSR_K8_SYSCFG is set.
+
+       - Active:
+         Supported, Enabled and the Linux kernel is actively applying
+         the encryption bit to page table entries (the SME mask in the
+         kernel is non-zero).
+
+SME can also be enabled and activated in the BIOS. If SME is enabled and
+activated in the BIOS, then all memory accesses will be encrypted and it will
+not be necessary to activate the Linux memory encryption support.  If the BIOS
+merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate
+memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or
+by supplying mem_encrypt=on on the kernel command line.  However, if BIOS does
+not enable SME, then Linux will not be able to activate memory encryption, even
+if configured to do so by default or the mem_encrypt=on command line parameter
+is specified.
diff --git a/Documentation/x86/protection-keys.txt b/Documentation/x86/protection-keys.txt

index b643045408218669de1af81b9a9a661c035ffc70..fa46dcb347bc1d2ac60901c4621bd3bad81de601 100644 (file)
--- a/Documentation/x86/protection-keys.txt
+++ b/Documentation/x86/protection-keys.txt
@@ -34,7 +34,7 @@ with a key.  In this example WRPKRU is wrapped by a C function
  called pkey_set().
  
         int real_prot = PROT_READ|PROT_WRITE;
-       pkey = pkey_alloc(0, PKEY_DENY_WRITE);
+       pkey = pkey_alloc(0, PKEY_DISABLE_WRITE);
         ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
         ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
         ... application runs here
@@ -42,9 +42,9 @@ called pkey_set().
  Now, if the application needs to update the data at 'ptr', it can
  gain access, do the update, then remove its write access:
  
-       pkey_set(pkey, 0); // clear PKEY_DENY_WRITE
+       pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE
         *ptr = foo; // assign something
-       pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again
+       pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again
  
  Now when it frees the memory, it will also free the pkey since it
  is no longer in use:
diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt

new file mode 100644 (file)

index 0000000..087251a
--- /dev/null
+++ b/Documentation/x86/x86_64/5level-paging.txt
@@ -0,0 +1,64 @@
+== Overview ==
+
+Original x86-64 was limited by 4-level paing to 256 TiB of virtual address
+space and 64 TiB of physical address space. We are already bumping into
+this limit: some vendors offers servers with 64 TiB of memory today.
+
+To overcome the limitation upcoming hardware will introduce support for
+5-level paging. It is a straight-forward extension of the current page
+table structure adding one more layer of translation.
+
+It bumps the limits to 128 PiB of virtual address space and 4 PiB of
+physical address space. This "ought to be enough for anybody" ©.
+
+QEMU 2.9 and later support 5-level paging.
+
+Virtual memory layout for 5-level paging is described in
+Documentation/x86/x86_64/mm.txt
+
+== Enabling 5-level paging ==
+
+CONFIG_X86_5LEVEL=y enables the feature.
+
+So far, a kernel compiled with the option enabled will be able to boot
+only on machines that supports the feature -- see for 'la57' flag in
+/proc/cpuinfo.
+
+The plan is to implement boot-time switching between 4- and 5-level paging
+in the future.
+
+== User-space and large virtual address space ==
+
+On x86, 5-level paging enables 56-bit userspace virtual address space.
+Not all user space is ready to handle wide addresses. It's known that
+at least some JIT compilers use higher bits in pointers to encode their
+information. It collides with valid pointers with 5-level paging and
+leads to crashes.
+
+To mitigate this, we are not going to allocate virtual address space
+above 47-bit by default.
+
+But userspace can ask for allocation from full address space by
+specifying hint address (with or without MAP_FIXED) above 47-bits.
+
+If hint address set above 47-bit, but MAP_FIXED is not specified, we try
+to look for unmapped area by specified address. If it's already
+occupied, we look for unmapped area in *full* address space, rather than
+from 47-bit window.
+
+A high hint address would only affect the allocation in question, but not
+any future mmap()s.
+
+Specifying high hint address on older kernel or on machine without 5-level
+paging support is safe. The hint will be ignored and kernel will fall back
+to allocation from 47-bit address space.
+
+This approach helps to easily make application's memory allocator aware
+about large address space without manually tracking allocated virtual
+address space.
+
+One important case we need to handle here is interaction with MPX.
+MPX (without MAWA extension) cannot handle addresses above 47-bit, so we
+need to make sure that MPX cannot be enabled we already have VMA above
+the boundary and forbid creating such VMAs once MPX is enabled.
+
diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h

index a3d0211970e95e5152edbaeeecd8ab3a42041ac6..c86a947f5368633b86d1a8e5dc7ddd1803982c64 100644 (file)
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
         buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP;
  }
  
-#define acpi_unlazy_tlb(x)
-
  #ifdef CONFIG_ACPI_NUMA
  extern cpumask_t early_cpu_possible_map;
  #define for_each_possible_early_cpu(cpu)  \
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c

index 121295637d0df831fbbf919983ee9a0cd1a7d366..81416000c5e07c18f16032bb0b9c9f77abc8b883 100644 (file)
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
         return 0;
  }
  
-u32
+int
  efi_mem_type (unsigned long phys_addr)
  {
         efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
  
         if (md)
                 return md->type;
-       return 0;
+       return -EINVAL;
  }
  
  u64
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 323cb065be5eda120b44dac79618a13301ece231..e4844e934728c58f054eefaf1375e6b018dc982a 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -327,6 +327,7 @@ config FIX_EARLYCON_MEM
  
  config PGTABLE_LEVELS
         int
+       default 5 if X86_5LEVEL
         default 4 if X86_64
         default 3 if X86_PAE
         default 2
@@ -1399,6 +1400,24 @@ config X86_PAE
           has the cost of more pagetable lookup overhead, and also
           consumes more pagetable space per process.
  
+config X86_5LEVEL
+       bool "Enable 5-level page tables support"
+       depends on X86_64
+       ---help---
+         5-level paging enables access to larger address space:
+         upto 128 PiB of virtual address space and 4 PiB of
+         physical address space.
+
+         It will be supported by future Intel CPUs.
+
+         Note: a kernel with this option enabled can only be booted
+         on machines that support the feature.
+
+         See Documentation/x86/x86_64/5level-paging.txt for more
+         information.
+
+         Say N if unsure.
+
  config ARCH_PHYS_ADDR_T_64BIT
         def_bool y
         depends on X86_64 || X86_PAE
@@ -1416,6 +1435,35 @@ config X86_DIRECT_GBPAGES
           supports them), so don't confuse the user by printing
           that we have them enabled.
  
+config ARCH_HAS_MEM_ENCRYPT
+       def_bool y
+
+config AMD_MEM_ENCRYPT
+       bool "AMD Secure Memory Encryption (SME) support"
+       depends on X86_64 && CPU_SUP_AMD
+       ---help---
+         Say yes to enable support for the encryption of system memory.
+         This requires an AMD processor that supports Secure Memory
+         Encryption (SME).
+
+config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
+       bool "Activate AMD Secure Memory Encryption (SME) by default"
+       default y
+       depends on AMD_MEM_ENCRYPT
+       ---help---
+         Say yes to have system memory encrypted by default if running on
+         an AMD processor that supports Secure Memory Encryption (SME).
+
+         If set to Y, then the encryption of system memory can be
+         deactivated with the mem_encrypt=off command line option.
+
+         If set to N, then the encryption of system memory can be
+         activated with the mem_encrypt=on command line option.
+
+config ARCH_USE_MEMREMAP_PROT
+       def_bool y
+       depends on AMD_MEM_ENCRYPT
+
  # Common NUMA Features
  config NUMA
         bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c

index 91f27ab970ef74347c1915e7ff6ac5f8a8803b8d..99c7194f7ea626379ac649faa7da83fdb95776bf 100644 (file)
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -479,35 +479,31 @@ static unsigned long slots_fetch_random(void)
         return 0;
  }
  
-static void process_e820_entry(struct boot_e820_entry *entry,
+static void process_mem_region(struct mem_vector *entry,
                                unsigned long minimum,
                                unsigned long image_size)
  {
         struct mem_vector region, overlap;
         struct slot_area slot_area;
         unsigned long start_orig, end;
-       struct boot_e820_entry cur_entry;
-
-       /* Skip non-RAM entries. */
-       if (entry->type != E820_TYPE_RAM)
-               return;
+       struct mem_vector cur_entry;
  
         /* On 32-bit, ignore entries entirely above our maximum. */
-       if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE)
+       if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
                 return;
  
         /* Ignore entries entirely below our minimum. */
-       if (entry->addr + entry->size < minimum)
+       if (entry->start + entry->size < minimum)
                 return;
  
         /* Ignore entries above memory limit */
-       end = min(entry->size + entry->addr, mem_limit);
-       if (entry->addr >= end)
+       end = min(entry->size + entry->start, mem_limit);
+       if (entry->start >= end)
                 return;
-       cur_entry.addr = entry->addr;
-       cur_entry.size = end - entry->addr;
+       cur_entry.start = entry->start;
+       cur_entry.size = end - entry->start;
  
-       region.start = cur_entry.addr;
+       region.start = cur_entry.start;
         region.size = cur_entry.size;
  
         /* Give up if slot area array is full. */
@@ -521,8 +517,8 @@ static void process_e820_entry(struct boot_e820_entry *entry,
                 /* Potentially raise address to meet alignment needs. */
                 region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
  
-               /* Did we raise the address above this e820 region? */
-               if (region.start > cur_entry.addr + cur_entry.size)
+               /* Did we raise the address above the passed in memory entry? */
+               if (region.start > cur_entry.start + cur_entry.size)
                         return;
  
                 /* Reduce size by any delta from the original address. */
@@ -562,12 +558,32 @@ static void process_e820_entry(struct boot_e820_entry *entry,
         }
  }
  
-static unsigned long find_random_phys_addr(unsigned long minimum,
-                                          unsigned long image_size)
+static void process_e820_entries(unsigned long minimum,
+                                unsigned long image_size)
  {
         int i;
-       unsigned long addr;
+       struct mem_vector region;
+       struct boot_e820_entry *entry;
+
+       /* Verify potential e820 positions, appending to slots list. */
+       for (i = 0; i < boot_params->e820_entries; i++) {
+               entry = &boot_params->e820_table[i];
+               /* Skip non-RAM entries. */
+               if (entry->type != E820_TYPE_RAM)
+                       continue;
+               region.start = entry->addr;
+               region.size = entry->size;
+               process_mem_region(&region, minimum, image_size);
+               if (slot_area_index == MAX_SLOT_AREA) {
+                       debug_putstr("Aborted e820 scan (slot_areas full)!\n");
+                       break;
+               }
+       }
+}
  
+static unsigned long find_random_phys_addr(unsigned long minimum,
+                                          unsigned long image_size)
+{
         /* Check if we had too many memmaps. */
         if (memmap_too_large) {
                 debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n");
@@ -577,16 +593,7 @@ static unsigned long find_random_phys_addr(unsigned long minimum,
         /* Make sure minimum is aligned. */
         minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
  
-       /* Verify potential e820 positions, appending to slots list. */
-       for (i = 0; i < boot_params->e820_entries; i++) {
-               process_e820_entry(&boot_params->e820_table[i], minimum,
-                                  image_size);
-               if (slot_area_index == MAX_SLOT_AREA) {
-                       debug_putstr("Aborted e820 scan (slot_areas full)!\n");
-                       break;
-               }
-       }
-
+       process_e820_entries(minimum, image_size);
         return slots_fetch_random();
  }
  
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c

index 28029be47fbb839f248826b517a9e295f4389395..f1aa43854bed423e7bfccaa84ff66ea91996b731 100644 (file)
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -15,6 +15,13 @@
  #define __pa(x)  ((unsigned long)(x))
  #define __va(x)  ((void *)((unsigned long)(x)))
  
+/*
+ * The pgtable.h and mm/ident_map.c includes make use of the SME related
+ * information which is not used in the compressed image support. Un-define
+ * the SME support to avoid any compile and link errors.
+ */
+#undef CONFIG_AMD_MEM_ENCRYPT
+
  #include "misc.h"
  
  /* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h

index 2efc768e43627a48118d28729c380c0fbcaa7681..72d867f6b518e4db5a79a10c924f858a3edb0af8 100644 (file)
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -150,8 +150,6 @@ static inline void disable_acpi(void) { }
  extern int x86_acpi_numa_init(void);
  #endif /* CONFIG_ACPI_NUMA */
  
-#define acpi_unlazy_tlb(x)     leave_mm(x)
-
  #ifdef CONFIG_ACPI_APEI
  static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
  {
@@ -162,12 +160,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
          * you call efi_mem_attributes() during boot and at runtime,
          * you could theoretically see different attributes.
          *
-        * Since we are yet to see any x86 platforms that require
-        * anything other than PAGE_KERNEL (some arm64 platforms
-        * require the equivalent of PAGE_KERNEL_NOCACHE), return that
-        * until we know differently.
+        * We are yet to see any x86 platforms that require anything
+        * other than PAGE_KERNEL (some ARM64 platforms require the
+        * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME
+        * is active, the ACPI information will not be encrypted,
+        * so return PAGE_KERNEL_NOENC until we know differently.
          */
-        return PAGE_KERNEL;
+       return PAGE_KERNEL_NOENC;
  }
  #endif
  
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h

index e01f7f7ccb0c5711db4f1fe130938cee36169965..84ae170bc3d0cd73cfa7d08fa6f730bd3106e4ea 100644 (file)
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
  #define _ASM_X86_CMDLINE_H
  
  int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+                       char *buffer, int bufsize);
  
  #endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h

index 5a28e8e55e36fd2164c0cda175288a7fcc534c4b..66ac08607471c4a6fcdfb919304f2fd3405458ef 100644 (file)
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -196,6 +196,7 @@
  
  #define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
  #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_SME                ( 7*32+10) /* AMD Secure Memory Encryption */
  
  #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
  #define X86_FEATURE_INTEL_PT   ( 7*32+15) /* Intel Processor Trace */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h

index 5dff775af7cd6456f7177d9ce5888ae78dc6bc10..c10c9128f54e6b7296014a74e7a253a1eedaacd9 100644 (file)
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -21,11 +21,13 @@
  # define DISABLE_K6_MTRR       (1<<(X86_FEATURE_K6_MTRR & 31))
  # define DISABLE_CYRIX_ARR     (1<<(X86_FEATURE_CYRIX_ARR & 31))
  # define DISABLE_CENTAUR_MCR   (1<<(X86_FEATURE_CENTAUR_MCR & 31))
+# define DISABLE_PCID          0
  #else
  # define DISABLE_VME           0
  # define DISABLE_K6_MTRR       0
  # define DISABLE_CYRIX_ARR     0
  # define DISABLE_CENTAUR_MCR   0
+# define DISABLE_PCID          (1<<(X86_FEATURE_PCID & 31))
  #endif /* CONFIG_X86_64 */
  
  #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -49,7 +51,7 @@
  #define DISABLED_MASK1 0
  #define DISABLED_MASK2 0
  #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
-#define DISABLED_MASK4 0
+#define DISABLED_MASK4 (DISABLE_PCID)
  #define DISABLED_MASK5 0
  #define DISABLED_MASK6 0
  #define DISABLED_MASK7 0
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h

index 398c79889f5c43d384b72238ce025f140debeb0b..1387dafdba2d2c24061adb014354b82174f620d4 100644 (file)
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,6 +12,7 @@
  #include <asm/io.h>
  #include <asm/swiotlb.h>
  #include <linux/dma-contiguous.h>
+#include <linux/mem_encrypt.h>
  
  #ifdef CONFIG_ISA
  # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
@@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
  
  static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
  {
-       return paddr;
+       return __sme_set(paddr);
  }
  
  static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
  {
-       return daddr;
+       return __sme_clr(daddr);
  }
  #endif /* CONFIG_X86_DMA_REMAP */
  
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h

index 3c69fed215c56c3203e59d97a6c7e11381c97cf8..a8e15b04565b842def6a1bd1fa4b3b03db756c3c 100644 (file)
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
  }
  
  /* Use early IO mappings for DMI because it's initialized early */
-#define dmi_early_remap                early_ioremap
-#define dmi_early_unmap                early_iounmap
-#define dmi_remap              ioremap_cache
-#define dmi_unmap              iounmap
+#define dmi_early_remap                early_memremap
+#define dmi_early_unmap                early_memunmap
+#define dmi_remap(_x, _l)      memremap(_x, _l, MEMREMAP_WB)
+#define dmi_unmap(_x)          memunmap(_x)
  
  #endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h

index a504adc661a4954ca0d1c5ed04149fece0e042a3..cd266d830e4960fd023aa0411c991a2701db09f9 100644 (file)
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -39,6 +39,8 @@ extern void e820__setup_pci_gap(void);
  extern void e820__reallocate_tables(void);
  extern void e820__register_nosave_regions(unsigned long limit_pfn);
  
+extern int  e820__get_entry_type(u64 start, u64 end);
+
  /*
   * Returns true iff the specified range [start,end) is completely contained inside
   * the ISA region.
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h

index 9aeb91935ce02387d8dae5e2f51bf1750f420a43..a3de31ffb72254199d769afd9266997f563c67d5 100644 (file)
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -304,8 +304,8 @@ static inline int mmap_is_ia32(void)
                 test_thread_flag(TIF_ADDR32));
  }
  
-extern unsigned long tasksize_32bit(void);
-extern unsigned long tasksize_64bit(void);
+extern unsigned long task_size_32bit(void);
+extern unsigned long task_size_64bit(int full_addr_space);
  extern unsigned long get_mmap_base(int is_legacy);
  
  #ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h

index b65155cc3760a72b49b680c3f70923ddedf684d2..dcd9fb55e67991821d46602754a392c6f2ed0e06 100644 (file)
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -157,6 +157,26 @@ static inline void __set_fixmap(enum fixed_addresses idx,
  }
  #endif
  
+/*
+ * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not
+ * supported for MMIO addresses, so make sure that the memory encryption
+ * mask is not part of the page attributes.
+ */
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
+
+/*
+ * Early memremap routines used for in-place encryption. The mappings created
+ * by these routines are intended to be used as temporary mappings.
+ */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+                                     unsigned long size);
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+                                        unsigned long size);
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+                                     unsigned long size);
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+                                        unsigned long size);
+
  #include <asm-generic/fixmap.h>
  
  #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h

index 474eb8c66feeb2c98de2f5d6fe1db84de752c806..05c4aa00cc862e3b1dad1b344b0eddb9d6f44db4 100644 (file)
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -7,6 +7,7 @@ struct x86_mapping_info {
         unsigned long page_flag;         /* page flag for PMD or PUD entry */
         unsigned long offset;            /* ident mapping offset */
         bool direct_gbpages;             /* PUD level 1GB page support */
+       unsigned long kernpg_flag;       /* kernel pagetable flag override */
  };
  
  int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h

index 48febf07e828099d0580be40ad82a8baee6ade18..4bc6f459a8b6dd9861f078c794d6c26534db5148 100644 (file)
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -381,4 +381,12 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
  #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
  #endif
  
+extern bool arch_memremap_can_ram_remap(resource_size_t offset,
+                                       unsigned long size,
+                                       unsigned long flags);
+#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
+
+extern bool phys_mem_access_encrypted(unsigned long phys_addr,
+                                     unsigned long size);
+
  #endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h

index 70ef205489f00e53ff568180c4dcbf6fb9e6ded1..942c1f444da88ddeb182e57f582a068c15cb2717 100644 (file)
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -147,7 +147,8 @@ unsigned long
  relocate_kernel(unsigned long indirection_page,
                 unsigned long page_list,
                 unsigned long start_address,
-               unsigned int preserve_context);
+               unsigned int preserve_context,
+               unsigned int sme_active);
  #endif
  
  #define ARCH_HAS_KIMAGE_ARCH
@@ -207,6 +208,14 @@ struct kexec_entry64_regs {
         uint64_t r15;
         uint64_t rip;
  };
+
+extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
+                                      gfp_t gfp);
+#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
+
+extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
+#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
+
  #endif
  
  typedef void crash_vmclear_fn(void);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 87ac4fba6d8e12f07e8a9f191bdb028a1c3e6234..7cbaab523f22dcd91812dc269021d1e884ec0df2 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1078,7 +1078,7 @@ void kvm_mmu_init_vm(struct kvm *kvm);
  void kvm_mmu_uninit_vm(struct kvm *kvm);
  void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask);
+               u64 acc_track_mask, u64 me_mask);
  
  void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h

new file mode 100644 (file)

index 0000000..8e618fc
--- /dev/null
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -0,0 +1,80 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __X86_MEM_ENCRYPT_H__
+#define __X86_MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+extern unsigned long sme_me_mask;
+
+void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
+                        unsigned long decrypted_kernel_vaddr,
+                        unsigned long kernel_len,
+                        unsigned long encryption_wa,
+                        unsigned long encryption_pgd);
+
+void __init sme_early_encrypt(resource_size_t paddr,
+                             unsigned long size);
+void __init sme_early_decrypt(resource_size_t paddr,
+                             unsigned long size);
+
+void __init sme_map_bootdata(char *real_mode_data);
+void __init sme_unmap_bootdata(char *real_mode_data);
+
+void __init sme_early_init(void);
+
+void __init sme_encrypt_kernel(void);
+void __init sme_enable(struct boot_params *bp);
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void);
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
+
+#else  /* !CONFIG_AMD_MEM_ENCRYPT */
+
+#define sme_me_mask    0UL
+
+static inline void __init sme_early_encrypt(resource_size_t paddr,
+                                           unsigned long size) { }
+static inline void __init sme_early_decrypt(resource_size_t paddr,
+                                           unsigned long size) { }
+
+static inline void __init sme_map_bootdata(char *real_mode_data) { }
+static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
+
+static inline void __init sme_early_init(void) { }
+
+static inline void __init sme_encrypt_kernel(void) { }
+static inline void __init sme_enable(struct boot_params *bp) { }
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+/*
+ * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
+ * writing to or comparing values from the cr3 register.  Having the
+ * encryption mask set in cr3 enables the PGD entry to be encrypted and
+ * avoid special case handling of PGD allocations.
+ */
+#define __sme_pa(x)            (__pa(x) | sme_me_mask)
+#define __sme_pa_nodebug(x)    (__pa_nodebug(x) | sme_me_mask)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h

index 79b647a7ebd0079b96472e52634c898301d3a63e..bb8c597c2248a9c8341d04813b04e1d42e9c7019 100644 (file)
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -3,12 +3,28 @@
  
  #include <linux/spinlock.h>
  #include <linux/mutex.h>
+#include <linux/atomic.h>
  
  /*
- * The x86 doesn't have a mmu context, but
- * we put the segment information here.
+ * x86 has arch-specific MMU state beyond what lives in mm_struct.
   */
  typedef struct {
+       /*
+        * ctx_id uniquely identifies this mm_struct.  A ctx_id will never
+        * be reused, and zero is not a valid ctx_id.
+        */
+       u64 ctx_id;
+
+       /*
+        * Any code that needs to do any sort of TLB flushing for this
+        * mm will first make its changes to the page tables, then
+        * increment tlb_gen, then flush.  This lets the low-level
+        * flushing code keep track of what needs flushing.
+        *
+        * This is not used on Xen PV.
+        */
+       atomic64_t tlb_gen;
+
  #ifdef CONFIG_MODIFY_LDT_SYSCALL
         struct ldt_struct *ldt;
  #endif
@@ -37,6 +53,11 @@ typedef struct {
  #endif
  } mm_context_t;
  
+#define INIT_MM_CONTEXT(mm)                                            \
+       .context = {                                                    \
+               .ctx_id = 1,                                            \
+       }
+
  void leave_mm(int cpu);
  
  #endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h

index 265c907d7d4c9b8c69e24792a20c5a3dfb6c95ee..d25d9f4abb15a1e06e83e6e4ebac1ef9f2c70c04 100644 (file)
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -12,6 +12,9 @@
  #include <asm/tlbflush.h>
  #include <asm/paravirt.h>
  #include <asm/mpx.h>
+
+extern atomic64_t last_mm_ctx_id;
+
  #ifndef CONFIG_PARAVIRT
  static inline void paravirt_activate_mm(struct mm_struct *prev,
                                         struct mm_struct *next)
@@ -125,13 +128,18 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
  
  static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
  {
-       if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
-               this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+       int cpu = smp_processor_id();
+
+       if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
+               cpumask_clear_cpu(cpu, mm_cpumask(mm));
  }
  
  static inline int init_new_context(struct task_struct *tsk,
                                    struct mm_struct *mm)
  {
+       mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
+       atomic64_set(&mm->context.tlb_gen, 0);
+
         #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
         if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                 /* pkey 0 is the default and always allocated */
@@ -292,6 +300,9 @@ static inline unsigned long __get_current_cr3_fast(void)
  {
         unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
  
+       if (static_cpu_has(X86_FEATURE_PCID))
+               cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
         /* For now, be very restrictive about when this can be called. */
         VM_WARN_ON(in_nmi() || preemptible());
  
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h

index a0d662be4c5b8545a7b0b8c5c846187ced8d0020..7d7404756bb4a734bcb04c5f57931daff8af73b3 100644 (file)
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm)
  }
  void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
                       unsigned long start, unsigned long end);
+
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+               unsigned long flags);
  #else
  static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
  {
@@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm,
                                     unsigned long start, unsigned long end)
  {
  }
+
+static inline unsigned long mpx_unmapped_area_check(unsigned long addr,
+               unsigned long len, unsigned long flags)
+{
+       return addr;
+}
  #endif /* CONFIG_X86_INTEL_MPX */
  
  #endif /* _ASM_X86_MPX_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h

index 5573c75f8e4ced276c8585b71f0df9b786ea9e90..17f5c12e1afd0c6ddb3fa2e6fc94b1fec52f7d5c 100644 (file)
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -356,6 +356,8 @@
  #define MSR_K8_TOP_MEM1                        0xc001001a
  #define MSR_K8_TOP_MEM2                        0xc001001d
  #define MSR_K8_SYSCFG                  0xc0010010
+#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT  23
+#define MSR_K8_SYSCFG_MEM_ENCRYPT      BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
  #define MSR_K8_INT_PENDING_MSG         0xc0010055
  /* C1E active bits in int pending message */
  #define K8_INTP_C1E_ACTIVE_MASK                0x18000000
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h

index b4a0d43248cf3d6f2c7ef042200c31a25f29f122..b50df06ad251f143e105843ab84ac1a14f40abdf 100644 (file)
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -51,6 +51,10 @@ static inline void clear_page(void *page)
  
  void copy_page(void *to, void *from);
  
+#ifdef CONFIG_X86_MCE
+#define arch_unmap_kpfn arch_unmap_kpfn
+#endif
+
  #endif /* !__ASSEMBLY__ */
  
  #ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h

index 7bd0099384cac4ed0fa89a4e25c42e4a63ae5f9c..b98ed9d1463098936bcebbae46b59dc00495508c 100644 (file)
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,6 +3,7 @@
  
  #include <linux/const.h>
  #include <linux/types.h>
+#include <linux/mem_encrypt.h>
  
  /* PAGE_SHIFT determines the page size */
  #define PAGE_SHIFT             12
@@ -15,7 +16,7 @@
  #define PUD_PAGE_SIZE          (_AC(1, UL) << PUD_SHIFT)
  #define PUD_PAGE_MASK          (~(PUD_PAGE_SIZE-1))
  
-#define __PHYSICAL_MASK                ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
+#define __PHYSICAL_MASK                ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1)))
  #define __VIRTUAL_MASK         ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
  
  /* Cast *PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index 77037b6f1caa22f622f50d67ea6cebd00f76f685..bbeae4a2bd01a3209e6d68f8f2af918eeb17dff2 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1,6 +1,7 @@
  #ifndef _ASM_X86_PGTABLE_H
  #define _ASM_X86_PGTABLE_H
  
+#include <linux/mem_encrypt.h>
  #include <asm/page.h>
  #include <asm/pgtable_types.h>
  
@@ -13,9 +14,18 @@
                      cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))     \
          : (prot))
  
+/*
+ * Macros to add or remove encryption attribute
+ */
+#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot)))
+#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
+
  #ifndef __ASSEMBLY__
  #include <asm/x86_init.h>
  
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
+
  void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
  void ptdump_walk_pgd_level_checkwx(void);
  
@@ -38,6 +48,8 @@ extern struct list_head pgd_list;
  
  extern struct mm_struct *pgd_page_get_mm(struct page *page);
  
+extern pmdval_t early_pmd_flags;
+
  #ifdef CONFIG_PARAVIRT
  #include <asm/paravirt.h>
  #else  /* !CONFIG_PARAVIRT */
@@ -195,6 +207,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d)
         return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
  }
  
+static inline unsigned long pgd_pfn(pgd_t pgd)
+{
+       return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
  static inline int p4d_large(p4d_t p4d)
  {
         /* No 512 GiB pages yet */
@@ -704,8 +721,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
   * Currently stuck as a macro due to indirect forward reference to
   * linux/mmzone.h's __section_mem_map_addr() definition:
   */
-#define pmd_page(pmd)          \
-       pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT)
+#define pmd_page(pmd)  pfn_to_page(pmd_pfn(pmd))
  
  /*
   * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -773,8 +789,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud)
   * Currently stuck as a macro due to indirect forward reference to
   * linux/mmzone.h's __section_mem_map_addr() definition:
   */
-#define pud_page(pud)          \
-       pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT)
+#define pud_page(pud)  pfn_to_page(pud_pfn(pud))
  
  /* Find an entry in the second-level page table.. */
  static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@ -824,8 +839,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d)
   * Currently stuck as a macro due to indirect forward reference to
   * linux/mmzone.h's __section_mem_map_addr() definition:
   */
-#define p4d_page(p4d)          \
-       pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT)
+#define p4d_page(p4d)  pfn_to_page(p4d_pfn(p4d))
  
  /* Find an entry in the third-level page table.. */
  static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
@@ -859,7 +873,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
   * Currently stuck as a macro due to indirect forward reference to
   * linux/mmzone.h's __section_mem_map_addr() definition:
   */
-#define pgd_page(pgd)          pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+#define pgd_page(pgd)  pfn_to_page(pgd_pfn(pgd))
  
  /* to find an entry in a page-table-directory. */
  static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h

index bf9638e1ee4215d4101d2836d5c3963d59e6dbab..399261ce904ca1df269e5194b8e523d73b8a3f69 100644 (file)
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -2,6 +2,8 @@
  #define _ASM_X86_PGTABLE_DEFS_H
  
  #include <linux/const.h>
+#include <linux/mem_encrypt.h>
+
  #include <asm/page_types.h>
  
  #define FIRST_USER_ADDRESS     0UL
@@ -121,10 +123,10 @@
  
  #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
  
-#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
-                        _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |    \
-                        _PAGE_DIRTY)
+#define _PAGE_TABLE_NOENC      (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
+                                _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE_NOENC    (_PAGE_PRESENT | _PAGE_RW |             \
+                                _PAGE_ACCESSED | _PAGE_DIRTY)
  
  /*
   * Set of bits not changed in pte_modify.  The pte's
@@ -159,6 +161,7 @@ enum page_cache_mode {
  
  #define _PAGE_CACHE_MASK       (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
  #define _PAGE_NOCACHE          (cachemode2protval(_PAGE_CACHE_MODE_UC))
+#define _PAGE_CACHE_WP         (cachemode2protval(_PAGE_CACHE_MODE_WP))
  
  #define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
  #define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
@@ -187,22 +190,42 @@ enum page_cache_mode {
  #define __PAGE_KERNEL_VVAR             (__PAGE_KERNEL_RO | _PAGE_USER)
  #define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
  #define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+#define __PAGE_KERNEL_WP               (__PAGE_KERNEL | _PAGE_CACHE_WP)
  
  #define __PAGE_KERNEL_IO               (__PAGE_KERNEL)
  #define __PAGE_KERNEL_IO_NOCACHE       (__PAGE_KERNEL_NOCACHE)
  
-#define PAGE_KERNEL                    __pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO                 __pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC               __pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX                 __pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_NOCACHE            __pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_LARGE              __pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC         __pgprot(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL           __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VVAR               __pgprot(__PAGE_KERNEL_VVAR)
+#ifndef __ASSEMBLY__
+
+#define _PAGE_ENC      (_AT(pteval_t, sme_me_mask))
+
+#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
+                        _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
+#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |    \
+                        _PAGE_DIRTY | _PAGE_ENC)
+
+#define __PAGE_KERNEL_ENC      (__PAGE_KERNEL | _PAGE_ENC)
+#define __PAGE_KERNEL_ENC_WP   (__PAGE_KERNEL_WP | _PAGE_ENC)
+
+#define __PAGE_KERNEL_NOENC    (__PAGE_KERNEL)
+#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP)
+
+#define PAGE_KERNEL            __pgprot(__PAGE_KERNEL | _PAGE_ENC)
+#define PAGE_KERNEL_NOENC      __pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO         __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC       __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX         __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
+#define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE      __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_VSYSCALL   __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC)
+#define PAGE_KERNEL_VVAR       __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
+
+#define PAGE_KERNEL_IO         __pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
  
-#define PAGE_KERNEL_IO                 __pgprot(__PAGE_KERNEL_IO)
-#define PAGE_KERNEL_IO_NOCACHE         __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#endif /* __ASSEMBLY__ */
  
  /*         xwr */
  #define __P000 PAGE_NONE
@@ -287,6 +310,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d)
  #else
  #include <asm-generic/pgtable-nop4d.h>
  
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+       return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
+}
+
  static inline p4dval_t native_p4d_val(p4d_t p4d)
  {
         return native_pgd_val(p4d.pgd);
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h

index 79aa2f98398d4eaabf45a1493baaa072f11782ec..dc723b64acf0675689c1feb187bfd957480c5d94 100644 (file)
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -2,6 +2,7 @@
  #define _ASM_X86_PROCESSOR_FLAGS_H
  
  #include <uapi/asm/processor-flags.h>
+#include <linux/mem_encrypt.h>
  
  #ifdef CONFIG_VM86
  #define X86_VM_MASK    X86_EFLAGS_VM
@@ -32,16 +33,18 @@
   * CR3_ADDR_MASK is the mask used by read_cr3_pa().
   */
  #ifdef CONFIG_X86_64
-/* Mask off the address space ID bits. */
-#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
-#define CR3_PCID_MASK 0xFFFull
+/* Mask off the address space ID and SME encryption bits. */
+#define CR3_ADDR_MASK  __sme_clr(0x7FFFFFFFFFFFF000ull)
+#define CR3_PCID_MASK  0xFFFull
+#define CR3_NOFLUSH    BIT_ULL(63)
  #else
  /*
   * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
   * a tiny bit of code size by setting all the bits.
   */
-#define CR3_ADDR_MASK 0xFFFFFFFFull
-#define CR3_PCID_MASK 0ull
+#define CR3_ADDR_MASK  0xFFFFFFFFull
+#define CR3_PCID_MASK  0ull
+#define CR3_NOFLUSH    0
  #endif
  
  #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index 028245e1c42b23d1498643427ebb73be25ded661..c61bab07a84e05a0a21afc9e166db6e37fa15442 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -29,6 +29,7 @@ struct vm86;
  #include <linux/math64.h>
  #include <linux/err.h>
  #include <linux/irqflags.h>
+#include <linux/mem_encrypt.h>
  
  /*
   * We handle most unaligned accesses in hardware.  On the other hand
@@ -239,9 +240,14 @@ static inline unsigned long read_cr3_pa(void)
         return __read_cr3() & CR3_ADDR_MASK;
  }
  
+static inline unsigned long native_read_cr3_pa(void)
+{
+       return __native_read_cr3() & CR3_ADDR_MASK;
+}
+
  static inline void load_cr3(pgd_t *pgdir)
  {
-       write_cr3(__pa(pgdir));
+       write_cr3(__sme_pa(pgdir));
  }
  
  #ifdef CONFIG_X86_32
@@ -802,7 +808,9 @@ static inline void spin_lock_prefetch(const void *x)
   */
  #define IA32_PAGE_OFFSET       PAGE_OFFSET
  #define TASK_SIZE              PAGE_OFFSET
+#define TASK_SIZE_LOW          TASK_SIZE
  #define TASK_SIZE_MAX          TASK_SIZE
+#define DEFAULT_MAP_WINDOW     TASK_SIZE
  #define STACK_TOP              TASK_SIZE
  #define STACK_TOP_MAX          STACK_TOP
  
@@ -842,7 +850,9 @@ static inline void spin_lock_prefetch(const void *x)
   * particular problem by preventing anything from being mapped
   * at the maximum canonical address.
   */
-#define TASK_SIZE_MAX  ((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_MAX  ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
+
+#define DEFAULT_MAP_WINDOW     ((1UL << 47) - PAGE_SIZE)
  
  /* This decides where the kernel will search for a free chunk of vm
   * space during mmap's.
@@ -850,12 +860,14 @@ static inline void spin_lock_prefetch(const void *x)
  #define IA32_PAGE_OFFSET       ((current->personality & ADDR_LIMIT_3GB) ? \
                                         0xc0000000 : 0xFFFFe000)
  
+#define TASK_SIZE_LOW          (test_thread_flag(TIF_ADDR32) ? \
+                                       IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
  #define TASK_SIZE              (test_thread_flag(TIF_ADDR32) ? \
                                         IA32_PAGE_OFFSET : TASK_SIZE_MAX)
  #define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
                                         IA32_PAGE_OFFSET : TASK_SIZE_MAX)
  
-#define STACK_TOP              TASK_SIZE
+#define STACK_TOP              TASK_SIZE_LOW
  #define STACK_TOP_MAX          TASK_SIZE_MAX
  
  #define INIT_THREAD  {                                         \
@@ -876,7 +888,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
   * space during mmap's.
   */
  #define __TASK_UNMAPPED_BASE(task_size)        (PAGE_ALIGN(task_size / 3))
-#define TASK_UNMAPPED_BASE             __TASK_UNMAPPED_BASE(TASK_SIZE)
+#define TASK_UNMAPPED_BASE             __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
  
  #define KSTK_EIP(task)         (task_pt_regs(task)->ip)
  
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h

index 230e1903acf07faa831c8d2496a4457aaae9b5ea..90d91520c13ab09b566ca0a599a6170353153e47 100644 (file)
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -1,6 +1,15 @@
  #ifndef _ARCH_X86_REALMODE_H
  #define _ARCH_X86_REALMODE_H
  
+/*
+ * Flag bit definitions for use with the flags field of the trampoline header
+ * in the CONFIG_X86_64 variant.
+ */
+#define TH_FLAGS_SME_ACTIVE_BIT                0
+#define TH_FLAGS_SME_ACTIVE            BIT(TH_FLAGS_SME_ACTIVE_BIT)
+
+#ifndef __ASSEMBLY__
+
  #include <linux/types.h>
  #include <asm/io.h>
  
@@ -38,6 +47,7 @@ struct trampoline_header {
         u64 start;
         u64 efer;
         u32 cr4;
+       u32 flags;
  #endif
  };
  
@@ -69,4 +79,6 @@ static inline size_t real_mode_size_needed(void)
  void set_real_mode_mem(phys_addr_t mem, size_t size);
  void reserve_real_mode(void);
  
+#endif /* __ASSEMBLY__ */
+
  #endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h

index eaec6c364e42d07f55930a80ef29b9d5c248165d..cd71273ec49d91aacffe3eff8883617ef94d8627 100644 (file)
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -11,6 +11,7 @@
   * Executability : eXeutable, NoteXecutable
   * Read/Write    : ReadOnly, ReadWrite
   * Presence      : NotPresent
+ * Encryption    : Encrypted, Decrypted
   *
   * Within a category, the attributes are mutually exclusive.
   *
@@ -42,6 +43,8 @@ int set_memory_wt(unsigned long addr, int numpages);
  int set_memory_wb(unsigned long addr, int numpages);
  int set_memory_np(unsigned long addr, int numpages);
  int set_memory_4k(unsigned long addr, int numpages);
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
  
  int set_memory_array_uc(unsigned long *addr, int addrinarray);
  int set_memory_array_wc(unsigned long *addr, int addrinarray);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h

index 50ea3482e1d1d0babfecf7864a6e9307ba6651fe..d23e61dc0640e451d8d1f997fad65af11b30dbfa 100644 (file)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void)
         __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
  }
  
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+       u64 new_tlb_gen;
+
+       /*
+        * Bump the generation count.  This also serves as a full barrier
+        * that synchronizes with switch_mm(): callers are required to order
+        * their read of mm_cpumask after their writes to the paging
+        * structures.
+        */
+       smp_mb__before_atomic();
+       new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
+       smp_mb__after_atomic();
+
+       return new_tlb_gen;
+}
+
  #ifdef CONFIG_PARAVIRT
  #include <asm/paravirt.h>
  #else
@@ -65,6 +82,17 @@ static inline void invpcid_flush_all_nonglobals(void)
  #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
  #endif
  
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in
+ * two cache lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
+struct tlb_context {
+       u64 ctx_id;
+       u64 tlb_gen;
+};
+
  struct tlb_state {
         /*
          * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
@@ -73,13 +101,35 @@ struct tlb_state {
          * mode even if we've already switched back to swapper_pg_dir.
          */
         struct mm_struct *loaded_mm;
-       int state;
+       u16 loaded_mm_asid;
+       u16 next_asid;
  
         /*
          * Access to this CR4 shadow and to H/W CR4 is protected by
          * disabling interrupts when modifying either one.
          */
         unsigned long cr4;
+
+       /*
+        * This is a list of all contexts that might exist in the TLB.
+        * There is one per ASID that we use, and the ASID (what the
+        * CPU calls PCID) is the index into ctxts.
+        *
+        * For each context, ctx_id indicates which mm the TLB's user
+        * entries came from.  As an invariant, the TLB will never
+        * contain entries that are out-of-date as when that mm reached
+        * the tlb_gen in the list.
+        *
+        * To be clear, this means that it's legal for the TLB code to
+        * flush the TLB without updating tlb_gen.  This can happen
+        * (for now, at least) due to paravirt remote flushes.
+        *
+        * NB: context 0 is a bit special, since it's also used by
+        * various bits of init code.  This is fine -- code that
+        * isn't aware of PCID will end up harmlessly flushing
+        * context 0.
+        */
+       struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
  };
  DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
  
@@ -207,6 +257,14 @@ static inline void __flush_tlb_all(void)
                 __flush_tlb_global();
         else
                 __flush_tlb();
+
+       /*
+        * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+        * we'd end up flushing kernel translations for the current ASID but
+        * we might fail to flush kernel translations for other cached ASIDs.
+        *
+        * To avoid this issue, we force PCID off if PGE is off.
+        */
  }
  
  static inline void __flush_tlb_one(unsigned long addr)
@@ -231,9 +289,26 @@ static inline void __flush_tlb_one(unsigned long addr)
   * and page-granular flushes are available only on i486 and up.
   */
  struct flush_tlb_info {
-       struct mm_struct *mm;
-       unsigned long start;
-       unsigned long end;
+       /*
+        * We support several kinds of flushes.
+        *
+        * - Fully flush a single mm.  .mm will be set, .end will be
+        *   TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
+        *   which the IPI sender is trying to catch us up.
+        *
+        * - Partially flush a single mm.  .mm will be set, .start and
+        *   .end will indicate the range, and .new_tlb_gen will be set
+        *   such that the changes between generation .new_tlb_gen-1 and
+        *   .new_tlb_gen are entirely contained in the indicated range.
+        *
+        * - Fully flush all mms whose tlb_gens have been updated.  .mm
+        *   will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
+        *   will be zero.
+        */
+       struct mm_struct        *mm;
+       unsigned long           start;
+       unsigned long           end;
+       u64                     new_tlb_gen;
  };
  
  #define local_flush_tlb() __flush_tlb()
@@ -256,12 +331,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
  void native_flush_tlb_others(const struct cpumask *cpumask,
                              const struct flush_tlb_info *info);
  
-#define TLBSTATE_OK    1
-#define TLBSTATE_LAZY  2
-
  static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
                                         struct mm_struct *mm)
  {
+       inc_mm_tlb_gen(mm);
         cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
  }
  
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h

index c4b9dc2f67c5f6f7a095dd7d4a3a6bb7fd7b3975..9f42beefc67a3d5d5242e054425c713ca394553e 100644 (file)
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -7,12 +7,24 @@
  #ifndef _ASM_X86_VGA_H
  #define _ASM_X86_VGA_H
  
+#include <asm/set_memory.h>
+
  /*
   *     On the PC, we can just recalculate addresses and then
   *     access the videoram directly without any black magic.
+ *     To support memory encryption however, we need to access
+ *     the videoram as decrypted memory.
   */
  
-#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x)
+#define VGA_MAP_MEM(x, s)                                      \
+({                                                             \
+       unsigned long start = (unsigned long)phys_to_virt(x);   \
+                                                               \
+       if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))                 \
+               set_memory_decrypted(start, (s) >> PAGE_SHIFT); \
+                                                               \
+       start;                                                  \
+})
  
  #define vga_readb(x) (*(x))
  #define vga_writeb(x, y) (*(y) = (x))
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c

index 7491e73d92530bf868a9b91b2c71c892bcbdb94a..97bb2caf342879ba365582d3bd7b1caab5f77dfe 100644 (file)
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -115,7 +115,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
  #define        ACPI_INVALID_GSI                INT_MIN
  
  /*
- * This is just a simple wrapper around early_ioremap(),
+ * This is just a simple wrapper around early_memremap(),
   * with sanity checks for phys == 0 and size == 0.
   */
  char *__init __acpi_map_table(unsigned long phys, unsigned long size)
@@ -124,7 +124,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
         if (!phys || !size)
                 return NULL;
  
-       return early_ioremap(phys, size);
+       return early_memremap(phys, size);
  }
  
  void __init __acpi_unmap_table(char *map, unsigned long size)
@@ -132,7 +132,7 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
         if (!map || !size)
                 return;
  
-       early_iounmap(map, size);
+       early_memunmap(map, size);
  }
  
  #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index 3b9e220621f83c8a5161e8b57b297233370f72ea..110ca5d2bb872a7f15cffe4349ffab74b03a4c86 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -548,8 +548,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
  
  static void early_init_amd(struct cpuinfo_x86 *c)
  {
+       u32 dummy;
+
         early_init_amd_mc(c);
  
+       rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
         /*
          * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
          * with P/T states and does not stop in deep C-states
@@ -612,6 +616,27 @@ static void early_init_amd(struct cpuinfo_x86 *c)
          */
         if (cpu_has_amd_erratum(c, amd_erratum_400))
                 set_cpu_bug(c, X86_BUG_AMD_E400);
+
+       /*
+        * BIOS support is required for SME. If BIOS has enabled SME then
+        * adjust x86_phys_bits by the SME physical address space reduction
+        * value. If BIOS has not enabled SME then don't advertise the
+        * feature (set in scattered.c). Also, since the SME support requires
+        * long mode, don't advertise the feature under CONFIG_X86_32.
+        */
+       if (cpu_has(c, X86_FEATURE_SME)) {
+               u64 msr;
+
+               /* Check if SME is enabled */
+               rdmsrl(MSR_K8_SYSCFG, msr);
+               if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
+                       c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+                       if (IS_ENABLED(CONFIG_X86_32))
+                               clear_cpu_cap(c, X86_FEATURE_SME);
+               } else {
+                       clear_cpu_cap(c, X86_FEATURE_SME);
+               }
+       }
  }
  
  static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -730,8 +755,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
  
  static void init_amd(struct cpuinfo_x86 *c)
  {
-       u32 dummy;
-
         early_init_amd(c);
  
         /*
@@ -793,8 +816,6 @@ static void init_amd(struct cpuinfo_x86 *c)
         if (c->x86 > 0x11)
                 set_cpu_cap(c, X86_FEATURE_ARAT);
  
-       rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
-
         /* 3DNow or LM implies PREFETCHW */
         if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
                 if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c

index 0af86d9242da0f6882f1f5252dfa659038c627ac..db684880d74ae47fbff37888ff31dd13b9a4653b 100644 (file)
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -21,6 +21,14 @@
  
  void __init check_bugs(void)
  {
+#ifdef CONFIG_X86_32
+       /*
+        * Regardless of whether PCID is enumerated, the SDM says
+        * that it can't be enabled in 32-bit mode.
+        */
+       setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
         identify_boot_cpu();
  
         if (!IS_ENABLED(CONFIG_SMP)) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index c8b39870f33e8d5579eb1b30ecd24ace23de9d86..b95cd94ca97bc191121e87bd5c0471d0ad8de494 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s)
  }
  __setup("nompx", x86_mpx_setup);
  
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+       /* require an exact match without trailing characters */
+       if (strlen(s))
+               return 0;
+
+       /* do not emit a message if the feature is not present */
+       if (!boot_cpu_has(X86_FEATURE_PCID))
+               return 1;
+
+       setup_clear_cpu_cap(X86_FEATURE_PCID);
+       pr_info("nopcid: PCID feature disabled\n");
+       return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
  static int __init x86_noinvpcid_setup(char *s)
  {
         /* noinvpcid doesn't accept parameters */
@@ -311,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
         }
  }
  
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+       if (cpu_has(c, X86_FEATURE_PCID)) {
+               if (cpu_has(c, X86_FEATURE_PGE)) {
+                       cr4_set_bits(X86_CR4_PCIDE);
+               } else {
+                       /*
+                        * flush_tlb_all(), as currently implemented, won't
+                        * work if PCID is on but PGE is not.  Since that
+                        * combination doesn't exist on real hardware, there's
+                        * no reason to try to fully support it, but it's
+                        * polite to avoid corrupting data if we're on
+                        * an improperly configured VM.
+                        */
+                       clear_cpu_cap(c, X86_FEATURE_PCID);
+               }
+       }
+}
+
  /*
   * Protection Keys are not available in 32-bit mode.
   */
@@ -1125,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
         setup_smep(c);
         setup_smap(c);
  
+       /* Set up PCID */
+       setup_pcid(c);
+
         /*
          * The vendor-specific functions might have changed features.
          * Now we do "generic changes."
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index 6dde0497efc7514b9981bd2d33014a6686c65677..3b413065c61308104794db2efe6ab939b39411a1 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,6 +51,7 @@
  #include <asm/mce.h>
  #include <asm/msr.h>
  #include <asm/reboot.h>
+#include <asm/set_memory.h>
  
  #include "mce-internal.h"
  
@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
         return ret;
  }
  
+#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
+
+void arch_unmap_kpfn(unsigned long pfn)
+{
+       unsigned long decoy_addr;
+
+       /*
+        * Unmap this page from the kernel 1:1 mappings to make sure
+        * we don't log more errors because of speculative access to
+        * the page.
+        * We would like to just call:
+        *      set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
+        * but doing that would radically increase the odds of a
+        * speculative access to the posion page because we'd have
+        * the virtual address of the kernel 1:1 mapping sitting
+        * around in registers.
+        * Instead we get tricky.  We create a non-canonical address
+        * that looks just like the one we want, but has bit 63 flipped.
+        * This relies on set_memory_np() not checking whether we passed
+        * a legal address.
+        */
+
+/*
+ * Build time check to see if we have a spare virtual bit. Don't want
+ * to leave this until run time because most developers don't have a
+ * system that can exercise this code path. This will only become a
+ * problem if/when we move beyond 5-level page tables.
+ *
+ * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
+ */
+#if PGDIR_SHIFT + 9 < 63
+       decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+#else
+#error "no unused virtual bit available"
+#endif
+
+       if (set_memory_np(decoy_addr, 1))
+               pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+
+}
+#endif
+
  /*
   * The actual machine check handler. This only handles real
   * exceptions when something got corrupted coming in through int 18.
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c

index 23c23508c0125e50caf69a6959aaf000151cf572..05459ad3db46e2139b7d97514899d398c321c541 100644 (file)
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = {
         { X86_FEATURE_HW_PSTATE,        CPUID_EDX,  7, 0x80000007, 0 },
         { X86_FEATURE_CPB,              CPUID_EDX,  9, 0x80000007, 0 },
         { X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
+       { X86_FEATURE_SME,              CPUID_EAX,  0, 0x8000001f, 0 },
         { 0, 0, 0, 0, 0 }
  };
  
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c

index 532da61d605ccc2271067fdf67bac83c454616aa..71c11ad5643e80059d4f262002fc9620044b594b 100644 (file)
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -96,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any);
   * Note: this function only works correctly once the E820 table is sorted and
   * not-overlapping (at least for the range specified), which is the case normally.
   */
-bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
+                                            enum e820_type type)
  {
         int i;
  
@@ -122,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
                  * coverage of the desired range exists:
                  */
                 if (start >= end)
-                       return 1;
+                       return entry;
         }
-       return 0;
+
+       return NULL;
+}
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ */
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+{
+       return __e820__mapped_all(start, end, type);
+}
+
+/*
+ * This function returns the type associated with the range <start,end>.
+ */
+int e820__get_entry_type(u64 start, u64 end)
+{
+       struct e820_entry *entry = __e820__mapped_all(start, end, 0);
+
+       return entry ? entry->type : -EINVAL;
  }
  
  /*
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c

index 6b91e2eb8d3f8a5b8ad1a57c7a2a47d0a3b4440d..9c4e7ba6870c142921cfbbd07b8bbf45285e5c07 100644 (file)
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -195,7 +195,7 @@ void init_espfix_ap(int cpu)
  
         pte_p = pte_offset_kernel(&pmd, addr);
         stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
-       pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+       pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
         for (n = 0; n < ESPFIX_PTE_CLONES; n++)
                 set_pte(&pte_p[n*PTE_STRIDE], pte);
  
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c

index 9ba79543d9ee9f1ee19bce38955467afa04b125f..6a193b93fd952d59b4bca8a2071859edf8bcfcb6 100644 (file)
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -14,6 +14,7 @@
  #include <linux/start_kernel.h>
  #include <linux/io.h>
  #include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
  
  #include <asm/processor.h>
  #include <asm/proto.h>
@@ -33,7 +34,6 @@
  /*
   * Manage page tables very early on.
   */
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
  extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
  static unsigned int __initdata next_early_pgt;
  pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
@@ -45,9 +45,11 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
         return ptr - (void *)_text + (void *)physaddr;
  }
  
-void __head __startup_64(unsigned long physaddr)
+unsigned long __head __startup_64(unsigned long physaddr,
+                                 struct boot_params *bp)
  {
         unsigned long load_delta, *p;
+       unsigned long pgtable_flags;
         pgdval_t *pgd;
         p4dval_t *p4d;
         pudval_t *pud;
@@ -69,6 +71,12 @@ void __head __startup_64(unsigned long physaddr)
         if (load_delta & ~PMD_PAGE_MASK)
                 for (;;);
  
+       /* Activate Secure Memory Encryption (SME) if supported and enabled */
+       sme_enable(bp);
+
+       /* Include the SME encryption mask in the fixup value */
+       load_delta += sme_get_me_mask();
+
         /* Fixup the physical addresses in the page table */
  
         pgd = fixup_pointer(&early_top_pgt, physaddr);
@@ -92,31 +100,35 @@ void __head __startup_64(unsigned long physaddr)
          * creates a bunch of nonsense entries but that is fine --
          * it avoids problems around wraparound.
          */
+
         next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
         pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
         pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
  
+       pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
+
         if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
                 p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
  
                 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
-               pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
-               pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+               pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+               pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
  
                 i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
-               p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
-               p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+               p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
+               p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
         } else {
                 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
-               pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
-               pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+               pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+               pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
         }
  
         i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
-       pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
-       pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+       pud[i + 0] = (pudval_t)pmd + pgtable_flags;
+       pud[i + 1] = (pudval_t)pmd + pgtable_flags;
  
         pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+       pmd_entry += sme_get_me_mask();
         pmd_entry +=  physaddr;
  
         for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
@@ -137,9 +149,30 @@ void __head __startup_64(unsigned long physaddr)
                         pmd[i] += load_delta;
         }
  
-       /* Fixup phys_base */
+       /*
+        * Fixup phys_base - remove the memory encryption mask to obtain
+        * the true physical address.
+        */
         p = fixup_pointer(&phys_base, physaddr);
-       *p += load_delta;
+       *p += load_delta - sme_get_me_mask();
+
+       /* Encrypt the kernel (if SME is active) */
+       sme_encrypt_kernel();
+
+       /*
+        * Return the SME encryption mask (if SME is active) to be used as a
+        * modifier for the initial pgdir entry programmed into CR3.
+        */
+       return sme_get_me_mask();
+}
+
+unsigned long __startup_secondary_64(void)
+{
+       /*
+        * Return the SME encryption mask (if SME is active) to be used as a
+        * modifier for the initial pgdir entry programmed into CR3.
+        */
+       return sme_get_me_mask();
  }
  
  /* Wipe all early page tables except for the kernel symbol map */
@@ -147,17 +180,17 @@ static void __init reset_early_page_tables(void)
  {
         memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
         next_early_pgt = 0;
-       write_cr3(__pa_nodebug(early_top_pgt));
+       write_cr3(__sme_pa_nodebug(early_top_pgt));
  }
  
  /* Create a new PMD entry */
-int __init early_make_pgtable(unsigned long address)
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
  {
         unsigned long physaddr = address - __PAGE_OFFSET;
         pgdval_t pgd, *pgd_p;
         p4dval_t p4d, *p4d_p;
         pudval_t pud, *pud_p;
-       pmdval_t pmd, *pmd_p;
+       pmdval_t *pmd_p;
  
         /* Invalid address or early pgt is done ?  */
         if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
@@ -216,12 +249,21 @@ again:
                 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
                 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
         }
-       pmd = (physaddr & PMD_MASK) + early_pmd_flags;
         pmd_p[pmd_index(address)] = pmd;
  
         return 0;
  }
  
+int __init early_make_pgtable(unsigned long address)
+{
+       unsigned long physaddr = address - __PAGE_OFFSET;
+       pmdval_t pmd;
+
+       pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+       return __early_make_pgtable(address, pmd);
+}
+
  /* Don't add a printk in there. printk relies on the PDA which is not initialized 
     yet. */
  static void __init clear_bss(void)
@@ -244,6 +286,12 @@ static void __init copy_bootdata(char *real_mode_data)
         char * command_line;
         unsigned long cmd_line_ptr;
  
+       /*
+        * If SME is active, this will create decrypted mappings of the
+        * boot data in advance of the copy operations.
+        */
+       sme_map_bootdata(real_mode_data);
+
         memcpy(&boot_params, real_mode_data, sizeof boot_params);
         sanitize_boot_params(&boot_params);
         cmd_line_ptr = get_cmd_line_ptr();
@@ -251,6 +299,14 @@ static void __init copy_bootdata(char *real_mode_data)
                 command_line = __va(cmd_line_ptr);
                 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
         }
+
+       /*
+        * The old boot data is no longer needed and won't be reserved,
+        * freeing up that memory for use by the system. If SME is active,
+        * we need to remove the mappings that were created so that the
+        * memory doesn't remain mapped as decrypted.
+        */
+       sme_unmap_bootdata(real_mode_data);
  }
  
  asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
@@ -280,6 +336,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
  
         clear_page(init_top_pgt);
  
+       /*
+        * SME support may update early_pmd_flags to include the memory
+        * encryption mask, so it needs to be called before anything
+        * that may generate a page fault.
+        */
+       sme_early_init();
+
         kasan_early_init();
  
         for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

index 6225550883dfe1e98bcf35ac5ed870e334b34399..513cbb012eccc51f18ac16f9dac74a33abaa4c16 100644 (file)
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -73,12 +73,19 @@ startup_64:
         /* Sanitize CPU configuration */
         call verify_cpu
  
+       /*
+        * Perform pagetable fixups. Additionally, if SME is active, encrypt
+        * the kernel and retrieve the modifier (SME encryption mask if SME
+        * is active) to be added to the initial pgdir entry that will be
+        * programmed into CR3.
+        */
         leaq    _text(%rip), %rdi
         pushq   %rsi
         call    __startup_64
         popq    %rsi
  
-       movq    $(early_top_pgt - __START_KERNEL_map), %rax
+       /* Form the CR3 value being sure to include the CR3 modifier */
+       addq    $(early_top_pgt - __START_KERNEL_map), %rax
         jmp 1f
  ENTRY(secondary_startup_64)
         /*
@@ -98,7 +105,16 @@ ENTRY(secondary_startup_64)
         /* Sanitize CPU configuration */
         call verify_cpu
  
-       movq    $(init_top_pgt - __START_KERNEL_map), %rax
+       /*
+        * Retrieve the modifier (SME encryption mask if SME is active) to be
+        * added to the initial pgdir entry that will be programmed into CR3.
+        */
+       pushq   %rsi
+       call    __startup_secondary_64
+       popq    %rsi
+
+       /* Form the CR3 value being sure to include the CR3 modifier */
+       addq    $(init_top_pgt - __START_KERNEL_map), %rax
  1:
  
         /* Enable PAE mode, PGE and LA57 */
@@ -335,9 +351,9 @@ GLOBAL(name)
  NEXT_PAGE(early_top_pgt)
         .fill   511,8,0
  #ifdef CONFIG_X86_5LEVEL
-       .quad   level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
  #else
-       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
  #endif
  
  NEXT_PAGE(early_dynamic_pgts)
@@ -350,15 +366,15 @@ NEXT_PAGE(init_top_pgt)
         .fill   512,8,0
  #else
  NEXT_PAGE(init_top_pgt)
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
         .org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
         .org    init_top_pgt + PGD_START_KERNEL*8, 0
         /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
  
  NEXT_PAGE(level3_ident_pgt)
-       .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
         .fill   511, 8, 0
  NEXT_PAGE(level2_ident_pgt)
         /* Since I easily can, map the first 1G.
@@ -370,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt)
  #ifdef CONFIG_X86_5LEVEL
  NEXT_PAGE(level4_kernel_pgt)
         .fill   511,8,0
-       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
  #endif
  
  NEXT_PAGE(level3_kernel_pgt)
         .fill   L3_START_KERNEL,8,0
         /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-       .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+       .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
  
  NEXT_PAGE(level2_kernel_pgt)
         /*
@@ -395,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt)
  
  NEXT_PAGE(level2_fixmap_pgt)
         .fill   506,8,0
-       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
         /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
         .fill   5,8,0
  
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c

index 38b64587b31be5611a763df6dafe8434db2a66b5..fd6f8fbbe6f2a05d061c3e0196c313293ae8ad08 100644 (file)
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
         struct setup_data_node *node = file->private_data;
         unsigned long remain;
         loff_t pos = *ppos;
-       struct page *pg;
         void *p;
         u64 pa;
  
@@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
                 count = node->len - pos;
  
         pa = node->paddr + sizeof(struct setup_data) + pos;
-       pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
-       if (PageHighMem(pg)) {
-               p = ioremap_cache(pa, count);
-               if (!p)
-                       return -ENXIO;
-       } else
-               p = __va(pa);
+       p = memremap(pa, count, MEMREMAP_WB);
+       if (!p)
+               return -ENOMEM;
  
         remain = copy_to_user(user_buf, p, count);
  
-       if (PageHighMem(pg))
-               iounmap(p);
+       memunmap(p);
  
         if (remain)
                 return -EFAULT;
@@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent)
         struct setup_data *data;
         int error;
         struct dentry *d;
-       struct page *pg;
         u64 pa_data;
         int no = 0;
  
@@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent)
                         goto err_dir;
                 }
  
-               pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
-               if (PageHighMem(pg)) {
-                       data = ioremap_cache(pa_data, sizeof(*data));
-                       if (!data) {
-                               kfree(node);
-                               error = -ENXIO;
-                               goto err_dir;
-                       }
-               } else
-                       data = __va(pa_data);
+               data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+               if (!data) {
+                       kfree(node);
+                       error = -ENOMEM;
+                       goto err_dir;
+               }
  
                 node->paddr = pa_data;
                 node->type = data->type;
@@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
                 error = create_setup_data_node(d, no, node);
                 pa_data = data->next;
  
-               if (PageHighMem(pg))
-                       iounmap(data);
+               memunmap(data);
                 if (error)
                         goto err_dir;
                 no++;
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c

index 06e1ff5562c0b4a5a28f3c052ed74b545c7ccca0..4b0592ca9e47b332d0ce67f8bcf5f555653587b2 100644 (file)
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -16,8 +16,8 @@
  #include <linux/stat.h>
  #include <linux/slab.h>
  #include <linux/mm.h>
+#include <linux/io.h>
  
-#include <asm/io.h>
  #include <asm/setup.h>
  
  static ssize_t version_show(struct kobject *kobj,
@@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr)
                         *paddr = pa_data;
                         return 0;
                 }
-               data = ioremap_cache(pa_data, sizeof(*data));
+               data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
                 if (!data)
                         return -ENOMEM;
  
                 pa_data = data->next;
-               iounmap(data);
+               memunmap(data);
                 i++;
         }
         return -EINVAL;
@@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size)
         u64 pa_data = boot_params.hdr.setup_data;
  
         while (pa_data) {
-               data = ioremap_cache(pa_data, sizeof(*data));
+               data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
                 if (!data)
                         return -ENOMEM;
                 if (nr == i) {
                         *size = data->len;
-                       iounmap(data);
+                       memunmap(data);
                         return 0;
                 }
  
                 pa_data = data->next;
-               iounmap(data);
+               memunmap(data);
                 i++;
         }
         return -EINVAL;
@@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj,
         ret = get_setup_data_paddr(nr, &paddr);
         if (ret)
                 return ret;
-       data = ioremap_cache(paddr, sizeof(*data));
+       data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
         if (!data)
                 return -ENOMEM;
  
         ret = sprintf(buf, "0x%x\n", data->type);
-       iounmap(data);
+       memunmap(data);
         return ret;
  }
  
@@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp,
         ret = get_setup_data_paddr(nr, &paddr);
         if (ret)
                 return ret;
-       data = ioremap_cache(paddr, sizeof(*data));
+       data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
         if (!data)
                 return -ENOMEM;
  
@@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp,
                 goto out;
  
         ret = count;
-       p = ioremap_cache(paddr + sizeof(*data), data->len);
+       p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
         if (!p) {
                 ret = -ENOMEM;
                 goto out;
         }
         memcpy(buf, p + off, count);
-       iounmap(p);
+       memunmap(p);
  out:
-       iounmap(data);
+       memunmap(data);
         return ret;
  }
  
@@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr)
         *nr = 0;
         while (pa_data) {
                 *nr += 1;
-               data = ioremap_cache(pa_data, sizeof(*data));
+               data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
                 if (!data) {
                         ret = -ENOMEM;
                         goto out;
                 }
                 pa_data = data->next;
-               iounmap(data);
+               memunmap(data);
         }
  
  out:
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c

index cb0a30473c2310b76695c73ec6fad3cd1e7b051f..1f790cf9d38fe0e10e46eaf9b5bef945d25a9370 100644 (file)
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
         }
         pte = pte_offset_kernel(pmd, vaddr);
-       set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+       set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
         return 0;
  err:
         free_transition_pgtable(image);
@@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
                 .alloc_pgt_page = alloc_pgt_page,
                 .context        = image,
                 .page_flag      = __PAGE_KERNEL_LARGE_EXEC,
+               .kernpg_flag    = _KERNPG_TABLE_NOENC,
         };
         unsigned long mstart, mend;
         pgd_t *level4p;
@@ -334,7 +335,8 @@ void machine_kexec(struct kimage *image)
         image->start = relocate_kernel((unsigned long)image->head,
                                        (unsigned long)page_list,
                                        image->start,
-                                      image->preserve_context);
+                                      image->preserve_context,
+                                      sme_active());
  
  #ifdef CONFIG_KEXEC_JUMP
         if (image->preserve_context)
@@ -602,3 +604,22 @@ void arch_kexec_unprotect_crashkres(void)
  {
         kexec_mark_crashkres(false);
  }
+
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+       /*
+        * If SME is active we need to be sure that kexec pages are
+        * not encrypted because when we boot to the new kernel the
+        * pages won't be accessed encrypted (initially).
+        */
+       return set_memory_decrypted((unsigned long)vaddr, pages);
+}
+
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+       /*
+        * If SME is active we need to reset the pages back to being
+        * an encrypted mapping before freeing them.
+        */
+       set_memory_encrypted((unsigned long)vaddr, pages);
+}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c

index 0d904d759ff1d42c97e57a7ef5e126bd961b8b03..5cbb3177ed17270b993a4c1a686282ce80608ed1 100644 (file)
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -429,16 +429,16 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
         }
  }
  
-static struct mpf_intel *mpf_found;
+static unsigned long mpf_base;
  
  static unsigned long __init get_mpc_size(unsigned long physptr)
  {
         struct mpc_table *mpc;
         unsigned long size;
  
-       mpc = early_ioremap(physptr, PAGE_SIZE);
+       mpc = early_memremap(physptr, PAGE_SIZE);
         size = mpc->length;
-       early_iounmap(mpc, PAGE_SIZE);
+       early_memunmap(mpc, PAGE_SIZE);
         apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
  
         return size;
@@ -450,7 +450,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
         unsigned long size;
  
         size = get_mpc_size(mpf->physptr);
-       mpc = early_ioremap(mpf->physptr, size);
+       mpc = early_memremap(mpf->physptr, size);
+
         /*
          * Read the physical hardware table.  Anything here will
          * override the defaults.
@@ -461,10 +462,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
  #endif
                 pr_err("BIOS bug, MP table errors detected!...\n");
                 pr_cont("... disabling SMP support. (tell your hw vendor)\n");
-               early_iounmap(mpc, size);
+               early_memunmap(mpc, size);
                 return -1;
         }
-       early_iounmap(mpc, size);
+       early_memunmap(mpc, size);
  
         if (early)
                 return -1;
@@ -497,12 +498,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
   */
  void __init default_get_smp_config(unsigned int early)
  {
-       struct mpf_intel *mpf = mpf_found;
+       struct mpf_intel *mpf;
  
         if (!smp_found_config)
                 return;
  
-       if (!mpf)
+       if (!mpf_base)
                 return;
  
         if (acpi_lapic && early)
@@ -515,6 +516,12 @@ void __init default_get_smp_config(unsigned int early)
         if (acpi_lapic && acpi_ioapic)
                 return;
  
+       mpf = early_memremap(mpf_base, sizeof(*mpf));
+       if (!mpf) {
+               pr_err("MPTABLE: error mapping MP table\n");
+               return;
+       }
+
         pr_info("Intel MultiProcessor Specification v1.%d\n",
                 mpf->specification);
  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -529,7 +536,7 @@ void __init default_get_smp_config(unsigned int early)
         /*
          * Now see if we need to read further.
          */
-       if (mpf->feature1 != 0) {
+       if (mpf->feature1) {
                 if (early) {
                         /*
                          * local APIC has default address
@@ -542,8 +549,10 @@ void __init default_get_smp_config(unsigned int early)
                 construct_default_ISA_mptable(mpf->feature1);
  
         } else if (mpf->physptr) {
-               if (check_physptr(mpf, early))
+               if (check_physptr(mpf, early)) {
+                       early_memunmap(mpf, sizeof(*mpf));
                         return;
+               }
         } else
                 BUG();
  
@@ -552,6 +561,8 @@ void __init default_get_smp_config(unsigned int early)
         /*
          * Only use the first configuration found.
          */
+
+       early_memunmap(mpf, sizeof(*mpf));
  }
  
  static void __init smp_reserve_memory(struct mpf_intel *mpf)
@@ -561,15 +572,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
  
  static int __init smp_scan_config(unsigned long base, unsigned long length)
  {
-       unsigned int *bp = phys_to_virt(base);
+       unsigned int *bp;
         struct mpf_intel *mpf;
-       unsigned long mem;
+       int ret = 0;
  
         apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
                     base, base + length - 1);
         BUILD_BUG_ON(sizeof(*mpf) != 16);
  
         while (length > 0) {
+               bp = early_memremap(base, length);
                 mpf = (struct mpf_intel *)bp;
                 if ((*bp == SMP_MAGIC_IDENT) &&
                     (mpf->length == 1) &&
@@ -579,24 +591,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
  #ifdef CONFIG_X86_LOCAL_APIC
                         smp_found_config = 1;
  #endif
-                       mpf_found = mpf;
+                       mpf_base = base;
  
-                       pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
-                               (unsigned long long) virt_to_phys(mpf),
-                               (unsigned long long) virt_to_phys(mpf) +
-                               sizeof(*mpf) - 1, mpf);
+                       pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n",
+                               base, base + sizeof(*mpf) - 1, mpf);
  
-                       mem = virt_to_phys(mpf);
-                       memblock_reserve(mem, sizeof(*mpf));
+                       memblock_reserve(base, sizeof(*mpf));
                         if (mpf->physptr)
                                 smp_reserve_memory(mpf);
  
-                       return 1;
+                       ret = 1;
                 }
-               bp += 4;
+               early_memunmap(bp, length);
+
+               if (ret)
+                       break;
+
+               base += 16;
                 length -= 16;
         }
-       return 0;
+       return ret;
  }
  
  void __init default_find_smp_config(void)
@@ -838,29 +852,40 @@ static int __init update_mp_table(void)
         char oem[10];
         struct mpf_intel *mpf;
         struct mpc_table *mpc, *mpc_new;
+       unsigned long size;
  
         if (!enable_update_mptable)
                 return 0;
  
-       mpf = mpf_found;
-       if (!mpf)
+       if (!mpf_base)
                 return 0;
  
+       mpf = early_memremap(mpf_base, sizeof(*mpf));
+       if (!mpf) {
+               pr_err("MPTABLE: mpf early_memremap() failed\n");
+               return 0;
+       }
+
         /*
          * Now see if we need to go further.
          */
-       if (mpf->feature1 != 0)
-               return 0;
+       if (mpf->feature1)
+               goto do_unmap_mpf;
  
         if (!mpf->physptr)
-               return 0;
+               goto do_unmap_mpf;
  
-       mpc = phys_to_virt(mpf->physptr);
+       size = get_mpc_size(mpf->physptr);
+       mpc = early_memremap(mpf->physptr, size);
+       if (!mpc) {
+               pr_err("MPTABLE: mpc early_memremap() failed\n");
+               goto do_unmap_mpf;
+       }
  
         if (!smp_check_mpc(mpc, oem, str))
-               return 0;
+               goto do_unmap_mpc;
  
-       pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf));
+       pr_info("mpf: %llx\n", (u64)mpf_base);
         pr_info("physptr: %x\n", mpf->physptr);
  
         if (mpc_new_phys && mpc->length > mpc_new_length) {
@@ -878,21 +903,32 @@ static int __init update_mp_table(void)
                 new = mpf_checksum((unsigned char *)mpc, mpc->length);
                 if (old == new) {
                         pr_info("mpc is readonly, please try alloc_mptable instead\n");
-                       return 0;
+                       goto do_unmap_mpc;
                 }
                 pr_info("use in-position replacing\n");
         } else {
+               mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
+               if (!mpc_new) {
+                       pr_err("MPTABLE: new mpc early_memremap() failed\n");
+                       goto do_unmap_mpc;
+               }
                 mpf->physptr = mpc_new_phys;
-               mpc_new = phys_to_virt(mpc_new_phys);
                 memcpy(mpc_new, mpc, mpc->length);
+               early_memunmap(mpc, size);
                 mpc = mpc_new;
+               size = mpc_new_length;
                 /* check if we can modify that */
                 if (mpc_new_phys - mpf->physptr) {
                         struct mpf_intel *mpf_new;
                         /* steal 16 bytes from [0, 1k) */
+                       mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
+                       if (!mpf_new) {
+                               pr_err("MPTABLE: new mpf early_memremap() failed\n");
+                               goto do_unmap_mpc;
+                       }
                         pr_info("mpf new: %x\n", 0x400 - 16);
-                       mpf_new = phys_to_virt(0x400 - 16);
                         memcpy(mpf_new, mpf, 16);
+                       early_memunmap(mpf, sizeof(*mpf));
                         mpf = mpf_new;
                         mpf->physptr = mpc_new_phys;
                 }
@@ -909,6 +945,12 @@ static int __init update_mp_table(void)
          */
         replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
  
+do_unmap_mpc:
+       early_memunmap(mpc, size);
+
+do_unmap_mpf:
+       early_memunmap(mpf, sizeof(*mpf));
+
         return 0;
  }
  
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c

index 5e16d3f2959468f6d05aaa1be5eef2f8b945abb1..0accc2404b9214d1b318577668f103bc41bc8d35 100644 (file)
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -93,9 +93,12 @@ again:
         if (gfpflags_allow_blocking(flag)) {
                 page = dma_alloc_from_contiguous(dev, count, get_order(size),
                                                  flag);
-               if (page && page_to_phys(page) + size > dma_mask) {
-                       dma_release_from_contiguous(dev, page, count);
-                       page = NULL;
+               if (page) {
+                       addr = phys_to_dma(dev, page_to_phys(page));
+                       if (addr + size > dma_mask) {
+                               dma_release_from_contiguous(dev, page, count);
+                               page = NULL;
+                       }
                 }
         }
         /* fallback */
@@ -104,7 +107,7 @@ again:
         if (!page)
                 return NULL;
  
-       addr = page_to_phys(page);
+       addr = phys_to_dma(dev, page_to_phys(page));
         if (addr + size > dma_mask) {
                 __free_pages(page, get_order(size));
  
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c

index a6d404087fe3285f65de353613153abde4a698a0..4fc3cb60ea11a546b08ae8c2f3fdbb9007dee670 100644 (file)
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
                                  enum dma_data_direction dir,
                                  unsigned long attrs)
  {
-       dma_addr_t bus = page_to_phys(page) + offset;
+       dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
         WARN_ON(size == 0);
         if (!check_addr("map_single", dev, bus, size))
                 return NOMMU_MAPPING_ERROR;
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c

index 1e23577e17cf10f87d584e4cdc38f4691db57862..677077510e308ebfabb56d8072a2afed439f23c8 100644 (file)
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,12 +6,14 @@
  #include <linux/swiotlb.h>
  #include <linux/bootmem.h>
  #include <linux/dma-mapping.h>
+#include <linux/mem_encrypt.h>
  
  #include <asm/iommu.h>
  #include <asm/swiotlb.h>
  #include <asm/dma.h>
  #include <asm/xen/swiotlb-xen.h>
  #include <asm/iommu_table.h>
+
  int swiotlb __read_mostly;
  
  void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
                   pci_swiotlb_late_init);
  
  /*
- * if 4GB or more detected (and iommu=off not set) return 1
- * and set swiotlb to 1.
+ * If 4GB or more detected (and iommu=off not set) or if SME is active
+ * then set swiotlb to 1 and return 1.
   */
  int __init pci_swiotlb_detect_4gb(void)
  {
@@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void)
         if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
                 swiotlb = 1;
  #endif
+
+       /*
+        * If SME is active then swiotlb will be set to 1 so that bounce
+        * buffers are allocated and used for devices that do not support
+        * the addressing range required for the encryption mask.
+        */
+       if (sme_active())
+               swiotlb = 1;
+
         return swiotlb;
  }
  IOMMU_INIT(pci_swiotlb_detect_4gb,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index 3ca198080ea9294486ae9a1121e7815dfba7cb19..bd6b85fac66696da70e316656ad6f0d51291f8aa 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -355,6 +355,7 @@ bool xen_set_default_idle(void)
         return ret;
  }
  #endif
+
  void stop_this_cpu(void *dummy)
  {
         local_irq_disable();
@@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy)
         disable_local_APIC();
         mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
  
-       for (;;)
-               halt();
+       for (;;) {
+               /*
+                * Use wbinvd followed by hlt to stop the processor. This
+                * provides support for kexec on a processor that supports
+                * SME. With kexec, going from SME inactive to SME active
+                * requires clearing cache entries so that addresses without
+                * the encryption bit set don't corrupt the same physical
+                * address that has the encryption bit set when caches are
+                * flushed. To achieve this a wbinvd is performed followed by
+                * a hlt. Even if the processor is not in the kexec/SME
+                * scenario this only adds a wbinvd to a halting processor.
+                */
+               asm volatile("wbinvd; hlt" : : : "memory");
+       }
  }
  
  /*
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S

index 98111b38ebfd6eb9949242c5aae7b18bbbdb4489..307d3bac5f04ece485ac1fe42226ee111c0c6e85 100644 (file)
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -47,6 +47,7 @@ relocate_kernel:
          * %rsi page_list
          * %rdx start address
          * %rcx preserve_context
+        * %r8  sme_active
          */
  
         /* Save the CPU context, used for jumping back */
@@ -71,6 +72,9 @@ relocate_kernel:
         pushq $0
         popfq
  
+       /* Save SME active flag */
+       movq    %r8, %r12
+
         /*
          * get physical address of control page now
          * this is impossible after page table switch
@@ -132,6 +136,16 @@ identity_mapped:
         /* Flush the TLB (needed?) */
         movq    %r9, %cr3
  
+       /*
+        * If SME is active, there could be old encrypted cache line
+        * entries that will conflict with the now unencrypted memory
+        * used by kexec. Flush the caches before copying the kernel.
+        */
+       testq   %r12, %r12
+       jz 1f
+       wbinvd
+1:
+
         movq    %rcx, %r11
         call    swap_pages
  
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index 3486d04988000b05344a590ce9ab8c86e96d0ec2..0bfe0c1628f638a0ae7b1ab69e9414485fbae147 100644 (file)
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -69,6 +69,7 @@
  #include <linux/crash_dump.h>
  #include <linux/tboot.h>
  #include <linux/jiffies.h>
+#include <linux/mem_encrypt.h>
  
  #include <linux/usb/xhci-dbgp.h>
  #include <video/edid.h>
@@ -374,6 +375,14 @@ static void __init reserve_initrd(void)
             !ramdisk_image || !ramdisk_size)
                 return;         /* No initrd provided by bootloader */
  
+       /*
+        * If SME is active, this memory will be marked encrypted by the
+        * kernel when it is accessed (including relocation). However, the
+        * ramdisk image was loaded decrypted by the bootloader, so make
+        * sure that it is encrypted before accessing it.
+        */
+       sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
+
         initrd_start = 0;
  
         mapped_size = memblock_mem_size(max_pfn_mapped);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c

index 213ddf3e937d800577514a3677f742813298555e..73e4d28112f8a14a1d741ab664b3f76ea0bf6479 100644 (file)
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -21,6 +21,7 @@
  #include <asm/compat.h>
  #include <asm/ia32.h>
  #include <asm/syscalls.h>
+#include <asm/mpx.h>
  
  /*
   * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
@@ -100,8 +101,8 @@ out:
         return error;
  }
  
-static void find_start_end(unsigned long flags, unsigned long *begin,
-                          unsigned long *end)
+static void find_start_end(unsigned long addr, unsigned long flags,
+               unsigned long *begin, unsigned long *end)
  {
         if (!in_compat_syscall() && (flags & MAP_32BIT)) {
                 /* This is usually used needed to map code in small
@@ -120,7 +121,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
         }
  
         *begin  = get_mmap_base(1);
-       *end    = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
+       if (in_compat_syscall())
+               *end = task_size_32bit();
+       else
+               *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
  }
  
  unsigned long
@@ -132,10 +136,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
         struct vm_unmapped_area_info info;
         unsigned long begin, end;
  
+       addr = mpx_unmapped_area_check(addr, len, flags);
+       if (IS_ERR_VALUE(addr))
+               return addr;
+
         if (flags & MAP_FIXED)
                 return addr;
  
-       find_start_end(flags, &begin, &end);
+       find_start_end(addr, flags, &begin, &end);
  
         if (len > end)
                 return -ENOMEM;
@@ -171,6 +179,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
         unsigned long addr = addr0;
         struct vm_unmapped_area_info info;
  
+       addr = mpx_unmapped_area_check(addr, len, flags);
+       if (IS_ERR_VALUE(addr))
+               return addr;
+
         /* requested length too big for entire address space */
         if (len > TASK_SIZE)
                 return -ENOMEM;
@@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
         info.length = len;
         info.low_limit = PAGE_SIZE;
         info.high_limit = get_mmap_base(0);
+
+       /*
+        * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+        * in the full address space.
+        *
+        * !in_compat_syscall() check to avoid high addresses for x32.
+        */
+       if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+               info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
         info.align_mask = 0;
         info.align_offset = pgoff << PAGE_SHIFT;
         if (filp) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 9b1dd114956a8bcb9e724bd4df792460f68a0943..ccb70b8d16ccd24545f0d71c92c3c28f5f54992e 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -108,7 +108,7 @@ module_param(dbg, bool, 0644);
         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
  
  
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
  #define PT64_DIR_BASE_ADDR_MASK \
         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
  #define PT64_LVL_ADDR_MASK(level) \
@@ -126,7 +126,7 @@ module_param(dbg, bool, 0644);
                                             * PT32_LEVEL_BITS))) - 1))
  
  #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
-                       | shadow_x_mask | shadow_nx_mask)
+                       | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
  
  #define ACC_EXEC_MASK    1
  #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -186,6 +186,7 @@ static u64 __read_mostly shadow_dirty_mask;
  static u64 __read_mostly shadow_mmio_mask;
  static u64 __read_mostly shadow_mmio_value;
  static u64 __read_mostly shadow_present_mask;
+static u64 __read_mostly shadow_me_mask;
  
  /*
   * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
@@ -349,7 +350,7 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
   */
  void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask)
+               u64 acc_track_mask, u64 me_mask)
  {
         BUG_ON(!dirty_mask != !accessed_mask);
         BUG_ON(!accessed_mask && !acc_track_mask);
@@ -362,6 +363,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
         shadow_x_mask = x_mask;
         shadow_present_mask = p_mask;
         shadow_acc_track_mask = acc_track_mask;
+       shadow_me_mask = me_mask;
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
  
@@ -2433,7 +2435,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
  
         spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
-              shadow_user_mask | shadow_x_mask;
+              shadow_user_mask | shadow_x_mask | shadow_me_mask;
  
         if (sp_ad_disabled(sp))
                 spte |= shadow_acc_track_value;
@@ -2745,6 +2747,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                 pte_access &= ~ACC_WRITE_MASK;
  
         spte |= (u64)pfn << PAGE_SHIFT;
+       spte |= shadow_me_mask;
  
         if (pte_access & ACC_WRITE_MASK) {
  
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h

index d7d248a000dd6772681f3f5541e344f9677a2d1d..3cc725590ab9fd848c6f228e7c152d12e24a1569 100644 (file)
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,7 +48,7 @@
  
  static inline u64 rsvd_bits(int s, int e)
  {
-       return ((1ULL << (e - s + 1)) - 1) << s;
+       return __sme_clr(((1ULL << (e - s + 1)) - 1) << s);
  }
  
  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

index 56ba05312759d3ed4568e546492d9d8bfad05b71..099ff08b4aff9c7ca4a1ea0a3a14a02c3891f506 100644 (file)
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1167,9 +1167,9 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
  {
         struct vmcb *vmcb = svm->vmcb;
         struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
-       phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
-       phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
-       phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
+       phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+       phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page));
+       phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page));
  
         vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
         vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -1232,8 +1232,8 @@ static void init_vmcb(struct vcpu_svm *svm)
                 set_intercept(svm, INTERCEPT_MWAIT);
         }
  
-       control->iopm_base_pa = iopm_base;
-       control->msrpm_base_pa = __pa(svm->msrpm);
+       control->iopm_base_pa = __sme_set(iopm_base);
+       control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
         control->int_ctl = V_INTR_MASKING_MASK;
  
         init_seg(&save->es);
@@ -1377,9 +1377,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
                 return -EINVAL;
  
         new_entry = READ_ONCE(*entry);
-       new_entry = (page_to_phys(svm->avic_backing_page) &
-                    AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
-                    AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+       new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+                             AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+                             AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
         WRITE_ONCE(*entry, new_entry);
  
         svm->avic_physical_id_cache = entry;
@@ -1647,7 +1647,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
  
         svm->vmcb = page_address(page);
         clear_page(svm->vmcb);
-       svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
+       svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
         svm->asid_generation = 0;
         init_vmcb(svm);
  
@@ -1675,7 +1675,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+       __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
         __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
         __free_page(virt_to_page(svm->nested.hsave));
         __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
@@ -2335,7 +2335,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
         u64 pdpte;
         int ret;
  
-       ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+       ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
                                        offset_in_page(cr3) + index * 8, 8);
         if (ret)
                 return 0;
@@ -2347,7 +2347,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       svm->vmcb->control.nested_cr3 = root;
+       svm->vmcb->control.nested_cr3 = __sme_set(root);
         mark_dirty(svm->vmcb, VMCB_NPT);
         svm_flush_tlb(vcpu);
  }
@@ -2878,7 +2878,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
                 svm->nested.msrpm[p] = svm->msrpm[p] | value;
         }
  
-       svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+       svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
  
         return true;
  }
@@ -4511,7 +4511,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
         pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
                  irq.vector);
         *svm = to_svm(vcpu);
-       vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+       vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
         vcpu_info->vector = irq.vector;
  
         return 0;
@@ -4562,7 +4562,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                         struct amd_iommu_pi_data pi;
  
                         /* Try to enable guest_mode in IRTE */
-                       pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+                       pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+                                           AVIC_HPA_MASK);
                         pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
                                                      svm->vcpu.vcpu_id);
                         pi.is_guest_mode = true;
@@ -5011,7 +5012,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       svm->vmcb->save.cr3 = root;
+       svm->vmcb->save.cr3 = __sme_set(root);
         mark_dirty(svm->vmcb, VMCB_CR);
         svm_flush_tlb(vcpu);
  }
@@ -5020,7 +5021,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       svm->vmcb->control.nested_cr3 = root;
+       svm->vmcb->control.nested_cr3 = __sme_set(root);
         mark_dirty(svm->vmcb, VMCB_NPT);
  
         /* Also sync guest cr3 here in case we live migrate */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 9b21b12230354e334900e6536b7612285f75b7e3..416d5ed320b605ef361b96db01c53ba6d09bff7a 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6563,7 +6563,7 @@ void vmx_enable_tdp(void)
                 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
                 0ull, VMX_EPT_EXECUTABLE_MASK,
                 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-               VMX_EPT_RWX_MASK);
+               VMX_EPT_RWX_MASK, 0ull);
  
         ept_set_mmio_spte_mask();
         kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index d734aa8c5b4f7290e365badd00ea962fd0af9acd..eda4bdbd7e5e1cb722de3c2c00fd6ed2323aa8a0 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,7 @@
  #include <linux/kvm_irqfd.h>
  #include <linux/irqbypass.h>
  #include <linux/sched/stat.h>
+#include <linux/mem_encrypt.h>
  
  #include <trace/events/kvm.h>
  
@@ -6116,7 +6117,7 @@ int kvm_arch_init(void *opaque)
  
         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                         PT_DIRTY_MASK, PT64_NX_MASK, 0,
-                       PT_PRESENT_MASK, 0);
+                       PT_PRESENT_MASK, 0, sme_me_mask);
         kvm_timer_init();
  
         perf_register_guest_info_callbacks(&kvm_guest_cbs);
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c

index 5cc78bf572325fb1c5b5d6f854bfe878c55dfeb3..3261abb21ef4f5e0d9c9239cc86651ee96b0a74a 100644 (file)
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
         return 0;       /* Buffer overrun */
  }
  
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+                     const char *option, char *buffer, int bufsize)
+{
+       char c;
+       int pos = 0, len = -1;
+       const char *opptr = NULL;
+       char *bufptr = buffer;
+       enum {
+               st_wordstart = 0,       /* Start of word/after whitespace */
+               st_wordcmp,     /* Comparing this word */
+               st_wordskip,    /* Miscompare, skip */
+               st_bufcpy,      /* Copying this to buffer */
+       } state = st_wordstart;
+
+       if (!cmdline)
+               return -1;      /* No command line */
+
+       /*
+        * This 'pos' check ensures we do not overrun
+        * a non-NULL-terminated 'cmdline'
+        */
+       while (pos++ < max_cmdline_size) {
+               c = *(char *)cmdline++;
+               if (!c)
+                       break;
+
+               switch (state) {
+               case st_wordstart:
+                       if (myisspace(c))
+                               break;
+
+                       state = st_wordcmp;
+                       opptr = option;
+                       /* fall through */
+
+               case st_wordcmp:
+                       if ((c == '=') && !*opptr) {
+                               /*
+                                * We matched all the way to the end of the
+                                * option we were looking for, prepare to
+                                * copy the argument.
+                                */
+                               len = 0;
+                               bufptr = buffer;
+                               state = st_bufcpy;
+                               break;
+                       } else if (c == *opptr++) {
+                               /*
+                                * We are currently matching, so continue
+                                * to the next character on the cmdline.
+                                */
+                               break;
+                       }
+                       state = st_wordskip;
+                       /* fall through */
+
+               case st_wordskip:
+                       if (myisspace(c))
+                               state = st_wordstart;
+                       break;
+
+               case st_bufcpy:
+                       if (myisspace(c)) {
+                               state = st_wordstart;
+                       } else {
+                               /*
+                                * Increment len, but don't overrun the
+                                * supplied buffer and leave room for the
+                                * NULL terminator.
+                                */
+                               if (++len < bufsize)
+                                       *bufptr++ = c;
+                       }
+                       break;
+               }
+       }
+
+       if (bufsize)
+               *bufptr = '\0';
+
+       return len;
+}
+
  int cmdline_find_option_bool(const char *cmdline, const char *option)
  {
         return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
  }
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+                       int bufsize)
+{
+       return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+                                    buffer, bufsize);
+}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

index 0fbdcb64f9f836c0556ca0abcca4c673aea143f0..72bf8c01c6e3a58254cc915aded88eea8146f41a 100644 (file)
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -39,3 +39,5 @@ obj-$(CONFIG_X86_INTEL_MPX)   += mpx.o
  obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
  obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
  
+obj-$(CONFIG_AMD_MEM_ENCRYPT)  += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT)  += mem_encrypt_boot.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c

index 0470826d2bdca2b04bbcba902ae42f8aed728cb5..5e3ac6fe6c9e32ed1906f4f9bf736310a7193c7d 100644 (file)
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -13,12 +13,12 @@
   */
  
  #include <linux/debugfs.h>
+#include <linux/kasan.h>
  #include <linux/mm.h>
  #include <linux/init.h>
  #include <linux/sched.h>
  #include <linux/seq_file.h>
  
-#include <asm/kasan.h>
  #include <asm/pgtable.h>
  
  /*
@@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
  {
         pgprotval_t pr = pgprot_val(prot);
         static const char * const level_name[] =
-               { "cr3", "pgd", "pud", "pmd", "pte" };
+               { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
  
         if (!pgprot_val(prot)) {
                 /* Not present */
@@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
                         pt_dump_cont_printf(m, dmsg, "    ");
  
                 /* Bit 7 has a different meaning on level 3 vs 4 */
-               if (level <= 3 && pr & _PAGE_PSE)
+               if (level <= 4 && pr & _PAGE_PSE)
                         pt_dump_cont_printf(m, dmsg, "PSE ");
                 else
                         pt_dump_cont_printf(m, dmsg, "    ");
-               if ((level == 4 && pr & _PAGE_PAT) ||
-                   ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
+               if ((level == 5 && pr & _PAGE_PAT) ||
+                   ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
                         pt_dump_cont_printf(m, dmsg, "PAT ");
                 else
                         pt_dump_cont_printf(m, dmsg, "    ");
@@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
   */
  static unsigned long normalize_addr(unsigned long u)
  {
-#ifdef CONFIG_X86_64
-       return (signed long)(u << 16) >> 16;
-#else
-       return u;
-#endif
+       int shift;
+       if (!IS_ENABLED(CONFIG_X86_64))
+               return u;
+
+       shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+       return (signed long)(u << shift) >> shift;
  }
  
  /*
@@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
         for (i = 0; i < PTRS_PER_PTE; i++) {
                 prot = pte_flags(*start);
                 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
-               note_page(m, st, __pgprot(prot), 4);
+               note_page(m, st, __pgprot(prot), 5);
                 start++;
         }
  }
+#ifdef CONFIG_KASAN
+
+/*
+ * This is an optimization for KASAN=y case. Since all kasan page tables
+ * eventually point to the kasan_zero_page we could call note_page()
+ * right away without walking through lower level page tables. This saves
+ * us dozens of seconds (minutes for 5-level config) while checking for
+ * W+X mapping or reading kernel_page_tables debugfs file.
+ */
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+                               void *pt)
+{
+       if (__pa(pt) == __pa(kasan_zero_pmd) ||
+#ifdef CONFIG_X86_5LEVEL
+           __pa(pt) == __pa(kasan_zero_p4d) ||
+#endif
+           __pa(pt) == __pa(kasan_zero_pud)) {
+               pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
+               note_page(m, st, __pgprot(prot), 5);
+               return true;
+       }
+       return false;
+}
+#else
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+                               void *pt)
+{
+       return false;
+}
+#endif
  
  #if PTRS_PER_PMD > 1
  
  static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
  {
         int i;
-       pmd_t *start;
+       pmd_t *start, *pmd_start;
         pgprotval_t prot;
  
-       start = (pmd_t *)pud_page_vaddr(addr);
+       pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
         for (i = 0; i < PTRS_PER_PMD; i++) {
                 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
                 if (!pmd_none(*start)) {
                         if (pmd_large(*start) || !pmd_present(*start)) {
                                 prot = pmd_flags(*start);
-                               note_page(m, st, __pgprot(prot), 3);
-                       } else {
+                               note_page(m, st, __pgprot(prot), 4);
+                       } else if (!kasan_page_table(m, st, pmd_start)) {
                                 walk_pte_level(m, st, *start,
                                                P + i * PMD_LEVEL_MULT);
                         }
                 } else
-                       note_page(m, st, __pgprot(0), 3);
+                       note_page(m, st, __pgprot(0), 4);
                 start++;
         }
  }
@@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
  
  #if PTRS_PER_PUD > 1
  
-/*
- * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
- * KASAN fills page tables with the same values. Since there is no
- * point in checking page table more than once we just skip repeated
- * entries. This saves us dozens of seconds during boot.
- */
-static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
-{
-       return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
-}
-
  static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
  {
         int i;
-       pud_t *start;
+       pud_t *start, *pud_start;
         pgprotval_t prot;
         pud_t *prev_pud = NULL;
  
-       start = (pud_t *)p4d_page_vaddr(addr);
+       pud_start = start = (pud_t *)p4d_page_vaddr(addr);
  
         for (i = 0; i < PTRS_PER_PUD; i++) {
                 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
-               if (!pud_none(*start) &&
-                   !pud_already_checked(prev_pud, start, st->check_wx)) {
+               if (!pud_none(*start)) {
                         if (pud_large(*start) || !pud_present(*start)) {
                                 prot = pud_flags(*start);
-                               note_page(m, st, __pgprot(prot), 2);
-                       } else {
+                               note_page(m, st, __pgprot(prot), 3);
+                       } else if (!kasan_page_table(m, st, pud_start)) {
                                 walk_pmd_level(m, st, *start,
                                                P + i * PUD_LEVEL_MULT);
                         }
                 } else
-                       note_page(m, st, __pgprot(0), 2);
+                       note_page(m, st, __pgprot(0), 3);
  
                 prev_pud = start;
                 start++;
@@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
  static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
  {
         int i;
-       p4d_t *start;
+       p4d_t *start, *p4d_start;
         pgprotval_t prot;
  
-       start = (p4d_t *)pgd_page_vaddr(addr);
+       p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
  
         for (i = 0; i < PTRS_PER_P4D; i++) {
                 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
@@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
                         if (p4d_large(*start) || !p4d_present(*start)) {
                                 prot = p4d_flags(*start);
                                 note_page(m, st, __pgprot(prot), 2);
-                       } else {
+                       } else if (!kasan_page_table(m, st, p4d_start)) {
                                 walk_pud_level(m, st, *start,
                                                P + i * P4D_LEVEL_MULT);
                         }
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c

index 2824607df1081fe38a96d5e8af70e7c295e43ed5..6d06cf33e3de54ab6daa49910458c2cbb79c848b 100644 (file)
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -18,6 +18,7 @@
  #include <asm/tlbflush.h>
  #include <asm/pgalloc.h>
  #include <asm/elf.h>
+#include <asm/mpx.h>
  
  #if 0  /* This is just for testing */
  struct page *
@@ -85,25 +86,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
         info.flags = 0;
         info.length = len;
         info.low_limit = get_mmap_base(1);
+
+       /*
+        * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+        * in the full address space.
+        */
         info.high_limit = in_compat_syscall() ?
-               tasksize_32bit() : tasksize_64bit();
+               task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+
         info.align_mask = PAGE_MASK & ~huge_page_mask(h);
         info.align_offset = 0;
         return vm_unmapped_area(&info);
  }
  
  static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
-               unsigned long addr0, unsigned long len,
+               unsigned long addr, unsigned long len,
                 unsigned long pgoff, unsigned long flags)
  {
         struct hstate *h = hstate_file(file);
         struct vm_unmapped_area_info info;
-       unsigned long addr;
  
         info.flags = VM_UNMAPPED_AREA_TOPDOWN;
         info.length = len;
         info.low_limit = PAGE_SIZE;
         info.high_limit = get_mmap_base(0);
+
+       /*
+        * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+        * in the full address space.
+        */
+       if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+               info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
         info.align_mask = PAGE_MASK & ~huge_page_mask(h);
         info.align_offset = 0;
         addr = vm_unmapped_area(&info);
@@ -118,7 +132,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
                 VM_BUG_ON(addr != -ENOMEM);
                 info.flags = 0;
                 info.low_limit = TASK_UNMAPPED_BASE;
-               info.high_limit = TASK_SIZE;
+               info.high_limit = TASK_SIZE_LOW;
                 addr = vm_unmapped_area(&info);
         }
  
@@ -135,6 +149,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  
         if (len & ~huge_page_mask(h))
                 return -EINVAL;
+
+       addr = mpx_unmapped_area_check(addr, len, flags);
+       if (IS_ERR_VALUE(addr))
+               return addr;
+
         if (len > TASK_SIZE)
                 return -ENOMEM;
  
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c

index adab1595f4bd89ba0729db70dd21d619d93e829d..31cea988fa36c5571687d1e18e2d5c914271b0f8 100644 (file)
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
                 if (!pmd)
                         return -ENOMEM;
                 ident_pmd_init(info, pmd, addr, next);
-               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+               set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
         }
  
         return 0;
@@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
                 if (!pud)
                         return -ENOMEM;
                 ident_pud_init(info, pud, addr, next);
-               set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+               set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
         }
  
         return 0;
@@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
         unsigned long next;
         int result;
  
+       /* Set the default pagetable flags if not supplied */
+       if (!info->kernpg_flag)
+               info->kernpg_flag = _KERNPG_TABLE;
+
         for (; addr < end; addr = next) {
                 pgd_t *pgd = pgd_page + pgd_index(addr);
                 p4d_t *p4d;
@@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
                 if (result)
                         return result;
                 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
-                       set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+                       set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
                 } else {
                         /*
                          * With p4d folded, pgd is equal to p4d.
                          * The pgd entry has to point to the pud page table in this case.
                          */
                         pud_t *pud = pud_offset(p4d, 0);
-                       set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+                       set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
                 }
         }
  
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c

index bf3f1065d6addb88b898ba3a86089cccff6ed15e..7777ccc0e9f979dc76cc9d520885eea02114223e 100644 (file)
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -815,7 +815,7 @@ void __init zone_sizes_init(void)
  
  DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
         .loaded_mm = &init_mm,
-       .state = 0,
+       .next_asid = 1,
         .cr4 = ~0UL,    /* fail hard if we screw up cr4 shadow initialization */
  };
  EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c

index 4c1b5fd0c7ad5512a231f556de46df459b97a058..34f0e1847dd64bc82a10679b9896c3d8886aa330 100644 (file)
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -13,6 +13,8 @@
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
  #include <linux/mmiotrace.h>
+#include <linux/mem_encrypt.h>
+#include <linux/efi.h>
  
  #include <asm/set_memory.h>
  #include <asm/e820/api.h>
@@ -21,6 +23,7 @@
  #include <asm/tlbflush.h>
  #include <asm/pgalloc.h>
  #include <asm/pat.h>
+#include <asm/setup.h>
  
  #include "physaddr.h"
  
@@ -105,12 +108,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
                 return NULL;
         }
  
-       /*
-        * Don't remap the low PCI/ISA area, it's always mapped..
-        */
-       if (is_ISA_range(phys_addr, last_addr))
-               return (__force void __iomem *)phys_to_virt(phys_addr);
-
         /*
          * Don't allow anybody to remap normal RAM that we're using..
          */
@@ -340,13 +337,17 @@ void iounmap(volatile void __iomem *addr)
                 return;
  
         /*
-        * __ioremap special-cases the PCI/ISA range by not instantiating a
-        * vm_area and by simply returning an address into the kernel mapping
-        * of ISA space.   So handle that here.
+        * The PCI/ISA range special-casing was removed from __ioremap()
+        * so this check, in theory, can be removed. However, there are
+        * cases where iounmap() is called for addresses not obtained via
+        * ioremap() (vga16fb for example). Add a warning so that these
+        * cases can be caught and fixed.
          */
         if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
-           (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
+           (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) {
+               WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n");
                 return;
+       }
  
         addr = (volatile void __iomem *)
                 (PAGE_MASK & (unsigned long __force)addr);
@@ -399,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
         unsigned long offset = phys & ~PAGE_MASK;
         void *vaddr;
  
-       /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
-       if (page_is_ram(start >> PAGE_SHIFT))
-               return __va(phys);
+       /* memremap() maps if RAM, otherwise falls back to ioremap() */
+       vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB);
  
-       vaddr = ioremap_cache(start, PAGE_SIZE);
-       /* Only add the offset on success and return NULL if the ioremap() failed: */
+       /* Only add the offset on success and return NULL if memremap() failed */
         if (vaddr)
                 vaddr += offset;
  
@@ -413,11 +412,263 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
  
  void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
  {
-       if (page_is_ram(phys >> PAGE_SHIFT))
-               return;
+       memunmap((void *)((unsigned long)addr & PAGE_MASK));
+}
+
+/*
+ * Examine the physical address to determine if it is an area of memory
+ * that should be mapped decrypted.  If the memory is not part of the
+ * kernel usable area it was accessed and created decrypted, so these
+ * areas should be mapped decrypted. And since the encryption key can
+ * change across reboots, persistent memory should also be mapped
+ * decrypted.
+ */
+static bool memremap_should_map_decrypted(resource_size_t phys_addr,
+                                         unsigned long size)
+{
+       int is_pmem;
+
+       /*
+        * Check if the address is part of a persistent memory region.
+        * This check covers areas added by E820, EFI and ACPI.
+        */
+       is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM,
+                                   IORES_DESC_PERSISTENT_MEMORY);
+       if (is_pmem != REGION_DISJOINT)
+               return true;
+
+       /*
+        * Check if the non-volatile attribute is set for an EFI
+        * reserved area.
+        */
+       if (efi_enabled(EFI_BOOT)) {
+               switch (efi_mem_type(phys_addr)) {
+               case EFI_RESERVED_TYPE:
+                       if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV)
+                               return true;
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       /* Check if the address is outside kernel usable area */
+       switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
+       case E820_TYPE_RESERVED:
+       case E820_TYPE_ACPI:
+       case E820_TYPE_NVS:
+       case E820_TYPE_UNUSABLE:
+       case E820_TYPE_PRAM:
+               return true;
+       default:
+               break;
+       }
+
+       return false;
+}
+
+/*
+ * Examine the physical address to determine if it is EFI data. Check
+ * it against the boot params structure and EFI tables and memory types.
+ */
+static bool memremap_is_efi_data(resource_size_t phys_addr,
+                                unsigned long size)
+{
+       u64 paddr;
+
+       /* Check if the address is part of EFI boot/runtime data */
+       if (!efi_enabled(EFI_BOOT))
+               return false;
+
+       paddr = boot_params.efi_info.efi_memmap_hi;
+       paddr <<= 32;
+       paddr |= boot_params.efi_info.efi_memmap;
+       if (phys_addr == paddr)
+               return true;
+
+       paddr = boot_params.efi_info.efi_systab_hi;
+       paddr <<= 32;
+       paddr |= boot_params.efi_info.efi_systab;
+       if (phys_addr == paddr)
+               return true;
+
+       if (efi_is_table_address(phys_addr))
+               return true;
+
+       switch (efi_mem_type(phys_addr)) {
+       case EFI_BOOT_SERVICES_DATA:
+       case EFI_RUNTIME_SERVICES_DATA:
+               return true;
+       default:
+               break;
+       }
+
+       return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain.
+ */
+static bool memremap_is_setup_data(resource_size_t phys_addr,
+                                  unsigned long size)
+{
+       struct setup_data *data;
+       u64 paddr, paddr_next;
+
+       paddr = boot_params.hdr.setup_data;
+       while (paddr) {
+               unsigned int len;
+
+               if (phys_addr == paddr)
+                       return true;
+
+               data = memremap(paddr, sizeof(*data),
+                               MEMREMAP_WB | MEMREMAP_DEC);
+
+               paddr_next = data->next;
+               len = data->len;
+
+               memunmap(data);
+
+               if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+                       return true;
+
+               paddr = paddr_next;
+       }
+
+       return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain (early boot version).
+ */
+static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
+                                               unsigned long size)
+{
+       struct setup_data *data;
+       u64 paddr, paddr_next;
+
+       paddr = boot_params.hdr.setup_data;
+       while (paddr) {
+               unsigned int len;
+
+               if (phys_addr == paddr)
+                       return true;
+
+               data = early_memremap_decrypted(paddr, sizeof(*data));
+
+               paddr_next = data->next;
+               len = data->len;
+
+               early_memunmap(data, sizeof(*data));
+
+               if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+                       return true;
+
+               paddr = paddr_next;
+       }
+
+       return false;
+}
+
+/*
+ * Architecture function to determine if RAM remap is allowed. By default, a
+ * RAM remap will map the data as encrypted. Determine if a RAM remap should
+ * not be done so that the data will be mapped decrypted.
+ */
+bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
+                                unsigned long flags)
+{
+       if (!sme_active())
+               return true;
+
+       if (flags & MEMREMAP_ENC)
+               return true;
+
+       if (flags & MEMREMAP_DEC)
+               return false;
+
+       if (memremap_is_setup_data(phys_addr, size) ||
+           memremap_is_efi_data(phys_addr, size) ||
+           memremap_should_map_decrypted(phys_addr, size))
+               return false;
+
+       return true;
+}
+
+/*
+ * Architecture override of __weak function to adjust the protection attributes
+ * used when remapping memory. By default, early_memremap() will map the data
+ * as encrypted. Determine if an encrypted mapping should not be done and set
+ * the appropriate protection attributes.
+ */
+pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
+                                            unsigned long size,
+                                            pgprot_t prot)
+{
+       if (!sme_active())
+               return prot;
+
+       if (early_memremap_is_setup_data(phys_addr, size) ||
+           memremap_is_efi_data(phys_addr, size) ||
+           memremap_should_map_decrypted(phys_addr, size))
+               prot = pgprot_decrypted(prot);
+       else
+               prot = pgprot_encrypted(prot);
+
+       return prot;
+}
+
+bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
+{
+       return arch_memremap_can_ram_remap(phys_addr, size, 0);
+}
+
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+/* Remap memory with encryption */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+                                     unsigned long size)
+{
+       return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC);
+}
+
+/*
+ * Remap memory with encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+                                        unsigned long size)
+{
+       /* Be sure the write-protect PAT entry is set for write-protect */
+       if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+               return NULL;
+
+       return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
+}
+
+/* Remap memory without encryption */
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+                                     unsigned long size)
+{
+       return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC);
+}
+
+/*
+ * Remap memory without encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+                                        unsigned long size)
+{
+       /* Be sure the write-protect PAT entry is set for write-protect */
+       if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+               return NULL;
  
-       iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+       return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
  }
+#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */
  
  static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
  
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c

index 02c9d75534091a0cf06b78716a990c41847cb6e4..bc84b73684b7e134a910b193c0103942e2cd0a19 100644 (file)
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -11,8 +11,8 @@
  #include <asm/e820/types.h>
  #include <asm/tlbflush.h>
  #include <asm/sections.h>
+#include <asm/pgtable.h>
  
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
  extern struct range pfn_mapped[E820_MAX_ENTRIES];
  
  static int __init map_range(struct range *range)
@@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = {
  void __init kasan_early_init(void)
  {
         int i;
-       pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+       pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC;
         pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
         pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
         p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
@@ -153,7 +153,7 @@ void __init kasan_init(void)
          */
         memset(kasan_zero_page, 0, PAGE_SIZE);
         for (i = 0; i < PTRS_PER_PTE; i++) {
-               pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+               pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC);
                 set_pte(&kasan_zero_pte[i], pte);
         }
         /* Flush TLBs again to be sure that write protection applied. */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c

new file mode 100644 (file)

index 0000000..0fbd092
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt.c
@@ -0,0 +1,593 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/sections.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+static char sme_cmdline_arg[] __initdata = "mem_encrypt";
+static char sme_cmdline_on[]  __initdata = "on";
+static char sme_cmdline_off[] __initdata = "off";
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+unsigned long sme_me_mask __section(.data) = 0;
+EXPORT_SYMBOL_GPL(sme_me_mask);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+                                      unsigned long size, bool enc)
+{
+       void *src, *dst;
+       size_t len;
+
+       if (!sme_me_mask)
+               return;
+
+       local_flush_tlb();
+       wbinvd();
+
+       /*
+        * There are limited number of early mapping slots, so map (at most)
+        * one page at time.
+        */
+       while (size) {
+               len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+               /*
+                * Create mappings for the current and desired format of
+                * the memory. Use a write-protected mapping for the source.
+                */
+               src = enc ? early_memremap_decrypted_wp(paddr, len) :
+                           early_memremap_encrypted_wp(paddr, len);
+
+               dst = enc ? early_memremap_encrypted(paddr, len) :
+                           early_memremap_decrypted(paddr, len);
+
+               /*
+                * If a mapping can't be obtained to perform the operation,
+                * then eventual access of that area in the desired mode
+                * will cause a crash.
+                */
+               BUG_ON(!src || !dst);
+
+               /*
+                * Use a temporary buffer, of cache-line multiple size, to
+                * avoid data corruption as documented in the APM.
+                */
+               memcpy(sme_early_buffer, src, len);
+               memcpy(dst, sme_early_buffer, len);
+
+               early_memunmap(dst, len);
+               early_memunmap(src, len);
+
+               paddr += len;
+               size -= len;
+       }
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+       __sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+       __sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+                                            bool map)
+{
+       unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+       pmdval_t pmd_flags, pmd;
+
+       /* Use early_pmd_flags but remove the encryption mask */
+       pmd_flags = __sme_clr(early_pmd_flags);
+
+       do {
+               pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+               __early_make_pgtable((unsigned long)vaddr, pmd);
+
+               vaddr += PMD_SIZE;
+               paddr += PMD_SIZE;
+               size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+       } while (size);
+
+       __native_flush_tlb();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+       struct boot_params *boot_data;
+       unsigned long cmdline_paddr;
+
+       if (!sme_active())
+               return;
+
+       /* Get the command line address before unmapping the real_mode_data */
+       boot_data = (struct boot_params *)real_mode_data;
+       cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+       __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+       if (!cmdline_paddr)
+               return;
+
+       __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+       struct boot_params *boot_data;
+       unsigned long cmdline_paddr;
+
+       if (!sme_active())
+               return;
+
+       __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+       /* Get the command line address after mapping the real_mode_data */
+       boot_data = (struct boot_params *)real_mode_data;
+       cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+       if (!cmdline_paddr)
+               return;
+
+       __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sme_early_init(void)
+{
+       unsigned int i;
+
+       if (!sme_me_mask)
+               return;
+
+       early_pmd_flags = __sme_set(early_pmd_flags);
+
+       __supported_pte_mask = __sme_set(__supported_pte_mask);
+
+       /* Update the protection map with memory encryption mask */
+       for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+               protection_map[i] = pgprot_encrypted(protection_map[i]);
+}
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void)
+{
+       if (!sme_me_mask)
+               return;
+
+       /* Call into SWIOTLB to update the SWIOTLB DMA buffers */
+       swiotlb_update_mem_attributes();
+
+       pr_info("AMD Secure Memory Encryption (SME) active\n");
+}
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
+{
+       WARN(PAGE_ALIGN(size) != size,
+            "size is not page-aligned (%#lx)\n", size);
+
+       /* Make the SWIOTLB buffer area decrypted */
+       set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
+}
+
+static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
+                                unsigned long end)
+{
+       unsigned long pgd_start, pgd_end, pgd_size;
+       pgd_t *pgd_p;
+
+       pgd_start = start & PGDIR_MASK;
+       pgd_end = end & PGDIR_MASK;
+
+       pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1);
+       pgd_size *= sizeof(pgd_t);
+
+       pgd_p = pgd_base + pgd_index(start);
+
+       memset(pgd_p, 0, pgd_size);
+}
+
+#define PGD_FLAGS      _KERNPG_TABLE_NOENC
+#define P4D_FLAGS      _KERNPG_TABLE_NOENC
+#define PUD_FLAGS      _KERNPG_TABLE_NOENC
+#define PMD_FLAGS      (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
+                                    unsigned long vaddr, pmdval_t pmd_val)
+{
+       pgd_t *pgd_p;
+       p4d_t *p4d_p;
+       pud_t *pud_p;
+       pmd_t *pmd_p;
+
+       pgd_p = pgd_base + pgd_index(vaddr);
+       if (native_pgd_val(*pgd_p)) {
+               if (IS_ENABLED(CONFIG_X86_5LEVEL))
+                       p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+               else
+                       pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+       } else {
+               pgd_t pgd;
+
+               if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+                       p4d_p = pgtable_area;
+                       memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+                       pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
+
+                       pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
+               } else {
+                       pud_p = pgtable_area;
+                       memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+                       pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+                       pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
+               }
+               native_set_pgd(pgd_p, pgd);
+       }
+
+       if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+               p4d_p += p4d_index(vaddr);
+               if (native_p4d_val(*p4d_p)) {
+                       pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
+               } else {
+                       p4d_t p4d;
+
+                       pud_p = pgtable_area;
+                       memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+                       pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+                       p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
+                       native_set_p4d(p4d_p, p4d);
+               }
+       }
+
+       pud_p += pud_index(vaddr);
+       if (native_pud_val(*pud_p)) {
+               if (native_pud_val(*pud_p) & _PAGE_PSE)
+                       goto out;
+
+               pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
+       } else {
+               pud_t pud;
+
+               pmd_p = pgtable_area;
+               memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+               pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
+
+               pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
+               native_set_pud(pud_p, pud);
+       }
+
+       pmd_p += pmd_index(vaddr);
+       if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
+               native_set_pmd(pmd_p, native_make_pmd(pmd_val));
+
+out:
+       return pgtable_area;
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+       unsigned long p4d_size, pud_size, pmd_size;
+       unsigned long total;
+
+       /*
+        * Perform a relatively simplistic calculation of the pagetable
+        * entries that are needed. That mappings will be covered by 2MB
+        * PMD entries so we can conservatively calculate the required
+        * number of P4D, PUD and PMD structures needed to perform the
+        * mappings. Incrementing the count for each covers the case where
+        * the addresses cross entries.
+        */
+       if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+               p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+               p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+               pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
+               pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+       } else {
+               p4d_size = 0;
+               pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+               pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+       }
+       pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
+       pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+       total = p4d_size + pud_size + pmd_size;
+
+       /*
+        * Now calculate the added pagetable structures needed to populate
+        * the new pagetables.
+        */
+       if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+               p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+               p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+               pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
+               pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+       } else {
+               p4d_size = 0;
+               pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+               pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+       }
+       pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
+       pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+       total += p4d_size + pud_size + pmd_size;
+
+       return total;
+}
+
+void __init sme_encrypt_kernel(void)
+{
+       unsigned long workarea_start, workarea_end, workarea_len;
+       unsigned long execute_start, execute_end, execute_len;
+       unsigned long kernel_start, kernel_end, kernel_len;
+       unsigned long pgtable_area_len;
+       unsigned long paddr, pmd_flags;
+       unsigned long decrypted_base;
+       void *pgtable_area;
+       pgd_t *pgd;
+
+       if (!sme_active())
+               return;
+
+       /*
+        * Prepare for encrypting the kernel by building new pagetables with
+        * the necessary attributes needed to encrypt the kernel in place.
+        *
+        *   One range of virtual addresses will map the memory occupied
+        *   by the kernel as encrypted.
+        *
+        *   Another range of virtual addresses will map the memory occupied
+        *   by the kernel as decrypted and write-protected.
+        *
+        *     The use of write-protect attribute will prevent any of the
+        *     memory from being cached.
+        */
+
+       /* Physical addresses gives us the identity mapped virtual addresses */
+       kernel_start = __pa_symbol(_text);
+       kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+       kernel_len = kernel_end - kernel_start;
+
+       /* Set the encryption workarea to be immediately after the kernel */
+       workarea_start = kernel_end;
+
+       /*
+        * Calculate required number of workarea bytes needed:
+        *   executable encryption area size:
+        *     stack page (PAGE_SIZE)
+        *     encryption routine page (PAGE_SIZE)
+        *     intermediate copy buffer (PMD_PAGE_SIZE)
+        *   pagetable structures for the encryption of the kernel
+        *   pagetable structures for workarea (in case not currently mapped)
+        */
+       execute_start = workarea_start;
+       execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+       execute_len = execute_end - execute_start;
+
+       /*
+        * One PGD for both encrypted and decrypted mappings and a set of
+        * PUDs and PMDs for each of the encrypted and decrypted mappings.
+        */
+       pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+       pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+
+       /* PUDs and PMDs needed in the current pagetables for the workarea */
+       pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+       /*
+        * The total workarea includes the executable encryption area and
+        * the pagetable area.
+        */
+       workarea_len = execute_len + pgtable_area_len;
+       workarea_end = workarea_start + workarea_len;
+
+       /*
+        * Set the address to the start of where newly created pagetable
+        * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+        * structures are created when the workarea is added to the current
+        * pagetables and when the new encrypted and decrypted kernel
+        * mappings are populated.
+        */
+       pgtable_area = (void *)execute_end;
+
+       /*
+        * Make sure the current pagetable structure has entries for
+        * addressing the workarea.
+        */
+       pgd = (pgd_t *)native_read_cr3_pa();
+       paddr = workarea_start;
+       while (paddr < workarea_end) {
+               pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+                                               paddr,
+                                               paddr + PMD_FLAGS);
+
+               paddr += PMD_PAGE_SIZE;
+       }
+
+       /* Flush the TLB - no globals so cr3 is enough */
+       native_write_cr3(__native_read_cr3());
+
+       /*
+        * A new pagetable structure is being built to allow for the kernel
+        * to be encrypted. It starts with an empty PGD that will then be
+        * populated with new PUDs and PMDs as the encrypted and decrypted
+        * kernel mappings are created.
+        */
+       pgd = pgtable_area;
+       memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
+       pgtable_area += sizeof(*pgd) * PTRS_PER_PGD;
+
+       /* Add encrypted kernel (identity) mappings */
+       pmd_flags = PMD_FLAGS | _PAGE_ENC;
+       paddr = kernel_start;
+       while (paddr < kernel_end) {
+               pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+                                               paddr,
+                                               paddr + pmd_flags);
+
+               paddr += PMD_PAGE_SIZE;
+       }
+
+       /*
+        * A different PGD index/entry must be used to get different
+        * pagetable entries for the decrypted mapping. Choose the next
+        * PGD index and convert it to a virtual address to be used as
+        * the base of the mapping.
+        */
+       decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+       decrypted_base <<= PGDIR_SHIFT;
+
+       /* Add decrypted, write-protected kernel (non-identity) mappings */
+       pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
+       paddr = kernel_start;
+       while (paddr < kernel_end) {
+               pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+                                               paddr + decrypted_base,
+                                               paddr + pmd_flags);
+
+               paddr += PMD_PAGE_SIZE;
+       }
+
+       /* Add decrypted workarea mappings to both kernel mappings */
+       paddr = workarea_start;
+       while (paddr < workarea_end) {
+               pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+                                               paddr,
+                                               paddr + PMD_FLAGS);
+
+               pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+                                               paddr + decrypted_base,
+                                               paddr + PMD_FLAGS);
+
+               paddr += PMD_PAGE_SIZE;
+       }
+
+       /* Perform the encryption */
+       sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+                           kernel_len, workarea_start, (unsigned long)pgd);
+
+       /*
+        * At this point we are running encrypted.  Remove the mappings for
+        * the decrypted areas - all that is needed for this is to remove
+        * the PGD entry/entries.
+        */
+       sme_clear_pgd(pgd, kernel_start + decrypted_base,
+                     kernel_end + decrypted_base);
+
+       sme_clear_pgd(pgd, workarea_start + decrypted_base,
+                     workarea_end + decrypted_base);
+
+       /* Flush the TLB - no globals so cr3 is enough */
+       native_write_cr3(__native_read_cr3());
+}
+
+void __init __nostackprotector sme_enable(struct boot_params *bp)
+{
+       const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
+       unsigned int eax, ebx, ecx, edx;
+       bool active_by_default;
+       unsigned long me_mask;
+       char buffer[16];
+       u64 msr;
+
+       /* Check for the SME support leaf */
+       eax = 0x80000000;
+       ecx = 0;
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+       if (eax < 0x8000001f)
+               return;
+
+       /*
+        * Check for the SME feature:
+        *   CPUID Fn8000_001F[EAX] - Bit 0
+        *     Secure Memory Encryption support
+        *   CPUID Fn8000_001F[EBX] - Bits 5:0
+        *     Pagetable bit position used to indicate encryption
+        */
+       eax = 0x8000001f;
+       ecx = 0;
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+       if (!(eax & 1))
+               return;
+
+       me_mask = 1UL << (ebx & 0x3f);
+
+       /* Check if SME is enabled */
+       msr = __rdmsr(MSR_K8_SYSCFG);
+       if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+               return;
+
+       /*
+        * Fixups have not been applied to phys_base yet and we're running
+        * identity mapped, so we must obtain the address to the SME command
+        * line argument data using rip-relative addressing.
+        */
+       asm ("lea sme_cmdline_arg(%%rip), %0"
+            : "=r" (cmdline_arg)
+            : "p" (sme_cmdline_arg));
+       asm ("lea sme_cmdline_on(%%rip), %0"
+            : "=r" (cmdline_on)
+            : "p" (sme_cmdline_on));
+       asm ("lea sme_cmdline_off(%%rip), %0"
+            : "=r" (cmdline_off)
+            : "p" (sme_cmdline_off));
+
+       if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
+               active_by_default = true;
+       else
+               active_by_default = false;
+
+       cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+                                    ((u64)bp->ext_cmd_line_ptr << 32));
+
+       cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+
+       if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+               sme_me_mask = me_mask;
+       else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
+               sme_me_mask = 0;
+       else
+               sme_me_mask = active_by_default ? me_mask : 0;
+}
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S

new file mode 100644 (file)

index 0000000..b327e04
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -0,0 +1,149 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/frame.h>
+
+       .text
+       .code64
+ENTRY(sme_encrypt_execute)
+
+       /*
+        * Entry parameters:
+        *   RDI - virtual address for the encrypted kernel mapping
+        *   RSI - virtual address for the decrypted kernel mapping
+        *   RDX - length of kernel
+        *   RCX - virtual address of the encryption workarea, including:
+        *     - stack page (PAGE_SIZE)
+        *     - encryption routine page (PAGE_SIZE)
+        *     - intermediate copy buffer (PMD_PAGE_SIZE)
+        *    R8 - physcial address of the pagetables to use for encryption
+        */
+
+       FRAME_BEGIN                     /* RBP now has original stack pointer */
+
+       /* Set up a one page stack in the non-encrypted memory area */
+       movq    %rcx, %rax              /* Workarea stack page */
+       leaq    PAGE_SIZE(%rax), %rsp   /* Set new stack pointer */
+       addq    $PAGE_SIZE, %rax        /* Workarea encryption routine */
+
+       push    %r12
+       movq    %rdi, %r10              /* Encrypted kernel */
+       movq    %rsi, %r11              /* Decrypted kernel */
+       movq    %rdx, %r12              /* Kernel length */
+
+       /* Copy encryption routine into the workarea */
+       movq    %rax, %rdi                              /* Workarea encryption routine */
+       leaq    __enc_copy(%rip), %rsi                  /* Encryption routine */
+       movq    $(.L__enc_copy_end - __enc_copy), %rcx  /* Encryption routine length */
+       rep     movsb
+
+       /* Setup registers for call */
+       movq    %r10, %rdi              /* Encrypted kernel */
+       movq    %r11, %rsi              /* Decrypted kernel */
+       movq    %r8, %rdx               /* Pagetables used for encryption */
+       movq    %r12, %rcx              /* Kernel length */
+       movq    %rax, %r8               /* Workarea encryption routine */
+       addq    $PAGE_SIZE, %r8         /* Workarea intermediate copy buffer */
+
+       call    *%rax                   /* Call the encryption routine */
+
+       pop     %r12
+
+       movq    %rbp, %rsp              /* Restore original stack pointer */
+       FRAME_END
+
+       ret
+ENDPROC(sme_encrypt_execute)
+
+ENTRY(__enc_copy)
+/*
+ * Routine used to encrypt kernel.
+ *   This routine must be run outside of the kernel proper since
+ *   the kernel will be encrypted during the process. So this
+ *   routine is defined here and then copied to an area outside
+ *   of the kernel where it will remain and run decrypted
+ *   during execution.
+ *
+ *   On entry the registers must be:
+ *     RDI - virtual address for the encrypted kernel mapping
+ *     RSI - virtual address for the decrypted kernel mapping
+ *     RDX - address of the pagetables to use for encryption
+ *     RCX - length of kernel
+ *      R8 - intermediate copy buffer
+ *
+ *     RAX - points to this routine
+ *
+ * The kernel will be encrypted by copying from the non-encrypted
+ * kernel space to an intermediate buffer and then copying from the
+ * intermediate buffer back to the encrypted kernel space. The physical
+ * addresses of the two kernel space mappings are the same which
+ * results in the kernel being encrypted "in place".
+ */
+       /* Enable the new page tables */
+       mov     %rdx, %cr3
+
+       /* Flush any global TLBs */
+       mov     %cr4, %rdx
+       andq    $~X86_CR4_PGE, %rdx
+       mov     %rdx, %cr4
+       orq     $X86_CR4_PGE, %rdx
+       mov     %rdx, %cr4
+
+       /* Set the PAT register PA5 entry to write-protect */
+       push    %rcx
+       movl    $MSR_IA32_CR_PAT, %ecx
+       rdmsr
+       push    %rdx                    /* Save original PAT value */
+       andl    $0xffff00ff, %edx       /* Clear PA5 */
+       orl     $0x00000500, %edx       /* Set PA5 to WP */
+       wrmsr
+       pop     %rdx                    /* RDX contains original PAT value */
+       pop     %rcx
+
+       movq    %rcx, %r9               /* Save kernel length */
+       movq    %rdi, %r10              /* Save encrypted kernel address */
+       movq    %rsi, %r11              /* Save decrypted kernel address */
+
+       wbinvd                          /* Invalidate any cache entries */
+
+       /* Copy/encrypt 2MB at a time */
+1:
+       movq    %r11, %rsi              /* Source - decrypted kernel */
+       movq    %r8, %rdi               /* Dest   - intermediate copy buffer */
+       movq    $PMD_PAGE_SIZE, %rcx    /* 2MB length */
+       rep     movsb
+
+       movq    %r8, %rsi               /* Source - intermediate copy buffer */
+       movq    %r10, %rdi              /* Dest   - encrypted kernel */
+       movq    $PMD_PAGE_SIZE, %rcx    /* 2MB length */
+       rep     movsb
+
+       addq    $PMD_PAGE_SIZE, %r11
+       addq    $PMD_PAGE_SIZE, %r10
+       subq    $PMD_PAGE_SIZE, %r9     /* Kernel length decrement */
+       jnz     1b                      /* Kernel length not zero? */
+
+       /* Restore PAT register */
+       push    %rdx                    /* Save original PAT value */
+       movl    $MSR_IA32_CR_PAT, %ecx
+       rdmsr
+       pop     %rdx                    /* Restore original PAT value */
+       wrmsr
+
+       ret
+.L__enc_copy_end:
+ENDPROC(__enc_copy)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c

index a88cfbfbd0781a4d20c398d7ca1b30f50f05be43..a9967982684649155cfcdc921d5247c8fbfe70d6 100644 (file)
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -37,21 +37,21 @@ struct va_alignment __read_mostly va_align = {
         .flags = -1,
  };
  
-unsigned long tasksize_32bit(void)
+unsigned long task_size_32bit(void)
  {
         return IA32_PAGE_OFFSET;
  }
  
-unsigned long tasksize_64bit(void)
+unsigned long task_size_64bit(int full_addr_space)
  {
-       return TASK_SIZE_MAX;
+       return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW;
  }
  
  static unsigned long stack_maxrandom_size(unsigned long task_size)
  {
         unsigned long max = 0;
         if (current->flags & PF_RANDOMIZE) {
-               max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
+               max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit());
                 max <<= PAGE_SHIFT;
         }
  
@@ -141,7 +141,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
                 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
  
         arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
-                       arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+                       arch_rnd(mmap64_rnd_bits), task_size_64bit(0));
  
  #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
         /*
@@ -151,7 +151,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
          * mmap_base, the compat syscall uses mmap_compat_base.
          */
         arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
-                       arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+                       arch_rnd(mmap32_rnd_bits), task_size_32bit());
  #endif
  }
  
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c

index 1c34b767c84ca650386f1b550aa5e3f13d24d5ce..9ceaa955d2bacc317582b510753a91f64eca401b 100644 (file)
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -355,10 +355,19 @@ int mpx_enable_management(void)
          */
         bd_base = mpx_get_bounds_dir();
         down_write(&mm->mmap_sem);
+
+       /* MPX doesn't support addresses above 47 bits yet. */
+       if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
+               pr_warn_once("%s (%d): MPX cannot handle addresses "
+                               "above 47-bits. Disabling.",
+                               current->comm, current->pid);
+               ret = -ENXIO;
+               goto out;
+       }
         mm->context.bd_addr = bd_base;
         if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
                 ret = -ENXIO;
-
+out:
         up_write(&mm->mmap_sem);
         return ret;
  }
@@ -1030,3 +1039,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
         if (ret)
                 force_sig(SIGSEGV, current);
  }
+
+/* MPX cannot handle addresses above 47 bits yet. */
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+               unsigned long flags)
+{
+       if (!kernel_managing_mpx_tables(current->mm))
+               return addr;
+       if (addr + len <= DEFAULT_MAP_WINDOW)
+               return addr;
+       if (flags & MAP_FIXED)
+               return -ENOMEM;
+
+       /*
+        * Requested len is larger than the whole area we're allowed to map in.
+        * Resetting hinting address wouldn't do much good -- fail early.
+        */
+       if (len > DEFAULT_MAP_WINDOW)
+               return -ENOMEM;
+
+       /* Look for unmap area within DEFAULT_MAP_WINDOW */
+       return 0;
+}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

index 757b0bcdf712dfb1e73527c603ab5d5b05f5bcae..dfb7d657cf4322b0dedcd0bb63c1058bd090b4ea 100644 (file)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1775,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages)
                                         __pgprot(0), 1, 0, NULL);
  }
  
+static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+{
+       struct cpa_data cpa;
+       unsigned long start;
+       int ret;
+
+       /* Nothing to do if the SME is not active */
+       if (!sme_active())
+               return 0;
+
+       /* Should not be working on unaligned addresses */
+       if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
+               addr &= PAGE_MASK;
+
+       start = addr;
+
+       memset(&cpa, 0, sizeof(cpa));
+       cpa.vaddr = &addr;
+       cpa.numpages = numpages;
+       cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
+       cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
+       cpa.pgd = init_mm.pgd;
+
+       /* Must avoid aliasing mappings in the highmem code */
+       kmap_flush_unused();
+       vm_unmap_aliases();
+
+       /*
+        * Before changing the encryption attribute, we need to flush caches.
+        */
+       if (static_cpu_has(X86_FEATURE_CLFLUSH))
+               cpa_flush_range(start, numpages, 1);
+       else
+               cpa_flush_all(1);
+
+       ret = __change_page_attr_set_clr(&cpa, 1);
+
+       /*
+        * After changing the encryption attribute, we need to flush TLBs
+        * again in case any speculative TLB caching occurred (but no need
+        * to flush caches again).  We could just use cpa_flush_all(), but
+        * in case TLB flushing gets optimized in the cpa_flush_range()
+        * path use the same logic as above.
+        */
+       if (static_cpu_has(X86_FEATURE_CLFLUSH))
+               cpa_flush_range(start, numpages, 0);
+       else
+               cpa_flush_all(0);
+
+       return ret;
+}
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+       return __set_memory_enc_dec(addr, numpages, true);
+}
+EXPORT_SYMBOL_GPL(set_memory_encrypted);
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+       return __set_memory_enc_dec(addr, numpages, false);
+}
+EXPORT_SYMBOL_GPL(set_memory_decrypted);
+
  int set_pages_uc(struct page *page, int numpages)
  {
         unsigned long addr = (unsigned long)page_address(page);
@@ -2020,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
         if (!(page_flags & _PAGE_RW))
                 cpa.mask_clr = __pgprot(_PAGE_RW);
  
+       if (!(page_flags & _PAGE_ENC))
+               cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
+
         cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
  
         retval = __change_page_attr_set_clr(&cpa, 0);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c

index 45979502f64b13e98b31dc862f4ac86652b44592..fe7d57a8fb6003a15e609bfd7481060005e429e4 100644 (file)
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -293,7 +293,7 @@ void init_cache_modes(void)
   * pat_init - Initialize PAT MSR and PAT table
   *
   * This function initializes PAT MSR and PAT table with an OS-defined value
- * to enable additional cache attributes, WC and WT.
+ * to enable additional cache attributes, WC, WT and WP.
   *
   * This function must be called on all CPUs using the specific sequence of
   * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
@@ -352,7 +352,7 @@ void pat_init(void)
                  *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
                  *      011    3    UC : _PAGE_CACHE_MODE_UC
                  *      100    4    WB : Reserved
-                *      101    5    WC : Reserved
+                *      101    5    WP : _PAGE_CACHE_MODE_WP
                  *      110    6    UC-: Reserved
                  *      111    7    WT : _PAGE_CACHE_MODE_WT
                  *
@@ -360,7 +360,7 @@ void pat_init(void)
                  * corresponding types in the presence of PAT errata.
                  */
                 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
-                     PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+                     PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
         }
  
         if (!boot_cpu_done) {
@@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc);
  pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                                 unsigned long size, pgprot_t vma_prot)
  {
+       if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
+               vma_prot = pgprot_decrypted(vma_prot);
+
         return vma_prot;
  }
  
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

index 014d07a800535f0320108effe4c9b087c6eb1e88..ce104b962a1704f9950b600c1fc19b464e03359e 100644 (file)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,42 @@
   *     Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
   */
  
+atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+                           u16 *new_asid, bool *need_flush)
+{
+       u16 asid;
+
+       if (!static_cpu_has(X86_FEATURE_PCID)) {
+               *new_asid = 0;
+               *need_flush = true;
+               return;
+       }
+
+       for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+               if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
+                   next->context.ctx_id)
+                       continue;
+
+               *new_asid = asid;
+               *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
+                              next_tlb_gen);
+               return;
+       }
+
+       /*
+        * We don't currently own an ASID slot on this CPU.
+        * Allocate a slot.
+        */
+       *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
+       if (*new_asid >= TLB_NR_DYN_ASIDS) {
+               *new_asid = 0;
+               this_cpu_write(cpu_tlbstate.next_asid, 1);
+       }
+       *need_flush = true;
+}
+
  void leave_mm(int cpu)
  {
         struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -43,12 +79,11 @@ void leave_mm(int cpu)
         if (loaded_mm == &init_mm)
                 return;
  
-       if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
-               BUG();
+       /* Warn if we're not lazy. */
+       WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
  
         switch_mm(NULL, &init_mm, NULL);
  }
-EXPORT_SYMBOL_GPL(leave_mm);
  
  void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                struct task_struct *tsk)
@@ -63,115 +98,219 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                         struct task_struct *tsk)
  {
-       unsigned cpu = smp_processor_id();
         struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+       u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+       unsigned cpu = smp_processor_id();
+       u64 next_tlb_gen;
  
         /*
-        * NB: The scheduler will call us with prev == next when
-        * switching from lazy TLB mode to normal mode if active_mm
-        * isn't changing.  When this happens, there is no guarantee
-        * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+        * NB: The scheduler will call us with prev == next when switching
+        * from lazy TLB mode to normal mode if active_mm isn't changing.
+        * When this happens, we don't assume that CR3 (and hence
+        * cpu_tlbstate.loaded_mm) matches next.
          *
          * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
          */
  
-       this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+       /* We don't want flush_tlb_func_* to run concurrently with us. */
+       if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+               WARN_ON_ONCE(!irqs_disabled());
+
+       /*
+        * Verify that CR3 is what we think it is.  This will catch
+        * hypothetical buggy code that directly switches to swapper_pg_dir
+        * without going through leave_mm() / switch_mm_irqs_off() or that
+        * does something like write_cr3(read_cr3_pa()).
+        */
+       VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
  
         if (real_prev == next) {
-               /*
-                * There's nothing to do: we always keep the per-mm control
-                * regs in sync with cpu_tlbstate.loaded_mm.  Just
-                * sanity-check mm_cpumask.
-                */
-               if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
-                       cpumask_set_cpu(cpu, mm_cpumask(next));
-               return;
-       }
+               VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+                         next->context.ctx_id);
+
+               if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
+                       /*
+                        * There's nothing to do: we weren't lazy, and we
+                        * aren't changing our mm.  We don't need to flush
+                        * anything, nor do we need to update CR3, CR4, or
+                        * LDTR.
+                        */
+                       return;
+               }
+
+               /* Resume remote flushes and then read tlb_gen. */
+               cpumask_set_cpu(cpu, mm_cpumask(next));
+               next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+               if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
+                   next_tlb_gen) {
+                       /*
+                        * Ideally, we'd have a flush_tlb() variant that
+                        * takes the known CR3 value as input.  This would
+                        * be faster on Xen PV and on hypothetical CPUs
+                        * on which INVPCID is fast.
+                        */
+                       this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
+                                      next_tlb_gen);
+                       write_cr3(__sme_pa(next->pgd) | prev_asid);
+                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
+                                       TLB_FLUSH_ALL);
+               }
  
-       if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                 /*
-                * If our current stack is in vmalloc space and isn't
-                * mapped in the new pgd, we'll double-fault.  Forcibly
-                * map it.
+                * We just exited lazy mode, which means that CR4 and/or LDTR
+                * may be stale.  (Changes to the required CR4 and LDTR states
+                * are not reflected in tlb_gen.)
                  */
-               unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
-               pgd_t *pgd = next->pgd + stack_pgd_index;
-
-               if (unlikely(pgd_none(*pgd)))
-                       set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
-       }
+       } else {
+               u16 new_asid;
+               bool need_flush;
+
+               if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+                       /*
+                        * If our current stack is in vmalloc space and isn't
+                        * mapped in the new pgd, we'll double-fault.  Forcibly
+                        * map it.
+                        */
+                       unsigned int index = pgd_index(current_stack_pointer());
+                       pgd_t *pgd = next->pgd + index;
+
+                       if (unlikely(pgd_none(*pgd)))
+                               set_pgd(pgd, init_mm.pgd[index]);
+               }
  
-       this_cpu_write(cpu_tlbstate.loaded_mm, next);
+               /* Stop remote flushes for the previous mm */
+               if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
+                       cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
  
-       WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
-       cpumask_set_cpu(cpu, mm_cpumask(next));
+               VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
  
-       /*
-        * Re-load page tables.
-        *
-        * This logic has an ordering constraint:
-        *
-        *  CPU 0: Write to a PTE for 'next'
-        *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
-        *  CPU 1: set bit 1 in next's mm_cpumask
-        *  CPU 1: load from the PTE that CPU 0 writes (implicit)
-        *
-        * We need to prevent an outcome in which CPU 1 observes
-        * the new PTE value and CPU 0 observes bit 1 clear in
-        * mm_cpumask.  (If that occurs, then the IPI will never
-        * be sent, and CPU 0's TLB will contain a stale entry.)
-        *
-        * The bad outcome can occur if either CPU's load is
-        * reordered before that CPU's store, so both CPUs must
-        * execute full barriers to prevent this from happening.
-        *
-        * Thus, switch_mm needs a full barrier between the
-        * store to mm_cpumask and any operation that could load
-        * from next->pgd.  TLB fills are special and can happen
-        * due to instruction fetches or for no reason at all,
-        * and neither LOCK nor MFENCE orders them.
-        * Fortunately, load_cr3() is serializing and gives the
-        * ordering guarantee we need.
-        */
-       load_cr3(next->pgd);
-
-       /*
-        * This gets called via leave_mm() in the idle path where RCU
-        * functions differently.  Tracing normally uses RCU, so we have to
-        * call the tracepoint specially here.
-        */
-       trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+               /*
+                * Start remote flushes and then read tlb_gen.
+                */
+               cpumask_set_cpu(cpu, mm_cpumask(next));
+               next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+               choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+
+               if (need_flush) {
+                       this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+                       this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+                       write_cr3(__sme_pa(next->pgd) | new_asid);
+                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
+                                       TLB_FLUSH_ALL);
+               } else {
+                       /* The new ASID is already up to date. */
+                       write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
+                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+               }
  
-       /* Stop flush ipis for the previous mm */
-       WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
-                    real_prev != &init_mm);
-       cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+               this_cpu_write(cpu_tlbstate.loaded_mm, next);
+               this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+       }
  
-       /* Load per-mm CR4 and LDTR state */
         load_mm_cr4(next);
         switch_ldt(real_prev, next);
  }
  
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen.  We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
  static void flush_tlb_func_common(const struct flush_tlb_info *f,
                                   bool local, enum tlb_flush_reason reason)
  {
+       /*
+        * We have three different tlb_gen values in here.  They are:
+        *
+        * - mm_tlb_gen:     the latest generation.
+        * - local_tlb_gen:  the generation that this CPU has already caught
+        *                   up to.
+        * - f->new_tlb_gen: the generation that the requester of the flush
+        *                   wants us to catch up to.
+        */
+       struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+       u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+       u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+       u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
         /* This code cannot presently handle being reentered. */
         VM_WARN_ON(!irqs_disabled());
  
-       if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
-               leave_mm(smp_processor_id());
+       VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+                  loaded_mm->context.ctx_id);
+
+       if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+               /*
+                * We're in lazy mode -- don't flush.  We can get here on
+                * remote flushes due to races and on local flushes if a
+                * kernel thread coincidentally flushes the mm it's lazily
+                * still using.
+                */
                 return;
         }
  
-       if (f->end == TLB_FLUSH_ALL) {
-               local_flush_tlb();
-               if (local)
-                       count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-               trace_tlb_flush(reason, TLB_FLUSH_ALL);
-       } else {
+       if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+               /*
+                * There's nothing to do: we're already up to date.  This can
+                * happen if two concurrent flushes happen -- the first flush to
+                * be handled can catch us all the way up, leaving no work for
+                * the second flush.
+                */
+               trace_tlb_flush(reason, 0);
+               return;
+       }
+
+       WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+       WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+       /*
+        * If we get to this point, we know that our TLB is out of date.
+        * This does not strictly imply that we need to flush (it's
+        * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+        * going to need to flush in the very near future, so we might
+        * as well get it over with.
+        *
+        * The only question is whether to do a full or partial flush.
+        *
+        * We do a partial flush if requested and two extra conditions
+        * are met:
+        *
+        * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
+        *    we've always done all needed flushes to catch up to
+        *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
+        *    f->new_tlb_gen == 3, then we know that the flush needed to bring
+        *    us up to date for tlb_gen 3 is the partial flush we're
+        *    processing.
+        *
+        *    As an example of why this check is needed, suppose that there
+        *    are two concurrent flushes.  The first is a full flush that
+        *    changes context.tlb_gen from 1 to 2.  The second is a partial
+        *    flush that changes context.tlb_gen from 2 to 3.  If they get
+        *    processed on this CPU in reverse order, we'll see
+        *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+        *    If we were to use __flush_tlb_single() and set local_tlb_gen to
+        *    3, we'd be break the invariant: we'd update local_tlb_gen above
+        *    1 without the full flush that's needed for tlb_gen 2.
+        *
+        * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
+        *    Partial TLB flushes are not all that much cheaper than full TLB
+        *    flushes, so it seems unlikely that it would be a performance win
+        *    to do a partial flush if that won't bring our TLB fully up to
+        *    date.  By doing a full flush instead, we can increase
+        *    local_tlb_gen all the way to mm_tlb_gen and we can probably
+        *    avoid another flush in the very near future.
+        */
+       if (f->end != TLB_FLUSH_ALL &&
+           f->new_tlb_gen == local_tlb_gen + 1 &&
+           f->new_tlb_gen == mm_tlb_gen) {
+               /* Partial flush */
                 unsigned long addr;
                 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+
                 addr = f->start;
                 while (addr < f->end) {
                         __flush_tlb_single(addr);
@@ -180,7 +319,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
                 if (local)
                         count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
                 trace_tlb_flush(reason, nr_pages);
+       } else {
+               /* Full flush. */
+               local_flush_tlb();
+               if (local)
+                       count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+               trace_tlb_flush(reason, TLB_FLUSH_ALL);
         }
+
+       /* Both paths above update our state to mm_tlb_gen. */
+       this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
  }
  
  static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
@@ -214,6 +362,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                                 (info->end - info->start) >> PAGE_SHIFT);
  
         if (is_uv_system()) {
+               /*
+                * This whole special case is confused.  UV has a "Broadcast
+                * Assist Unit", which seems to be a fancy way to send IPIs.
+                * Back when x86 used an explicit TLB flush IPI, UV was
+                * optimized to use its own mechanism.  These days, x86 uses
+                * smp_call_function_many(), but UV still uses a manual IPI,
+                * and that IPI's action is out of date -- it does a manual
+                * flush instead of calling flush_tlb_func_remote().  This
+                * means that the percpu tlb_gen variables won't be updated
+                * and we'll do pointless flushes on future context switches.
+                *
+                * Rather than hooking native_flush_tlb_others() here, I think
+                * that UV should be updated so that smp_call_function_many(),
+                * etc, are optimal on UV.
+                */
                 unsigned int cpu;
  
                 cpu = smp_processor_id();
@@ -250,8 +413,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
  
         cpu = get_cpu();
  
-       /* Synchronize with switch_mm. */
-       smp_mb();
+       /* This is also a barrier that synchronizes with switch_mm(). */
+       info.new_tlb_gen = inc_mm_tlb_gen(mm);
  
         /* Should we flush just the requested range? */
         if ((end != TLB_FLUSH_ALL) &&
@@ -273,6 +436,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
  
         if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
                 flush_tlb_others(mm_cpumask(mm), &info);
+
         put_cpu();
  }
  
@@ -281,8 +445,6 @@ static void do_flush_tlb_all(void *info)
  {
         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
         __flush_tlb_all();
-       if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
-               leave_mm(smp_processor_id());
  }
  
  void flush_tlb_all(void)
@@ -335,6 +497,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
  
         if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
                 flush_tlb_others(&batch->cpumask, &info);
+
         cpumask_clear(&batch->cpumask);
  
         put_cpu();
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c

index dbe2132b0ed4cabf19119a5fe0ffa4d89480745d..7a5350d08cef711a14c29cf1f8fcedb70ecc7465 100644 (file)
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -674,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev)
  
         pa_data = boot_params.hdr.setup_data;
         while (pa_data) {
-               data = ioremap(pa_data, sizeof(*rom));
+               data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB);
                 if (!data)
                         return -ENOMEM;
  
@@ -693,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev)
                         }
                 }
                 pa_data = data->next;
-               iounmap(data);
+               memunmap(data);
         }
         set_dma_domain_ops(dev);
         set_dev_domain_options(dev);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c

index f084d8718ac4990fca3c4cbf004d91551b3a8e76..6217b23e85f6ce3824583f6b49fe6c0f46c0ccf9 100644 (file)
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -1035,12 +1035,12 @@ void __init efi_enter_virtual_mode(void)
  /*
   * Convenience functions to obtain memory types and attributes
   */
-u32 efi_mem_type(unsigned long phys_addr)
+int efi_mem_type(unsigned long phys_addr)
  {
         efi_memory_desc_t *md;
  
         if (!efi_enabled(EFI_MEMMAP))
-               return 0;
+               return -ENOTSUPP;
  
         for_each_efi_memory_desc(md) {
                 if ((md->phys_addr <= phys_addr) &&
@@ -1048,7 +1048,7 @@ u32 efi_mem_type(unsigned long phys_addr)
                                   (md->num_pages << EFI_PAGE_SHIFT))))
                         return md->type;
         }
-       return 0;
+       return -EINVAL;
  }
  
  static int __init arch_parse_efi_cmdline(char *str)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c

index 9bf72f5bfedb6c86c452642ca8635c0b370e64e5..12e83888e5b96714a4ad7bd8b9cc84844dd91ded 100644 (file)
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -327,7 +327,7 @@ virt_to_phys_or_null_size(void *va, unsigned long size)
  
  int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
  {
-       unsigned long pfn, text;
+       unsigned long pfn, text, pf;
         struct page *page;
         unsigned npages;
         pgd_t *pgd;
@@ -335,7 +335,12 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
         if (efi_enabled(EFI_OLD_MEMMAP))
                 return 0;
  
-       efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd);
+       /*
+        * Since the PGD is encrypted, set the encryption mask so that when
+        * this value is loaded into cr3 the PGD will be decrypted during
+        * the pagetable walk.
+        */
+       efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd);
         pgd = efi_pgd;
  
         /*
@@ -345,7 +350,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
          * phys_efi_set_virtual_address_map().
          */
         pfn = pa_memmap >> PAGE_SHIFT;
-       if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) {
+       pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC;
+       if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) {
                 pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
                 return 1;
         }
@@ -388,7 +394,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
         text = __pa(_text);
         pfn = text >> PAGE_SHIFT;
  
-       if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) {
+       pf = _PAGE_RW | _PAGE_ENC;
+       if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) {
                 pr_err("Failed to map kernel text 1:1\n");
                 return 1;
         }
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c

index cd4be19c36dc611f482ad5291d27365db47a92c5..1f71980fc5e0fa9983c1687aa4ff81f02eb0b989 100644 (file)
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -1,6 +1,7 @@
  #include <linux/io.h>
  #include <linux/slab.h>
  #include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
  
  #include <asm/set_memory.h>
  #include <asm/pgtable.h>
@@ -59,6 +60,13 @@ static void __init setup_real_mode(void)
  
         base = (unsigned char *)real_mode_header;
  
+       /*
+        * If SME is active, the trampoline area will need to be in
+        * decrypted memory in order to bring up other processors
+        * successfully.
+        */
+       set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
+
         memcpy(base, real_mode_blob, size);
  
         phys_base = __pa(base);
@@ -100,6 +108,10 @@ static void __init setup_real_mode(void)
         trampoline_cr4_features = &trampoline_header->cr4;
         *trampoline_cr4_features = mmu_cr4_features;
  
+       trampoline_header->flags = 0;
+       if (sme_active())
+               trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
+
         trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
         trampoline_pgd[0] = trampoline_pgd_entry.pgd;
         trampoline_pgd[511] = init_top_pgt[511].pgd;
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S

index dac7b20d2f9de40f0244f623e8560c304394c178..614fd7064d0a21366c3f27721e610038fda1e9b0 100644 (file)
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -30,6 +30,7 @@
  #include <asm/msr.h>
  #include <asm/segment.h>
  #include <asm/processor-flags.h>
+#include <asm/realmode.h>
  #include "realmode.h"
  
         .text
@@ -92,6 +93,28 @@ ENTRY(startup_32)
         movl    %edx, %fs
         movl    %edx, %gs
  
+       /*
+        * Check for memory encryption support. This is a safety net in
+        * case BIOS hasn't done the necessary step of setting the bit in
+        * the MSR for this AP. If SME is active and we've gotten this far
+        * then it is safe for us to set the MSR bit and continue. If we
+        * don't we'll eventually crash trying to execute encrypted
+        * instructions.
+        */
+       bt      $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
+       jnc     .Ldone
+       movl    $MSR_K8_SYSCFG, %ecx
+       rdmsr
+       bts     $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax
+       jc      .Ldone
+
+       /*
+        * Memory encryption is enabled but the SME enable bit for this
+        * CPU has has not been set.  It is safe to set it, so do so.
+        */
+       wrmsr
+.Ldone:
+
         movl    pa_tr_cr4, %eax
         movl    %eax, %cr4              # Enable PAE mode
  
@@ -147,6 +170,7 @@ GLOBAL(trampoline_header)
         tr_start:               .space  8
         GLOBAL(tr_efer)         .space  8
         GLOBAL(tr_cr4)          .space  4
+       GLOBAL(tr_flags)        .space  4
  END(trampoline_header)
  
  #include "trampoline_common.S"
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig

index 027987638e9800cc4cf651a352e6718c9ac26eb4..1ecd419811a2b30a88d1fccc140829292e184035 100644 (file)
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -17,6 +17,9 @@ config XEN_PV
         bool "Xen PV guest support"
         default y
         depends on XEN
+       # XEN_PV is not ready to work with 5-level paging.
+       # Changes to hypervisor are also required.
+       depends on !X86_5LEVEL
         select XEN_HAVE_PVMMU
         select XEN_HAVE_VPMU
         help
@@ -75,4 +78,6 @@ config XEN_DEBUG_FS
  config XEN_PVH
         bool "Support for running as a PVH guest"
         depends on XEN && XEN_PVHVM && ACPI
+       # Pre-built page tables are not ready to handle 5-level paging.
+       depends on !X86_5LEVEL
         def_bool n
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c

index 811e4ddb3f37484180c0099e92b4eb351515d5bc..df1921751aa5cc03f9b0b54f1fa559dfc2d64b28 100644 (file)
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -263,6 +263,13 @@ static void __init xen_init_capabilities(void)
         setup_clear_cpu_cap(X86_FEATURE_MTRR);
         setup_clear_cpu_cap(X86_FEATURE_ACC);
         setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+       setup_clear_cpu_cap(X86_FEATURE_SME);
+
+       /*
+        * Xen PV would need some work to support PCID: CR3 handling as well
+        * as xen_flush_tlb_others() would need updating.
+        */
+       setup_clear_cpu_cap(X86_FEATURE_PCID);
  
         if (!xen_initial_domain())
                 setup_clear_cpu_cap(X86_FEATURE_ACPI);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c

index cab28cf2cffbb78d09325c46413f65381ec6c5e3..e437714750f8fccbaa424cfc03a1e46a5053a010 100644 (file)
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
         /* Get the "official" set of cpus referring to our pagetable. */
         if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
                 for_each_online_cpu(cpu) {
-                       if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
-                           && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+                       if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
                                 continue;
                         smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
                 }
                 return;
         }
-       cpumask_copy(mask, mm_cpumask(mm));
  
         /*
          * It's possible that a vcpu may have a stale reference to our
@@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
          * look at its actual current cr3 value, and force it to flush
          * if needed.
          */
+       cpumask_clear(mask);
         for_each_online_cpu(cpu) {
                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
                         cpumask_set_cpu(cpu, mask);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S

index 72a8e6adebe6c09da07303bafac194236f2df7c6..a7525e95d53fe9d35ce31349d6fb45dc112f9418 100644 (file)
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -58,7 +58,7 @@ ENTRY(hypercall_page)
  #else
         ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
         /* Map the p2m table to a 512GB-aligned user address. */
-       ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad PGDIR_SIZE)
+       ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad (PUD_SIZE * PTRS_PER_PUD))
  #endif
  #ifdef CONFIG_XEN_PV
         ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c

index 5c8aa9cf62d70de12b240e62887aaf59311e4a99..fe3d2a40f3111bea5440f00ee25ba92df0ef3e64 100644 (file)
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -708,8 +708,6 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
  static void acpi_idle_enter_bm(struct acpi_processor *pr,
                                struct acpi_processor_cx *cx, bool timer_bc)
  {
-       acpi_unlazy_tlb(smp_processor_id());
-
         /*
          * Must be done before busmaster disable as we might need to
          * access HPET !
diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c

index ef76e5eecf0b090d7488f0b73b9cb4d6248da09f..d5de6ee8466d51532d041ae5ff9b07dfd2d03ca5 100644 (file)
--- a/drivers/firmware/dmi-sysfs.c
+++ b/drivers/firmware/dmi-sysfs.c
@@ -25,6 +25,7 @@
  #include <linux/slab.h>
  #include <linux/list.h>
  #include <linux/io.h>
+#include <asm/dmi.h>
  
  #define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider
                               the top entry type is only 8 bits */
@@ -380,7 +381,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry,
         u8 __iomem *mapped;
         ssize_t wrote = 0;
  
-       mapped = ioremap(sel->access_method_address, sel->area_length);
+       mapped = dmi_remap(sel->access_method_address, sel->area_length);
         if (!mapped)
                 return -EIO;
  
@@ -390,7 +391,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry,
                 wrote++;
         }
  
-       iounmap(mapped);
+       dmi_unmap(mapped);
         return wrote;
  }
  
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c

index 045d6d311bde2defc5ebabaf229ff7f71a94f60f..69d4d130e055c28c50155934628a40808e248c00 100644 (file)
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -55,6 +55,25 @@ struct efi __read_mostly efi = {
  };
  EXPORT_SYMBOL(efi);
  
+static unsigned long *efi_tables[] = {
+       &efi.mps,
+       &efi.acpi,
+       &efi.acpi20,
+       &efi.smbios,
+       &efi.smbios3,
+       &efi.sal_systab,
+       &efi.boot_info,
+       &efi.hcdp,
+       &efi.uga,
+       &efi.uv_systab,
+       &efi.fw_vendor,
+       &efi.runtime,
+       &efi.config_table,
+       &efi.esrt,
+       &efi.properties_table,
+       &efi.mem_attr_table,
+};
+
  static bool disable_runtime;
  static int __init setup_noefi(char *arg)
  {
@@ -855,6 +874,20 @@ int efi_status_to_err(efi_status_t status)
         return err;
  }
  
+bool efi_is_table_address(unsigned long phys_addr)
+{
+       unsigned int i;
+
+       if (phys_addr == EFI_INVALID_TABLE_ADDR)
+               return false;
+
+       for (i = 0; i < ARRAY_SIZE(efi_tables); i++)
+               if (*(efi_tables[i]) == phys_addr)
+                       return true;
+
+       return false;
+}
+
  #ifdef CONFIG_KEXEC
  static int update_efi_random_seed(struct notifier_block *nb,
                                   unsigned long code, void *unused)
diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c

index 75273a2516039116da1d706d5f44565cadc2b41a..e83d6aec0c1376bc263352f9feec7ab40cb31c78 100644 (file)
--- a/drivers/firmware/pcdp.c
+++ b/drivers/firmware/pcdp.c
@@ -95,7 +95,7 @@ efi_setup_pcdp_console(char *cmdline)
         if (efi.hcdp == EFI_INVALID_TABLE_ADDR)
                 return -ENODEV;
  
-       pcdp = early_ioremap(efi.hcdp, 4096);
+       pcdp = early_memremap(efi.hcdp, 4096);
         printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp);
  
         if (strstr(cmdline, "console=hcdp")) {
@@ -131,6 +131,6 @@ efi_setup_pcdp_console(char *cmdline)
         }
  
  out:
-       early_iounmap(pcdp, 4096);
+       early_memunmap(pcdp, 4096);
         return rc;
  }
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c

index 8dc11064253d9e5ed58f8c817a471b36c25c5951..7a61a07ac4de97643199b09cbdaf185e80b9edaf 100644 (file)
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -36,6 +36,7 @@
  #include <linux/pagemap.h>
  #include <linux/shmem_fs.h>
  #include <linux/dma-buf.h>
+#include <linux/mem_encrypt.h>
  #include <drm/drmP.h>
  #include <drm/drm_vma_manager.h>
  #include <drm/drm_gem.h>
@@ -928,6 +929,7 @@ int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size,
         vma->vm_ops = dev->driver->gem_vm_ops;
         vma->vm_private_data = obj;
         vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+       vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
  
         /* Take a ref for this mapping of the object, so that the fault
          * handler can dereference the mmap offset's pointer to the object.
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c

index 1170b3209a1269aff7c1cfa3692b0fe73b413bb8..ed4bcbfd60864ca46064620cd1c9b83e0b1c8c25 100644 (file)
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -40,6 +40,7 @@
  #include <linux/efi.h>
  #include <linux/slab.h>
  #endif
+#include <linux/mem_encrypt.h>
  #include <asm/pgtable.h>
  #include "drm_internal.h"
  #include "drm_legacy.h"
@@ -58,6 +59,9 @@ static pgprot_t drm_io_prot(struct drm_local_map *map,
  {
         pgprot_t tmp = vm_get_page_prot(vma->vm_flags);
  
+       /* We don't want graphics memory to be mapped encrypted */
+       tmp = pgprot_decrypted(tmp);
+
  #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)
         if (map->type == _DRM_REGISTERS && !(map->flags & _DRM_WRITE_COMBINING))
                 tmp = pgprot_noncached(tmp);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c

index b442d12f2f7d64819faff9eace7bdc51fa199f89..84fb009d4eb045eeafb1e731faf418107613c69e 100644 (file)
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -39,6 +39,7 @@
  #include <linux/rbtree.h>
  #include <linux/module.h>
  #include <linux/uaccess.h>
+#include <linux/mem_encrypt.h>
  
  #define TTM_BO_VM_NUM_PREFAULT 16
  
@@ -230,9 +231,11 @@ static int ttm_bo_vm_fault(struct vm_fault *vmf)
          * first page.
          */
         for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
-               if (bo->mem.bus.is_iomem)
+               if (bo->mem.bus.is_iomem) {
+                       /* Iomem should not be marked encrypted */
+                       cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
                         pfn = bdev->driver->io_mem_pfn(bo, page_offset);
-               else {
+               } else {
                         page = ttm->pages[page_offset];
                         if (unlikely(!page && i == 0)) {
                                 retval = VM_FAULT_OOM;
diff --git a/drivers/gpu/drm/udl/udl_fb.c b/drivers/gpu/drm/udl/udl_fb.c

index 4a650036256444ed84d1d2ef4e349236d5eb5f26..92e1690e28de90faf99d3fb0d2eec7308aa9d693 100644 (file)
--- a/drivers/gpu/drm/udl/udl_fb.c
+++ b/drivers/gpu/drm/udl/udl_fb.c
@@ -14,6 +14,7 @@
  #include <linux/slab.h>
  #include <linux/fb.h>
  #include <linux/dma-buf.h>
+#include <linux/mem_encrypt.h>
  
  #include <drm/drmP.h>
  #include <drm/drm_crtc.h>
@@ -169,6 +170,9 @@ static int udl_fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
         pr_notice("mmap() framebuffer addr:%lu size:%lu\n",
                   pos, size);
  
+       /* We don't want the framebuffer to be mapped encrypted */
+       vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
         while (size > 0) {
                 page = vmalloc_to_pfn((void *)pos);
                 if (remap_pfn_range(vma, start, page, PAGE_SIZE, PAGE_SHARED))
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c

index c2ae819a871cb6d8f09412702e46463397f9fc0f..e87ffb3c31a92faeb29ca07b47397f2ad5504120 100644 (file)
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -913,16 +913,15 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
         struct cpuidle_state *state = &drv->states[index];
         unsigned long eax = flg2MWAIT(state->flags);
         unsigned int cstate;
-       int cpu = smp_processor_id();
  
         cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
  
         /*
-        * leave_mm() to avoid costly and often unnecessary wakeups
-        * for flushing the user TLB's associated with the active mm.
+        * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition
+        * will probably flush the TLB.  It's not guaranteed to flush
+        * the TLB, though, so it's not clear that we can do anything
+        * useful with this knowledge.
          */
-       if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
-               leave_mm(cpu);
  
         if (!(lapic_timer_reliable_states & (1 << (cstate))))
                 tick_broadcast_enter();
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c

index 354cbd6392cdf261ba657548ed2c208a09ddf50f..4ad7e5e31943db7b1d1d90850fac652fbfeb8ed6 100644 (file)
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -575,7 +575,7 @@ static void dump_dte_entry(u16 devid)
  
  static void dump_command(unsigned long phys_addr)
  {
-       struct iommu_cmd *cmd = phys_to_virt(phys_addr);
+       struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
         int i;
  
         for (i = 0; i < 4; ++i)
@@ -919,11 +919,13 @@ static void copy_cmd_to_buffer(struct amd_iommu *iommu,
  
  static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
  {
+       u64 paddr = iommu_virt_to_phys((void *)address);
+
         WARN_ON(address & 0x7ULL);
  
         memset(cmd, 0, sizeof(*cmd));
-       cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
-       cmd->data[1] = upper_32_bits(__pa(address));
+       cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
+       cmd->data[1] = upper_32_bits(paddr);
         cmd->data[2] = 1;
         CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
  }
@@ -1383,7 +1385,7 @@ static bool increase_address_space(struct protection_domain *domain,
                 return false;
  
         *pte             = PM_LEVEL_PDE(domain->mode,
-                                       virt_to_phys(domain->pt_root));
+                                       iommu_virt_to_phys(domain->pt_root));
         domain->pt_root  = pte;
         domain->mode    += 1;
         domain->updated  = true;
@@ -1420,7 +1422,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
                         if (!page)
                                 return NULL;
  
-                       __npte = PM_LEVEL_PDE(level, virt_to_phys(page));
+                       __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
  
                         /* pte could have been changed somewhere. */
                         if (cmpxchg64(pte, __pte, __npte) != __pte) {
@@ -1536,10 +1538,10 @@ static int iommu_map_page(struct protection_domain *dom,
                         return -EBUSY;
  
         if (count > 1) {
-               __pte = PAGE_SIZE_PTE(phys_addr, page_size);
+               __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
                 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
         } else
-               __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
+               __pte = __sme_set(phys_addr) | IOMMU_PTE_P | IOMMU_PTE_FC;
  
         if (prot & IOMMU_PROT_IR)
                 __pte |= IOMMU_PTE_IR;
@@ -1755,7 +1757,7 @@ static void free_gcr3_tbl_level1(u64 *tbl)
                 if (!(tbl[i] & GCR3_VALID))
                         continue;
  
-               ptr = __va(tbl[i] & PAGE_MASK);
+               ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
  
                 free_page((unsigned long)ptr);
         }
@@ -1770,7 +1772,7 @@ static void free_gcr3_tbl_level2(u64 *tbl)
                 if (!(tbl[i] & GCR3_VALID))
                         continue;
  
-               ptr = __va(tbl[i] & PAGE_MASK);
+               ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
  
                 free_gcr3_tbl_level1(ptr);
         }
@@ -2049,7 +2051,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
         u64 flags = 0;
  
         if (domain->mode != PAGE_MODE_NONE)
-               pte_root = virt_to_phys(domain->pt_root);
+               pte_root = iommu_virt_to_phys(domain->pt_root);
  
         pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
                     << DEV_ENTRY_MODE_SHIFT;
@@ -2061,7 +2063,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
                 flags |= DTE_FLAG_IOTLB;
  
         if (domain->flags & PD_IOMMUV2_MASK) {
-               u64 gcr3 = __pa(domain->gcr3_tbl);
+               u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
                 u64 glx  = domain->glx;
                 u64 tmp;
  
@@ -3606,10 +3608,10 @@ static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
                         if (root == NULL)
                                 return NULL;
  
-                       *pte = __pa(root) | GCR3_VALID;
+                       *pte = iommu_virt_to_phys(root) | GCR3_VALID;
                 }
  
-               root = __va(*pte & PAGE_MASK);
+               root = iommu_phys_to_virt(*pte & PAGE_MASK);
  
                 level -= 1;
         }
@@ -3788,7 +3790,7 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
  
         dte     = amd_iommu_dev_table[devid].data[2];
         dte     &= ~DTE_IRQ_PHYS_ADDR_MASK;
-       dte     |= virt_to_phys(table->table);
+       dte     |= iommu_virt_to_phys(table->table);
         dte     |= DTE_IRQ_REMAP_INTCTL;
         dte     |= DTE_IRQ_TABLE_LEN;
         dte     |= DTE_IRQ_REMAP_ENABLE;
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c

index 372303700566f4f984e6656bd937e2d99bf07cc6..2292a6cece76e02e73411935c58f5d35387f60fc 100644 (file)
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -30,6 +30,7 @@
  #include <linux/iommu.h>
  #include <linux/kmemleak.h>
  #include <linux/crash_dump.h>
+#include <linux/mem_encrypt.h>
  #include <asm/pci-direct.h>
  #include <asm/iommu.h>
  #include <asm/gart.h>
@@ -348,7 +349,7 @@ static void iommu_set_device_table(struct amd_iommu *iommu)
  
         BUG_ON(iommu->mmio_base == NULL);
  
-       entry = virt_to_phys(amd_iommu_dev_table);
+       entry = iommu_virt_to_phys(amd_iommu_dev_table);
         entry |= (dev_table_size >> 12) - 1;
         memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
                         &entry, sizeof(entry));
@@ -606,7 +607,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
  
         BUG_ON(iommu->cmd_buf == NULL);
  
-       entry = (u64)virt_to_phys(iommu->cmd_buf);
+       entry = iommu_virt_to_phys(iommu->cmd_buf);
         entry |= MMIO_CMD_SIZE_512;
  
         memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -635,7 +636,7 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu)
  
         BUG_ON(iommu->evt_buf == NULL);
  
-       entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+       entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
  
         memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
                     &entry, sizeof(entry));
@@ -668,7 +669,7 @@ static void iommu_enable_ppr_log(struct amd_iommu *iommu)
         if (iommu->ppr_log == NULL)
                 return;
  
-       entry = (u64)virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512;
+       entry = iommu_virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512;
  
         memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET,
                     &entry, sizeof(entry));
@@ -748,10 +749,10 @@ static int iommu_init_ga_log(struct amd_iommu *iommu)
         if (!iommu->ga_log_tail)
                 goto err_out;
  
-       entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
+       entry = iommu_virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
         memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET,
                     &entry, sizeof(entry));
-       entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL;
+       entry = (iommu_virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL;
         memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET,
                     &entry, sizeof(entry));
         writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
@@ -2564,6 +2565,24 @@ static int __init amd_iommu_init(void)
         return ret;
  }
  
+static bool amd_iommu_sme_check(void)
+{
+       if (!sme_active() || (boot_cpu_data.x86 != 0x17))
+               return true;
+
+       /* For Fam17h, a specific level of support is required */
+       if (boot_cpu_data.microcode >= 0x08001205)
+               return true;
+
+       if ((boot_cpu_data.microcode >= 0x08001126) &&
+           (boot_cpu_data.microcode <= 0x080011ff))
+               return true;
+
+       pr_notice("AMD-Vi: IOMMU not currently supported when SME is active\n");
+
+       return false;
+}
+
  /****************************************************************************
   *
   * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -2578,6 +2597,9 @@ int __init amd_iommu_detect(void)
         if (no_iommu || (iommu_detected && !gart_iommu_aperture))
                 return -ENODEV;
  
+       if (!amd_iommu_sme_check())
+               return -ENODEV;
+
         ret = iommu_go_to_state(IOMMU_IVRS_DETECTED);
         if (ret)
                 return ret;
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h

index 466260f8a1df37bb79738d4c8f91b90568cdd7c6..3f12fb2338ea5bbc1850f779fd98fff92808d7c0 100644 (file)
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -87,4 +87,14 @@ static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
         return !!(iommu->features & f);
  }
  
+static inline u64 iommu_virt_to_phys(void *vaddr)
+{
+       return (u64)__sme_set(virt_to_phys(vaddr));
+}
+
+static inline void *iommu_phys_to_virt(unsigned long paddr)
+{
+       return phys_to_virt(__sme_clr(paddr));
+}
+
  #endif /* _ASM_X86_AMD_IOMMU_PROTO_H  */
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h

index 294a409e283b7ae4b52c59350756983711bba9d1..8591f43c467c9c93aa1c3dcf5d8e64d2f3bcdbd5 100644 (file)
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -344,7 +344,7 @@
  
  #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
  #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
-#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
+#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
  #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
  
  #define IOMMU_PROT_MASK 0x03
diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c

index 296db7a69c27548c2529c52d532832c39f0db1aa..153b3f3cc795fdd5b7d0e31ff5a18043b738b4c8 100644 (file)
--- a/drivers/sfi/sfi_core.c
+++ b/drivers/sfi/sfi_core.c
@@ -68,6 +68,7 @@
  #include <linux/init.h>
  #include <linux/sfi.h>
  #include <linux/slab.h>
+#include <linux/io.h>
  
  #include "sfi_core.h"
  
@@ -86,13 +87,13 @@ static struct sfi_table_simple *syst_va __read_mostly;
  /*
   * FW creates and saves the SFI tables in memory. When these tables get
   * used, they may need to be mapped to virtual address space, and the mapping
- * can happen before or after the ioremap() is ready, so a flag is needed
+ * can happen before or after the memremap() is ready, so a flag is needed
   * to indicating this
   */
-static u32 sfi_use_ioremap __read_mostly;
+static u32 sfi_use_memremap __read_mostly;
  
  /*
- * sfi_un/map_memory calls early_ioremap/iounmap which is a __init function
+ * sfi_un/map_memory calls early_memremap/memunmap which is a __init function
   * and introduces section mismatch. So use __ref to make it calm.
   */
  static void __iomem * __ref sfi_map_memory(u64 phys, u32 size)
@@ -100,10 +101,10 @@ static void __iomem * __ref sfi_map_memory(u64 phys, u32 size)
         if (!phys || !size)
                 return NULL;
  
-       if (sfi_use_ioremap)
-               return ioremap_cache(phys, size);
+       if (sfi_use_memremap)
+               return memremap(phys, size, MEMREMAP_WB);
         else
-               return early_ioremap(phys, size);
+               return early_memremap(phys, size);
  }
  
  static void __ref sfi_unmap_memory(void __iomem *virt, u32 size)
@@ -111,10 +112,10 @@ static void __ref sfi_unmap_memory(void __iomem *virt, u32 size)
         if (!virt || !size)
                 return;
  
-       if (sfi_use_ioremap)
-               iounmap(virt);
+       if (sfi_use_memremap)
+               memunmap(virt);
         else
-               early_iounmap(virt, size);
+               early_memunmap(virt, size);
  }
  
  static void sfi_print_table_header(unsigned long long pa,
@@ -507,8 +508,8 @@ void __init sfi_init_late(void)
         length = syst_va->header.len;
         sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple));
  
-       /* Use ioremap now after it is ready */
-       sfi_use_ioremap = 1;
+       /* Use memremap now after it is ready */
+       sfi_use_memremap = 1;
         syst_va = sfi_map_memory(syst_pa, length);
  
         sfi_acpi_init();
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c

index 7a42238db446b0093323505b710963ac94a53e15..25e862c487f643f97353abb607caf23ea336a757 100644 (file)
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -32,6 +32,7 @@
  #include <linux/device.h>
  #include <linux/efi.h>
  #include <linux/fb.h>
+#include <linux/mem_encrypt.h>
  
  #include <asm/fb.h>
  
@@ -1396,6 +1397,12 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
         mutex_lock(&info->mm_lock);
         if (fb->fb_mmap) {
                 int res;
+
+               /*
+                * The framebuffer needs to be accessed decrypted, be sure
+                * SME protection is removed ahead of the call
+                */
+               vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
                 res = fb->fb_mmap(info, vma);
                 mutex_unlock(&info->mm_lock);
                 return res;
@@ -1421,6 +1428,11 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
         mutex_unlock(&info->mm_lock);
  
         vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+       /*
+        * The framebuffer needs to be accessed decrypted, be sure
+        * SME protection is removed
+        */
+       vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
         fb_pgprotect(file, vma, start);
  
         return vm_iomap_memory(vma, start, len);
diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h

index 734ad4db388c6d922fb812391f913cbdda710f12..2edef8d7fa6b8a1fe65bacb8e0d110f55b5e803f 100644 (file)
--- a/include/asm-generic/early_ioremap.h
+++ b/include/asm-generic/early_ioremap.h
@@ -13,6 +13,8 @@ extern void *early_memremap(resource_size_t phys_addr,
                             unsigned long size);
  extern void *early_memremap_ro(resource_size_t phys_addr,
                                unsigned long size);
+extern void *early_memremap_prot(resource_size_t phys_addr,
+                                unsigned long size, unsigned long prot_val);
  extern void early_iounmap(void __iomem *addr, unsigned long size);
  extern void early_memunmap(void *addr, unsigned long size);
  
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h

index 7dfa767dc68012ac52ac81d48e92b3ec79c97311..4d7bb98f41340f52881f78a4d8e4b9dc2f21600f 100644 (file)
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -582,6 +582,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
  #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
  #endif /* CONFIG_MMU */
  
+/*
+ * No-op macros that just return the current protection value. Defined here
+ * because these macros can be used used even if CONFIG_MMU is not defined.
+ */
+#ifndef pgprot_encrypted
+#define pgprot_encrypted(prot) (prot)
+#endif
+
+#ifndef pgprot_decrypted
+#define pgprot_decrypted(prot) (prot)
+#endif
+
  /*
   * A facility to provide lazy MMU batching.  This allows PTE updates and
   * page invalidations to be delayed until a call to leave lazy MMU mode
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h

index bdb80c4aef6e13631075b2ad06b39d2788c8eba5..71b86a5d3061dda87c7ab3ee95211c9512adbea0 100644 (file)
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -166,6 +166,8 @@
  
  #if GCC_VERSION >= 40100
  # define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
+
+#define __nostackprotector     __attribute__((__optimize__("no-stack-protector")))
  #endif
  
  #if GCC_VERSION >= 40300
diff --git a/include/linux/compiler.h b/include/linux/compiler.h

index eca8ad75e28b054db4657d5e562b3904120b042e..43cac547f773d2af821f69cd03610861d33bc571 100644 (file)
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -475,6 +475,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  #define __visible
  #endif
  
+#ifndef __nostackprotector
+# define __nostackprotector
+#endif
+
  /*
   * Assume alignment of return value.
   */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h

index 03c0196a6f2474ea4e34e9840638ff80a45370ec..2189c79cde5d5b0f0f76bb6b9e4211e427f8d6d9 100644 (file)
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -10,6 +10,7 @@
  #include <linux/scatterlist.h>
  #include <linux/kmemcheck.h>
  #include <linux/bug.h>
+#include <linux/mem_encrypt.h>
  
  /**
   * List of possible attributes associated with a DMA mapping. The semantics
@@ -572,6 +573,12 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
         return 0;
  }
  
+static inline void dma_check_mask(struct device *dev, u64 mask)
+{
+       if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
+               dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
+}
+
  static inline int dma_supported(struct device *dev, u64 mask)
  {
         const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -588,6 +595,9 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
  {
         if (!dev->dma_mask || !dma_supported(dev, mask))
                 return -EIO;
+
+       dma_check_mask(dev, mask);
+
         *dev->dma_mask = mask;
         return 0;
  }
@@ -607,6 +617,9 @@ static inline int dma_set_coherent_mask(struct device *dev, u64 mask)
  {
         if (!dma_supported(dev, mask))
                 return -EIO;
+
+       dma_check_mask(dev, mask);
+
         dev->coherent_dma_mask = mask;
         return 0;
  }
diff --git a/include/linux/efi.h b/include/linux/efi.h

index 8269bcb8ccf7961bd01f52e39fc4d34d370f25d3..4e47f78430bece2a3015f2fe1a74c88ae339f9e7 100644 (file)
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -985,7 +985,7 @@ static inline void efi_esrt_init(void) { }
  extern int efi_config_parse_tables(void *config_tables, int count, int sz,
                                    efi_config_table_type_t *arch_tables);
  extern u64 efi_get_iobase (void);
-extern u32 efi_mem_type (unsigned long phys_addr);
+extern int efi_mem_type(unsigned long phys_addr);
  extern u64 efi_mem_attributes (unsigned long phys_addr);
  extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
  extern int __init efi_uart_console_only (void);
@@ -1091,6 +1091,8 @@ static inline bool efi_enabled(int feature)
         return test_bit(feature, &efi.flags) != 0;
  }
  extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused);
+
+extern bool efi_is_table_address(unsigned long phys_addr);
  #else
  static inline bool efi_enabled(int feature)
  {
@@ -1104,6 +1106,11 @@ efi_capsule_pending(int *reset_type)
  {
         return false;
  }
+
+static inline bool efi_is_table_address(unsigned long phys_addr)
+{
+       return false;
+}
  #endif
  
  extern int efi_status_to_err(efi_status_t status);
diff --git a/include/linux/io.h b/include/linux/io.h

index 2195d9ea4aaae0c054f04aab2da7cff851d2b997..32e30e8fb9db92cf1472c6188ff3310a610b3c5f 100644 (file)
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -157,6 +157,8 @@ enum {
         MEMREMAP_WB = 1 << 0,
         MEMREMAP_WT = 1 << 1,
         MEMREMAP_WC = 1 << 2,
+       MEMREMAP_ENC = 1 << 3,
+       MEMREMAP_DEC = 1 << 4,
  };
  
  void *memremap(resource_size_t offset, size_t size, unsigned long flags);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h

index dd056fab9e35c958c7aee14156e1df5640fb2556..2b7590f5483a1fc4474fbecddd099977222531cb 100644 (file)
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -327,6 +327,14 @@ static inline void *boot_phys_to_virt(unsigned long entry)
         return phys_to_virt(boot_phys_to_phys(entry));
  }
  
+#ifndef arch_kexec_post_alloc_pages
+static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; }
+#endif
+
+#ifndef arch_kexec_pre_free_pages
+static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
+#endif
+
  #else /* !CONFIG_KEXEC_CORE */
  struct pt_regs;
  struct task_struct;
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h

new file mode 100644 (file)

index 0000000..1255f09
--- /dev/null
+++ b/include/linux/mem_encrypt.h
@@ -0,0 +1,48 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __MEM_ENCRYPT_H__
+#define __MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+
+#include <asm/mem_encrypt.h>
+
+#else  /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
+
+#define sme_me_mask    0UL
+
+#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */
+
+static inline bool sme_active(void)
+{
+       return !!sme_me_mask;
+}
+
+static inline unsigned long sme_get_me_mask(void)
+{
+       return sme_me_mask;
+}
+
+/*
+ * The __sme_set() and __sme_clr() macros are useful for adding or removing
+ * the encryption mask from a value (e.g. when dealing with pagetable
+ * entries).
+ */
+#define __sme_set(x)           ((unsigned long)(x) | sme_me_mask)
+#define __sme_clr(x)           ((unsigned long)(x) & ~sme_me_mask)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __MEM_ENCRYPT_H__ */
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h

index e030a68ead7e211fcd1e19277fd3708e665bf78b..25438b2b6f223fb29989c41e1ce8ff854c425d10 100644 (file)
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -126,4 +126,10 @@ static __always_inline enum lru_list page_lru(struct page *page)
  
  #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
  
+#ifdef arch_unmap_kpfn
+extern void arch_unmap_kpfn(unsigned long pfn);
+#else
+static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
+#endif
+
  #endif
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h

index 4ee479f2f355b1fa2658b1c2aabbdfc376125651..15e7160751a85a4be938f860257d05595b03105d 100644 (file)
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -35,6 +35,7 @@ int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
  extern unsigned long swiotlb_nr_tbl(void);
  unsigned long swiotlb_size_or_default(void);
  extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
+extern void __init swiotlb_update_mem_attributes(void);
  
  /*
   * Enumeration for sync targets
diff --git a/init/main.c b/init/main.c

index 052481fbe3633f64b420c5bbd6deea3be261e6a9..9789ab7fe85e14f7434be3b0298a9ebf920242e0 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -488,6 +488,8 @@ void __init __weak thread_stack_cache_init(void)
  }
  #endif
  
+void __init __weak mem_encrypt_init(void) { }
+
  /*
   * Set up kernel memory allocators
   */
@@ -641,6 +643,14 @@ asmlinkage __visible void __init start_kernel(void)
          */
         locking_selftest();
  
+       /*
+        * This needs to be called before any devices perform DMA
+        * operations that might use the SWIOTLB bounce buffers. It will
+        * mark the bounce buffers as decrypted so that their usage will
+        * not cause "plain-text" data to be decrypted when accessed.
+        */
+       mem_encrypt_init();
+
  #ifdef CONFIG_BLK_DEV_INITRD
         if (initrd_start && !initrd_below_start_ok &&
             page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c

index 1ae7c41c33c19c54e4b08d33c0c59da78244efba..20fef1a38602d9d0ed6fdb5d359d5604fbafc3dd 100644 (file)
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
  {
         struct page *pages;
  
-       pages = alloc_pages(gfp_mask, order);
+       pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
         if (pages) {
                 unsigned int count, i;
  
@@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
                 count = 1 << order;
                 for (i = 0; i < count; i++)
                         SetPageReserved(pages + i);
+
+               arch_kexec_post_alloc_pages(page_address(pages), count,
+                                           gfp_mask);
+
+               if (gfp_mask & __GFP_ZERO)
+                       for (i = 0; i < count; i++)
+                               clear_highpage(pages + i);
         }
  
         return pages;
@@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page)
  
         order = page_private(page);
         count = 1 << order;
+
+       arch_kexec_pre_free_pages(page_address(page), count);
+
         for (i = 0; i < count; i++)
                 ClearPageReserved(page + i);
         __free_pages(page, order);
diff --git a/kernel/memremap.c b/kernel/memremap.c

index 124bed776532d3d2e69f4079e3f2d23da193fd2e..9afdc434fb490a3384d847bc50647fa3dd3ab16a 100644 (file)
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
  }
  #endif
  
-static void *try_ram_remap(resource_size_t offset, size_t size)
+#ifndef arch_memremap_can_ram_remap
+static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
+                                       unsigned long flags)
+{
+       return true;
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size,
+                          unsigned long flags)
  {
         unsigned long pfn = PHYS_PFN(offset);
  
         /* In the simple case just return the existing linear address */
-       if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
+       if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
+           arch_memremap_can_ram_remap(offset, size, flags))
                 return __va(offset);
+
         return NULL; /* fallback to arch_memremap_wb */
  }
  
@@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
   * memremap() - remap an iomem_resource as cacheable memory
   * @offset: iomem resource start address
   * @size: size of remap
- * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
+ *               MEMREMAP_ENC, MEMREMAP_DEC
   *
   * memremap() is "ioremap" for cases where it is known that the resource
   * being mapped does not have i/o side effects and the __iomem
@@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
                  * the requested range is potentially in System RAM.
                  */
                 if (is_ram == REGION_INTERSECTS)
-                       addr = try_ram_remap(offset, size);
+                       addr = try_ram_remap(offset, size, flags);
                 if (!addr)
                         addr = arch_memremap_wb(offset, size);
         }
diff --git a/lib/swiotlb.c b/lib/swiotlb.c

index a8d74a733a38b54a912f5e292f0a15a2cfff4a95..8c6c83ef57a43336e0a33a52f691e3323eb8f3f4 100644 (file)
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -30,6 +30,7 @@
  #include <linux/highmem.h>
  #include <linux/gfp.h>
  #include <linux/scatterlist.h>
+#include <linux/mem_encrypt.h>
  
  #include <asm/io.h>
  #include <asm/dma.h>
@@ -155,6 +156,15 @@ unsigned long swiotlb_size_or_default(void)
         return size ? size : (IO_TLB_DEFAULT_SIZE);
  }
  
+void __weak swiotlb_set_mem_attributes(void *vaddr, unsigned long size) { }
+
+/* For swiotlb, clear memory encryption mask from dma addresses */
+static dma_addr_t swiotlb_phys_to_dma(struct device *hwdev,
+                                     phys_addr_t address)
+{
+       return __sme_clr(phys_to_dma(hwdev, address));
+}
+
  /* Note that this doesn't work with highmem page */
  static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
                                       volatile void *address)
@@ -183,6 +193,31 @@ void swiotlb_print_info(void)
                bytes >> 20, vstart, vend - 1);
  }
  
+/*
+ * Early SWIOTLB allocation may be too early to allow an architecture to
+ * perform the desired operations.  This function allows the architecture to
+ * call SWIOTLB when the operations are possible.  It needs to be called
+ * before the SWIOTLB memory is used.
+ */
+void __init swiotlb_update_mem_attributes(void)
+{
+       void *vaddr;
+       unsigned long bytes;
+
+       if (no_iotlb_memory || late_alloc)
+               return;
+
+       vaddr = phys_to_virt(io_tlb_start);
+       bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
+       swiotlb_set_mem_attributes(vaddr, bytes);
+       memset(vaddr, 0, bytes);
+
+       vaddr = phys_to_virt(io_tlb_overflow_buffer);
+       bytes = PAGE_ALIGN(io_tlb_overflow);
+       swiotlb_set_mem_attributes(vaddr, bytes);
+       memset(vaddr, 0, bytes);
+}
+
  int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
  {
         void *v_overflow_buffer;
@@ -320,6 +355,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
         io_tlb_start = virt_to_phys(tlb);
         io_tlb_end = io_tlb_start + bytes;
  
+       swiotlb_set_mem_attributes(tlb, bytes);
         memset(tlb, 0, bytes);
  
         /*
@@ -330,6 +366,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
         if (!v_overflow_buffer)
                 goto cleanup2;
  
+       swiotlb_set_mem_attributes(v_overflow_buffer, io_tlb_overflow);
+       memset(v_overflow_buffer, 0, io_tlb_overflow);
         io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
  
         /*
@@ -469,6 +507,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
         if (no_iotlb_memory)
                 panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
  
+       if (sme_active())
+               pr_warn_once("SME is active and system is using DMA bounce buffers\n");
+
         mask = dma_get_seg_boundary(hwdev);
  
         tbl_dma_addr &= mask;
@@ -581,7 +622,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
                 return SWIOTLB_MAP_ERROR;
         }
  
-       start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
+       start_dma_addr = swiotlb_phys_to_dma(hwdev, io_tlb_start);
         return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
                                       dir, attrs);
  }
@@ -702,7 +743,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                         goto err_warn;
  
                 ret = phys_to_virt(paddr);
-               dev_addr = phys_to_dma(hwdev, paddr);
+               dev_addr = swiotlb_phys_to_dma(hwdev, paddr);
  
                 /* Confirm address can be DMA'd by device */
                 if (dev_addr + size - 1 > dma_mask) {
@@ -812,10 +853,10 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
         map = map_single(dev, phys, size, dir, attrs);
         if (map == SWIOTLB_MAP_ERROR) {
                 swiotlb_full(dev, size, dir, 1);
-               return phys_to_dma(dev, io_tlb_overflow_buffer);
+               return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
         }
  
-       dev_addr = phys_to_dma(dev, map);
+       dev_addr = swiotlb_phys_to_dma(dev, map);
  
         /* Ensure that the address returned is DMA'ble */
         if (dma_capable(dev, dev_addr, size))
@@ -824,7 +865,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
         attrs |= DMA_ATTR_SKIP_CPU_SYNC;
         swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
  
-       return phys_to_dma(dev, io_tlb_overflow_buffer);
+       return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
  }
  EXPORT_SYMBOL_GPL(swiotlb_map_page);
  
@@ -958,7 +999,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
                                 sg_dma_len(sgl) = 0;
                                 return 0;
                         }
-                       sg->dma_address = phys_to_dma(hwdev, map);
+                       sg->dma_address = swiotlb_phys_to_dma(hwdev, map);
                 } else
                         sg->dma_address = dev_addr;
                 sg_dma_len(sg) = sg->length;
@@ -1026,7 +1067,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
  int
  swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
  {
-       return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer));
+       return (dma_addr == swiotlb_phys_to_dma(hwdev, io_tlb_overflow_buffer));
  }
  EXPORT_SYMBOL(swiotlb_dma_mapping_error);
  
@@ -1039,6 +1080,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error);
  int
  swiotlb_dma_supported(struct device *hwdev, u64 mask)
  {
-       return phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+       return swiotlb_phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
  }
  EXPORT_SYMBOL(swiotlb_dma_supported);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c

index 6d5717bd7197ba0428c32941df27d90da04264b5..b1dd4a948fc0b3afc375964d23ebcb9f69eaafa3 100644 (file)
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup);
  
  static int after_paging_init __initdata;
  
+pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
+                                                   unsigned long size,
+                                                   pgprot_t prot)
+{
+       return prot;
+}
+
  void __init __weak early_ioremap_shutdown(void)
  {
  }
@@ -215,14 +222,29 @@ early_ioremap(resource_size_t phys_addr, unsigned long size)
  void __init *
  early_memremap(resource_size_t phys_addr, unsigned long size)
  {
-       return (__force void *)__early_ioremap(phys_addr, size,
-                                              FIXMAP_PAGE_NORMAL);
+       pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+                                                    FIXMAP_PAGE_NORMAL);
+
+       return (__force void *)__early_ioremap(phys_addr, size, prot);
  }
  #ifdef FIXMAP_PAGE_RO
  void __init *
  early_memremap_ro(resource_size_t phys_addr, unsigned long size)
  {
-       return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
+       pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+                                                    FIXMAP_PAGE_RO);
+
+       return (__force void *)__early_ioremap(phys_addr, size, prot);
+}
+#endif
+
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+void __init *
+early_memremap_prot(resource_size_t phys_addr, unsigned long size,
+                   unsigned long prot_val)
+{
+       return (__force void *)__early_ioremap(phys_addr, size,
+                                              __pgprot(prot_val));
  }
  #endif
  
diff --git a/mm/memory-failure.c b/mm/memory-failure.c

index 1cd3b3569af8a79285b75bfdb2485b7de7a69aa8..88366626c0b7c790d92514f32cb3cea701dd0ffb 100644 (file)
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                 return 0;
         }
  
+       arch_unmap_kpfn(pfn);
+
         orig_head = hpage = compound_head(p);
         num_poisoned_pages_inc();
author	Ingo Molnar <mingo@kernel.org>
	Sat, 26 Aug 2017 07:19:13 +0000 (09:19 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Sat, 26 Aug 2017 07:19:13 +0000 (09:19 +0200)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| blame \| history
Documentation/x86/amd-memory-encryption.txt	[new file with mode: 0644]	patch \| blob
Documentation/x86/protection-keys.txt		patch \| blob \| blame \| history
Documentation/x86/x86_64/5level-paging.txt	[new file with mode: 0644]	patch \| blob
arch/ia64/include/asm/acpi.h		patch \| blob \| blame \| history
arch/ia64/kernel/efi.c		patch \| blob \| blame \| history
arch/x86/Kconfig		patch \| blob \| blame \| history
arch/x86/boot/compressed/kaslr.c		patch \| blob \| blame \| history
arch/x86/boot/compressed/pagetable.c		patch \| blob \| blame \| history
arch/x86/include/asm/acpi.h		patch \| blob \| blame \| history
arch/x86/include/asm/cmdline.h		patch \| blob \| blame \| history
arch/x86/include/asm/cpufeatures.h		patch \| blob \| blame \| history
arch/x86/include/asm/disabled-features.h		patch \| blob \| blame \| history
arch/x86/include/asm/dma-mapping.h		patch \| blob \| blame \| history
arch/x86/include/asm/dmi.h		patch \| blob \| blame \| history
arch/x86/include/asm/e820/api.h		patch \| blob \| blame \| history
arch/x86/include/asm/elf.h		patch \| blob \| blame \| history
arch/x86/include/asm/fixmap.h		patch \| blob \| blame \| history
arch/x86/include/asm/init.h		patch \| blob \| blame \| history
arch/x86/include/asm/io.h		patch \| blob \| blame \| history
arch/x86/include/asm/kexec.h		patch \| blob \| blame \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| blame \| history
arch/x86/include/asm/mem_encrypt.h	[new file with mode: 0644]	patch \| blob
arch/x86/include/asm/mmu.h		patch \| blob \| blame \| history
arch/x86/include/asm/mmu_context.h		patch \| blob \| blame \| history
arch/x86/include/asm/mpx.h		patch \| blob \| blame \| history
arch/x86/include/asm/msr-index.h		patch \| blob \| blame \| history
arch/x86/include/asm/page_64.h		patch \| blob \| blame \| history
arch/x86/include/asm/page_types.h		patch \| blob \| blame \| history
arch/x86/include/asm/pgtable.h		patch \| blob \| blame \| history
arch/x86/include/asm/pgtable_types.h		patch \| blob \| blame \| history
arch/x86/include/asm/processor-flags.h		patch \| blob \| blame \| history
arch/x86/include/asm/processor.h		patch \| blob \| blame \| history
arch/x86/include/asm/realmode.h		patch \| blob \| blame \| history
arch/x86/include/asm/set_memory.h		patch \| blob \| blame \| history
arch/x86/include/asm/tlbflush.h		patch \| blob \| blame \| history
arch/x86/include/asm/vga.h		patch \| blob \| blame \| history
arch/x86/kernel/acpi/boot.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/amd.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/bugs.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/common.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/mcheck/mce.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/scattered.c		patch \| blob \| blame \| history
arch/x86/kernel/e820.c		patch \| blob \| blame \| history
arch/x86/kernel/espfix_64.c		patch \| blob \| blame \| history
arch/x86/kernel/head64.c		patch \| blob \| blame \| history
arch/x86/kernel/head_64.S		patch \| blob \| blame \| history
arch/x86/kernel/kdebugfs.c		patch \| blob \| blame \| history
arch/x86/kernel/ksysfs.c		patch \| blob \| blame \| history
arch/x86/kernel/machine_kexec_64.c		patch \| blob \| blame \| history
arch/x86/kernel/mpparse.c		patch \| blob \| blame \| history
arch/x86/kernel/pci-dma.c		patch \| blob \| blame \| history
arch/x86/kernel/pci-nommu.c		patch \| blob \| blame \| history
arch/x86/kernel/pci-swiotlb.c		patch \| blob \| blame \| history
arch/x86/kernel/process.c		patch \| blob \| blame \| history
arch/x86/kernel/relocate_kernel_64.S		patch \| blob \| blame \| history
arch/x86/kernel/setup.c		patch \| blob \| blame \| history
arch/x86/kernel/sys_x86_64.c		patch \| blob \| blame \| history
arch/x86/kvm/mmu.c		patch \| blob \| blame \| history
arch/x86/kvm/mmu.h		patch \| blob \| blame \| history
arch/x86/kvm/svm.c		patch \| blob \| blame \| history
arch/x86/kvm/vmx.c		patch \| blob \| blame \| history
arch/x86/kvm/x86.c		patch \| blob \| blame \| history
arch/x86/lib/cmdline.c		patch \| blob \| blame \| history
arch/x86/mm/Makefile		patch \| blob \| blame \| history
arch/x86/mm/dump_pagetables.c		patch \| blob \| blame \| history
arch/x86/mm/hugetlbpage.c		patch \| blob \| blame \| history
arch/x86/mm/ident_map.c		patch \| blob \| blame \| history
arch/x86/mm/init.c		patch \| blob \| blame \| history
arch/x86/mm/ioremap.c		patch \| blob \| blame \| history
arch/x86/mm/kasan_init_64.c		patch \| blob \| blame \| history
arch/x86/mm/mem_encrypt.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/mem_encrypt_boot.S	[new file with mode: 0644]	patch \| blob
arch/x86/mm/mmap.c		patch \| blob \| blame \| history
arch/x86/mm/mpx.c		patch \| blob \| blame \| history
arch/x86/mm/pageattr.c		patch \| blob \| blame \| history
arch/x86/mm/pat.c		patch \| blob \| blame \| history
arch/x86/mm/tlb.c		patch \| blob \| blame \| history
arch/x86/pci/common.c		patch \| blob \| blame \| history
arch/x86/platform/efi/efi.c		patch \| blob \| blame \| history
arch/x86/platform/efi/efi_64.c		patch \| blob \| blame \| history
arch/x86/realmode/init.c		patch \| blob \| blame \| history
arch/x86/realmode/rm/trampoline_64.S		patch \| blob \| blame \| history
arch/x86/xen/Kconfig		patch \| blob \| blame \| history
arch/x86/xen/enlighten_pv.c		patch \| blob \| blame \| history
arch/x86/xen/mmu_pv.c		patch \| blob \| blame \| history
arch/x86/xen/xen-head.S		patch \| blob \| blame \| history
drivers/acpi/processor_idle.c		patch \| blob \| blame \| history
drivers/firmware/dmi-sysfs.c		patch \| blob \| blame \| history
drivers/firmware/efi/efi.c		patch \| blob \| blame \| history
drivers/firmware/pcdp.c		patch \| blob \| blame \| history
drivers/gpu/drm/drm_gem.c		patch \| blob \| blame \| history
drivers/gpu/drm/drm_vm.c		patch \| blob \| blame \| history
drivers/gpu/drm/ttm/ttm_bo_vm.c		patch \| blob \| blame \| history
drivers/gpu/drm/udl/udl_fb.c		patch \| blob \| blame \| history
drivers/idle/intel_idle.c		patch \| blob \| blame \| history
drivers/iommu/amd_iommu.c		patch \| blob \| blame \| history
drivers/iommu/amd_iommu_init.c		patch \| blob \| blame \| history
drivers/iommu/amd_iommu_proto.h		patch \| blob \| blame \| history
drivers/iommu/amd_iommu_types.h		patch \| blob \| blame \| history
drivers/sfi/sfi_core.c		patch \| blob \| blame \| history
drivers/video/fbdev/core/fbmem.c		patch \| blob \| blame \| history
include/asm-generic/early_ioremap.h		patch \| blob \| blame \| history
include/asm-generic/pgtable.h		patch \| blob \| blame \| history
include/linux/compiler-gcc.h		patch \| blob \| blame \| history
include/linux/compiler.h		patch \| blob \| blame \| history
include/linux/dma-mapping.h		patch \| blob \| blame \| history
include/linux/efi.h		patch \| blob \| blame \| history
include/linux/io.h		patch \| blob \| blame \| history
include/linux/kexec.h		patch \| blob \| blame \| history
include/linux/mem_encrypt.h	[new file with mode: 0644]	patch \| blob
include/linux/mm_inline.h		patch \| blob \| blame \| history
include/linux/swiotlb.h		patch \| blob \| blame \| history
init/main.c		patch \| blob \| blame \| history
kernel/kexec_core.c		patch \| blob \| blame \| history
kernel/memremap.c		patch \| blob \| blame \| history
lib/swiotlb.c		patch \| blob \| blame \| history
mm/early_ioremap.c		patch \| blob \| blame \| history
mm/memory-failure.c		patch \| blob \| blame \| history