tracing-fix-possible-double-free-on-failure-of-allocating-trace-buffer.patch
tracing-fix-crash-when-it-fails-to-alloc-ring-buffer.patch
x86-cpufeatures-add-x86_bug_cpu_insecure.patch
+x86-mm-pti-disable-global-pages-if-page_table_isolation-y.patch
+x86-mm-pti-prepare-the-x86-entry-assembly-code-for-entry-exit-cr3-switching.patch
+x86-mm-pti-add-infrastructure-for-page-table-isolation.patch
+x86-pti-add-the-pti-cmdline-option-and-documentation.patch
+x86-mm-pti-add-mapping-helper-functions.patch
+x86-mm-pti-allow-nx-poison-to-be-set-in-p4d-pgd.patch
+x86-mm-pti-allocate-a-separate-user-pgd.patch
+x86-mm-pti-populate-user-pgd.patch
+x86-mm-pti-add-functions-to-clone-kernel-pmds.patch
+x86-mm-pti-force-entry-through-trampoline-when-pti-active.patch
+x86-mm-pti-share-cpu_entry_area-with-user-space-page-tables.patch
+x86-entry-align-entry-text-section-to-pmd-boundary.patch
+x86-mm-pti-share-entry-text-pmd.patch
+x86-mm-pti-map-espfix-into-user-space.patch
+x86-cpu_entry_area-add-debugstore-entries-to-cpu_entry_area.patch
+x86-events-intel-ds-map-debug-buffers-in-cpu_entry_area.patch
+x86-mm-64-make-a-full-pgd-entry-size-hole-in-the-memory-map.patch
+x86-pti-put-the-ldt-in-its-own-pgd-if-pti-is-on.patch
+x86-pti-map-the-vsyscall-page-if-needed.patch
+x86-mm-allow-flushing-for-future-asid-switches.patch
+x86-mm-abstract-switching-cr3.patch
+x86-mm-use-fix-pcid-to-optimize-user-kernel-switches.patch
+x86-mm-optimize-restore_cr3.patch
+x86-mm-use-invpcid-for-__native_flush_tlb_single.patch
+x86-mm-clarify-the-whole-asid-kernel-pcid-user-pcid-naming.patch
+x86-dumpstack-indicate-in-oops-whether-pti-is-configured-and-enabled.patch
+x86-mm-pti-add-kconfig.patch
+x86-mm-dump_pagetables-add-page-table-directory-to-the-debugfs-vfs-hierarchy.patch
+x86-mm-dump_pagetables-check-user-space-page-table-for-wx-pages.patch
+x86-mm-dump_pagetables-allow-dumping-current-pagetables.patch
+x86-ldt-make-the-ldt-mapping-ro.patch
--- /dev/null
+From 10043e02db7f8a4161f76434931051e7d797a5f6 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 4 Dec 2017 15:07:49 +0100
+Subject: x86/cpu_entry_area: Add debugstore entries to cpu_entry_area
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 10043e02db7f8a4161f76434931051e7d797a5f6 upstream.
+
+The Intel PEBS/BTS debug store is a design trainwreck as it expects virtual
+addresses which must be visible in any execution context.
+
+So it is required to make these mappings visible to user space when kernel
+page table isolation is active.
+
+Provide enough room for the buffer mappings in the cpu_entry_area so the
+buffers are available in the user space visible page tables.
+
+At the point where the kernel side entry area is populated there is no
+buffer available yet, but the kernel PMD must be populated. To achieve this
+set the entries for these buffers to non present.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/events/intel/ds.c | 5 ++--
+ arch/x86/events/perf_event.h | 21 +------------------
+ arch/x86/include/asm/cpu_entry_area.h | 13 ++++++++++++
+ arch/x86/include/asm/intel_ds.h | 36 ++++++++++++++++++++++++++++++++++
+ arch/x86/mm/cpu_entry_area.c | 27 +++++++++++++++++++++++++
+ 5 files changed, 81 insertions(+), 21 deletions(-)
+
+--- a/arch/x86/events/intel/ds.c
++++ b/arch/x86/events/intel/ds.c
+@@ -8,11 +8,12 @@
+
+ #include "../perf_event.h"
+
++/* Waste a full page so it can be mapped into the cpu_entry_area */
++DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
++
+ /* The size of a BTS record in bytes: */
+ #define BTS_RECORD_SIZE 24
+
+-#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
+-#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
+ #define PEBS_FIXUP_SIZE PAGE_SIZE
+
+ /*
+--- a/arch/x86/events/perf_event.h
++++ b/arch/x86/events/perf_event.h
+@@ -14,6 +14,8 @@
+
+ #include <linux/perf_event.h>
+
++#include <asm/intel_ds.h>
++
+ /* To enable MSR tracing please use the generic trace points. */
+
+ /*
+@@ -77,8 +79,6 @@ struct amd_nb {
+ struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+ };
+
+-/* The maximal number of PEBS events: */
+-#define MAX_PEBS_EVENTS 8
+ #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
+
+ /*
+@@ -95,23 +95,6 @@ struct amd_nb {
+ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
+ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
+
+-/*
+- * A debug store configuration.
+- *
+- * We only support architectures that use 64bit fields.
+- */
+-struct debug_store {
+- u64 bts_buffer_base;
+- u64 bts_index;
+- u64 bts_absolute_maximum;
+- u64 bts_interrupt_threshold;
+- u64 pebs_buffer_base;
+- u64 pebs_index;
+- u64 pebs_absolute_maximum;
+- u64 pebs_interrupt_threshold;
+- u64 pebs_event_reset[MAX_PEBS_EVENTS];
+-};
+-
+ #define PEBS_REGS \
+ (PERF_REG_X86_AX | \
+ PERF_REG_X86_BX | \
+--- a/arch/x86/include/asm/cpu_entry_area.h
++++ b/arch/x86/include/asm/cpu_entry_area.h
+@@ -5,6 +5,7 @@
+
+ #include <linux/percpu-defs.h>
+ #include <asm/processor.h>
++#include <asm/intel_ds.h>
+
+ /*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+@@ -40,6 +41,18 @@ struct cpu_entry_area {
+ */
+ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+ #endif
++#ifdef CONFIG_CPU_SUP_INTEL
++ /*
++ * Per CPU debug store for Intel performance monitoring. Wastes a
++ * full page at the moment.
++ */
++ struct debug_store cpu_debug_store;
++ /*
++ * The actual PEBS/BTS buffers must be mapped to user space
++ * Reserve enough fixmap PTEs.
++ */
++ struct debug_store_buffers cpu_debug_buffers;
++#endif
+ };
+
+ #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
+--- /dev/null
++++ b/arch/x86/include/asm/intel_ds.h
+@@ -0,0 +1,36 @@
++#ifndef _ASM_INTEL_DS_H
++#define _ASM_INTEL_DS_H
++
++#include <linux/percpu-defs.h>
++
++#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
++#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
++
++/* The maximal number of PEBS events: */
++#define MAX_PEBS_EVENTS 8
++
++/*
++ * A debug store configuration.
++ *
++ * We only support architectures that use 64bit fields.
++ */
++struct debug_store {
++ u64 bts_buffer_base;
++ u64 bts_index;
++ u64 bts_absolute_maximum;
++ u64 bts_interrupt_threshold;
++ u64 pebs_buffer_base;
++ u64 pebs_index;
++ u64 pebs_absolute_maximum;
++ u64 pebs_interrupt_threshold;
++ u64 pebs_event_reset[MAX_PEBS_EVENTS];
++} __aligned(PAGE_SIZE);
++
++DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
++
++struct debug_store_buffers {
++ char bts_buffer[BTS_BUFFER_SIZE];
++ char pebs_buffer[PEBS_BUFFER_SIZE];
++};
++
++#endif
+--- a/arch/x86/mm/cpu_entry_area.c
++++ b/arch/x86/mm/cpu_entry_area.c
+@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, vo
+ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+ }
+
++static void percpu_setup_debug_store(int cpu)
++{
++#ifdef CONFIG_CPU_SUP_INTEL
++ int npages;
++ void *cea;
++
++ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
++ return;
++
++ cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
++ npages = sizeof(struct debug_store) / PAGE_SIZE;
++ BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
++ cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
++ PAGE_KERNEL);
++
++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
++ /*
++ * Force the population of PMDs for not yet allocated per cpu
++ * memory like debug store buffers.
++ */
++ npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
++ for (; npages; npages--, cea += PAGE_SIZE)
++ cea_set_pte(cea, 0, PAGE_NONE);
++#endif
++}
++
+ /* Setup the fixmap mappings only once per-processor */
+ static void __init setup_cpu_entry_area(int cpu)
+ {
+@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(
+ cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+ #endif
++ percpu_setup_debug_store(cpu);
+ }
+
+ static __init void setup_cpu_entry_area_ptes(void)
--- /dev/null
+From 5f26d76c3fd67c48806415ef8b1116c97beff8ba Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 19 Dec 2017 22:33:46 +0100
+Subject: x86/dumpstack: Indicate in Oops whether PTI is configured and enabled
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 5f26d76c3fd67c48806415ef8b1116c97beff8ba upstream.
+
+CONFIG_PAGE_TABLE_ISOLATION is relatively new and intrusive feature that may
+still have some corner cases which could take some time to manifest and be
+fixed. It would be useful to have Oops messages indicate whether it was
+enabled for building the kernel, and whether it was disabled during boot.
+
+Example of fully enabled:
+
+ Oops: 0001 [#1] SMP PTI
+
+Example of enabled during build, but disabled during boot:
+
+ Oops: 0001 [#1] SMP NOPTI
+
+We can decide to remove this after the feature has been tested in the field
+long enough.
+
+[ tglx: Made it use boot_cpu_has() as requested by Borislav ]
+
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Eduardo Valentin <eduval@amazon.com>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Andy Lutomirsky <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: bpetkov@suse.de
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: jkosina@suse.cz
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/dumpstack.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kernel/dumpstack.c
++++ b/arch/x86/kernel/dumpstack.c
+@@ -297,11 +297,13 @@ int __die(const char *str, struct pt_reg
+ unsigned long sp;
+ #endif
+ printk(KERN_DEFAULT
+- "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
++ "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
+ IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
+ IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
+ debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
+- IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "");
++ IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
++ IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
++ (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
+
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
--- /dev/null
+From 2f7412ba9c6af5ab16bdbb4a3fdb1dcd2b4fd3c2 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 4 Dec 2017 15:07:46 +0100
+Subject: x86/entry: Align entry text section to PMD boundary
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 2f7412ba9c6af5ab16bdbb4a3fdb1dcd2b4fd3c2 upstream.
+
+The (irq)entry text must be visible in the user space page tables. To allow
+simple PMD based sharing, make the entry text PMD aligned.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/vmlinux.lds.S | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/arch/x86/kernel/vmlinux.lds.S
++++ b/arch/x86/kernel/vmlinux.lds.S
+@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
+ . = ALIGN(HPAGE_SIZE); \
+ __end_rodata_hpage_align = .;
+
++#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
++#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
++
+ #else
+
+ #define X64_ALIGN_RODATA_BEGIN
+ #define X64_ALIGN_RODATA_END
+
++#define ALIGN_ENTRY_TEXT_BEGIN
++#define ALIGN_ENTRY_TEXT_END
++
+ #endif
+
+ PHDRS {
+@@ -102,8 +108,10 @@ SECTIONS
+ CPUIDLE_TEXT
+ LOCK_TEXT
+ KPROBES_TEXT
++ ALIGN_ENTRY_TEXT_BEGIN
+ ENTRY_TEXT
+ IRQENTRY_TEXT
++ ALIGN_ENTRY_TEXT_END
+ SOFTIRQENTRY_TEXT
+ *(.fixup)
+ *(.gnu.warning)
--- /dev/null
+From c1961a4631daef4aeabee8e368b1b13e8f173c91 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Mon, 4 Dec 2017 15:07:50 +0100
+Subject: x86/events/intel/ds: Map debug buffers in cpu_entry_area
+
+From: Hugh Dickins <hughd@google.com>
+
+commit c1961a4631daef4aeabee8e368b1b13e8f173c91 upstream.
+
+The BTS and PEBS buffers both have their virtual addresses programmed into
+the hardware. This means that any access to them is performed via the page
+tables. The times that the hardware accesses these are entirely dependent
+on how the performance monitoring hardware events are set up. In other
+words, there is no way for the kernel to tell when the hardware might
+access these buffers.
+
+To avoid perf crashes, place 'debug_store' allocate pages and map them into
+the cpu_entry_area.
+
+The PEBS fixup buffer does not need this treatment.
+
+[ tglx: Got rid of the kaiser_add_mapping() complication ]
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/events/intel/ds.c | 125 +++++++++++++++++++++++++++----------------
+ arch/x86/events/perf_event.h | 2
+ 2 files changed, 82 insertions(+), 45 deletions(-)
+
+--- a/arch/x86/events/intel/ds.c
++++ b/arch/x86/events/intel/ds.c
+@@ -3,6 +3,7 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+
++#include <asm/cpu_entry_area.h>
+ #include <asm/perf_event.h>
+ #include <asm/insn.h>
+
+@@ -280,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
+
+ static DEFINE_PER_CPU(void *, insn_buffer);
+
+-static int alloc_pebs_buffer(int cpu)
++static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
+ {
+- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
++ phys_addr_t pa;
++ size_t msz = 0;
++
++ pa = virt_to_phys(addr);
++ for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
++ cea_set_pte(cea, pa, prot);
++}
++
++static void ds_clear_cea(void *cea, size_t size)
++{
++ size_t msz = 0;
++
++ for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
++ cea_set_pte(cea, 0, PAGE_NONE);
++}
++
++static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
++{
++ unsigned int order = get_order(size);
+ int node = cpu_to_node(cpu);
+- int max;
+- void *buffer, *ibuffer;
++ struct page *page;
++
++ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
++ return page ? page_address(page) : NULL;
++}
++
++static void dsfree_pages(const void *buffer, size_t size)
++{
++ if (buffer)
++ free_pages((unsigned long)buffer, get_order(size));
++}
++
++static int alloc_pebs_buffer(int cpu)
++{
++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
++ struct debug_store *ds = hwev->ds;
++ size_t bsiz = x86_pmu.pebs_buffer_size;
++ int max, node = cpu_to_node(cpu);
++ void *buffer, *ibuffer, *cea;
+
+ if (!x86_pmu.pebs)
+ return 0;
+
+- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
++ buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
+ if (unlikely(!buffer))
+ return -ENOMEM;
+
+@@ -301,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
+ if (x86_pmu.intel_cap.pebs_format < 2) {
+ ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+ if (!ibuffer) {
+- kfree(buffer);
++ dsfree_pages(buffer, bsiz);
+ return -ENOMEM;
+ }
+ per_cpu(insn_buffer, cpu) = ibuffer;
+ }
+-
+- max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
+-
+- ds->pebs_buffer_base = (u64)(unsigned long)buffer;
++ hwev->ds_pebs_vaddr = buffer;
++ /* Update the cpu entry area mapping */
++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
++ ds->pebs_buffer_base = (unsigned long) cea;
++ ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
+ ds->pebs_index = ds->pebs_buffer_base;
+- ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+- max * x86_pmu.pebs_record_size;
+-
++ max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
++ ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
+ return 0;
+ }
+
+ static void release_pebs_buffer(int cpu)
+ {
+- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
++ struct debug_store *ds = hwev->ds;
++ void *cea;
+
+ if (!ds || !x86_pmu.pebs)
+ return;
+@@ -327,73 +365,70 @@ static void release_pebs_buffer(int cpu)
+ kfree(per_cpu(insn_buffer, cpu));
+ per_cpu(insn_buffer, cpu) = NULL;
+
+- kfree((void *)(unsigned long)ds->pebs_buffer_base);
++ /* Clear the fixmap */
++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
++ ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
+ ds->pebs_buffer_base = 0;
++ dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
++ hwev->ds_pebs_vaddr = NULL;
+ }
+
+ static int alloc_bts_buffer(int cpu)
+ {
+- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+- int node = cpu_to_node(cpu);
+- int max, thresh;
+- void *buffer;
++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
++ struct debug_store *ds = hwev->ds;
++ void *buffer, *cea;
++ int max;
+
+ if (!x86_pmu.bts)
+ return 0;
+
+- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
++ buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
+ if (unlikely(!buffer)) {
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
+ return -ENOMEM;
+ }
+-
+- max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+- thresh = max / 16;
+-
+- ds->bts_buffer_base = (u64)(unsigned long)buffer;
++ hwev->ds_bts_vaddr = buffer;
++ /* Update the fixmap */
++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
++ ds->bts_buffer_base = (unsigned long) cea;
++ ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
+ ds->bts_index = ds->bts_buffer_base;
+- ds->bts_absolute_maximum = ds->bts_buffer_base +
+- max * BTS_RECORD_SIZE;
+- ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+- thresh * BTS_RECORD_SIZE;
+-
++ max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
++ ds->bts_absolute_maximum = ds->bts_buffer_base + max;
++ ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
+ return 0;
+ }
+
+ static void release_bts_buffer(int cpu)
+ {
+- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
++ struct debug_store *ds = hwev->ds;
++ void *cea;
+
+ if (!ds || !x86_pmu.bts)
+ return;
+
+- kfree((void *)(unsigned long)ds->bts_buffer_base);
++ /* Clear the fixmap */
++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
++ ds_clear_cea(cea, BTS_BUFFER_SIZE);
+ ds->bts_buffer_base = 0;
++ dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
++ hwev->ds_bts_vaddr = NULL;
+ }
+
+ static int alloc_ds_buffer(int cpu)
+ {
+- int node = cpu_to_node(cpu);
+- struct debug_store *ds;
+-
+- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
+- if (unlikely(!ds))
+- return -ENOMEM;
++ struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
+
++ memset(ds, 0, sizeof(*ds));
+ per_cpu(cpu_hw_events, cpu).ds = ds;
+-
+ return 0;
+ }
+
+ static void release_ds_buffer(int cpu)
+ {
+- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+-
+- if (!ds)
+- return;
+-
+ per_cpu(cpu_hw_events, cpu).ds = NULL;
+- kfree(ds);
+ }
+
+ void release_ds_buffers(void)
+--- a/arch/x86/events/perf_event.h
++++ b/arch/x86/events/perf_event.h
+@@ -199,6 +199,8 @@ struct cpu_hw_events {
+ * Intel DebugStore bits
+ */
+ struct debug_store *ds;
++ void *ds_pebs_vaddr;
++ void *ds_bts_vaddr;
+ u64 pebs_enabled;
+ int n_pebs;
+ int n_large_pebs;
--- /dev/null
+From 9f5cb6b32d9e0a3a7453222baaf15664d92adbf2 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Fri, 15 Dec 2017 20:35:11 +0100
+Subject: x86/ldt: Make the LDT mapping RO
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 9f5cb6b32d9e0a3a7453222baaf15664d92adbf2 upstream.
+
+Now that the LDT mapping is in a known area when PAGE_TABLE_ISOLATION is
+enabled its a primary target for attacks, if a user space interface fails
+to validate a write address correctly. That can never happen, right?
+
+The SDM states:
+
+ If the segment descriptors in the GDT or an LDT are placed in ROM, the
+ processor can enter an indefinite loop if software or the processor
+ attempts to update (write to) the ROM-based segment descriptors. To
+ prevent this problem, set the accessed bits for all segment descriptors
+ placed in a ROM. Also, remove operating-system or executive code that
+ attempts to modify segment descriptors located in ROM.
+
+So its a valid approach to set the ACCESS bit when setting up the LDT entry
+and to map the table RO. Fixup the selftest so it can handle that new mode.
+
+Remove the manual ACCESS bit setter in set_tls_desc() as this is now
+pointless. Folded the patch from Peter Ziljstra.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/desc.h | 2 ++
+ arch/x86/kernel/ldt.c | 7 ++++++-
+ arch/x86/kernel/tls.c | 11 ++---------
+ tools/testing/selftests/x86/ldt_gdt.c | 3 +--
+ 4 files changed, 11 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/include/asm/desc.h
++++ b/arch/x86/include/asm/desc.h
+@@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_
+
+ desc->type = (info->read_exec_only ^ 1) << 1;
+ desc->type |= info->contents << 2;
++ /* Set the ACCESS bit so it can be mapped RO */
++ desc->type |= 1;
+
+ desc->s = 1;
+ desc->dpl = 0x3;
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -158,7 +158,12 @@ map_ldt_struct(struct mm_struct *mm, str
+ ptep = get_locked_pte(mm, va, &ptl);
+ if (!ptep)
+ return -ENOMEM;
+- pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
++ /*
++ * Map it RO so the easy to find address is not a primary
++ * target via some kernel interface which misses a
++ * permission check.
++ */
++ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
+ set_pte_at(mm, va, ptep, pte);
+ pte_unmap_unlock(ptep, ptl);
+ }
+--- a/arch/x86/kernel/tls.c
++++ b/arch/x86/kernel/tls.c
+@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_str
+ cpu = get_cpu();
+
+ while (n-- > 0) {
+- if (LDT_empty(info) || LDT_zero(info)) {
++ if (LDT_empty(info) || LDT_zero(info))
+ memset(desc, 0, sizeof(*desc));
+- } else {
++ else
+ fill_ldt(desc, info);
+-
+- /*
+- * Always set the accessed bit so that the CPU
+- * doesn't try to write to the (read-only) GDT.
+- */
+- desc->type |= 1;
+- }
+ ++info;
+ ++desc;
+ }
+--- a/tools/testing/selftests/x86/ldt_gdt.c
++++ b/tools/testing/selftests/x86/ldt_gdt.c
+@@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t
+ * NB: Different Linux versions do different things with the
+ * accessed bit in set_thread_area().
+ */
+- if (ar != expected_ar &&
+- (ldt || ar != (expected_ar | AR_ACCESSED))) {
++ if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
+ printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
+ (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
+ nerrs++;
--- /dev/null
+From 9f449772a3106bcdd4eb8fdeb281147b0e99fb30 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 12 Dec 2017 07:56:44 -0800
+Subject: x86/mm/64: Make a full PGD-entry size hole in the memory map
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 9f449772a3106bcdd4eb8fdeb281147b0e99fb30 upstream.
+
+Shrink vmalloc space from 16384TiB to 12800TiB to enlarge the hole starting
+at 0xff90000000000000 to be a full PGD entry.
+
+A subsequent patch will use this hole for the pagetable isolation LDT
+alias.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Kirill A. Shutemov <kirill@shutemov.name>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ Documentation/x86/x86_64/mm.txt | 4 ++--
+ arch/x86/include/asm/pgtable_64_types.h | 4 ++--
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+--- a/Documentation/x86/x86_64/mm.txt
++++ b/Documentation/x86/x86_64/mm.txt
+@@ -29,8 +29,8 @@ Virtual memory map with 5 level page tab
+ hole caused by [56:63] sign extension
+ ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
+ ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
+-ff90000000000000 - ff91ffffffffffff (=49 bits) hole
+-ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
++ff90000000000000 - ff9fffffffffffff (=52 bits) hole
++ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
+ ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
+ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
+ ... unused hole ...
+--- a/arch/x86/include/asm/pgtable_64_types.h
++++ b/arch/x86/include/asm/pgtable_64_types.h
+@@ -79,8 +79,8 @@ typedef struct { pteval_t pte; } pte_t;
+ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+
+ #ifdef CONFIG_X86_5LEVEL
+-# define VMALLOC_SIZE_TB _AC(16384, UL)
+-# define __VMALLOC_BASE _AC(0xff92000000000000, UL)
++# define VMALLOC_SIZE_TB _AC(12800, UL)
++# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
+ # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+ #else
+ # define VMALLOC_SIZE_TB _AC(32, UL)
--- /dev/null
+From 48e111982cda033fec832c6b0592c2acedd85d04 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Mon, 4 Dec 2017 15:07:58 +0100
+Subject: x86/mm: Abstract switching CR3
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 48e111982cda033fec832c6b0592c2acedd85d04 upstream.
+
+In preparation to adding additional PCID flushing, abstract the
+loading of a new ASID into CR3.
+
+[ PeterZ: Split out from big combo patch ]
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/tlb.c | 22 ++++++++++++++++++++--
+ 1 file changed, 20 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -100,6 +100,24 @@ static void choose_new_asid(struct mm_st
+ *need_flush = true;
+ }
+
++static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
++{
++ unsigned long new_mm_cr3;
++
++ if (need_flush) {
++ new_mm_cr3 = build_cr3(pgdir, new_asid);
++ } else {
++ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
++ }
++
++ /*
++ * Caution: many callers of this function expect
++ * that load_cr3() is serializing and orders TLB
++ * fills with respect to the mm_cpumask writes.
++ */
++ write_cr3(new_mm_cr3);
++}
++
+ void leave_mm(int cpu)
+ {
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+@@ -230,7 +248,7 @@ void switch_mm_irqs_off(struct mm_struct
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+- write_cr3(build_cr3(next->pgd, new_asid));
++ load_new_mm_cr3(next->pgd, new_asid, true);
+
+ /*
+ * NB: This gets called via leave_mm() in the idle path
+@@ -243,7 +261,7 @@ void switch_mm_irqs_off(struct mm_struct
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+- write_cr3(build_cr3_noflush(next->pgd, new_asid));
++ load_new_mm_cr3(next->pgd, new_asid, false);
+
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
--- /dev/null
+From 2ea907c4fe7b78e5840c1dc07800eae93248cad1 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Mon, 4 Dec 2017 15:07:57 +0100
+Subject: x86/mm: Allow flushing for future ASID switches
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 2ea907c4fe7b78e5840c1dc07800eae93248cad1 upstream.
+
+If changing the page tables in such a way that an invalidation of all
+contexts (aka. PCIDs / ASIDs) is required, they can be actively invalidated
+by:
+
+ 1. INVPCID for each PCID (works for single pages too).
+
+ 2. Load CR3 with each PCID without the NOFLUSH bit set
+
+ 3. Load CR3 with the NOFLUSH bit set for each and do INVLPG for each address.
+
+But, none of these are really feasible since there are ~6 ASIDs (12 with
+PAGE_TABLE_ISOLATION) at the time that invalidation is required.
+Instead of actively invalidating them, invalidate the *current* context and
+also mark the cpu_tlbstate _quickly_ to indicate future invalidation to be
+required.
+
+At the next context-switch, look for this indicator
+('invalidate_other' being set) invalidate all of the
+cpu_tlbstate.ctxs[] entries.
+
+This ensures that any future context switches will do a full flush
+of the TLB, picking up the previous changes.
+
+[ tglx: Folded more fixups from Peter ]
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/tlbflush.h | 37 +++++++++++++++++++++++++++++--------
+ arch/x86/mm/tlb.c | 35 +++++++++++++++++++++++++++++++++++
+ 2 files changed, 64 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -135,6 +135,17 @@ struct tlb_state {
+ bool is_lazy;
+
+ /*
++ * If set we changed the page tables in such a way that we
++ * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
++ * This tells us to go invalidate all the non-loaded ctxs[]
++ * on the next context switch.
++ *
++ * The current ctx was kept up-to-date as it ran and does not
++ * need to be invalidated.
++ */
++ bool invalidate_other;
++
++ /*
+ * Access to this CR4 shadow and to H/W CR4 is protected by
+ * disabling interrupts when modifying either one.
+ */
+@@ -212,6 +223,14 @@ static inline unsigned long cr4_read_sha
+ }
+
+ /*
++ * Mark all other ASIDs as invalid, preserves the current.
++ */
++static inline void invalidate_other_asid(void)
++{
++ this_cpu_write(cpu_tlbstate.invalidate_other, true);
++}
++
++/*
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags. This should only be used
+@@ -298,14 +317,6 @@ static inline void __flush_tlb_all(void)
+ */
+ __flush_tlb();
+ }
+-
+- /*
+- * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+- * we'd end up flushing kernel translations for the current ASID but
+- * we might fail to flush kernel translations for other cached ASIDs.
+- *
+- * To avoid this issue, we force PCID off if PGE is off.
+- */
+ }
+
+ /*
+@@ -315,6 +326,16 @@ static inline void __flush_tlb_one(unsig
+ {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+ __flush_tlb_single(addr);
++
++ if (!static_cpu_has(X86_FEATURE_PTI))
++ return;
++
++ /*
++ * __flush_tlb_single() will have cleared the TLB entry for this ASID,
++ * but since kernel space is replicated across all, we must also
++ * invalidate all others.
++ */
++ invalidate_other_asid();
+ }
+
+ #define TLB_FLUSH_ALL -1UL
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -28,6 +28,38 @@
+ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
+ */
+
++/*
++ * We get here when we do something requiring a TLB invalidation
++ * but could not go invalidate all of the contexts. We do the
++ * necessary invalidation by clearing out the 'ctx_id' which
++ * forces a TLB flush when the context is loaded.
++ */
++void clear_asid_other(void)
++{
++ u16 asid;
++
++ /*
++ * This is only expected to be set if we have disabled
++ * kernel _PAGE_GLOBAL pages.
++ */
++ if (!static_cpu_has(X86_FEATURE_PTI)) {
++ WARN_ON_ONCE(1);
++ return;
++ }
++
++ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
++ /* Do not need to flush the current asid */
++ if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
++ continue;
++ /*
++ * Make sure the next time we go to switch to
++ * this asid, we do a flush:
++ */
++ this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
++ }
++ this_cpu_write(cpu_tlbstate.invalidate_other, false);
++}
++
+ atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
+
+@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_st
+ return;
+ }
+
++ if (this_cpu_read(cpu_tlbstate.invalidate_other))
++ clear_asid_other();
++
+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+ if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
+ next->context.ctx_id)
--- /dev/null
+From 0a126abd576ebc6403f063dbe20cf7416c9d9393 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Tue, 5 Dec 2017 13:34:53 +0100
+Subject: x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 0a126abd576ebc6403f063dbe20cf7416c9d9393 upstream.
+
+Ideally we'd also use sparse to enforce this separation so it becomes much
+more difficult to mess up.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Cc: linux-mm@kvack.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/tlbflush.h | 55 +++++++++++++++++++++++++++++++---------
+ 1 file changed, 43 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -13,16 +13,33 @@
+ #include <asm/pti.h>
+ #include <asm/processor-flags.h>
+
+-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+-{
+- /*
+- * Bump the generation count. This also serves as a full barrier
+- * that synchronizes with switch_mm(): callers are required to order
+- * their read of mm_cpumask after their writes to the paging
+- * structures.
+- */
+- return atomic64_inc_return(&mm->context.tlb_gen);
+-}
++/*
++ * The x86 feature is called PCID (Process Context IDentifier). It is similar
++ * to what is traditionally called ASID on the RISC processors.
++ *
++ * We don't use the traditional ASID implementation, where each process/mm gets
++ * its own ASID and flush/restart when we run out of ASID space.
++ *
++ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
++ * that came by on this CPU, allowing cheaper switch_mm between processes on
++ * this CPU.
++ *
++ * We end up with different spaces for different things. To avoid confusion we
++ * use different names for each of them:
++ *
++ * ASID - [0, TLB_NR_DYN_ASIDS-1]
++ * the canonical identifier for an mm
++ *
++ * kPCID - [1, TLB_NR_DYN_ASIDS]
++ * the value we write into the PCID part of CR3; corresponds to the
++ * ASID+1, because PCID 0 is special.
++ *
++ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
++ * for KPTI each mm has two address spaces and thus needs two
++ * PCID values, but we can still do with a single ASID denomination
++ * for each mm. Corresponds to kPCID + 2048.
++ *
++ */
+
+ /* There are 12 bits of space for ASIDS in CR3 */
+ #define CR3_HW_ASID_BITS 12
+@@ -41,7 +58,7 @@ static inline u64 inc_mm_tlb_gen(struct
+
+ /*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
+- * for them being zero-based. Another -1 is because ASID 0 is reserved for
++ * for them being zero-based. Another -1 is because PCID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+ #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+@@ -52,6 +69,9 @@ static inline u64 inc_mm_tlb_gen(struct
+ */
+ #define TLB_NR_DYN_ASIDS 6
+
++/*
++ * Given @asid, compute kPCID
++ */
+ static inline u16 kern_pcid(u16 asid)
+ {
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+@@ -86,7 +106,7 @@ static inline u16 kern_pcid(u16 asid)
+ }
+
+ /*
+- * The user PCID is just the kernel one, plus the "switch bit".
++ * Given @asid, compute uPCID
+ */
+ static inline u16 user_pcid(u16 asid)
+ {
+@@ -484,6 +504,17 @@ static inline void flush_tlb_page(struct
+ void native_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info);
+
++static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
++{
++ /*
++ * Bump the generation count. This also serves as a full barrier
++ * that synchronizes with switch_mm(): callers are required to order
++ * their read of mm_cpumask after their writes to the paging
++ * structures.
++ */
++ return atomic64_inc_return(&mm->context.tlb_gen);
++}
++
+ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm)
+ {
--- /dev/null
+From 75298aa179d56cd64f54e58a19fffc8ab922b4c0 Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@suse.de>
+Date: Mon, 4 Dec 2017 15:08:04 +0100
+Subject: x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy
+
+From: Borislav Petkov <bp@suse.de>
+
+commit 75298aa179d56cd64f54e58a19fffc8ab922b4c0 upstream.
+
+The upcoming support for dumping the kernel and the user space page tables
+of the current process would create more random files in the top level
+debugfs directory.
+
+Add a page table directory and move the existing file to it.
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/debug_pagetables.c | 15 ++++++++++-----
+ 1 file changed, 10 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/mm/debug_pagetables.c
++++ b/arch/x86/mm/debug_pagetables.c
+@@ -22,21 +22,26 @@ static const struct file_operations ptdu
+ .release = single_release,
+ };
+
+-static struct dentry *pe;
++static struct dentry *dir, *pe;
+
+ static int __init pt_dump_debug_init(void)
+ {
+- pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
+- &ptdump_fops);
+- if (!pe)
++ dir = debugfs_create_dir("page_tables", NULL);
++ if (!dir)
+ return -ENOMEM;
+
++ pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops);
++ if (!pe)
++ goto err;
+ return 0;
++err:
++ debugfs_remove_recursive(dir);
++ return -ENOMEM;
+ }
+
+ static void __exit pt_dump_debug_exit(void)
+ {
+- debugfs_remove_recursive(pe);
++ debugfs_remove_recursive(dir);
+ }
+
+ module_init(pt_dump_debug_init);
--- /dev/null
+From a4b51ef6552c704764684cef7e753162dc87c5fa Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 4 Dec 2017 15:08:06 +0100
+Subject: x86/mm/dump_pagetables: Allow dumping current pagetables
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit a4b51ef6552c704764684cef7e753162dc87c5fa upstream.
+
+Add two debugfs files which allow to dump the pagetable of the current
+task.
+
+current_kernel dumps the regular page table. This is the page table which
+is normally shared between kernel and user space. If kernel page table
+isolation is enabled this is the kernel space mapping.
+
+If kernel page table isolation is enabled the second file, current_user,
+dumps the user space page table.
+
+These files allow to verify the resulting page tables for page table
+isolation, but even in the normal case its useful to be able to inspect
+user space page tables of current for debugging purposes.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Cc: linux-mm@kvack.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/pgtable.h | 2 -
+ arch/x86/mm/debug_pagetables.c | 71 ++++++++++++++++++++++++++++++++++++++---
+ arch/x86/mm/dump_pagetables.c | 6 ++-
+ 3 files changed, 73 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -28,7 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD]
+ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
+
+ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd);
++void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
+ void ptdump_walk_pgd_level_checkwx(void);
+
+ #ifdef CONFIG_DEBUG_WX
+--- a/arch/x86/mm/debug_pagetables.c
++++ b/arch/x86/mm/debug_pagetables.c
+@@ -5,7 +5,7 @@
+
+ static int ptdump_show(struct seq_file *m, void *v)
+ {
+- ptdump_walk_pgd_level_debugfs(m, NULL);
++ ptdump_walk_pgd_level_debugfs(m, NULL, false);
+ return 0;
+ }
+
+@@ -22,7 +22,57 @@ static const struct file_operations ptdu
+ .release = single_release,
+ };
+
+-static struct dentry *dir, *pe;
++static int ptdump_show_curknl(struct seq_file *m, void *v)
++{
++ if (current->mm->pgd) {
++ down_read(¤t->mm->mmap_sem);
++ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
++ up_read(¤t->mm->mmap_sem);
++ }
++ return 0;
++}
++
++static int ptdump_open_curknl(struct inode *inode, struct file *filp)
++{
++ return single_open(filp, ptdump_show_curknl, NULL);
++}
++
++static const struct file_operations ptdump_curknl_fops = {
++ .owner = THIS_MODULE,
++ .open = ptdump_open_curknl,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = single_release,
++};
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++static struct dentry *pe_curusr;
++
++static int ptdump_show_curusr(struct seq_file *m, void *v)
++{
++ if (current->mm->pgd) {
++ down_read(¤t->mm->mmap_sem);
++ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
++ up_read(¤t->mm->mmap_sem);
++ }
++ return 0;
++}
++
++static int ptdump_open_curusr(struct inode *inode, struct file *filp)
++{
++ return single_open(filp, ptdump_show_curusr, NULL);
++}
++
++static const struct file_operations ptdump_curusr_fops = {
++ .owner = THIS_MODULE,
++ .open = ptdump_open_curusr,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = single_release,
++};
++#endif
++
++static struct dentry *dir, *pe_knl, *pe_curknl;
+
+ static int __init pt_dump_debug_init(void)
+ {
+@@ -30,9 +80,22 @@ static int __init pt_dump_debug_init(voi
+ if (!dir)
+ return -ENOMEM;
+
+- pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops);
+- if (!pe)
++ pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
++ &ptdump_fops);
++ if (!pe_knl)
++ goto err;
++
++ pe_curknl = debugfs_create_file("current_kernel", 0400,
++ dir, NULL, &ptdump_curknl_fops);
++ if (!pe_curknl)
++ goto err;
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ pe_curusr = debugfs_create_file("current_user", 0400,
++ dir, NULL, &ptdump_curusr_fops);
++ if (!pe_curusr)
+ goto err;
++#endif
+ return 0;
+ err:
+ debugfs_remove_recursive(dir);
+--- a/arch/x86/mm/dump_pagetables.c
++++ b/arch/x86/mm/dump_pagetables.c
+@@ -530,8 +530,12 @@ void ptdump_walk_pgd_level(struct seq_fi
+ ptdump_walk_pgd_level_core(m, pgd, false, true);
+ }
+
+-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd)
++void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
+ {
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ if (user && static_cpu_has(X86_FEATURE_PTI))
++ pgd = kernel_to_user_pgdp(pgd);
++#endif
+ ptdump_walk_pgd_level_core(m, pgd, false, false);
+ }
+ EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
--- /dev/null
+From b4bf4f924b1d7bade38fd51b2e401d20d0956e4d Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 4 Dec 2017 15:08:05 +0100
+Subject: x86/mm/dump_pagetables: Check user space page table for WX pages
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit b4bf4f924b1d7bade38fd51b2e401d20d0956e4d upstream.
+
+ptdump_walk_pgd_level_checkwx() checks the kernel page table for WX pages,
+but does not check the PAGE_TABLE_ISOLATION user space page table.
+
+Restructure the code so that dmesg output is selected by an explicit
+argument and not implicit via checking the pgd argument for !NULL.
+
+Add the check for the user space page table.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Cc: linux-mm@kvack.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/pgtable.h | 1 +
+ arch/x86/mm/debug_pagetables.c | 2 +-
+ arch/x86/mm/dump_pagetables.c | 30 +++++++++++++++++++++++++-----
+ 3 files changed, 27 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD]
+ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
+
+ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
++void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd);
+ void ptdump_walk_pgd_level_checkwx(void);
+
+ #ifdef CONFIG_DEBUG_WX
+--- a/arch/x86/mm/debug_pagetables.c
++++ b/arch/x86/mm/debug_pagetables.c
+@@ -5,7 +5,7 @@
+
+ static int ptdump_show(struct seq_file *m, void *v)
+ {
+- ptdump_walk_pgd_level(m, NULL);
++ ptdump_walk_pgd_level_debugfs(m, NULL);
+ return 0;
+ }
+
+--- a/arch/x86/mm/dump_pagetables.c
++++ b/arch/x86/mm/dump_pagetables.c
+@@ -476,7 +476,7 @@ static inline bool is_hypervisor_range(i
+ }
+
+ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
+- bool checkwx)
++ bool checkwx, bool dmesg)
+ {
+ #ifdef CONFIG_X86_64
+ pgd_t *start = (pgd_t *) &init_top_pgt;
+@@ -489,7 +489,7 @@ static void ptdump_walk_pgd_level_core(s
+
+ if (pgd) {
+ start = pgd;
+- st.to_dmesg = true;
++ st.to_dmesg = dmesg;
+ }
+
+ st.check_wx = checkwx;
+@@ -527,13 +527,33 @@ static void ptdump_walk_pgd_level_core(s
+
+ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+ {
+- ptdump_walk_pgd_level_core(m, pgd, false);
++ ptdump_walk_pgd_level_core(m, pgd, false, true);
++}
++
++void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd)
++{
++ ptdump_walk_pgd_level_core(m, pgd, false, false);
++}
++EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
++
++static void ptdump_walk_user_pgd_level_checkwx(void)
++{
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ pgd_t *pgd = (pgd_t *) &init_top_pgt;
++
++ if (!static_cpu_has(X86_FEATURE_PTI))
++ return;
++
++ pr_info("x86/mm: Checking user space page tables\n");
++ pgd = kernel_to_user_pgdp(pgd);
++ ptdump_walk_pgd_level_core(NULL, pgd, true, false);
++#endif
+ }
+-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
+
+ void ptdump_walk_pgd_level_checkwx(void)
+ {
+- ptdump_walk_pgd_level_core(NULL, NULL, true);
++ ptdump_walk_pgd_level_core(NULL, NULL, true, false);
++ ptdump_walk_user_pgd_level_checkwx();
+ }
+
+ static int __init pt_dump_init(void)
--- /dev/null
+From 21e94459110252d41b45c0c8ba50fd72a664d50c Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Mon, 4 Dec 2017 15:08:00 +0100
+Subject: x86/mm: Optimize RESTORE_CR3
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 21e94459110252d41b45c0c8ba50fd72a664d50c upstream.
+
+Most NMI/paranoid exceptions will not in fact change pagetables and would
+thus not require TLB flushing, however RESTORE_CR3 uses flushing CR3
+writes.
+
+Restores to kernel PCIDs can be NOFLUSH, because we explicitly flush the
+kernel mappings and now that we track which user PCIDs need flushing we can
+avoid those too when possible.
+
+This does mean RESTORE_CR3 needs an additional scratch_reg, luckily both
+sites have plenty available.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/calling.h | 30 ++++++++++++++++++++++++++++--
+ arch/x86/entry/entry_64.S | 4 ++--
+ 2 files changed, 30 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/entry/calling.h
++++ b/arch/x86/entry/calling.h
+@@ -281,8 +281,34 @@ For 32-bit we have the following convent
+ .Ldone_\@:
+ .endm
+
+-.macro RESTORE_CR3 save_reg:req
++.macro RESTORE_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
++
++ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
++
++ /*
++ * KERNEL pages can always resume with NOFLUSH as we do
++ * explicit flushes.
++ */
++ bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
++ jnc .Lnoflush_\@
++
++ /*
++ * Check if there's a pending flush for the user ASID we're
++ * about to set.
++ */
++ movq \save_reg, \scratch_reg
++ andq $(0x7FF), \scratch_reg
++ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
++ jnc .Lnoflush_\@
++
++ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
++ jmp .Lwrcr3_\@
++
++.Lnoflush_\@:
++ SET_NOFLUSH_BIT \save_reg
++
++.Lwrcr3_\@:
+ /*
+ * The CR3 write could be avoided when not changing its value,
+ * but would require a CR3 read *and* a scratch register.
+@@ -301,7 +327,7 @@ For 32-bit we have the following convent
+ .endm
+ .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+ .endm
+-.macro RESTORE_CR3 save_reg:req
++.macro RESTORE_CR3 scratch_reg:req save_reg:req
+ .endm
+
+ #endif
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -1288,7 +1288,7 @@ ENTRY(paranoid_exit)
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz .Lparanoid_exit_no_swapgs
+ TRACE_IRQS_IRETQ
+- RESTORE_CR3 save_reg=%r14
++ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
+ SWAPGS_UNSAFE_STACK
+ jmp .Lparanoid_exit_restore
+ .Lparanoid_exit_no_swapgs:
+@@ -1730,7 +1730,7 @@ end_repeat_nmi:
+ movq $-1, %rsi
+ call do_nmi
+
+- RESTORE_CR3 save_reg=%r14
++ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz nmi_restore
--- /dev/null
+From 03f4424f348e8be95eb1bbeba09461cd7b867828 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:42 +0100
+Subject: x86/mm/pti: Add functions to clone kernel PMDs
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 03f4424f348e8be95eb1bbeba09461cd7b867828 upstream.
+
+Provide infrastructure to:
+
+ - find a kernel PMD for a mapping which must be visible to user space for
+ the entry/exit code to work.
+
+ - walk an address range and share the kernel PMD with it.
+
+This reuses a small part of the original KAISER patches to populate the
+user space page table.
+
+[ tglx: Made it universally usable so it can be used for any kind of shared
+ mapping. Add a mechanism to clear specific bits in the user space
+ visible PMD entry. Folded Andys simplifactions ]
+
+Originally-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/pti.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 127 insertions(+)
+
+--- a/arch/x86/mm/pti.c
++++ b/arch/x86/mm/pti.c
+@@ -48,6 +48,11 @@
+ #undef pr_fmt
+ #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
+
++/* Backporting helper */
++#ifndef __GFP_NOTRACK
++#define __GFP_NOTRACK 0
++#endif
++
+ static void __init pti_print_if_insecure(const char *reason)
+ {
+ if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+@@ -138,6 +143,128 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pg
+ }
+
+ /*
++ * Walk the user copy of the page tables (optionally) trying to allocate
++ * page table pages on the way down.
++ *
++ * Returns a pointer to a P4D on success, or NULL on failure.
++ */
++static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
++{
++ pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
++
++ if (address < PAGE_OFFSET) {
++ WARN_ONCE(1, "attempt to walk user address\n");
++ return NULL;
++ }
++
++ if (pgd_none(*pgd)) {
++ unsigned long new_p4d_page = __get_free_page(gfp);
++ if (!new_p4d_page)
++ return NULL;
++
++ if (pgd_none(*pgd)) {
++ set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
++ new_p4d_page = 0;
++ }
++ if (new_p4d_page)
++ free_page(new_p4d_page);
++ }
++ BUILD_BUG_ON(pgd_large(*pgd) != 0);
++
++ return p4d_offset(pgd, address);
++}
++
++/*
++ * Walk the user copy of the page tables (optionally) trying to allocate
++ * page table pages on the way down.
++ *
++ * Returns a pointer to a PMD on success, or NULL on failure.
++ */
++static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
++{
++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
++ p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
++ pud_t *pud;
++
++ BUILD_BUG_ON(p4d_large(*p4d) != 0);
++ if (p4d_none(*p4d)) {
++ unsigned long new_pud_page = __get_free_page(gfp);
++ if (!new_pud_page)
++ return NULL;
++
++ if (p4d_none(*p4d)) {
++ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
++ new_pud_page = 0;
++ }
++ if (new_pud_page)
++ free_page(new_pud_page);
++ }
++
++ pud = pud_offset(p4d, address);
++ /* The user page tables do not use large mappings: */
++ if (pud_large(*pud)) {
++ WARN_ON(1);
++ return NULL;
++ }
++ if (pud_none(*pud)) {
++ unsigned long new_pmd_page = __get_free_page(gfp);
++ if (!new_pmd_page)
++ return NULL;
++
++ if (pud_none(*pud)) {
++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
++ new_pmd_page = 0;
++ }
++ if (new_pmd_page)
++ free_page(new_pmd_page);
++ }
++
++ return pmd_offset(pud, address);
++}
++
++static void __init
++pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
++{
++ unsigned long addr;
++
++ /*
++ * Clone the populated PMDs which cover start to end. These PMD areas
++ * can have holes.
++ */
++ for (addr = start; addr < end; addr += PMD_SIZE) {
++ pmd_t *pmd, *target_pmd;
++ pgd_t *pgd;
++ p4d_t *p4d;
++ pud_t *pud;
++
++ pgd = pgd_offset_k(addr);
++ if (WARN_ON(pgd_none(*pgd)))
++ return;
++ p4d = p4d_offset(pgd, addr);
++ if (WARN_ON(p4d_none(*p4d)))
++ return;
++ pud = pud_offset(p4d, addr);
++ if (pud_none(*pud))
++ continue;
++ pmd = pmd_offset(pud, addr);
++ if (pmd_none(*pmd))
++ continue;
++
++ target_pmd = pti_user_pagetable_walk_pmd(addr);
++ if (WARN_ON(!target_pmd))
++ return;
++
++ /*
++ * Copy the PMD. That is, the kernelmode and usermode
++ * tables will share the last-level page tables of this
++ * address range
++ */
++ *target_pmd = pmd_clear_flags(*pmd, clear);
++ }
++}
++
++/*
+ * Initialize kernel page table isolation
+ */
+ void __init pti_init(void)
--- /dev/null
+From aa8c6248f8c75acfd610fe15d8cae23cf70d9d09 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 4 Dec 2017 15:07:36 +0100
+Subject: x86/mm/pti: Add infrastructure for page table isolation
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit aa8c6248f8c75acfd610fe15d8cae23cf70d9d09 upstream.
+
+Add the initial files for kernel page table isolation, with a minimal init
+function and the boot time detection for this misfeature.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ Documentation/admin-guide/kernel-parameters.txt | 2
+ arch/x86/boot/compressed/pagetable.c | 3
+ arch/x86/entry/calling.h | 7 ++
+ arch/x86/include/asm/pti.h | 14 ++++
+ arch/x86/mm/Makefile | 7 +-
+ arch/x86/mm/init.c | 2
+ arch/x86/mm/pti.c | 84 ++++++++++++++++++++++++
+ include/linux/pti.h | 11 +++
+ init/main.c | 3
+ 9 files changed, 130 insertions(+), 3 deletions(-)
+
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -2685,6 +2685,8 @@
+ steal time is computed, but won't influence scheduler
+ behaviour
+
++ nopti [X86-64] Disable kernel page table isolation
++
+ nolapic [X86-32,APIC] Do not enable or use the local APIC.
+
+ nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
+--- a/arch/x86/boot/compressed/pagetable.c
++++ b/arch/x86/boot/compressed/pagetable.c
+@@ -23,6 +23,9 @@
+ */
+ #undef CONFIG_AMD_MEM_ENCRYPT
+
++/* No PAGE_TABLE_ISOLATION support needed either: */
++#undef CONFIG_PAGE_TABLE_ISOLATION
++
+ #include "misc.h"
+
+ /* These actually do the work of building the kernel identity maps. */
+--- a/arch/x86/entry/calling.h
++++ b/arch/x86/entry/calling.h
+@@ -205,18 +205,23 @@ For 32-bit we have the following convent
+ .endm
+
+ .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+ ADJUST_KERNEL_CR3 \scratch_reg
+ mov \scratch_reg, %cr3
++.Lend_\@:
+ .endm
+
+ .macro SWITCH_TO_USER_CR3 scratch_reg:req
++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+ ADJUST_USER_CR3 \scratch_reg
+ mov \scratch_reg, %cr3
++.Lend_\@:
+ .endm
+
+ .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
++ ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+ movq %cr3, \scratch_reg
+ movq \scratch_reg, \save_reg
+ /*
+@@ -233,11 +238,13 @@ For 32-bit we have the following convent
+ .endm
+
+ .macro RESTORE_CR3 save_reg:req
++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ /*
+ * The CR3 write could be avoided when not changing its value,
+ * but would require a CR3 read *and* a scratch register.
+ */
+ movq \save_reg, %cr3
++.Lend_\@:
+ .endm
+
+ #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+--- /dev/null
++++ b/arch/x86/include/asm/pti.h
+@@ -0,0 +1,14 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifndef _ASM_X86_PTI_H
++#define _ASM_X86_PTI_H
++#ifndef __ASSEMBLY__
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++extern void pti_init(void);
++extern void pti_check_boottime_disable(void);
++#else
++static inline void pti_check_boottime_disable(void) { }
++#endif
++
++#endif /* __ASSEMBLY__ */
++#endif /* _ASM_X86_PTI_H */
+--- a/arch/x86/mm/Makefile
++++ b/arch/x86/mm/Makefile
+@@ -43,9 +43,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
+ obj-$(CONFIG_ACPI_NUMA) += srat.o
+ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
+
+-obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
+-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
++obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
++obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
++obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
++obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
+
+ obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+ obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -20,6 +20,7 @@
+ #include <asm/kaslr.h>
+ #include <asm/hypervisor.h>
+ #include <asm/cpufeature.h>
++#include <asm/pti.h>
+
+ /*
+ * We need to define the tracepoints somewhere, and tlb.c
+@@ -630,6 +631,7 @@ void __init init_mem_mapping(void)
+ {
+ unsigned long end;
+
++ pti_check_boottime_disable();
+ probe_page_size_mask();
+ setup_pcid();
+
+--- /dev/null
++++ b/arch/x86/mm/pti.c
+@@ -0,0 +1,84 @@
++/*
++ * Copyright(c) 2017 Intel Corporation. All rights reserved.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of version 2 of the GNU General Public License as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License for more details.
++ *
++ * This code is based in part on work published here:
++ *
++ * https://github.com/IAIK/KAISER
++ *
++ * The original work was written by and and signed off by for the Linux
++ * kernel by:
++ *
++ * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
++ * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
++ * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
++ * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
++ *
++ * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
++ * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
++ * Andy Lutomirsky <luto@amacapital.net>
++ */
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/string.h>
++#include <linux/types.h>
++#include <linux/bug.h>
++#include <linux/init.h>
++#include <linux/spinlock.h>
++#include <linux/mm.h>
++#include <linux/uaccess.h>
++
++#include <asm/cpufeature.h>
++#include <asm/hypervisor.h>
++#include <asm/cmdline.h>
++#include <asm/pti.h>
++#include <asm/pgtable.h>
++#include <asm/pgalloc.h>
++#include <asm/tlbflush.h>
++#include <asm/desc.h>
++
++#undef pr_fmt
++#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
++
++static void __init pti_print_if_insecure(const char *reason)
++{
++ if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
++ pr_info("%s\n", reason);
++}
++
++void __init pti_check_boottime_disable(void)
++{
++ if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
++ pti_print_if_insecure("disabled on XEN PV.");
++ return;
++ }
++
++ if (cmdline_find_option_bool(boot_command_line, "nopti")) {
++ pti_print_if_insecure("disabled on command line.");
++ return;
++ }
++
++ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
++ return;
++
++ setup_force_cpu_cap(X86_FEATURE_PTI);
++}
++
++/*
++ * Initialize kernel page table isolation
++ */
++void __init pti_init(void)
++{
++ if (!static_cpu_has(X86_FEATURE_PTI))
++ return;
++
++ pr_info("enabled\n");
++}
+--- /dev/null
++++ b/include/linux/pti.h
+@@ -0,0 +1,11 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifndef _INCLUDE_PTI_H
++#define _INCLUDE_PTI_H
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++#include <asm/pti.h>
++#else
++static inline void pti_init(void) { }
++#endif
++
++#endif
+--- a/init/main.c
++++ b/init/main.c
+@@ -75,6 +75,7 @@
+ #include <linux/slab.h>
+ #include <linux/perf_event.h>
+ #include <linux/ptrace.h>
++#include <linux/pti.h>
+ #include <linux/blkdev.h>
+ #include <linux/elevator.h>
+ #include <linux/sched_clock.h>
+@@ -506,6 +507,8 @@ static void __init mm_init(void)
+ ioremap_huge_init();
+ /* Should be run before the first non-init thread is created */
+ init_espfix_bsp();
++ /* Should be run after espfix64 is set up. */
++ pti_init();
+ }
+
+ asmlinkage __visible void __init start_kernel(void)
--- /dev/null
+From 385ce0ea4c078517fa51c261882c4e72fba53005 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Mon, 4 Dec 2017 15:08:03 +0100
+Subject: x86/mm/pti: Add Kconfig
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 385ce0ea4c078517fa51c261882c4e72fba53005 upstream.
+
+Finally allow CONFIG_PAGE_TABLE_ISOLATION to be enabled.
+
+PARAVIRT generally requires that the kernel not manage its own page tables.
+It also means that the hypervisor and kernel must agree wholeheartedly
+about what format the page tables are in and what they contain.
+PAGE_TABLE_ISOLATION, unfortunately, changes the rules and they
+can not be used together.
+
+I've seen conflicting feedback from maintainers lately about whether they
+want the Kconfig magic to go first or last in a patch series. It's going
+last here because the partially-applied series leads to kernels that can
+not boot in a bunch of cases. I did a run through the entire series with
+CONFIG_PAGE_TABLE_ISOLATION=y to look for build errors, though.
+
+[ tglx: Removed SMP and !PARAVIRT dependencies as they not longer exist ]
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Cc: linux-mm@kvack.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ security/Kconfig | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/security/Kconfig
++++ b/security/Kconfig
+@@ -54,6 +54,16 @@ config SECURITY_NETWORK
+ implement socket and networking access controls.
+ If you are unsure how to answer this question, answer N.
+
++config PAGE_TABLE_ISOLATION
++ bool "Remove the kernel mapping in user mode"
++ depends on X86_64 && !UML
++ help
++ This feature reduces the number of hardware side channels by
++ ensuring that the majority of kernel addresses are not mapped
++ into userspace.
++
++ See Documentation/x86/pagetable-isolation.txt for more details.
++
+ config SECURITY_INFINIBAND
+ bool "Infiniband Security Hooks"
+ depends on SECURITY && INFINIBAND
--- /dev/null
+From 61e9b3671007a5da8127955a1a3bda7e0d5f42e8 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Mon, 4 Dec 2017 15:07:37 +0100
+Subject: x86/mm/pti: Add mapping helper functions
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 61e9b3671007a5da8127955a1a3bda7e0d5f42e8 upstream.
+
+Add the pagetable helper functions do manage the separate user space page
+tables.
+
+[ tglx: Split out from the big combo kaiser patch. Folded Andys
+ simplification and made it out of line as Boris suggested ]
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/pgtable.h | 6 ++
+ arch/x86/include/asm/pgtable_64.h | 92 ++++++++++++++++++++++++++++++++++++++
+ arch/x86/mm/pti.c | 41 ++++++++++++++++
+ 3 files changed, 138 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -909,7 +909,11 @@ static inline int pgd_none(pgd_t pgd)
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
++#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
++/*
++ * a shortcut to get a pgd_t in a given mm
++ */
++#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
+ /*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_
+ #endif
+ }
+
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++/*
++ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
++ * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
++ * the user one is in the last 4k. To switch between them, you
++ * just need to flip the 12th bit in their addresses.
++ */
++#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
++
++/*
++ * This generates better code than the inline assembly in
++ * __set_bit().
++ */
++static inline void *ptr_set_bit(void *ptr, int bit)
++{
++ unsigned long __ptr = (unsigned long)ptr;
++
++ __ptr |= BIT(bit);
++ return (void *)__ptr;
++}
++static inline void *ptr_clear_bit(void *ptr, int bit)
++{
++ unsigned long __ptr = (unsigned long)ptr;
++
++ __ptr &= ~BIT(bit);
++ return (void *)__ptr;
++}
++
++static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
++{
++ return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
++}
++
++static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
++{
++ return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
++}
++
++static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
++{
++ return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
++}
++
++static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
++{
++ return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
++}
++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
++
++/*
++ * Page table pages are page-aligned. The lower half of the top
++ * level is used for userspace and the top half for the kernel.
++ *
++ * Returns true for parts of the PGD that map userspace and
++ * false for the parts that map the kernel.
++ */
++static inline bool pgdp_maps_userspace(void *__ptr)
++{
++ unsigned long ptr = (unsigned long)__ptr;
++
++ return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
++}
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
++
++/*
++ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
++ * Populates the user and returns the resulting PGD that must be set in
++ * the kernel copy of the page tables.
++ */
++static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++ if (!static_cpu_has(X86_FEATURE_PTI))
++ return pgd;
++ return __pti_set_user_pgd(pgdp, pgd);
++}
++#else
++static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++ return pgd;
++}
++#endif
++
+ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
+ {
++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
++ p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
++#else
+ *p4dp = p4d;
++#endif
+ }
+
+ static inline void native_p4d_clear(p4d_t *p4d)
+@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_
+
+ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ *pgdp = pti_set_user_pgd(pgdp, pgd);
++#else
+ *pgdp = pgd;
++#endif
+ }
+
+ static inline void native_pgd_clear(pgd_t *pgd)
+--- a/arch/x86/mm/pti.c
++++ b/arch/x86/mm/pti.c
+@@ -96,6 +96,47 @@ enable:
+ setup_force_cpu_cap(X86_FEATURE_PTI);
+ }
+
++pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++ /*
++ * Changes to the high (kernel) portion of the kernelmode page
++ * tables are not automatically propagated to the usermode tables.
++ *
++ * Users should keep in mind that, unlike the kernelmode tables,
++ * there is no vmalloc_fault equivalent for the usermode tables.
++ * Top-level entries added to init_mm's usermode pgd after boot
++ * will not be automatically propagated to other mms.
++ */
++ if (!pgdp_maps_userspace(pgdp))
++ return pgd;
++
++ /*
++ * The user page tables get the full PGD, accessible from
++ * userspace:
++ */
++ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
++
++ /*
++ * If this is normal user memory, make it NX in the kernel
++ * pagetables so that, if we somehow screw up and return to
++ * usermode with the kernel CR3 loaded, we'll get a page fault
++ * instead of allowing user code to execute with the wrong CR3.
++ *
++ * As exceptions, we don't set NX if:
++ * - _PAGE_USER is not set. This could be an executable
++ * EFI runtime mapping or something similar, and the kernel
++ * may execute from it
++ * - we don't have NX support
++ * - we're clearing the PGD (i.e. the new pgd is not present).
++ */
++ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
++ (__supported_pte_mask & _PAGE_NX))
++ pgd.pgd |= _PAGE_NX;
++
++ /* return the copy of the PGD we want the kernel to use: */
++ return pgd;
++}
++
+ /*
+ * Initialize kernel page table isolation
+ */
--- /dev/null
+From d9e9a6418065bb376e5de8d93ce346939b9a37a6 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Mon, 4 Dec 2017 15:07:39 +0100
+Subject: x86/mm/pti: Allocate a separate user PGD
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit d9e9a6418065bb376e5de8d93ce346939b9a37a6 upstream.
+
+Kernel page table isolation requires to have two PGDs. One for the kernel,
+which contains the full kernel mapping plus the user space mapping and one
+for user space which contains the user space mappings and the minimal set
+of kernel mappings which are required by the architecture to be able to
+transition from and to user space.
+
+Add the necessary preliminaries.
+
+[ tglx: Split out from the big kaiser dump. EFI fixup from Kirill ]
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/pgalloc.h | 11 +++++++++++
+ arch/x86/kernel/head_64.S | 30 +++++++++++++++++++++++++++---
+ arch/x86/mm/pgtable.c | 5 +++--
+ arch/x86/platform/efi/efi_64.c | 5 ++++-
+ 4 files changed, 45 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/include/asm/pgalloc.h
++++ b/arch/x86/include/asm/pgalloc.h
+@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(
+ */
+ extern gfp_t __userpte_alloc_gfp;
+
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++/*
++ * Instead of one PGD, we acquire two PGDs. Being order-1, it is
++ * both 8k in size and 8k-aligned. That lets us just flip bit 12
++ * in a pointer to swap between the two 4k halves.
++ */
++#define PGD_ALLOCATION_ORDER 1
++#else
++#define PGD_ALLOCATION_ORDER 0
++#endif
++
+ /*
+ * Allocate and free page tables.
+ */
+--- a/arch/x86/kernel/head_64.S
++++ b/arch/x86/kernel/head_64.S
+@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
+ .balign PAGE_SIZE; \
+ GLOBAL(name)
+
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++/*
++ * Each PGD needs to be 8k long and 8k aligned. We do not
++ * ever go out to userspace with these, so we do not
++ * strictly *need* the second page, but this allows us to
++ * have a single set_pgd() implementation that does not
++ * need to worry about whether it has 4k or 8k to work
++ * with.
++ *
++ * This ensures PGDs are 8k long:
++ */
++#define PTI_USER_PGD_FILL 512
++/* This ensures they are 8k-aligned: */
++#define NEXT_PGD_PAGE(name) \
++ .balign 2 * PAGE_SIZE; \
++GLOBAL(name)
++#else
++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
++#define PTI_USER_PGD_FILL 0
++#endif
++
+ /* Automate the creation of 1 to 1 mapping pmd entries */
+ #define PMDS(START, PERM, COUNT) \
+ i = 0 ; \
+@@ -350,13 +371,14 @@ GLOBAL(name)
+ .endr
+
+ __INITDATA
+-NEXT_PAGE(early_top_pgt)
++NEXT_PGD_PAGE(early_top_pgt)
+ .fill 511,8,0
+ #ifdef CONFIG_X86_5LEVEL
+ .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ #else
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ #endif
++ .fill PTI_USER_PGD_FILL,8,0
+
+ NEXT_PAGE(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
+ .data
+
+ #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
+-NEXT_PAGE(init_top_pgt)
++NEXT_PGD_PAGE(init_top_pgt)
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org init_top_pgt + PGD_START_KERNEL*8, 0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
++ .fill PTI_USER_PGD_FILL,8,0
+
+ NEXT_PAGE(level3_ident_pgt)
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+ #else
+-NEXT_PAGE(init_top_pgt)
++NEXT_PGD_PAGE(init_top_pgt)
+ .fill 512,8,0
++ .fill PTI_USER_PGD_FILL,8,0
+ #endif
+
+ #ifdef CONFIG_X86_5LEVEL
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
+ kmem_cache_free(pgd_cache, pgd);
+ }
+ #else
++
+ static inline pgd_t *_pgd_alloc(void)
+ {
+- return (pgd_t *)__get_free_page(PGALLOC_GFP);
++ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
+ }
+
+ static inline void _pgd_free(pgd_t *pgd)
+ {
+- free_page((unsigned long)pgd);
++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+ }
+ #endif /* CONFIG_X86_PAE */
+
+--- a/arch/x86/platform/efi/efi_64.c
++++ b/arch/x86/platform/efi/efi_64.c
+@@ -195,6 +195,9 @@ static pgd_t *efi_pgd;
+ * because we want to avoid inserting EFI region mappings (EFI_VA_END
+ * to EFI_VA_START) into the standard kernel page tables. Everything
+ * else can be shared, see efi_sync_low_kernel_mappings().
++ *
++ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
++ * allocation.
+ */
+ int __init efi_alloc_page_tables(void)
+ {
+@@ -207,7 +210,7 @@ int __init efi_alloc_page_tables(void)
+ return 0;
+
+ gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
+- efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
++ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
+ if (!efi_pgd)
+ return -ENOMEM;
+
--- /dev/null
+From 1c4de1ff4fe50453b968579ee86fac3da80dd783 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Mon, 4 Dec 2017 15:07:38 +0100
+Subject: x86/mm/pti: Allow NX poison to be set in p4d/pgd
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 1c4de1ff4fe50453b968579ee86fac3da80dd783 upstream.
+
+With PAGE_TABLE_ISOLATION the user portion of the kernel page tables is
+poisoned with the NX bit so if the entry code exits with the kernel page
+tables selected in CR3, userspace crashes.
+
+But doing so trips the p4d/pgd_bad() checks. Make sure it does not do
+that.
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/pgtable.h | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -846,7 +846,12 @@ static inline pud_t *pud_offset(p4d_t *p
+
+ static inline int p4d_bad(p4d_t p4d)
+ {
+- return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
++ unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
++
++ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
++ ignore_flags |= _PAGE_NX;
++
++ return (p4d_flags(p4d) & ~ignore_flags) != 0;
+ }
+ #endif /* CONFIG_PGTABLE_LEVELS > 3 */
+
+@@ -880,7 +885,12 @@ static inline p4d_t *p4d_offset(pgd_t *p
+
+ static inline int pgd_bad(pgd_t pgd)
+ {
+- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
++ unsigned long ignore_flags = _PAGE_USER;
++
++ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
++ ignore_flags |= _PAGE_NX;
++
++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
+ }
+
+ static inline int pgd_none(pgd_t pgd)
--- /dev/null
+From c313ec66317d421fb5768d78c56abed2dc862264 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Mon, 4 Dec 2017 15:07:34 +0100
+Subject: x86/mm/pti: Disable global pages if PAGE_TABLE_ISOLATION=y
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit c313ec66317d421fb5768d78c56abed2dc862264 upstream.
+
+Global pages stay in the TLB across context switches. Since all contexts
+share the same kernel mapping, these mappings are marked as global pages
+so kernel entries in the TLB are not flushed out on a context switch.
+
+But, even having these entries in the TLB opens up something that an
+attacker can use, such as the double-page-fault attack:
+
+ http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf
+
+That means that even when PAGE_TABLE_ISOLATION switches page tables
+on return to user space the global pages would stay in the TLB cache.
+
+Disable global pages so that kernel TLB entries can be flushed before
+returning to user space. This way, all accesses to kernel addresses from
+userspace result in a TLB miss independent of the existence of a kernel
+mapping.
+
+Suppress global pages via the __supported_pte_mask. The user space
+mappings set PAGE_GLOBAL for the minimal kernel mappings which are
+required for entry/exit. These mappings are set up manually so the
+filtering does not take place.
+
+[ The __supported_pte_mask simplification was written by Thomas Gleixner. ]
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Cc: linux-mm@kvack.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/init.c | 12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -161,6 +161,12 @@ struct map_range {
+
+ static int page_size_mask;
+
++static void enable_global_pages(void)
++{
++ if (!static_cpu_has(X86_FEATURE_PTI))
++ __supported_pte_mask |= _PAGE_GLOBAL;
++}
++
+ static void __init probe_page_size_mask(void)
+ {
+ /*
+@@ -179,11 +185,11 @@ static void __init probe_page_size_mask(
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);
+
+ /* Enable PGE if available */
++ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
+- __supported_pte_mask |= _PAGE_GLOBAL;
+- } else
+- __supported_pte_mask &= ~_PAGE_GLOBAL;
++ enable_global_pages();
++ }
+
+ /* Enable 1 GB linear kernel mappings if available: */
+ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
--- /dev/null
+From 8d4b067895791ab9fdb1aadfc505f64d71239dd2 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 4 Dec 2017 15:07:43 +0100
+Subject: x86/mm/pti: Force entry through trampoline when PTI active
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 8d4b067895791ab9fdb1aadfc505f64d71239dd2 upstream.
+
+Force the entry through the trampoline only when PTI is active. Otherwise
+go through the normal entry code.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/cpu/common.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1339,7 +1339,10 @@ void syscall_init(void)
+ (entry_SYSCALL_64_trampoline - _entry_trampoline);
+
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+- wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
++ if (static_cpu_has(X86_FEATURE_PTI))
++ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
++ else
++ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+
+ #ifdef CONFIG_IA32_EMULATION
+ wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
--- /dev/null
+From 4b6bbe95b87966ba08999574db65c93c5e925a36 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Fri, 15 Dec 2017 22:08:18 +0100
+Subject: x86/mm/pti: Map ESPFIX into user space
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 4b6bbe95b87966ba08999574db65c93c5e925a36 upstream.
+
+Map the ESPFIX pages into user space when PTI is enabled.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/pti.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/arch/x86/mm/pti.c
++++ b/arch/x86/mm/pti.c
+@@ -288,6 +288,16 @@ static void __init pti_clone_user_shared
+ }
+
+ /*
++ * Clone the ESPFIX P4D into the user space visinble page table
++ */
++static void __init pti_setup_espfix64(void)
++{
++#ifdef CONFIG_X86_ESPFIX64
++ pti_clone_p4d(ESPFIX_BASE_ADDR);
++#endif
++}
++
++/*
+ * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ */
+ static void __init pti_clone_entry_text(void)
+@@ -308,4 +318,5 @@ void __init pti_init(void)
+
+ pti_clone_user_shared();
+ pti_clone_entry_text();
++ pti_setup_espfix64();
+ }
--- /dev/null
+From fc2fbc8512ed08d1de7720936fd7d2e4ce02c3a2 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Mon, 4 Dec 2017 15:07:40 +0100
+Subject: x86/mm/pti: Populate user PGD
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit fc2fbc8512ed08d1de7720936fd7d2e4ce02c3a2 upstream.
+
+In clone_pgd_range() copy the init user PGDs which cover the kernel half of
+the address space, so a process has all the required kernel mappings
+visible.
+
+[ tglx: Split out from the big kaiser dump and folded Andys simplification ]
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/pgtable.h | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -1125,7 +1125,14 @@ static inline int pud_write(pud_t pud)
+ */
+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ {
+- memcpy(dst, src, count * sizeof(pgd_t));
++ memcpy(dst, src, count * sizeof(pgd_t));
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ if (!static_cpu_has(X86_FEATURE_PTI))
++ return;
++ /* Clone the user space pgd as well */
++ memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
++ count * sizeof(pgd_t));
++#endif
+ }
+
+ #define PTE_SHIFT ilog2(PTRS_PER_PTE)
--- /dev/null
+From 8a09317b895f073977346779df52f67c1056d81d Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Mon, 4 Dec 2017 15:07:35 +0100
+Subject: x86/mm/pti: Prepare the x86/entry assembly code for entry/exit CR3 switching
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 8a09317b895f073977346779df52f67c1056d81d upstream.
+
+PAGE_TABLE_ISOLATION needs to switch to a different CR3 value when it
+enters the kernel and switch back when it exits. This essentially needs to
+be done before leaving assembly code.
+
+This is extra challenging because the switching context is tricky: the
+registers that can be clobbered can vary. It is also hard to store things
+on the stack because there is an established ABI (ptregs) or the stack is
+entirely unsafe to use.
+
+Establish a set of macros that allow changing to the user and kernel CR3
+values.
+
+Interactions with SWAPGS:
+
+ Previous versions of the PAGE_TABLE_ISOLATION code relied on having
+ per-CPU scratch space to save/restore a register that can be used for the
+ CR3 MOV. The %GS register is used to index into our per-CPU space, so
+ SWAPGS *had* to be done before the CR3 switch. That scratch space is gone
+ now, but the semantic that SWAPGS must be done before the CR3 MOV is
+ retained. This is good to keep because it is not that hard to do and it
+ allows to do things like add per-CPU debugging information.
+
+What this does in the NMI code is worth pointing out. NMIs can interrupt
+*any* context and they can also be nested with NMIs interrupting other
+NMIs. The comments below ".Lnmi_from_kernel" explain the format of the
+stack during this situation. Changing the format of this stack is hard.
+Instead of storing the old CR3 value on the stack, this depends on the
+*regular* register save/restore mechanism and then uses %r14 to keep CR3
+during the NMI. It is callee-saved and will not be clobbered by the C NMI
+handlers that get called.
+
+[ PeterZ: ESPFIX optimization ]
+
+Based-on-code-from: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Cc: linux-mm@kvack.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/calling.h | 66 +++++++++++++++++++++++++++++++++++++++
+ arch/x86/entry/entry_64.S | 45 +++++++++++++++++++++++---
+ arch/x86/entry/entry_64_compat.S | 24 +++++++++++++-
+ 3 files changed, 128 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/entry/calling.h
++++ b/arch/x86/entry/calling.h
+@@ -1,6 +1,8 @@
+ /* SPDX-License-Identifier: GPL-2.0 */
+ #include <linux/jump_label.h>
+ #include <asm/unwind_hints.h>
++#include <asm/cpufeatures.h>
++#include <asm/page_types.h>
+
+ /*
+
+@@ -187,6 +189,70 @@ For 32-bit we have the following convent
+ #endif
+ .endm
+
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++
++/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */
++#define PTI_SWITCH_MASK (1<<PAGE_SHIFT)
++
++.macro ADJUST_KERNEL_CR3 reg:req
++ /* Clear "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
++ andq $(~PTI_SWITCH_MASK), \reg
++.endm
++
++.macro ADJUST_USER_CR3 reg:req
++ /* Move CR3 up a page to the user page tables: */
++ orq $(PTI_SWITCH_MASK), \reg
++.endm
++
++.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
++ mov %cr3, \scratch_reg
++ ADJUST_KERNEL_CR3 \scratch_reg
++ mov \scratch_reg, %cr3
++.endm
++
++.macro SWITCH_TO_USER_CR3 scratch_reg:req
++ mov %cr3, \scratch_reg
++ ADJUST_USER_CR3 \scratch_reg
++ mov \scratch_reg, %cr3
++.endm
++
++.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
++ movq %cr3, \scratch_reg
++ movq \scratch_reg, \save_reg
++ /*
++ * Is the switch bit zero? This means the address is
++ * up in real PAGE_TABLE_ISOLATION patches in a moment.
++ */
++ testq $(PTI_SWITCH_MASK), \scratch_reg
++ jz .Ldone_\@
++
++ ADJUST_KERNEL_CR3 \scratch_reg
++ movq \scratch_reg, %cr3
++
++.Ldone_\@:
++.endm
++
++.macro RESTORE_CR3 save_reg:req
++ /*
++ * The CR3 write could be avoided when not changing its value,
++ * but would require a CR3 read *and* a scratch register.
++ */
++ movq \save_reg, %cr3
++.endm
++
++#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
++
++.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
++.endm
++.macro SWITCH_TO_USER_CR3 scratch_reg:req
++.endm
++.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
++.endm
++.macro RESTORE_CR3 save_reg:req
++.endm
++
++#endif
++
+ #endif /* CONFIG_X86_64 */
+
+ /*
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -164,6 +164,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
+ /* Stash the user RSP. */
+ movq %rsp, RSP_SCRATCH
+
++ /* Note: using %rsp as a scratch reg. */
++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
++
+ /* Load the top of the task stack into RSP */
+ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
+
+@@ -203,6 +206,10 @@ ENTRY(entry_SYSCALL_64)
+ */
+
+ swapgs
++ /*
++ * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
++ * is not required to switch CR3.
++ */
+ movq %rsp, PER_CPU_VAR(rsp_scratch)
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+@@ -399,6 +406,7 @@ syscall_return_via_sysret:
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
++ SWITCH_TO_USER_CR3 scratch_reg=%rdi
+
+ popq %rdi
+ popq %rsp
+@@ -736,6 +744,8 @@ GLOBAL(swapgs_restore_regs_and_return_to
+ * We can do future final exit work right here.
+ */
+
++ SWITCH_TO_USER_CR3 scratch_reg=%rdi
++
+ /* Restore RDI. */
+ popq %rdi
+ SWAPGS
+@@ -818,7 +828,9 @@ native_irq_return_ldt:
+ */
+
+ pushq %rdi /* Stash user RDI */
+- SWAPGS
++ SWAPGS /* to kernel GS */
++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
++
+ movq PER_CPU_VAR(espfix_waddr), %rdi
+ movq %rax, (0*8)(%rdi) /* user RAX */
+ movq (1*8)(%rsp), %rax /* user RIP */
+@@ -834,7 +846,6 @@ native_irq_return_ldt:
+ /* Now RAX == RSP. */
+
+ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
+- popq %rdi /* Restore user RDI */
+
+ /*
+ * espfix_stack[31:16] == 0. The page tables are set up such that
+@@ -845,7 +856,11 @@ native_irq_return_ldt:
+ * still points to an RO alias of the ESPFIX stack.
+ */
+ orq PER_CPU_VAR(espfix_stack), %rax
+- SWAPGS
++
++ SWITCH_TO_USER_CR3 scratch_reg=%rdi /* to user CR3 */
++ SWAPGS /* to user GS */
++ popq %rdi /* Restore user RDI */
++
+ movq %rax, %rsp
+ UNWIND_HINT_IRET_REGS offset=8
+
+@@ -945,6 +960,8 @@ ENTRY(switch_to_thread_stack)
+ UNWIND_HINT_FUNC
+
+ pushq %rdi
++ /* Need to switch before accessing the thread stack. */
++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
+@@ -1244,7 +1261,11 @@ ENTRY(paranoid_entry)
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx, %ebx
+-1: ret
++
++1:
++ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
++
++ ret
+ END(paranoid_entry)
+
+ /*
+@@ -1266,6 +1287,7 @@ ENTRY(paranoid_exit)
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz .Lparanoid_exit_no_swapgs
+ TRACE_IRQS_IRETQ
++ RESTORE_CR3 save_reg=%r14
+ SWAPGS_UNSAFE_STACK
+ jmp .Lparanoid_exit_restore
+ .Lparanoid_exit_no_swapgs:
+@@ -1293,6 +1315,8 @@ ENTRY(error_entry)
+ * from user mode due to an IRET fault.
+ */
+ SWAPGS
++ /* We have user CR3. Change to kernel CR3. */
++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+
+ .Lerror_entry_from_usermode_after_swapgs:
+ /* Put us onto the real thread stack. */
+@@ -1339,6 +1363,7 @@ ENTRY(error_entry)
+ * .Lgs_change's error handler with kernel gsbase.
+ */
+ SWAPGS
++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+ jmp .Lerror_entry_done
+
+ .Lbstep_iret:
+@@ -1348,10 +1373,11 @@ ENTRY(error_entry)
+
+ .Lerror_bad_iret:
+ /*
+- * We came from an IRET to user mode, so we have user gsbase.
+- * Switch to kernel gsbase:
++ * We came from an IRET to user mode, so we have user
++ * gsbase and CR3. Switch to kernel gsbase and CR3:
+ */
+ SWAPGS
++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+
+ /*
+ * Pretend that the exception came from user mode: set up pt_regs
+@@ -1383,6 +1409,10 @@ END(error_exit)
+ /*
+ * Runs on exception stack. Xen PV does not go through this path at all,
+ * so we can use real assembly here.
++ *
++ * Registers:
++ * %r14: Used to save/restore the CR3 of the interrupted context
++ * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
+ */
+ ENTRY(nmi)
+ UNWIND_HINT_IRET_REGS
+@@ -1446,6 +1476,7 @@ ENTRY(nmi)
+
+ swapgs
+ cld
++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ UNWIND_HINT_IRET_REGS base=%rdx offset=8
+@@ -1698,6 +1729,8 @@ end_repeat_nmi:
+ movq $-1, %rsi
+ call do_nmi
+
++ RESTORE_CR3 save_reg=%r14
++
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz nmi_restore
+ nmi_swapgs:
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -49,6 +49,10 @@
+ ENTRY(entry_SYSENTER_compat)
+ /* Interrupts are off on entry. */
+ SWAPGS
++
++ /* We are about to clobber %rsp anyway, clobbering here is OK */
++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
++
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ /*
+@@ -216,6 +220,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwfram
+ pushq $0 /* pt_regs->r15 = 0 */
+
+ /*
++ * We just saved %rdi so it is safe to clobber. It is not
++ * preserved during the C calls inside TRACE_IRQS_OFF anyway.
++ */
++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
++
++ /*
+ * User mode is traced as though IRQs are on, and SYSENTER
+ * turned them off.
+ */
+@@ -256,10 +266,22 @@ sysret32_from_system_call:
+ * when the system call started, which is already known to user
+ * code. We zero R8-R10 to avoid info leaks.
+ */
++ movq RSP-ORIG_RAX(%rsp), %rsp
++
++ /*
++ * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
++ * on the process stack which is not mapped to userspace and
++ * not readable after we SWITCH_TO_USER_CR3. Delay the CR3
++ * switch until after after the last reference to the process
++ * stack.
++ *
++ * %r8 is zeroed before the sysret, thus safe to clobber.
++ */
++ SWITCH_TO_USER_CR3 scratch_reg=%r8
++
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r10
+- movq RSP-ORIG_RAX(%rsp), %rsp
+ swapgs
+ sysretl
+ END(entry_SYSCALL_compat)
--- /dev/null
+From f7cfbee91559ca7e3e961a00ffac921208a115ad Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:45 +0100
+Subject: x86/mm/pti: Share cpu_entry_area with user space page tables
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit f7cfbee91559ca7e3e961a00ffac921208a115ad upstream.
+
+Share the cpu entry area so the user space and kernel space page tables
+have the same P4D page.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/pti.c | 25 +++++++++++++++++++++++++
+ 1 file changed, 25 insertions(+)
+
+--- a/arch/x86/mm/pti.c
++++ b/arch/x86/mm/pti.c
+@@ -265,6 +265,29 @@ pti_clone_pmds(unsigned long start, unsi
+ }
+
+ /*
++ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
++ * next-level entry on 5-level systems.
++ */
++static void __init pti_clone_p4d(unsigned long addr)
++{
++ p4d_t *kernel_p4d, *user_p4d;
++ pgd_t *kernel_pgd;
++
++ user_p4d = pti_user_pagetable_walk_p4d(addr);
++ kernel_pgd = pgd_offset_k(addr);
++ kernel_p4d = p4d_offset(kernel_pgd, addr);
++ *user_p4d = *kernel_p4d;
++}
++
++/*
++ * Clone the CPU_ENTRY_AREA into the user space visible page table.
++ */
++static void __init pti_clone_user_shared(void)
++{
++ pti_clone_p4d(CPU_ENTRY_AREA_BASE);
++}
++
++/*
+ * Initialize kernel page table isolation
+ */
+ void __init pti_init(void)
+@@ -273,4 +296,6 @@ void __init pti_init(void)
+ return;
+
+ pr_info("enabled\n");
++
++ pti_clone_user_shared();
+ }
--- /dev/null
+From 6dc72c3cbca0580642808d677181cad4c6433893 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 4 Dec 2017 15:07:47 +0100
+Subject: x86/mm/pti: Share entry text PMD
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 6dc72c3cbca0580642808d677181cad4c6433893 upstream.
+
+Share the entry text PMD of the kernel mapping with the user space
+mapping. If large pages are enabled this is a single PMD entry and at the
+point where it is copied into the user page table the RW bit has not been
+cleared yet. Clear it right away so the user space visible map becomes RX.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/pti.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/arch/x86/mm/pti.c
++++ b/arch/x86/mm/pti.c
+@@ -288,6 +288,15 @@ static void __init pti_clone_user_shared
+ }
+
+ /*
++ * Clone the populated PMDs of the entry and irqentry text and force it RO.
++ */
++static void __init pti_clone_entry_text(void)
++{
++ pti_clone_pmds((unsigned long) __entry_text_start,
++ (unsigned long) __irqentry_text_end, _PAGE_RW);
++}
++
++/*
+ * Initialize kernel page table isolation
+ */
+ void __init pti_init(void)
+@@ -298,4 +307,5 @@ void __init pti_init(void)
+ pr_info("enabled\n");
+
+ pti_clone_user_shared();
++ pti_clone_entry_text();
+ }
--- /dev/null
+From 6fd166aae78c0ab738d49bda653cbd9e3b1491cf Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Mon, 4 Dec 2017 15:07:59 +0100
+Subject: x86/mm: Use/Fix PCID to optimize user/kernel switches
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 6fd166aae78c0ab738d49bda653cbd9e3b1491cf upstream.
+
+We can use PCID to retain the TLBs across CR3 switches; including those now
+part of the user/kernel switch. This increases performance of kernel
+entry/exit at the cost of more expensive/complicated TLB flushing.
+
+Now that we have two address spaces, one for kernel and one for user space,
+we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID
+(just like we use the PFN LSB for the PGD). Since we do TLB invalidation
+from kernel space, the existing code will only invalidate the kernel PCID,
+we augment that by marking the corresponding user PCID invalid, and upon
+switching back to userspace, use a flushing CR3 write for the switch.
+
+In order to access the user_pcid_flush_mask we use PER_CPU storage, which
+means the previously established SWAPGS vs CR3 ordering is now mandatory
+and required.
+
+Having to do this memory access does require additional registers, most
+sites have a functioning stack and we can spill one (RAX), sites without
+functional stack need to otherwise provide the second scratch register.
+
+Note: PCID is generally available on Intel Sandybridge and later CPUs.
+Note: Up until this point TLB flushing was broken in this series.
+
+Based-on-code-from: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/calling.h | 72 ++++++++++++++++++----
+ arch/x86/entry/entry_64.S | 9 +-
+ arch/x86/entry/entry_64_compat.S | 4 -
+ arch/x86/include/asm/processor-flags.h | 5 +
+ arch/x86/include/asm/tlbflush.h | 91 ++++++++++++++++++++++++----
+ arch/x86/include/uapi/asm/processor-flags.h | 7 +-
+ arch/x86/kernel/asm-offsets.c | 4 +
+ arch/x86/mm/init.c | 2
+ arch/x86/mm/tlb.c | 1
+ 9 files changed, 162 insertions(+), 33 deletions(-)
+
+--- a/arch/x86/entry/calling.h
++++ b/arch/x86/entry/calling.h
+@@ -3,6 +3,9 @@
+ #include <asm/unwind_hints.h>
+ #include <asm/cpufeatures.h>
+ #include <asm/page_types.h>
++#include <asm/percpu.h>
++#include <asm/asm-offsets.h>
++#include <asm/processor-flags.h>
+
+ /*
+
+@@ -191,17 +194,21 @@ For 32-bit we have the following convent
+
+ #ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+-/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */
+-#define PTI_SWITCH_MASK (1<<PAGE_SHIFT)
++/*
++ * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
++ * halves:
++ */
++#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
++#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
+
+-.macro ADJUST_KERNEL_CR3 reg:req
+- /* Clear "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+- andq $(~PTI_SWITCH_MASK), \reg
++.macro SET_NOFLUSH_BIT reg:req
++ bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
+ .endm
+
+-.macro ADJUST_USER_CR3 reg:req
+- /* Move CR3 up a page to the user page tables: */
+- orq $(PTI_SWITCH_MASK), \reg
++.macro ADJUST_KERNEL_CR3 reg:req
++ ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
++ /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
++ andq $(~PTI_SWITCH_MASK), \reg
+ .endm
+
+ .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+@@ -212,21 +219,58 @@ For 32-bit we have the following convent
+ .Lend_\@:
+ .endm
+
+-.macro SWITCH_TO_USER_CR3 scratch_reg:req
++#define THIS_CPU_user_pcid_flush_mask \
++ PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
++
++.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+- ADJUST_USER_CR3 \scratch_reg
++
++ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
++
++ /*
++ * Test if the ASID needs a flush.
++ */
++ movq \scratch_reg, \scratch_reg2
++ andq $(0x7FF), \scratch_reg /* mask ASID */
++ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
++ jnc .Lnoflush_\@
++
++ /* Flush needed, clear the bit */
++ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
++ movq \scratch_reg2, \scratch_reg
++ jmp .Lwrcr3_\@
++
++.Lnoflush_\@:
++ movq \scratch_reg2, \scratch_reg
++ SET_NOFLUSH_BIT \scratch_reg
++
++.Lwrcr3_\@:
++ /* Flip the PGD and ASID to the user version */
++ orq $(PTI_SWITCH_MASK), \scratch_reg
+ mov \scratch_reg, %cr3
+ .Lend_\@:
+ .endm
+
++.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
++ pushq %rax
++ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
++ popq %rax
++.endm
++
+ .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+ movq %cr3, \scratch_reg
+ movq \scratch_reg, \save_reg
+ /*
+- * Is the switch bit zero? This means the address is
+- * up in real PAGE_TABLE_ISOLATION patches in a moment.
++ * Is the "switch mask" all zero? That means that both of
++ * these are zero:
++ *
++ * 1. The user/kernel PCID bit, and
++ * 2. The user/kernel "bit" that points CR3 to the
++ * bottom half of the 8k PGD
++ *
++ * That indicates a kernel CR3 value, not a user CR3.
+ */
+ testq $(PTI_SWITCH_MASK), \scratch_reg
+ jz .Ldone_\@
+@@ -251,7 +295,9 @@ For 32-bit we have the following convent
+
+ .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ .endm
+-.macro SWITCH_TO_USER_CR3 scratch_reg:req
++.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
++.endm
++.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+ .endm
+ .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+ .endm
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -23,7 +23,6 @@
+ #include <asm/segment.h>
+ #include <asm/cache.h>
+ #include <asm/errno.h>
+-#include "calling.h"
+ #include <asm/asm-offsets.h>
+ #include <asm/msr.h>
+ #include <asm/unistd.h>
+@@ -40,6 +39,8 @@
+ #include <asm/frame.h>
+ #include <linux/err.h>
+
++#include "calling.h"
++
+ .code64
+ .section .entry.text, "ax"
+
+@@ -406,7 +407,7 @@ syscall_return_via_sysret:
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+- SWITCH_TO_USER_CR3 scratch_reg=%rdi
++ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
+ popq %rdi
+ popq %rsp
+@@ -744,7 +745,7 @@ GLOBAL(swapgs_restore_regs_and_return_to
+ * We can do future final exit work right here.
+ */
+
+- SWITCH_TO_USER_CR3 scratch_reg=%rdi
++ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
+ /* Restore RDI. */
+ popq %rdi
+@@ -857,7 +858,7 @@ native_irq_return_ldt:
+ */
+ orq PER_CPU_VAR(espfix_stack), %rax
+
+- SWITCH_TO_USER_CR3 scratch_reg=%rdi /* to user CR3 */
++ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ SWAPGS /* to user GS */
+ popq %rdi /* Restore user RDI */
+
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -275,9 +275,9 @@ sysret32_from_system_call:
+ * switch until after after the last reference to the process
+ * stack.
+ *
+- * %r8 is zeroed before the sysret, thus safe to clobber.
++ * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
+ */
+- SWITCH_TO_USER_CR3 scratch_reg=%r8
++ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
+
+ xorq %r8, %r8
+ xorq %r9, %r9
+--- a/arch/x86/include/asm/processor-flags.h
++++ b/arch/x86/include/asm/processor-flags.h
+@@ -38,6 +38,11 @@
+ #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
+ #define CR3_PCID_MASK 0xFFFull
+ #define CR3_NOFLUSH BIT_ULL(63)
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++# define X86_CR3_PTI_SWITCH_BIT 11
++#endif
++
+ #else
+ /*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -10,6 +10,8 @@
+ #include <asm/special_insns.h>
+ #include <asm/smp.h>
+ #include <asm/invpcid.h>
++#include <asm/pti.h>
++#include <asm/processor-flags.h>
+
+ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+ {
+@@ -24,24 +26,54 @@ static inline u64 inc_mm_tlb_gen(struct
+
+ /* There are 12 bits of space for ASIDS in CR3 */
+ #define CR3_HW_ASID_BITS 12
++
+ /*
+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+-#define PTI_CONSUMED_ASID_BITS 0
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++# define PTI_CONSUMED_PCID_BITS 1
++#else
++# define PTI_CONSUMED_PCID_BITS 0
++#endif
++
++#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
+
+-#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
+ /*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
+ * for them being zero-based. Another -1 is because ASID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
++#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
++
++/*
++ * 6 because 6 should be plenty and struct tlb_state will fit in two cache
++ * lines.
++ */
++#define TLB_NR_DYN_ASIDS 6
+
+ static inline u16 kern_pcid(u16 asid)
+ {
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
++ * Make sure that the dynamic ASID space does not confict with the
++ * bit we are using to switch between user and kernel ASIDs.
++ */
++ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
++
++ /*
++ * The ASID being passed in here should have respected the
++ * MAX_ASID_AVAILABLE and thus never have the switch bit set.
++ */
++ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
++#endif
++ /*
++ * The dynamically-assigned ASIDs that get passed in are small
++ * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
++ * so do not bother to clear it.
++ *
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the
+ * PCID bits. This serves two purposes. It prevents a nasty
+ * situation in which PCID-unaware code saves CR3, loads some other
+@@ -95,12 +127,6 @@ static inline bool tlb_defer_switch_to_i
+ return !static_cpu_has(X86_FEATURE_PCID);
+ }
+
+-/*
+- * 6 because 6 should be plenty and struct tlb_state will fit in
+- * two cache lines.
+- */
+-#define TLB_NR_DYN_ASIDS 6
+-
+ struct tlb_context {
+ u64 ctx_id;
+ u64 tlb_gen;
+@@ -146,6 +172,13 @@ struct tlb_state {
+ bool invalidate_other;
+
+ /*
++ * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
++ * the corresponding user PCID needs a flush next time we
++ * switch to it; see SWITCH_TO_USER_CR3.
++ */
++ unsigned short user_pcid_flush_mask;
++
++ /*
+ * Access to this CR4 shadow and to H/W CR4 is protected by
+ * disabling interrupts when modifying either one.
+ */
+@@ -250,14 +283,41 @@ static inline void cr4_set_bits_and_upda
+ extern void initialize_tlbstate_and_flush(void);
+
+ /*
++ * Given an ASID, flush the corresponding user ASID. We can delay this
++ * until the next time we switch to it.
++ *
++ * See SWITCH_TO_USER_CR3.
++ */
++static inline void invalidate_user_asid(u16 asid)
++{
++ /* There is no user ASID if address space separation is off */
++ if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
++ return;
++
++ /*
++ * We only have a single ASID if PCID is off and the CR3
++ * write will have flushed it.
++ */
++ if (!cpu_feature_enabled(X86_FEATURE_PCID))
++ return;
++
++ if (!static_cpu_has(X86_FEATURE_PTI))
++ return;
++
++ __set_bit(kern_pcid(asid),
++ (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
++}
++
++/*
+ * flush the entire current user mapping
+ */
+ static inline void __native_flush_tlb(void)
+ {
++ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+ /*
+- * If current->mm == NULL then we borrow a mm which may change during a
+- * task switch and therefore we must not be preempted while we write CR3
+- * back:
++ * If current->mm == NULL then we borrow a mm which may change
++ * during a task switch and therefore we must not be preempted
++ * while we write CR3 back:
+ */
+ preempt_disable();
+ native_write_cr3(__native_read_cr3());
+@@ -301,7 +361,14 @@ static inline void __native_flush_tlb_gl
+ */
+ static inline void __native_flush_tlb_single(unsigned long addr)
+ {
++ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
++
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
++
++ if (!static_cpu_has(X86_FEATURE_PTI))
++ return;
++
++ invalidate_user_asid(loaded_mm_asid);
+ }
+
+ /*
+--- a/arch/x86/include/uapi/asm/processor-flags.h
++++ b/arch/x86/include/uapi/asm/processor-flags.h
+@@ -78,7 +78,12 @@
+ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
+ #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
+ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
+-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
++
++#define X86_CR3_PCID_BITS 12
++#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
++
++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
++#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
+
+ /*
+ * Intel CPU features in CR4
+--- a/arch/x86/kernel/asm-offsets.c
++++ b/arch/x86/kernel/asm-offsets.c
+@@ -17,6 +17,7 @@
+ #include <asm/sigframe.h>
+ #include <asm/bootparam.h>
+ #include <asm/suspend.h>
++#include <asm/tlbflush.h>
+
+ #ifdef CONFIG_XEN
+ #include <xen/interface/xen.h>
+@@ -94,6 +95,9 @@ void common(void) {
+ BLANK();
+ DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
++ /* TLB state for the entry code */
++ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
++
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -855,7 +855,7 @@ void __init zone_sizes_init(void)
+ free_area_init_nodes(max_zone_pfns);
+ }
+
+-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
++__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+ .loaded_mm = &init_mm,
+ .next_asid = 1,
+ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -105,6 +105,7 @@ static void load_new_mm_cr3(pgd_t *pgdir
+ unsigned long new_mm_cr3;
+
+ if (need_flush) {
++ invalidate_user_asid(new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid);
+ } else {
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
--- /dev/null
+From 6cff64b86aaaa07f89f50498055a20e45754b0c1 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Mon, 4 Dec 2017 15:08:01 +0100
+Subject: x86/mm: Use INVPCID for __native_flush_tlb_single()
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 6cff64b86aaaa07f89f50498055a20e45754b0c1 upstream.
+
+This uses INVPCID to shoot down individual lines of the user mapping
+instead of marking the entire user map as invalid. This
+could/might/possibly be faster.
+
+This for sure needs tlb_single_page_flush_ceiling to be redetermined;
+esp. since INVPCID is _slow_.
+
+A detailed performance analysis is available here:
+
+ https://lkml.kernel.org/r/3062e486-3539-8a1f-5724-16199420be71@intel.com
+
+[ Peterz: Split out from big combo patch ]
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/cpufeatures.h | 1
+ arch/x86/include/asm/tlbflush.h | 23 ++++++++++++-
+ arch/x86/mm/init.c | 64 +++++++++++++++++++++----------------
+ 3 files changed, 60 insertions(+), 28 deletions(-)
+
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -197,6 +197,7 @@
+ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
+ #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
+ #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
+
+ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
+ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -85,6 +85,18 @@ static inline u16 kern_pcid(u16 asid)
+ return asid + 1;
+ }
+
++/*
++ * The user PCID is just the kernel one, plus the "switch bit".
++ */
++static inline u16 user_pcid(u16 asid)
++{
++ u16 ret = kern_pcid(asid);
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
++#endif
++ return ret;
++}
++
+ struct pgd_t;
+ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+ {
+@@ -335,6 +347,8 @@ static inline void __native_flush_tlb_gl
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
++ *
++ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
+ return;
+@@ -368,7 +382,14 @@ static inline void __native_flush_tlb_si
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+- invalidate_user_asid(loaded_mm_asid);
++ /*
++ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
++ * Just use invalidate_user_asid() in case we are called early.
++ */
++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
++ invalidate_user_asid(loaded_mm_asid);
++ else
++ invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
+ }
+
+ /*
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -203,34 +203,44 @@ static void __init probe_page_size_mask(
+
+ static void setup_pcid(void)
+ {
+-#ifdef CONFIG_X86_64
+- if (boot_cpu_has(X86_FEATURE_PCID)) {
+- if (boot_cpu_has(X86_FEATURE_PGE)) {
+- /*
+- * This can't be cr4_set_bits_and_update_boot() --
+- * the trampoline code can't handle CR4.PCIDE and
+- * it wouldn't do any good anyway. Despite the name,
+- * cr4_set_bits_and_update_boot() doesn't actually
+- * cause the bits in question to remain set all the
+- * way through the secondary boot asm.
+- *
+- * Instead, we brute-force it and set CR4.PCIDE
+- * manually in start_secondary().
+- */
+- cr4_set_bits(X86_CR4_PCIDE);
+- } else {
+- /*
+- * flush_tlb_all(), as currently implemented, won't
+- * work if PCID is on but PGE is not. Since that
+- * combination doesn't exist on real hardware, there's
+- * no reason to try to fully support it, but it's
+- * polite to avoid corrupting data if we're on
+- * an improperly configured VM.
+- */
+- setup_clear_cpu_cap(X86_FEATURE_PCID);
+- }
++ if (!IS_ENABLED(CONFIG_X86_64))
++ return;
++
++ if (!boot_cpu_has(X86_FEATURE_PCID))
++ return;
++
++ if (boot_cpu_has(X86_FEATURE_PGE)) {
++ /*
++ * This can't be cr4_set_bits_and_update_boot() -- the
++ * trampoline code can't handle CR4.PCIDE and it wouldn't
++ * do any good anyway. Despite the name,
++ * cr4_set_bits_and_update_boot() doesn't actually cause
++ * the bits in question to remain set all the way through
++ * the secondary boot asm.
++ *
++ * Instead, we brute-force it and set CR4.PCIDE manually in
++ * start_secondary().
++ */
++ cr4_set_bits(X86_CR4_PCIDE);
++
++ /*
++ * INVPCID's single-context modes (2/3) only work if we set
++ * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
++ * on systems that have X86_CR4_PCIDE clear, or that have
++ * no INVPCID support at all.
++ */
++ if (boot_cpu_has(X86_FEATURE_INVPCID))
++ setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
++ } else {
++ /*
++ * flush_tlb_all(), as currently implemented, won't work if
++ * PCID is on but PGE is not. Since that combination
++ * doesn't exist on real hardware, there's no reason to try
++ * to fully support it, but it's polite to avoid corrupting
++ * data if we're on an improperly configured VM.
++ */
++ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ }
+-#endif
+ }
+
+ #ifdef CONFIG_X86_32
--- /dev/null
+From 41f4c20b57a4890ea7f56ff8717cc83fefb8d537 Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@suse.de>
+Date: Tue, 12 Dec 2017 14:39:52 +0100
+Subject: x86/pti: Add the pti= cmdline option and documentation
+
+From: Borislav Petkov <bp@suse.de>
+
+commit 41f4c20b57a4890ea7f56ff8717cc83fefb8d537 upstream.
+
+Keep the "nopti" optional for traditional reasons.
+
+[ tglx: Don't allow force on when running on XEN PV and made 'on'
+ printout conditional ]
+
+Requested-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Andy Lutomirsky <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171212133952.10177-1-bp@alien8.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ Documentation/admin-guide/kernel-parameters.txt | 6 +++++
+ arch/x86/mm/pti.c | 26 +++++++++++++++++++++++-
+ 2 files changed, 31 insertions(+), 1 deletion(-)
+
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -3255,6 +3255,12 @@
+ pt. [PARIDE]
+ See Documentation/blockdev/paride.txt.
+
++ pti= [X86_64]
++ Control user/kernel address space isolation:
++ on - enable
++ off - disable
++ auto - default setting
++
+ pty.legacy_count=
+ [KNL] Number of legacy pty's. Overwrites compiled-in
+ default number.
+--- a/arch/x86/mm/pti.c
++++ b/arch/x86/mm/pti.c
+@@ -54,21 +54,45 @@ static void __init pti_print_if_insecure
+ pr_info("%s\n", reason);
+ }
+
++static void __init pti_print_if_secure(const char *reason)
++{
++ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
++ pr_info("%s\n", reason);
++}
++
+ void __init pti_check_boottime_disable(void)
+ {
++ char arg[5];
++ int ret;
++
+ if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
+ pti_print_if_insecure("disabled on XEN PV.");
+ return;
+ }
+
++ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
++ if (ret > 0) {
++ if (ret == 3 && !strncmp(arg, "off", 3)) {
++ pti_print_if_insecure("disabled on command line.");
++ return;
++ }
++ if (ret == 2 && !strncmp(arg, "on", 2)) {
++ pti_print_if_secure("force enabled on command line.");
++ goto enable;
++ }
++ if (ret == 4 && !strncmp(arg, "auto", 4))
++ goto autosel;
++ }
++
+ if (cmdline_find_option_bool(boot_command_line, "nopti")) {
+ pti_print_if_insecure("disabled on command line.");
+ return;
+ }
+
++autosel:
+ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+ return;
+-
++enable:
+ setup_force_cpu_cap(X86_FEATURE_PTI);
+ }
+
--- /dev/null
+From 85900ea51577e31b186e523c8f4e068c79ecc7d3 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 12 Dec 2017 07:56:42 -0800
+Subject: x86/pti: Map the vsyscall page if needed
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 85900ea51577e31b186e523c8f4e068c79ecc7d3 upstream.
+
+Make VSYSCALLs work fully in PTI mode by mapping them properly to the user
+space visible page tables.
+
+[ tglx: Hide unused functions (Patch by Arnd Bergmann) ]
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/vsyscall/vsyscall_64.c | 6 +--
+ arch/x86/include/asm/vsyscall.h | 1
+ arch/x86/mm/pti.c | 65 ++++++++++++++++++++++++++++++++++
+ 3 files changed, 69 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/entry/vsyscall/vsyscall_64.c
++++ b/arch/x86/entry/vsyscall/vsyscall_64.c
+@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long add
+ * vsyscalls but leave the page not present. If so, we skip calling
+ * this.
+ */
+-static void __init set_vsyscall_pgtable_user_bits(void)
++void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
+ {
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+- pgd = pgd_offset_k(VSYSCALL_ADDR);
++ pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
+ set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
+ p4d = p4d_offset(pgd, VSYSCALL_ADDR);
+ #if CONFIG_PGTABLE_LEVELS >= 5
+@@ -373,7 +373,7 @@ void __init map_vsyscall(void)
+ vsyscall_mode == NATIVE
+ ? PAGE_KERNEL_VSYSCALL
+ : PAGE_KERNEL_VVAR);
+- set_vsyscall_pgtable_user_bits();
++ set_vsyscall_pgtable_user_bits(swapper_pg_dir);
+ }
+
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
+--- a/arch/x86/include/asm/vsyscall.h
++++ b/arch/x86/include/asm/vsyscall.h
+@@ -7,6 +7,7 @@
+
+ #ifdef CONFIG_X86_VSYSCALL_EMULATION
+ extern void map_vsyscall(void);
++extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
+
+ /*
+ * Called on instruction fetch fault in vsyscall page.
+--- a/arch/x86/mm/pti.c
++++ b/arch/x86/mm/pti.c
+@@ -38,6 +38,7 @@
+
+ #include <asm/cpufeature.h>
+ #include <asm/hypervisor.h>
++#include <asm/vsyscall.h>
+ #include <asm/cmdline.h>
+ #include <asm/pti.h>
+ #include <asm/pgtable.h>
+@@ -223,6 +224,69 @@ static pmd_t *pti_user_pagetable_walk_pm
+ return pmd_offset(pud, address);
+ }
+
++#ifdef CONFIG_X86_VSYSCALL_EMULATION
++/*
++ * Walk the shadow copy of the page tables (optionally) trying to allocate
++ * page table pages on the way down. Does not support large pages.
++ *
++ * Note: this is only used when mapping *new* kernel data into the
++ * user/shadow page tables. It is never used for userspace data.
++ *
++ * Returns a pointer to a PTE on success, or NULL on failure.
++ */
++static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
++{
++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
++ pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
++ pte_t *pte;
++
++ /* We can't do anything sensible if we hit a large mapping. */
++ if (pmd_large(*pmd)) {
++ WARN_ON(1);
++ return NULL;
++ }
++
++ if (pmd_none(*pmd)) {
++ unsigned long new_pte_page = __get_free_page(gfp);
++ if (!new_pte_page)
++ return NULL;
++
++ if (pmd_none(*pmd)) {
++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
++ new_pte_page = 0;
++ }
++ if (new_pte_page)
++ free_page(new_pte_page);
++ }
++
++ pte = pte_offset_kernel(pmd, address);
++ if (pte_flags(*pte) & _PAGE_USER) {
++ WARN_ONCE(1, "attempt to walk to user pte\n");
++ return NULL;
++ }
++ return pte;
++}
++
++static void __init pti_setup_vsyscall(void)
++{
++ pte_t *pte, *target_pte;
++ unsigned int level;
++
++ pte = lookup_address(VSYSCALL_ADDR, &level);
++ if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
++ return;
++
++ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
++ if (WARN_ON(!target_pte))
++ return;
++
++ *target_pte = *pte;
++ set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
++}
++#else
++static void __init pti_setup_vsyscall(void) { }
++#endif
++
+ static void __init
+ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+ {
+@@ -319,4 +383,5 @@ void __init pti_init(void)
+ pti_clone_user_shared();
+ pti_clone_entry_text();
+ pti_setup_espfix64();
++ pti_setup_vsyscall();
+ }
--- /dev/null
+From f55f0501cbf65ec41cca5058513031b711730b1d Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 12 Dec 2017 07:56:45 -0800
+Subject: x86/pti: Put the LDT in its own PGD if PTI is on
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit f55f0501cbf65ec41cca5058513031b711730b1d upstream.
+
+With PTI enabled, the LDT must be mapped in the usermode tables somewhere.
+The LDT is per process, i.e. per mm.
+
+An earlier approach mapped the LDT on context switch into a fixmap area,
+but that's a big overhead and exhausted the fixmap space when NR_CPUS got
+big.
+
+Take advantage of the fact that there is an address space hole which
+provides a completely unused pgd. Use this pgd to manage per-mm LDT
+mappings.
+
+This has a down side: the LDT isn't (currently) randomized, and an attack
+that can write the LDT is instant root due to call gates (thanks, AMD, for
+leaving call gates in AMD64 but designing them wrong so they're only useful
+for exploits). This can be mitigated by making the LDT read-only or
+randomizing the mapping, either of which is strightforward on top of this
+patch.
+
+This will significantly slow down LDT users, but that shouldn't matter for
+important workloads -- the LDT is only used by DOSEMU(2), Wine, and very
+old libc implementations.
+
+[ tglx: Cleaned it up. ]
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Kirill A. Shutemov <kirill@shutemov.name>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ Documentation/x86/x86_64/mm.txt | 3
+ arch/x86/include/asm/mmu_context.h | 59 ++++++++++++-
+ arch/x86/include/asm/pgtable_64_types.h | 4
+ arch/x86/include/asm/processor.h | 23 +++--
+ arch/x86/kernel/ldt.c | 139 +++++++++++++++++++++++++++++++-
+ arch/x86/mm/dump_pagetables.c | 9 ++
+ 6 files changed, 220 insertions(+), 17 deletions(-)
+
+--- a/Documentation/x86/x86_64/mm.txt
++++ b/Documentation/x86/x86_64/mm.txt
+@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40
+ ... unused hole ...
+ ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
+ ... unused hole ...
++fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
+ fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
+ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
+ ... unused hole ...
+@@ -29,7 +30,7 @@ Virtual memory map with 5 level page tab
+ hole caused by [56:63] sign extension
+ ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
+ ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
+-ff90000000000000 - ff9fffffffffffff (=52 bits) hole
++ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
+ ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
+ ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
+ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
+--- a/arch/x86/include/asm/mmu_context.h
++++ b/arch/x86/include/asm/mmu_context.h
+@@ -50,10 +50,33 @@ struct ldt_struct {
+ * call gates. On native, we could merge the ldt_struct and LDT
+ * allocations, but it's not worth trying to optimize.
+ */
+- struct desc_struct *entries;
+- unsigned int nr_entries;
++ struct desc_struct *entries;
++ unsigned int nr_entries;
++
++ /*
++ * If PTI is in use, then the entries array is not mapped while we're
++ * in user mode. The whole array will be aliased at the addressed
++ * given by ldt_slot_va(slot). We use two slots so that we can allocate
++ * and map, and enable a new LDT without invalidating the mapping
++ * of an older, still-in-use LDT.
++ *
++ * slot will be -1 if this LDT doesn't have an alias mapping.
++ */
++ int slot;
+ };
+
++/* This is a multiple of PAGE_SIZE. */
++#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
++
++static inline void *ldt_slot_va(int slot)
++{
++#ifdef CONFIG_X86_64
++ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
++#else
++ BUG();
++#endif
++}
++
+ /*
+ * Used for LDT copy/destruction.
+ */
+@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(
+ }
+ int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
+ void destroy_context_ldt(struct mm_struct *mm);
++void ldt_arch_exit_mmap(struct mm_struct *mm);
+ #else /* CONFIG_MODIFY_LDT_SYSCALL */
+ static inline void init_new_context_ldt(struct mm_struct *mm) { }
+ static inline int ldt_dup_context(struct mm_struct *oldmm,
+@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct
+ {
+ return 0;
+ }
+-static inline void destroy_context_ldt(struct mm_struct *mm) {}
++static inline void destroy_context_ldt(struct mm_struct *mm) { }
++static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
+ #endif
+
+ static inline void load_mm_ldt(struct mm_struct *mm)
+@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm
+ * that we can see.
+ */
+
+- if (unlikely(ldt))
+- set_ldt(ldt->entries, ldt->nr_entries);
+- else
++ if (unlikely(ldt)) {
++ if (static_cpu_has(X86_FEATURE_PTI)) {
++ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
++ /*
++ * Whoops -- either the new LDT isn't mapped
++ * (if slot == -1) or is mapped into a bogus
++ * slot (if slot > 1).
++ */
++ clear_LDT();
++ return;
++ }
++
++ /*
++ * If page table isolation is enabled, ldt->entries
++ * will not be mapped in the userspace pagetables.
++ * Tell the CPU to access the LDT through the alias
++ * at ldt_slot_va(ldt->slot).
++ */
++ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
++ } else {
++ set_ldt(ldt->entries, ldt->nr_entries);
++ }
++ } else {
+ clear_LDT();
++ }
+ #else
+ clear_LDT();
+ #endif
+@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct m
+ static inline void arch_exit_mmap(struct mm_struct *mm)
+ {
+ paravirt_arch_exit_mmap(mm);
++ ldt_arch_exit_mmap(mm);
+ }
+
+ #ifdef CONFIG_X86_64
+--- a/arch/x86/include/asm/pgtable_64_types.h
++++ b/arch/x86/include/asm/pgtable_64_types.h
+@@ -82,10 +82,14 @@ typedef struct { pteval_t pte; } pte_t;
+ # define VMALLOC_SIZE_TB _AC(12800, UL)
+ # define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
+ # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
++# define LDT_PGD_ENTRY _AC(-112, UL)
++# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+ #else
+ # define VMALLOC_SIZE_TB _AC(32, UL)
+ # define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
+ # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
++# define LDT_PGD_ENTRY _AC(-4, UL)
++# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+ #endif
+
+ #ifdef CONFIG_RANDOMIZE_MEMORY
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(co
+
+ #else
+ /*
+- * User space process size. 47bits minus one guard page. The guard
+- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
+- * the highest possible canonical userspace address, then that
+- * syscall will enter the kernel with a non-canonical return
+- * address, and SYSRET will explode dangerously. We avoid this
+- * particular problem by preventing anything from being mapped
+- * at the maximum canonical address.
++ * User space process size. This is the first address outside the user range.
++ * There are a few constraints that determine this:
++ *
++ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
++ * address, then that syscall will enter the kernel with a
++ * non-canonical return address, and SYSRET will explode dangerously.
++ * We avoid this particular problem by preventing anything executable
++ * from being mapped at the maximum canonical address.
++ *
++ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
++ * CPUs malfunction if they execute code from the highest canonical page.
++ * They'll speculate right off the end of the canonical space, and
++ * bad things happen. This is worked around in the same way as the
++ * Intel problem.
++ *
++ * With page table isolation enabled, we map the LDT in ... [stay tuned]
+ */
+ #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
+
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -24,6 +24,7 @@
+ #include <linux/uaccess.h>
+
+ #include <asm/ldt.h>
++#include <asm/tlb.h>
+ #include <asm/desc.h>
+ #include <asm/mmu_context.h>
+ #include <asm/syscalls.h>
+@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
+ static void flush_ldt(void *__mm)
+ {
+ struct mm_struct *mm = __mm;
+- mm_context_t *pc;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
+ return;
+
+- pc = &mm->context;
+- set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
++ load_mm_ldt(mm);
+
+ refresh_ldt_segments();
+ }
+@@ -94,10 +93,121 @@ static struct ldt_struct *alloc_ldt_stru
+ return NULL;
+ }
+
++ /* The new LDT isn't aliased for PTI yet. */
++ new_ldt->slot = -1;
++
+ new_ldt->nr_entries = num_entries;
+ return new_ldt;
+ }
+
++/*
++ * If PTI is enabled, this maps the LDT into the kernelmode and
++ * usermode tables for the given mm.
++ *
++ * There is no corresponding unmap function. Even if the LDT is freed, we
++ * leave the PTEs around until the slot is reused or the mm is destroyed.
++ * This is harmless: the LDT is always in ordinary memory, and no one will
++ * access the freed slot.
++ *
++ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
++ * it useful, and the flush would slow down modify_ldt().
++ */
++static int
++map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
++{
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ bool is_vmalloc, had_top_level_entry;
++ unsigned long va;
++ spinlock_t *ptl;
++ pgd_t *pgd;
++ int i;
++
++ if (!static_cpu_has(X86_FEATURE_PTI))
++ return 0;
++
++ /*
++ * Any given ldt_struct should have map_ldt_struct() called at most
++ * once.
++ */
++ WARN_ON(ldt->slot != -1);
++
++ /*
++ * Did we already have the top level entry allocated? We can't
++ * use pgd_none() for this because it doens't do anything on
++ * 4-level page table kernels.
++ */
++ pgd = pgd_offset(mm, LDT_BASE_ADDR);
++ had_top_level_entry = (pgd->pgd != 0);
++
++ is_vmalloc = is_vmalloc_addr(ldt->entries);
++
++ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
++ unsigned long offset = i << PAGE_SHIFT;
++ const void *src = (char *)ldt->entries + offset;
++ unsigned long pfn;
++ pte_t pte, *ptep;
++
++ va = (unsigned long)ldt_slot_va(slot) + offset;
++ pfn = is_vmalloc ? vmalloc_to_pfn(src) :
++ page_to_pfn(virt_to_page(src));
++ /*
++ * Treat the PTI LDT range as a *userspace* range.
++ * get_locked_pte() will allocate all needed pagetables
++ * and account for them in this mm.
++ */
++ ptep = get_locked_pte(mm, va, &ptl);
++ if (!ptep)
++ return -ENOMEM;
++ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
++ set_pte_at(mm, va, ptep, pte);
++ pte_unmap_unlock(ptep, ptl);
++ }
++
++ if (mm->context.ldt) {
++ /*
++ * We already had an LDT. The top-level entry should already
++ * have been allocated and synchronized with the usermode
++ * tables.
++ */
++ WARN_ON(!had_top_level_entry);
++ if (static_cpu_has(X86_FEATURE_PTI))
++ WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
++ } else {
++ /*
++ * This is the first time we're mapping an LDT for this process.
++ * Sync the pgd to the usermode tables.
++ */
++ WARN_ON(had_top_level_entry);
++ if (static_cpu_has(X86_FEATURE_PTI)) {
++ WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
++ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
++ }
++ }
++
++ va = (unsigned long)ldt_slot_va(slot);
++ flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
++
++ ldt->slot = slot;
++#endif
++ return 0;
++}
++
++static void free_ldt_pgtables(struct mm_struct *mm)
++{
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++ struct mmu_gather tlb;
++ unsigned long start = LDT_BASE_ADDR;
++ unsigned long end = start + (1UL << PGDIR_SHIFT);
++
++ if (!static_cpu_has(X86_FEATURE_PTI))
++ return;
++
++ tlb_gather_mmu(&tlb, mm, start, end);
++ free_pgd_range(&tlb, start, end, start, end);
++ tlb_finish_mmu(&tlb, start, end);
++#endif
++}
++
+ /* After calling this, the LDT is immutable. */
+ static void finalize_ldt_struct(struct ldt_struct *ldt)
+ {
+@@ -156,6 +266,12 @@ int ldt_dup_context(struct mm_struct *ol
+ new_ldt->nr_entries * LDT_ENTRY_SIZE);
+ finalize_ldt_struct(new_ldt);
+
++ retval = map_ldt_struct(mm, new_ldt, 0);
++ if (retval) {
++ free_ldt_pgtables(mm);
++ free_ldt_struct(new_ldt);
++ goto out_unlock;
++ }
+ mm->context.ldt = new_ldt;
+
+ out_unlock:
+@@ -174,6 +290,11 @@ void destroy_context_ldt(struct mm_struc
+ mm->context.ldt = NULL;
+ }
+
++void ldt_arch_exit_mmap(struct mm_struct *mm)
++{
++ free_ldt_pgtables(mm);
++}
++
+ static int read_ldt(void __user *ptr, unsigned long bytecount)
+ {
+ struct mm_struct *mm = current->mm;
+@@ -287,6 +408,18 @@ static int write_ldt(void __user *ptr, u
+ new_ldt->entries[ldt_info.entry_number] = ldt;
+ finalize_ldt_struct(new_ldt);
+
++ /*
++ * If we are using PTI, map the new LDT into the userspace pagetables.
++ * If there is already an LDT, use the other slot so that other CPUs
++ * will continue to use the old LDT until install_ldt() switches
++ * them over to the new LDT.
++ */
++ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
++ if (error) {
++ free_ldt_struct(old_ldt);
++ goto out_unlock;
++ }
++
+ install_ldt(mm, new_ldt);
+ free_ldt_struct(old_ldt);
+ error = 0;
+--- a/arch/x86/mm/dump_pagetables.c
++++ b/arch/x86/mm/dump_pagetables.c
+@@ -52,12 +52,18 @@ enum address_markers_idx {
+ USER_SPACE_NR = 0,
+ KERNEL_SPACE_NR,
+ LOW_KERNEL_NR,
++#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
++ LDT_NR,
++#endif
+ VMALLOC_START_NR,
+ VMEMMAP_START_NR,
+ #ifdef CONFIG_KASAN
+ KASAN_SHADOW_START_NR,
+ KASAN_SHADOW_END_NR,
+ #endif
++#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
++ LDT_NR,
++#endif
+ CPU_ENTRY_AREA_NR,
+ #ifdef CONFIG_X86_ESPFIX64
+ ESPFIX_START_NR,
+@@ -82,6 +88,9 @@ static struct addr_marker address_marker
+ [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
+ [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
+ #endif
++#ifdef CONFIG_MODIFY_LDT_SYSCALL
++ [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
++#endif
+ [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
+ #ifdef CONFIG_X86_ESPFIX64
+ [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },