--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 29 Dec 2015 20:12:20 -0800
+Subject: mm: Add vm_insert_pfn_prot()
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 1745cbc5d0dee0749a6bc0ea8e872c5db0074061 upstream
+
+The x86 vvar vma contains pages with differing cacheability
+flags. x86 currently implements this by manually inserting all
+the ptes using (io_)remap_pfn_range when the vma is set up.
+
+x86 wants to move to using .fault with VM_FAULT_NOPAGE to set up
+the mappings as needed. The correct API to use to insert a pfn
+in .fault is vm_insert_pfn(), but vm_insert_pfn() can't override the
+vma's cache mode, and the HPET page in particular needs to be
+uncached despite the fact that the rest of the VMA is cached.
+
+Add vm_insert_pfn_prot() to support varying cacheability within
+the same non-COW VMA in a more sane manner.
+
+x86 could alternatively use multiple VMAs, but that's messy,
+would break CRIU, and would create unnecessary VMAs that would
+waste memory.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+Acked-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Fenghua Yu <fenghua.yu@intel.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/d2938d1eb37be7a5e4f86182db646551f11e45aa.1451446564.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h | 2 ++
+ mm/memory.c | 25 +++++++++++++++++++++++--
+ 2 files changed, 25 insertions(+), 2 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2083,6 +2083,8 @@ int remap_pfn_range(struct vm_area_struc
+ int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
+ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn);
++int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
++ unsigned long pfn, pgprot_t pgprot);
+ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn);
+ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1605,8 +1605,29 @@ out:
+ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+ {
++ return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
++}
++EXPORT_SYMBOL(vm_insert_pfn);
++
++/**
++ * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
++ * @vma: user vma to map to
++ * @addr: target user address of this page
++ * @pfn: source kernel pfn
++ * @pgprot: pgprot flags for the inserted page
++ *
++ * This is exactly like vm_insert_pfn, except that it allows drivers to
++ * to override pgprot on a per-page basis.
++ *
++ * This only makes sense for IO mappings, and it makes no sense for
++ * cow mappings. In general, using multiple vmas is preferable;
++ * vm_insert_pfn_prot should only be used if using multiple VMAs is
++ * impractical.
++ */
++int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
++ unsigned long pfn, pgprot_t pgprot)
++{
+ int ret;
+- pgprot_t pgprot = vma->vm_page_prot;
+ /*
+ * Technically, architectures with pte_special can avoid all these
+ * restrictions (same for remap_pfn_range). However we would like
+@@ -1628,7 +1649,7 @@ int vm_insert_pfn(struct vm_area_struct
+
+ return ret;
+ }
+-EXPORT_SYMBOL(vm_insert_pfn);
++EXPORT_SYMBOL(vm_insert_pfn_prot);
+
+ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Fri, 7 Oct 2016 17:00:18 -0700
+Subject: mm: fix cache mode tracking in vm_insert_mixed()
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 87744ab3832b83ba71b931f86f9cfdb000d07da5 upstream
+
+vm_insert_mixed() unlike vm_insert_pfn_prot() and vmf_insert_pfn_pmd(),
+fails to check the pgprot_t it uses for the mapping against the one
+recorded in the memtype tracking tree. Add the missing call to
+track_pfn_insert() to preclude cases where incompatible aliased mappings
+are established for a given physical address range.
+
+[groeck: Backport to v4.4.y]
+
+Link: http://lkml.kernel.org/r/147328717909.35069.14256589123570653697.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Cc: David Airlie <airlied@linux.ie>
+Cc: Matthew Wilcox <mawilcox@microsoft.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1654,10 +1654,14 @@ EXPORT_SYMBOL(vm_insert_pfn_prot);
+ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+ {
++ pgprot_t pgprot = vma->vm_page_prot;
++
+ BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
++ if (track_pfn_insert(vma, &pgprot, pfn))
++ return -EINVAL;
+
+ /*
+ * If we don't have pte special, then we have to use the pfn_valid()
+@@ -1670,9 +1674,9 @@ int vm_insert_mixed(struct vm_area_struc
+ struct page *page;
+
+ page = pfn_to_page(pfn);
+- return insert_page(vma, addr, page, vma->vm_page_prot);
++ return insert_page(vma, addr, page, pgprot);
+ }
+- return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
++ return insert_pfn(vma, addr, pfn, pgprot);
+ }
+ EXPORT_SYMBOL(vm_insert_mixed);
+
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Date: Fri, 8 Sep 2017 16:10:46 -0700
+Subject: mm: x86: move _PAGE_SWP_SOFT_DIRTY from bit 7 to bit 1
+
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+
+commit eee4818baac0f2b37848fdf90e4b16430dc536ac upstream
+
+_PAGE_PSE is used to distinguish between a truly non-present
+(_PAGE_PRESENT=0) PMD, and a PMD which is undergoing a THP split and
+should be treated as present.
+
+But _PAGE_SWP_SOFT_DIRTY currently uses the _PAGE_PSE bit, which would
+cause confusion between one of those PMDs undergoing a THP split, and a
+soft-dirty PMD. Dropping _PAGE_PSE check in pmd_present() does not work
+well, because it can hurt optimization of tlb handling in thp split.
+
+Thus, we need to move the bit.
+
+In the current kernel, bits 1-4 are not used in non-present format since
+commit 00839ee3b299 ("x86/mm: Move swap offset/type up in PTE to work
+around erratum"). So let's move _PAGE_SWP_SOFT_DIRTY to bit 1. Bit 7
+is used as reserved (always clear), so please don't use it for other
+purpose.
+
+[dwmw2: Pulled in to 4.9 backport to support L1TF changes]
+
+Link: http://lkml.kernel.org/r/20170717193955.20207-3-zi.yan@sent.com
+Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Ingo Molnar <mingo@elte.hu>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable_64.h | 12 +++++++++---
+ arch/x86/include/asm/pgtable_types.h | 10 +++++-----
+ 2 files changed, 14 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -166,15 +166,21 @@ static inline int pgd_large(pgd_t pgd) {
+ /*
+ * Encode and de-code a swap entry
+ *
+- * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number
+- * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
+- * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry
++ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
++ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
++ * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry
+ *
+ * G (8) is aliased and used as a PROT_NONE indicator for
+ * !present ptes. We need to start storing swap entries above
+ * there. We also need to avoid using A and D because of an
+ * erratum where they can be incorrectly set by hardware on
+ * non-present PTEs.
++ *
++ * SD (1) in swp entry is used to store soft dirty bit, which helps us
++ * remember soft dirty over page migration
++ *
++ * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
++ * but also L and G.
+ */
+ #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+ #define SWP_TYPE_BITS 5
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -70,15 +70,15 @@
+ /*
+ * Tracking soft dirty bit when a page goes to a swap is tricky.
+ * We need a bit which can be stored in pte _and_ not conflict
+- * with swap entry format. On x86 bits 6 and 7 are *not* involved
+- * into swap entry computation, but bit 6 is used for nonlinear
+- * file mapping, so we borrow bit 7 for soft dirty tracking.
++ * with swap entry format. On x86 bits 1-4 are *not* involved
++ * into swap entry computation, but bit 7 is used for thp migration,
++ * so we borrow bit 1 for soft dirty tracking.
+ *
+ * Please note that this bit must be treated as swap dirty page
+- * mark if and only if the PTE has present bit clear!
++ * mark if and only if the PTE/PMD has present bit clear!
+ */
+ #ifdef CONFIG_MEM_SOFT_DIRTY
+-#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE
++#define _PAGE_SWP_SOFT_DIRTY _PAGE_RW
+ #else
+ #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
+ #endif
x86-speculation-protect-against-userspace-userspace-spectrersb.patch
kprobes-x86-fix-p-uses-in-error-messages.patch
x86-irqflags-provide-a-declaration-for-native_save_fl.patch
+x86-speculation-l1tf-increase-32bit-pae-__physical_page_shift.patch
+x86-mm-move-swap-offset-type-up-in-pte-to-work-around-erratum.patch
+x86-mm-fix-swap-entry-comment-and-macro.patch
+mm-x86-move-_page_swp_soft_dirty-from-bit-7-to-bit-1.patch
+x86-speculation-l1tf-change-order-of-offset-type-in-swap-entry.patch
+x86-speculation-l1tf-protect-swap-entries-against-l1tf.patch
+x86-speculation-l1tf-protect-prot_none-ptes-against-speculation.patch
+x86-speculation-l1tf-make-sure-the-first-page-is-always-reserved.patch
+x86-speculation-l1tf-add-sysfs-reporting-for-l1tf.patch
+mm-add-vm_insert_pfn_prot.patch
+mm-fix-cache-mode-tracking-in-vm_insert_mixed.patch
+x86-speculation-l1tf-disallow-non-privileged-high-mmio-prot_none-mappings.patch
+x86-speculation-l1tf-limit-swap-file-size-to-max_pa-2.patch
+x86-bugs-move-the-l1tf-function-and-define-pr_fmt-properly.patch
+x86-speculation-l1tf-extend-64bit-swap-file-size-limit.patch
+x86-cpufeatures-add-detection-of-l1d-cache-flush-support.patch
+x86-speculation-l1tf-protect-pae-swap-entries-against-l1tf.patch
+x86-speculation-l1tf-fix-up-pte-pfn-conversion-for-pae.patch
+x86-speculation-l1tf-invert-all-not-present-mappings.patch
+x86-speculation-l1tf-make-pmd-pud_mknotpresent-invert.patch
+x86-mm-pat-make-set_memory_np-l1tf-safe.patch
+x86-mm-kmmio-make-the-tracer-robust-against-l1tf.patch
+x86-speculation-l1tf-fix-up-cpu-feature-flags.patch
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Date: Wed, 20 Jun 2018 16:42:57 -0400
+Subject: x86/bugs: Move the l1tf function and define pr_fmt properly
+
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+
+commit 56563f53d3066afa9e63d6c997bf67e76a8b05c0 upstream
+
+The pr_warn in l1tf_select_mitigation would have used the prior pr_fmt
+which was defined as "Spectre V2 : ".
+
+Move the function to be past SSBD and also define the pr_fmt.
+
+Fixes: 17dbca119312 ("x86/speculation/l1tf: Add sysfs reporting for l1tf")
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 55 +++++++++++++++++++++++----------------------
+ 1 file changed, 29 insertions(+), 26 deletions(-)
+
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -207,32 +207,6 @@ static void x86_amd_ssb_disable(void)
+ wrmsrl(MSR_AMD64_LS_CFG, msrval);
+ }
+
+-static void __init l1tf_select_mitigation(void)
+-{
+- u64 half_pa;
+-
+- if (!boot_cpu_has_bug(X86_BUG_L1TF))
+- return;
+-
+-#if CONFIG_PGTABLE_LEVELS == 2
+- pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
+- return;
+-#endif
+-
+- /*
+- * This is extremely unlikely to happen because almost all
+- * systems have far more MAX_PA/2 than RAM can be fit into
+- * DIMM slots.
+- */
+- half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
+- if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
+- pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
+- return;
+- }
+-
+- setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
+-}
+-
+ #ifdef RETPOLINE
+ static bool spectre_v2_bad_module;
+
+@@ -658,6 +632,35 @@ void x86_spec_ctrl_setup_ap(void)
+ x86_amd_ssb_disable();
+ }
+
++#undef pr_fmt
++#define pr_fmt(fmt) "L1TF: " fmt
++static void __init l1tf_select_mitigation(void)
++{
++ u64 half_pa;
++
++ if (!boot_cpu_has_bug(X86_BUG_L1TF))
++ return;
++
++#if CONFIG_PGTABLE_LEVELS == 2
++ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
++ return;
++#endif
++
++ /*
++ * This is extremely unlikely to happen because almost all
++ * systems have far more MAX_PA/2 than RAM can be fit into
++ * DIMM slots.
++ */
++ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
++ if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
++ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
++ return;
++ }
++
++ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
++}
++#undef pr_fmt
++
+ #ifdef CONFIG_SYSFS
+
+ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Date: Wed, 20 Jun 2018 16:42:58 -0400
+Subject: x86/cpufeatures: Add detection of L1D cache flush support.
+
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+
+commit 11e34e64e4103955fc4568750914c75d65ea87ee upstream
+
+336996-Speculative-Execution-Side-Channel-Mitigations.pdf defines a new MSR
+(IA32_FLUSH_CMD) which is detected by CPUID.7.EDX[28]=1 bit being set.
+
+This new MSR "gives software a way to invalidate structures with finer
+granularity than other architectual methods like WBINVD."
+
+A copy of this document is available at
+ https://bugzilla.kernel.org/show_bug.cgi?id=199511
+
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/cpufeatures.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -310,6 +310,7 @@
+ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
+ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
++#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
+ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
+
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 10 Aug 2016 10:23:25 -0700
+Subject: x86/mm: Fix swap entry comment and macro
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit ace7fab7a6cdd363a615ec537f2aa94dbc761ee2 upstream
+
+A recent patch changed the format of a swap PTE.
+
+The comment explaining the format of the swap PTE is wrong about
+the bits used for the swap type field. Amusingly, the ASCII art
+and the patch description are correct, but the comment itself
+is wrong.
+
+As I was looking at this, I also noticed that the
+SWP_OFFSET_FIRST_BIT has an off-by-one error. This does not
+really hurt anything. It just wasted a bit of space in the PTE,
+giving us 2^59 bytes of addressable space in our swapfiles
+instead of 2^60. But, it doesn't match with the comments, and it
+wastes a bit of space, so fix it.
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave@sr71.net>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Luis R. Rodriguez <mcgrof@suse.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Toshi Kani <toshi.kani@hp.com>
+Fixes: 00839ee3b299 ("x86/mm: Move swap offset/type up in PTE to work around erratum")
+Link: http://lkml.kernel.org/r/20160810172325.E56AD7DA@viggo.jf.intel.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable_64.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -168,7 +168,7 @@ static inline int pgd_large(pgd_t pgd) {
+ *
+ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number
+ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
+- * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry
++ * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry
+ *
+ * G (8) is aliased and used as a PROT_NONE indicator for
+ * !present ptes. We need to start storing swap entries above
+@@ -179,7 +179,7 @@ static inline int pgd_large(pgd_t pgd) {
+ #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+ #define SWP_TYPE_BITS 5
+ /* Place the offset above the type: */
+-#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1)
++#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS)
+
+ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andi Kleen <ak@linux.intel.com>
+Date: Tue, 7 Aug 2018 15:09:38 -0700
+Subject: x86/mm/kmmio: Make the tracer robust against L1TF
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit 1063711b57393c1999248cccb57bebfaf16739e7 upstream
+
+The mmio tracer sets io mapping PTEs and PMDs to non present when enabled
+without inverting the address bits, which makes the PTE entry vulnerable
+for L1TF.
+
+Make it use the right low level macros to actually invert the address bits
+to protect against L1TF.
+
+In principle this could be avoided because MMIO tracing is not likely to be
+enabled on production machines, but the fix is straigt forward and for
+consistency sake it's better to get rid of the open coded PTE manipulation.
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/kmmio.c | 25 +++++++++++++++----------
+ 1 file changed, 15 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/mm/kmmio.c
++++ b/arch/x86/mm/kmmio.c
+@@ -125,24 +125,29 @@ static struct kmmio_fault_page *get_kmmi
+
+ static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
+ {
++ pmd_t new_pmd;
+ pmdval_t v = pmd_val(*pmd);
+ if (clear) {
+- *old = v & _PAGE_PRESENT;
+- v &= ~_PAGE_PRESENT;
+- } else /* presume this has been called with clear==true previously */
+- v |= *old;
+- set_pmd(pmd, __pmd(v));
++ *old = v;
++ new_pmd = pmd_mknotpresent(*pmd);
++ } else {
++ /* Presume this has been called with clear==true previously */
++ new_pmd = __pmd(*old);
++ }
++ set_pmd(pmd, new_pmd);
+ }
+
+ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
+ {
+ pteval_t v = pte_val(*pte);
+ if (clear) {
+- *old = v & _PAGE_PRESENT;
+- v &= ~_PAGE_PRESENT;
+- } else /* presume this has been called with clear==true previously */
+- v |= *old;
+- set_pte_atomic(pte, __pte(v));
++ *old = v;
++ /* Nothing should care about address */
++ pte_clear(&init_mm, 0, pte);
++ } else {
++ /* Presume this has been called with clear==true previously */
++ set_pte_atomic(pte, __pte(*old));
++ }
+ }
+
+ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Thu, 7 Jul 2016 17:19:11 -0700
+Subject: x86/mm: Move swap offset/type up in PTE to work around erratum
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 00839ee3b299303c6a5e26a0a2485427a3afcbbf upstream
+
+This erratum can result in Accessed/Dirty getting set by the hardware
+when we do not expect them to be (on !Present PTEs).
+
+Instead of trying to fix them up after this happens, we just
+allow the bits to get set and try to ignore them. We do this by
+shifting the layout of the bits we use for swap offset/type in
+our 64-bit PTEs.
+
+It looks like this:
+
+ bitnrs: | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0|
+ names: | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P|
+ before: | OFFSET (9-63) |0|X|X| TYPE(1-5) |0|
+ after: | OFFSET (14-63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0|
+
+Note that D was already a don't care (X) even before. We just
+move TYPE up and turn its old spot (which could be hit by the
+A bit) into all don't cares.
+
+We take 5 bits away from the offset, but that still leaves us
+with 50 bits which lets us index into a 62-bit swapfile (4 EiB).
+I think that's probably fine for the moment. We could
+theoretically reclaim 5 of the bits (1, 2, 3, 4, 7) but it
+doesn't gain us anything.
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave@sr71.net>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Luis R. Rodriguez <mcgrof@suse.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Toshi Kani <toshi.kani@hp.com>
+Cc: dave.hansen@intel.com
+Cc: linux-mm@kvack.org
+Cc: mhocko@suse.com
+Link: http://lkml.kernel.org/r/20160708001911.9A3FD2B6@viggo.jf.intel.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable_64.h | 26 ++++++++++++++++++++------
+ 1 file changed, 20 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -163,18 +163,32 @@ static inline int pgd_large(pgd_t pgd) {
+ #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
+ #define pte_unmap(pte) ((void)(pte))/* NOP */
+
+-/* Encode and de-code a swap entry */
++/*
++ * Encode and de-code a swap entry
++ *
++ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number
++ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
++ * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry
++ *
++ * G (8) is aliased and used as a PROT_NONE indicator for
++ * !present ptes. We need to start storing swap entries above
++ * there. We also need to avoid using A and D because of an
++ * erratum where they can be incorrectly set by hardware on
++ * non-present PTEs.
++ */
++#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+ #define SWP_TYPE_BITS 5
+-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
++/* Place the offset above the type: */
++#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1)
+
+ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+-#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
++#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \
+ & ((1U << SWP_TYPE_BITS) - 1))
+-#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
++#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT)
+ #define __swp_entry(type, offset) ((swp_entry_t) { \
+- ((type) << (_PAGE_BIT_PRESENT + 1)) \
+- | ((offset) << SWP_OFFSET_SHIFT) })
++ ((type) << (SWP_TYPE_FIRST_BIT)) \
++ | ((offset) << SWP_OFFSET_FIRST_BIT) })
+ #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
+ #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andi Kleen <ak@linux.intel.com>
+Date: Tue, 7 Aug 2018 15:09:39 -0700
+Subject: x86/mm/pat: Make set_memory_np() L1TF safe
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit 958f79b9ee55dfaf00c8106ed1c22a2919e0028b upstream
+
+set_memory_np() is used to mark kernel mappings not present, but it has
+it's own open coded mechanism which does not have the L1TF protection of
+inverting the address bits.
+
+Replace the open coded PTE manipulation with the L1TF protecting low level
+PTE routines.
+
+Passes the CPA self test.
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+[ dwmw2: Pull in pud_mkhuge() from commit a00cc7d9dd, and pfn_pud() ]
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+[groeck: port to 4.4]
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable.h | 27 +++++++++++++++++++++++++++
+ arch/x86/mm/pageattr.c | 8 ++++----
+ 2 files changed, 31 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -378,12 +378,39 @@ static inline pmd_t pfn_pmd(unsigned lon
+ return __pmd(pfn | massage_pgprot(pgprot));
+ }
+
++static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
++{
++ phys_addr_t pfn = page_nr << PAGE_SHIFT;
++ pfn ^= protnone_mask(pgprot_val(pgprot));
++ pfn &= PHYSICAL_PUD_PAGE_MASK;
++ return __pud(pfn | massage_pgprot(pgprot));
++}
++
+ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+ {
+ return pfn_pmd(pmd_pfn(pmd),
+ __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
+ }
+
++static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
++{
++ pudval_t v = native_pud_val(pud);
++
++ return __pud(v | set);
++}
++
++static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
++{
++ pudval_t v = native_pud_val(pud);
++
++ return __pud(v & ~clear);
++}
++
++static inline pud_t pud_mkhuge(pud_t pud)
++{
++ return pud_set_flags(pud, _PAGE_PSE);
++}
++
+ static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
+
+ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+--- a/arch/x86/mm/pageattr.c
++++ b/arch/x86/mm/pageattr.c
+@@ -1006,8 +1006,8 @@ static int populate_pmd(struct cpa_data
+
+ pmd = pmd_offset(pud, start);
+
+- set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
+- massage_pgprot(pmd_pgprot)));
++ set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
++ canon_pgprot(pmd_pgprot))));
+
+ start += PMD_SIZE;
+ cpa->pfn += PMD_SIZE;
+@@ -1079,8 +1079,8 @@ static int populate_pud(struct cpa_data
+ * Map everything starting from the Gb boundary, possibly with 1G pages
+ */
+ while (end - start >= PUD_SIZE) {
+- set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
+- massage_pgprot(pud_pgprot)));
++ set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
++ canon_pgprot(pud_pgprot))));
+
+ start += PUD_SIZE;
+ cpa->pfn += PUD_SIZE;
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andi Kleen <ak@linux.intel.com>
+Date: Wed, 13 Jun 2018 15:48:26 -0700
+Subject: x86/speculation/l1tf: Add sysfs reporting for l1tf
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit 17dbca119312b4e8173d4e25ff64262119fcef38 upstream
+
+L1TF core kernel workarounds are cheap and normally always enabled, However
+they still should be reported in sysfs if the system is vulnerable or
+mitigated. Add the necessary CPU feature/bug bits.
+
+- Extend the existing checks for Meltdowns to determine if the system is
+ vulnerable. All CPUs which are not vulnerable to Meltdown are also not
+ vulnerable to L1TF
+
+- Check for 32bit non PAE and emit a warning as there is no practical way
+ for mitigation due to the limited physical address bits
+
+- If the system has more than MAX_PA/2 physical memory the invert page
+ workarounds don't protect the system against the L1TF attack anymore,
+ because an inverted physical address will also point to valid
+ memory. Print a warning in this case and report that the system is
+ vulnerable.
+
+Add a function which returns the PFN limit for the L1TF mitigation, which
+will be used in follow up patches for sanity and range checks.
+
+[ tglx: Renamed the CPU feature bit to L1TF_PTEINV ]
+[ dwmw2: Backport to 4.9 (cpufeatures.h, E820) ]
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/cpufeatures.h | 3 +-
+ arch/x86/include/asm/processor.h | 5 ++++
+ arch/x86/kernel/cpu/bugs.c | 40 +++++++++++++++++++++++++++++++++++++
+ arch/x86/kernel/cpu/common.c | 20 ++++++++++++++++++
+ drivers/base/cpu.c | 8 +++++++
+ include/linux/cpu.h | 2 +
+ 6 files changed, 77 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -214,7 +214,7 @@
+ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
+ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
+ #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
+-
++#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
+
+ /* Virtualization flags: Linux defined, word 8 */
+ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
+@@ -331,5 +331,6 @@
+ #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
+ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
+ #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
++#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
+
+ #endif /* _ASM_X86_CPUFEATURES_H */
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -172,6 +172,11 @@ extern const struct seq_operations cpuin
+
+ extern void cpu_detect(struct cpuinfo_x86 *c);
+
++static inline unsigned long l1tf_pfn_limit(void)
++{
++ return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1;
++}
++
+ extern void early_cpu_init(void);
+ extern void identify_boot_cpu(void);
+ extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -26,9 +26,11 @@
+ #include <asm/pgtable.h>
+ #include <asm/cacheflush.h>
+ #include <asm/intel-family.h>
++#include <asm/e820.h>
+
+ static void __init spectre_v2_select_mitigation(void);
+ static void __init ssb_select_mitigation(void);
++static void __init l1tf_select_mitigation(void);
+
+ /*
+ * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
+@@ -80,6 +82,8 @@ void __init check_bugs(void)
+ */
+ ssb_select_mitigation();
+
++ l1tf_select_mitigation();
++
+ #ifdef CONFIG_X86_32
+ /*
+ * Check whether we are able to run this kernel safely on SMP.
+@@ -203,6 +207,32 @@ static void x86_amd_ssb_disable(void)
+ wrmsrl(MSR_AMD64_LS_CFG, msrval);
+ }
+
++static void __init l1tf_select_mitigation(void)
++{
++ u64 half_pa;
++
++ if (!boot_cpu_has_bug(X86_BUG_L1TF))
++ return;
++
++#if CONFIG_PGTABLE_LEVELS == 2
++ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
++ return;
++#endif
++
++ /*
++ * This is extremely unlikely to happen because almost all
++ * systems have far more MAX_PA/2 than RAM can be fit into
++ * DIMM slots.
++ */
++ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
++ if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
++ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
++ return;
++ }
++
++ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
++}
++
+ #ifdef RETPOLINE
+ static bool spectre_v2_bad_module;
+
+@@ -655,6 +685,11 @@ static ssize_t cpu_show_common(struct de
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+
++ case X86_BUG_L1TF:
++ if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
++ return sprintf(buf, "Mitigation: Page Table Inversion\n");
++ break;
++
+ default:
+ break;
+ }
+@@ -681,4 +716,9 @@ ssize_t cpu_show_spec_store_bypass(struc
+ {
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
+ }
++
++ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
++{
++ return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
++}
+ #endif
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -880,6 +880,21 @@ static const __initconst struct x86_cpu_
+ {}
+ };
+
++static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
++ /* in addition to cpu_no_speculation */
++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
++ {}
++};
++
+ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+ {
+ u64 ia32_cap = 0;
+@@ -905,6 +920,11 @@ static void __init cpu_set_bug_bits(stru
+ return;
+
+ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
++
++ if (x86_match_cpu(cpu_no_l1tf))
++ return;
++
++ setup_force_cpu_bug(X86_BUG_L1TF);
+ }
+
+ /*
+--- a/drivers/base/cpu.c
++++ b/drivers/base/cpu.c
+@@ -524,16 +524,24 @@ ssize_t __weak cpu_show_spec_store_bypas
+ return sprintf(buf, "Not affected\n");
+ }
+
++ssize_t __weak cpu_show_l1tf(struct device *dev,
++ struct device_attribute *attr, char *buf)
++{
++ return sprintf(buf, "Not affected\n");
++}
++
+ static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
+ static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
+ static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
+ static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
++static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
+
+ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
+ &dev_attr_meltdown.attr,
+ &dev_attr_spectre_v1.attr,
+ &dev_attr_spectre_v2.attr,
+ &dev_attr_spec_store_bypass.attr,
++ &dev_attr_l1tf.attr,
+ NULL
+ };
+
+--- a/include/linux/cpu.h
++++ b/include/linux/cpu.h
+@@ -48,6 +48,8 @@ extern ssize_t cpu_show_spectre_v2(struc
+ struct device_attribute *attr, char *buf);
+ extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
+ struct device_attribute *attr, char *buf);
++extern ssize_t cpu_show_l1tf(struct device *dev,
++ struct device_attribute *attr, char *buf);
+
+ extern __printf(4, 5)
+ struct device *cpu_device_create(struct device *parent, void *drvdata,
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Wed, 13 Jun 2018 15:48:22 -0700
+Subject: x86/speculation/l1tf: Change order of offset/type in swap entry
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit bcd11afa7adad8d720e7ba5ef58bdcd9775cf45f upstream
+
+If pages are swapped out, the swap entry is stored in the corresponding
+PTE, which has the Present bit cleared. CPUs vulnerable to L1TF speculate
+on PTE entries which have the present bit set and would treat the swap
+entry as phsyical address (PFN). To mitigate that the upper bits of the PTE
+must be set so the PTE points to non existent memory.
+
+The swap entry stores the type and the offset of a swapped out page in the
+PTE. type is stored in bit 9-13 and offset in bit 14-63. The hardware
+ignores the bits beyond the phsyical address space limit, so to make the
+mitigation effective its required to start 'offset' at the lowest possible
+bit so that even large swap offsets do not reach into the physical address
+space limit bits.
+
+Move offset to bit 9-58 and type to bit 59-63 which are the bits that
+hardware generally doesn't care about.
+
+That, in turn, means that if you on desktop chip with only 40 bits of
+physical addressing, now that the offset starts at bit 9, there needs to be
+30 bits of offset actually *in use* until bit 39 ends up being set, which
+means when inverted it will again point into existing memory.
+
+So that's 4 terabyte of swap space (because the offset is counted in pages,
+so 30 bits of offset is 42 bits of actual coverage). With bigger physical
+addressing, that obviously grows further, until the limit of the offset is
+hit (at 50 bits of offset - 62 bits of actual swap file coverage).
+
+This is a preparatory change for the actual swap entry inversion to protect
+against L1TF.
+
+[ AK: Updated description and minor tweaks. Split into two parts ]
+[ tglx: Massaged changelog ]
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Andi Kleen <ak@linux.intel.com>
+Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable_64.h | 31 ++++++++++++++++++++-----------
+ 1 file changed, 20 insertions(+), 11 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -168,7 +168,7 @@ static inline int pgd_large(pgd_t pgd) {
+ *
+ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
+ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
+- * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry
++ * | TYPE (59-63) | OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
+ *
+ * G (8) is aliased and used as a PROT_NONE indicator for
+ * !present ptes. We need to start storing swap entries above
+@@ -182,19 +182,28 @@ static inline int pgd_large(pgd_t pgd) {
+ * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
+ * but also L and G.
+ */
+-#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+-#define SWP_TYPE_BITS 5
+-/* Place the offset above the type: */
+-#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS)
++#define SWP_TYPE_BITS 5
++
++#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
++
++/* We always extract/encode the offset by shifting it all the way up, and then down again */
++#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)
+
+ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+-#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \
+- & ((1U << SWP_TYPE_BITS) - 1))
+-#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT)
+-#define __swp_entry(type, offset) ((swp_entry_t) { \
+- ((type) << (SWP_TYPE_FIRST_BIT)) \
+- | ((offset) << SWP_OFFSET_FIRST_BIT) })
++/* Extract the high bits for type */
++#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))
++
++/* Shift up (to get rid of type), then down to get value */
++#define __swp_offset(x) ((x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
++
++/*
++ * Shift the offset up "too far" by TYPE bits, then down again
++ */
++#define __swp_entry(type, offset) ((swp_entry_t) { \
++ ((unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
++ | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })
++
+ #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
+ #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andi Kleen <ak@linux.intel.com>
+Date: Wed, 13 Jun 2018 15:48:27 -0700
+Subject: x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit 42e4089c7890725fcd329999252dc489b72f2921 upstream
+
+For L1TF PROT_NONE mappings are protected by inverting the PFN in the page
+table entry. This sets the high bits in the CPU's address space, thus
+making sure to point to not point an unmapped entry to valid cached memory.
+
+Some server system BIOSes put the MMIO mappings high up in the physical
+address space. If such an high mapping was mapped to unprivileged users
+they could attack low memory by setting such a mapping to PROT_NONE. This
+could happen through a special device driver which is not access
+protected. Normal /dev/mem is of course access protected.
+
+To avoid this forbid PROT_NONE mappings or mprotect for high MMIO mappings.
+
+Valid page mappings are allowed because the system is then unsafe anyways.
+
+It's not expected that users commonly use PROT_NONE on MMIO. But to
+minimize any impact this is only enforced if the mapping actually refers to
+a high MMIO address (defined as the MAX_PA-1 bit being set), and also skip
+the check for root.
+
+For mmaps this is straight forward and can be handled in vm_insert_pfn and
+in remap_pfn_range().
+
+For mprotect it's a bit trickier. At the point where the actual PTEs are
+accessed a lot of state has been changed and it would be difficult to undo
+on an error. Since this is a uncommon case use a separate early page talk
+walk pass for MMIO PROT_NONE mappings that checks for this condition
+early. For non MMIO and non PROT_NONE there are no changes.
+
+[dwmw2: Backport to 4.9]
+[groeck: Backport to 4.4]
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable.h | 8 ++++++
+ arch/x86/mm/mmap.c | 21 +++++++++++++++++
+ include/asm-generic/pgtable.h | 12 ++++++++++
+ mm/memory.c | 29 ++++++++++++++++++------
+ mm/mprotect.c | 49 +++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 112 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -942,6 +942,14 @@ static inline pte_t pte_swp_clear_soft_d
+ }
+ #endif
+
++#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
++extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);
++
++static inline bool arch_has_pfn_modify_check(void)
++{
++ return boot_cpu_has_bug(X86_BUG_L1TF);
++}
++
+ #include <asm-generic/pgtable.h>
+ #endif /* __ASSEMBLY__ */
+
+--- a/arch/x86/mm/mmap.c
++++ b/arch/x86/mm/mmap.c
+@@ -121,3 +121,24 @@ const char *arch_vma_name(struct vm_area
+ return "[mpx]";
+ return NULL;
+ }
++
++/*
++ * Only allow root to set high MMIO mappings to PROT_NONE.
++ * This prevents an unpriv. user to set them to PROT_NONE and invert
++ * them, then pointing to valid memory for L1TF speculation.
++ *
++ * Note: for locked down kernels may want to disable the root override.
++ */
++bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
++{
++ if (!boot_cpu_has_bug(X86_BUG_L1TF))
++ return true;
++ if (!__pte_needs_invert(pgprot_val(prot)))
++ return true;
++ /* If it's real memory always allow */
++ if (pfn_valid(pfn))
++ return true;
++ if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
++ return false;
++ return true;
++}
+--- a/include/asm-generic/pgtable.h
++++ b/include/asm-generic/pgtable.h
+@@ -805,4 +805,16 @@ static inline int pmd_free_pte_page(pmd_
+ #define io_remap_pfn_range remap_pfn_range
+ #endif
+
++#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
++static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
++{
++ return true;
++}
++
++static inline bool arch_has_pfn_modify_check(void)
++{
++ return false;
++}
++#endif
++
+ #endif /* _ASM_GENERIC_PGTABLE_H */
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1645,6 +1645,9 @@ int vm_insert_pfn_prot(struct vm_area_st
+ if (track_pfn_insert(vma, &pgprot, pfn))
+ return -EINVAL;
+
++ if (!pfn_modify_allowed(pfn, pgprot))
++ return -EACCES;
++
+ ret = insert_pfn(vma, addr, pfn, pgprot);
+
+ return ret;
+@@ -1663,6 +1666,9 @@ int vm_insert_mixed(struct vm_area_struc
+ if (track_pfn_insert(vma, &pgprot, pfn))
+ return -EINVAL;
+
++ if (!pfn_modify_allowed(pfn, pgprot))
++ return -EACCES;
++
+ /*
+ * If we don't have pte special, then we have to use the pfn_valid()
+ * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
+@@ -1691,6 +1697,7 @@ static int remap_pte_range(struct mm_str
+ {
+ pte_t *pte;
+ spinlock_t *ptl;
++ int err = 0;
+
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+@@ -1698,12 +1705,16 @@ static int remap_pte_range(struct mm_str
+ arch_enter_lazy_mmu_mode();
+ do {
+ BUG_ON(!pte_none(*pte));
++ if (!pfn_modify_allowed(pfn, prot)) {
++ err = -EACCES;
++ break;
++ }
+ set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+- return 0;
++ return err;
+ }
+
+ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
+@@ -1712,6 +1723,7 @@ static inline int remap_pmd_range(struct
+ {
+ pmd_t *pmd;
+ unsigned long next;
++ int err;
+
+ pfn -= addr >> PAGE_SHIFT;
+ pmd = pmd_alloc(mm, pud, addr);
+@@ -1720,9 +1732,10 @@ static inline int remap_pmd_range(struct
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ do {
+ next = pmd_addr_end(addr, end);
+- if (remap_pte_range(mm, pmd, addr, next,
+- pfn + (addr >> PAGE_SHIFT), prot))
+- return -ENOMEM;
++ err = remap_pte_range(mm, pmd, addr, next,
++ pfn + (addr >> PAGE_SHIFT), prot);
++ if (err)
++ return err;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+ }
+@@ -1733,6 +1746,7 @@ static inline int remap_pud_range(struct
+ {
+ pud_t *pud;
+ unsigned long next;
++ int err;
+
+ pfn -= addr >> PAGE_SHIFT;
+ pud = pud_alloc(mm, pgd, addr);
+@@ -1740,9 +1754,10 @@ static inline int remap_pud_range(struct
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+- if (remap_pmd_range(mm, pud, addr, next,
+- pfn + (addr >> PAGE_SHIFT), prot))
+- return -ENOMEM;
++ err = remap_pmd_range(mm, pud, addr, next,
++ pfn + (addr >> PAGE_SHIFT), prot);
++ if (err)
++ return err;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+ }
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -255,6 +255,42 @@ unsigned long change_protection(struct v
+ return pages;
+ }
+
++static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
++ unsigned long next, struct mm_walk *walk)
++{
++ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
++ 0 : -EACCES;
++}
++
++static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
++ unsigned long addr, unsigned long next,
++ struct mm_walk *walk)
++{
++ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
++ 0 : -EACCES;
++}
++
++static int prot_none_test(unsigned long addr, unsigned long next,
++ struct mm_walk *walk)
++{
++ return 0;
++}
++
++static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
++ unsigned long end, unsigned long newflags)
++{
++ pgprot_t new_pgprot = vm_get_page_prot(newflags);
++ struct mm_walk prot_none_walk = {
++ .pte_entry = prot_none_pte_entry,
++ .hugetlb_entry = prot_none_hugetlb_entry,
++ .test_walk = prot_none_test,
++ .mm = current->mm,
++ .private = &new_pgprot,
++ };
++
++ return walk_page_range(start, end, &prot_none_walk);
++}
++
+ int
+ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ unsigned long start, unsigned long end, unsigned long newflags)
+@@ -273,6 +309,19 @@ mprotect_fixup(struct vm_area_struct *vm
+ }
+
+ /*
++ * Do PROT_NONE PFN permission checks here when we can still
++ * bail out without undoing a lot of state. This is a rather
++ * uncommon case, so doesn't need to be very optimized.
++ */
++ if (arch_has_pfn_modify_check() &&
++ (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
++ (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
++ error = prot_none_walk(vma, start, end, newflags);
++ if (error)
++ return error;
++ }
++
++ /*
+ * If we make a private mapping writable we increase our commit;
+ * but (without finer accounting) cannot reduce our commit if we
+ * make it unwritable again. hugetlb mapping were accounted for
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Thu, 21 Jun 2018 12:36:29 +0200
+Subject: x86/speculation/l1tf: Extend 64bit swap file size limit
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 1a7ed1ba4bba6c075d5ad61bb75e3fbc870840d6 upstream
+
+The previous patch has limited swap file size so that large offsets cannot
+clear bits above MAX_PA/2 in the pte and interfere with L1TF mitigation.
+
+It assumed that offsets are encoded starting with bit 12, same as pfn. But
+on x86_64, offsets are encoded starting with bit 9.
+
+Thus the limit can be raised by 3 bits. That means 16TB with 42bit MAX_PA
+and 256TB with 46bit MAX_PA.
+
+Fixes: 377eeaa8e11f ("x86/speculation/l1tf: Limit swap file size to MAX_PA/2")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/init.c | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -778,7 +778,15 @@ unsigned long max_swapfile_size(void)
+
+ if (boot_cpu_has_bug(X86_BUG_L1TF)) {
+ /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
+- pages = min_t(unsigned long, l1tf_pfn_limit() + 1, pages);
++ unsigned long l1tf_limit = l1tf_pfn_limit() + 1;
++ /*
++ * We encode swap offsets also with 3 bits below those for pfn
++ * which makes the usable limit higher.
++ */
++#ifdef CONFIG_X86_64
++ l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
++#endif
++ pages = min_t(unsigned long, l1tf_limit, pages);
+ }
+ return pages;
+ }
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Guenter Roeck <linux@roeck-us.net>
+Date: Mon, 13 Aug 2018 10:15:16 -0700
+Subject: x86/speculation/l1tf: Fix up CPU feature flags
+
+From: Guenter Roeck <linux@roeck-us.net>
+
+In linux-4.4.y, the definition of X86_FEATURE_RETPOLINE and
+X86_FEATURE_RETPOLINE_AMD is different from the upstream
+definition. Result is an overlap with the newly introduced
+X86_FEATURE_L1TF_PTEINV. Update RETPOLINE definitions to match
+upstream definitions to improve alignment with upstream code.
+
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/cpufeatures.h | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -193,12 +193,12 @@
+ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
+ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+
++#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
++#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
++
+ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
+ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
+
+-#define X86_FEATURE_RETPOLINE ( 7*32+29) /* "" Generic Retpoline mitigation for Spectre variant 2 */
+-#define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* "" AMD Retpoline mitigation for Spectre variant 2 */
+-
+ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
+ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
+
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Michal Hocko <mhocko@suse.cz>
+Date: Wed, 27 Jun 2018 17:46:50 +0200
+Subject: x86/speculation/l1tf: Fix up pte->pfn conversion for PAE
+
+From: Michal Hocko <mhocko@suse.cz>
+
+commit e14d7dfb41f5807a0c1c26a13f2b8ef16af24935 upstream
+
+Jan has noticed that pte_pfn and co. resp. pfn_pte are incorrect for
+CONFIG_PAE because phys_addr_t is wider than unsigned long and so the
+pte_val reps. shift left would get truncated. Fix this up by using proper
+types.
+
+[dwmw2: Backport to 4.9]
+
+Fixes: 6b28baca9b1f ("x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation")
+Reported-by: Jan Beulich <JBeulich@suse.com>
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable.h | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -154,21 +154,21 @@ static inline u64 protnone_mask(u64 val)
+
+ static inline unsigned long pte_pfn(pte_t pte)
+ {
+- unsigned long pfn = pte_val(pte);
++ phys_addr_t pfn = pte_val(pte);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
+ }
+
+ static inline unsigned long pmd_pfn(pmd_t pmd)
+ {
+- unsigned long pfn = pmd_val(pmd);
++ phys_addr_t pfn = pmd_val(pmd);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
+ }
+
+ static inline unsigned long pud_pfn(pud_t pud)
+ {
+- unsigned long pfn = pud_val(pud);
++ phys_addr_t pfn = pud_val(pud);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
+ }
+@@ -369,7 +369,7 @@ static inline pgprotval_t massage_pgprot
+
+ static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+ {
+- phys_addr_t pfn = page_nr << PAGE_SHIFT;
++ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PTE_PFN_MASK;
+ return __pte(pfn | massage_pgprot(pgprot));
+@@ -377,7 +377,7 @@ static inline pte_t pfn_pte(unsigned lon
+
+ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+ {
+- phys_addr_t pfn = page_nr << PAGE_SHIFT;
++ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PHYSICAL_PMD_PAGE_MASK;
+ return __pmd(pfn | massage_pgprot(pgprot));
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andi Kleen <ak@linux.intel.com>
+Date: Wed, 13 Jun 2018 15:48:21 -0700
+Subject: x86/speculation/l1tf: Increase 32bit PAE __PHYSICAL_PAGE_SHIFT
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit 50896e180c6aa3a9c61a26ced99e15d602666a4c upstream
+
+L1 Terminal Fault (L1TF) is a speculation related vulnerability. The CPU
+speculates on PTE entries which do not have the PRESENT bit set, if the
+content of the resulting physical address is available in the L1D cache.
+
+The OS side mitigation makes sure that a !PRESENT PTE entry points to a
+physical address outside the actually existing and cachable memory
+space. This is achieved by inverting the upper bits of the PTE. Due to the
+address space limitations this only works for 64bit and 32bit PAE kernels,
+but not for 32bit non PAE.
+
+This mitigation applies to both host and guest kernels, but in case of a
+64bit host (hypervisor) and a 32bit PAE guest, inverting the upper bits of
+the PAE address space (44bit) is not enough if the host has more than 43
+bits of populated memory address space, because the speculation treats the
+PTE content as a physical host address bypassing EPT.
+
+The host (hypervisor) protects itself against the guest by flushing L1D as
+needed, but pages inside the guest are not protected against attacks from
+other processes inside the same guest.
+
+For the guest the inverted PTE mask has to match the host to provide the
+full protection for all pages the host could possibly map into the
+guest. The hosts populated address space is not known to the guest, so the
+mask must cover the possible maximal host address space, i.e. 52 bit.
+
+On 32bit PAE the maximum PTE mask is currently set to 44 bit because that
+is the limit imposed by 32bit unsigned long PFNs in the VMs. This limits
+the mask to be below what the host could possible use for physical pages.
+
+The L1TF PROT_NONE protection code uses the PTE masks to determine which
+bits to invert to make sure the higher bits are set for unmapped entries to
+prevent L1TF speculation attacks against EPT inside guests.
+
+In order to invert all bits that could be used by the host, increase
+__PHYSICAL_PAGE_SHIFT to 52 to match 64bit.
+
+The real limit for a 32bit PAE kernel is still 44 bits because all Linux
+PTEs are created from unsigned long PFNs, so they cannot be higher than 44
+bits on a 32bit kernel. So these extra PFN bits should be never set. The
+only users of this macro are using it to look at PTEs, so it's safe.
+
+[ tglx: Massaged changelog ]
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/page_32_types.h | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/page_32_types.h
++++ b/arch/x86/include/asm/page_32_types.h
+@@ -27,8 +27,13 @@
+ #define N_EXCEPTION_STACKS 1
+
+ #ifdef CONFIG_X86_PAE
+-/* 44=32+12, the limit we can fit into an unsigned long pfn */
+-#define __PHYSICAL_MASK_SHIFT 44
++/*
++ * This is beyond the 44 bit limit imposed by the 32bit long pfns,
++ * but we need the full mask to make sure inverted PROT_NONE
++ * entries have all the host bits set in a guest.
++ * The real limit is still 44 bits.
++ */
++#define __PHYSICAL_MASK_SHIFT 52
+ #define __VIRTUAL_MASK_SHIFT 32
+
+ #else /* !CONFIG_X86_PAE */
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andi Kleen <ak@linux.intel.com>
+Date: Tue, 7 Aug 2018 15:09:36 -0700
+Subject: x86/speculation/l1tf: Invert all not present mappings
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit f22cc87f6c1f771b57c407555cfefd811cdd9507 upstream
+
+For kernel mappings PAGE_PROTNONE is not necessarily set for a non present
+mapping, but the inversion logic explicitely checks for !PRESENT and
+PROT_NONE.
+
+Remove the PROT_NONE check and make the inversion unconditional for all not
+present mappings.
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable-invert.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/pgtable-invert.h
++++ b/arch/x86/include/asm/pgtable-invert.h
+@@ -6,7 +6,7 @@
+
+ static inline bool __pte_needs_invert(u64 val)
+ {
+- return (val & (_PAGE_PRESENT|_PAGE_PROTNONE)) == _PAGE_PROTNONE;
++ return !(val & _PAGE_PRESENT);
+ }
+
+ /* Get a mask to xor with the page table entry to get the correct pfn. */
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andi Kleen <ak@linux.intel.com>
+Date: Wed, 13 Jun 2018 15:48:28 -0700
+Subject: x86/speculation/l1tf: Limit swap file size to MAX_PA/2
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit 377eeaa8e11fe815b1d07c81c4a0e2843a8c15eb upstream
+
+For the L1TF workaround its necessary to limit the swap file size to below
+MAX_PA/2, so that the higher bits of the swap offset inverted never point
+to valid memory.
+
+Add a mechanism for the architecture to override the swap file size check
+in swapfile.c and add a x86 specific max swapfile check function that
+enforces that limit.
+
+The check is only enabled if the CPU is vulnerable to L1TF.
+
+In VMs with 42bit MAX_PA the typical limit is 2TB now, on a native system
+with 46bit PA it is 32TB. The limit is only per individual swap file, so
+it's always possible to exceed these limits with multiple swap files or
+partitions.
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/init.c | 15 +++++++++++++++
+ include/linux/swapfile.h | 2 ++
+ mm/swapfile.c | 46 ++++++++++++++++++++++++++++++----------------
+ 3 files changed, 47 insertions(+), 16 deletions(-)
+
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -4,6 +4,8 @@
+ #include <linux/swap.h>
+ #include <linux/memblock.h>
+ #include <linux/bootmem.h> /* for max_low_pfn */
++#include <linux/swapfile.h>
++#include <linux/swapops.h>
+
+ #include <asm/cacheflush.h>
+ #include <asm/e820.h>
+@@ -767,3 +769,16 @@ void update_cache_mode_entry(unsigned en
+ __cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
+ __pte2cachemode_tbl[entry] = cache;
+ }
++
++unsigned long max_swapfile_size(void)
++{
++ unsigned long pages;
++
++ pages = generic_max_swapfile_size();
++
++ if (boot_cpu_has_bug(X86_BUG_L1TF)) {
++ /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
++ pages = min_t(unsigned long, l1tf_pfn_limit() + 1, pages);
++ }
++ return pages;
++}
+--- a/include/linux/swapfile.h
++++ b/include/linux/swapfile.h
+@@ -9,5 +9,7 @@ extern spinlock_t swap_lock;
+ extern struct plist_head swap_active_head;
+ extern struct swap_info_struct *swap_info[];
+ extern int try_to_unuse(unsigned int, bool, unsigned long);
++extern unsigned long generic_max_swapfile_size(void);
++extern unsigned long max_swapfile_size(void);
+
+ #endif /* _LINUX_SWAPFILE_H */
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -2206,6 +2206,35 @@ static int claim_swapfile(struct swap_in
+ return 0;
+ }
+
++
++/*
++ * Find out how many pages are allowed for a single swap device. There
++ * are two limiting factors:
++ * 1) the number of bits for the swap offset in the swp_entry_t type, and
++ * 2) the number of bits in the swap pte, as defined by the different
++ * architectures.
++ *
++ * In order to find the largest possible bit mask, a swap entry with
++ * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
++ * decoded to a swp_entry_t again, and finally the swap offset is
++ * extracted.
++ *
++ * This will mask all the bits from the initial ~0UL mask that can't
++ * be encoded in either the swp_entry_t or the architecture definition
++ * of a swap pte.
++ */
++unsigned long generic_max_swapfile_size(void)
++{
++ return swp_offset(pte_to_swp_entry(
++ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
++}
++
++/* Can be overridden by an architecture for additional checks. */
++__weak unsigned long max_swapfile_size(void)
++{
++ return generic_max_swapfile_size();
++}
++
+ static unsigned long read_swap_header(struct swap_info_struct *p,
+ union swap_header *swap_header,
+ struct inode *inode)
+@@ -2241,22 +2270,7 @@ static unsigned long read_swap_header(st
+ p->cluster_next = 1;
+ p->cluster_nr = 0;
+
+- /*
+- * Find out how many pages are allowed for a single swap
+- * device. There are two limiting factors: 1) the number
+- * of bits for the swap offset in the swp_entry_t type, and
+- * 2) the number of bits in the swap pte as defined by the
+- * different architectures. In order to find the
+- * largest possible bit mask, a swap entry with swap type 0
+- * and swap offset ~0UL is created, encoded to a swap pte,
+- * decoded to a swp_entry_t again, and finally the swap
+- * offset is extracted. This will mask all the bits from
+- * the initial ~0UL mask that can't be encoded in either
+- * the swp_entry_t or the architecture definition of a
+- * swap pte.
+- */
+- maxpages = swp_offset(pte_to_swp_entry(
+- swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
++ maxpages = max_swapfile_size();
+ last_page = swap_header->info.last_page;
+ if (!last_page) {
+ pr_warn("Empty swap-file\n");
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andi Kleen <ak@linux.intel.com>
+Date: Tue, 7 Aug 2018 15:09:37 -0700
+Subject: x86/speculation/l1tf: Make pmd/pud_mknotpresent() invert
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit 0768f91530ff46683e0b372df14fd79fe8d156e5 upstream
+
+Some cases in THP like:
+ - MADV_FREE
+ - mprotect
+ - split
+
+mark the PMD non present for temporarily to prevent races. The window for
+an L1TF attack in these contexts is very small, but it wants to be fixed
+for correctness sake.
+
+Use the proper low level functions for pmd/pud_mknotpresent() to address
+this.
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable.h | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -315,11 +315,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pm
+ return pmd_set_flags(pmd, _PAGE_RW);
+ }
+
+-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+-{
+- return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
+-}
+-
+ #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+ static inline int pte_soft_dirty(pte_t pte)
+ {
+@@ -383,6 +378,12 @@ static inline pmd_t pfn_pmd(unsigned lon
+ return __pmd(pfn | massage_pgprot(pgprot));
+ }
+
++static inline pmd_t pmd_mknotpresent(pmd_t pmd)
++{
++ return pfn_pmd(pmd_pfn(pmd),
++ __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
++}
++
+ static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
+
+ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andi Kleen <ak@linux.intel.com>
+Date: Wed, 13 Jun 2018 15:48:25 -0700
+Subject: x86/speculation/l1tf: Make sure the first page is always reserved
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit 10a70416e1f067f6c4efda6ffd8ea96002ac4223 upstream
+
+The L1TF workaround doesn't make any attempt to mitigate speculate accesses
+to the first physical page for zeroed PTEs. Normally it only contains some
+data from the early real mode BIOS.
+
+It's not entirely clear that the first page is reserved in all
+configurations, so add an extra reservation call to make sure it is really
+reserved. In most configurations (e.g. with the standard reservations)
+it's likely a nop.
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/setup.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -851,6 +851,12 @@ void __init setup_arch(char **cmdline_p)
+ memblock_reserve(__pa_symbol(_text),
+ (unsigned long)__bss_stop - (unsigned long)_text);
+
++ /*
++ * Make sure page 0 is always reserved because on systems with
++ * L1TF its contents can be leaked to user processes.
++ */
++ memblock_reserve(0, PAGE_SIZE);
++
+ early_reserve_initrd();
+
+ /*
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Fri, 22 Jun 2018 17:39:33 +0200
+Subject: x86/speculation/l1tf: Protect PAE swap entries against L1TF
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 0d0f6249058834ffe1ceaad0bb31464af66f6e7a upstream
+
+The PAE 3-level paging code currently doesn't mitigate L1TF by flipping the
+offset bits, and uses the high PTE word, thus bits 32-36 for type, 37-63 for
+offset. The lower word is zeroed, thus systems with less than 4GB memory are
+safe. With 4GB to 128GB the swap type selects the memory locations vulnerable
+to L1TF; with even more memory, also the swap offfset influences the address.
+This might be a problem with 32bit PAE guests running on large 64bit hosts.
+
+By continuing to keep the whole swap entry in either high or low 32bit word of
+PTE we would limit the swap size too much. Thus this patch uses the whole PAE
+PTE with the same layout as the 64bit version does. The macros just become a
+bit tricky since they assume the arch-dependent swp_entry_t to be 32bit.
+
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable-3level.h | 35 ++++++++++++++++++++++++++++++++--
+ arch/x86/mm/init.c | 2 -
+ 2 files changed, 34 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable-3level.h
++++ b/arch/x86/include/asm/pgtable-3level.h
+@@ -177,12 +177,43 @@ static inline pmd_t native_pmdp_get_and_
+ #endif
+
+ /* Encode and de-code a swap entry */
++#define SWP_TYPE_BITS 5
++
++#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
++
++/* We always extract/encode the offset by shifting it all the way up, and then down again */
++#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
++
+ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
+ #define __swp_type(x) (((x).val) & 0x1f)
+ #define __swp_offset(x) ((x).val >> 5)
+ #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
+-#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
+-#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
++
++/*
++ * Normally, __swp_entry() converts from arch-independent swp_entry_t to
++ * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result
++ * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the
++ * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to
++ * __swp_entry_to_pte() through the following helper macro based on 64bit
++ * __swp_entry().
++ */
++#define __swp_pteval_entry(type, offset) ((pteval_t) { \
++ (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
++ | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) })
++
++#define __swp_entry_to_pte(x) ((pte_t){ .pte = \
++ __swp_pteval_entry(__swp_type(x), __swp_offset(x)) })
++/*
++ * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent
++ * swp_entry_t, but also has to convert it from 64bit to the 32bit
++ * intermediate representation, using the following macros based on 64bit
++ * __swp_type() and __swp_offset().
++ */
++#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS)))
++#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT))
++
++#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \
++ __pteval_swp_offset(pte)))
+
+ #include <asm/pgtable-invert.h>
+
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -783,7 +783,7 @@ unsigned long max_swapfile_size(void)
+ * We encode swap offsets also with 3 bits below those for pfn
+ * which makes the usable limit higher.
+ */
+-#ifdef CONFIG_X86_64
++#if CONFIG_PGTABLE_LEVELS > 2
+ l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
+ #endif
+ pages = min_t(unsigned long, l1tf_limit, pages);
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Andi Kleen <ak@linux.intel.com>
+Date: Wed, 13 Jun 2018 15:48:24 -0700
+Subject: x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit 6b28baca9b1f0d4a42b865da7a05b1c81424bd5c upstream
+
+When PTEs are set to PROT_NONE the kernel just clears the Present bit and
+preserves the PFN, which creates attack surface for L1TF speculation
+speculation attacks.
+
+This is important inside guests, because L1TF speculation bypasses physical
+page remapping. While the host has its own migitations preventing leaking
+data from other VMs into the guest, this would still risk leaking the wrong
+page inside the current guest.
+
+This uses the same technique as Linus' swap entry patch: while an entry is
+is in PROTNONE state invert the complete PFN part part of it. This ensures
+that the the highest bit will point to non existing memory.
+
+The invert is done by pte/pmd_modify and pfn/pmd/pud_pte for PROTNONE and
+pte/pmd/pud_pfn undo it.
+
+This assume that no code path touches the PFN part of a PTE directly
+without using these primitives.
+
+This doesn't handle the case that MMIO is on the top of the CPU physical
+memory. If such an MMIO region was exposed by an unpriviledged driver for
+mmap it would be possible to attack some real memory. However this
+situation is all rather unlikely.
+
+For 32bit non PAE the inversion is not done because there are really not
+enough bits to protect anything.
+
+Q: Why does the guest need to be protected when the HyperVisor already has
+ L1TF mitigations?
+
+A: Here's an example:
+
+ Physical pages 1 2 get mapped into a guest as
+ GPA 1 -> PA 2
+ GPA 2 -> PA 1
+ through EPT.
+
+ The L1TF speculation ignores the EPT remapping.
+
+ Now the guest kernel maps GPA 1 to process A and GPA 2 to process B, and
+ they belong to different users and should be isolated.
+
+ A sets the GPA 1 PA 2 PTE to PROT_NONE to bypass the EPT remapping and
+ gets read access to the underlying physical page. Which in this case
+ points to PA 2, so it can read process B's data, if it happened to be in
+ L1, so isolation inside the guest is broken.
+
+ There's nothing the hypervisor can do about this. This mitigation has to
+ be done in the guest itself.
+
+[ tglx: Massaged changelog ]
+[ dwmw2: backported to 4.9 ]
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable-2level.h | 17 +++++++++++++++
+ arch/x86/include/asm/pgtable-3level.h | 2 +
+ arch/x86/include/asm/pgtable-invert.h | 32 ++++++++++++++++++++++++++++
+ arch/x86/include/asm/pgtable.h | 38 ++++++++++++++++++++++++----------
+ arch/x86/include/asm/pgtable_64.h | 2 +
+ 5 files changed, 80 insertions(+), 11 deletions(-)
+ create mode 100644 arch/x86/include/asm/pgtable-invert.h
+
+--- a/arch/x86/include/asm/pgtable-2level.h
++++ b/arch/x86/include/asm/pgtable-2level.h
+@@ -77,4 +77,21 @@ static inline unsigned long pte_bitop(un
+ #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
+ #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
++/* No inverted PFNs on 2 level page tables */
++
++static inline u64 protnone_mask(u64 val)
++{
++ return 0;
++}
++
++static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
++{
++ return val;
++}
++
++static inline bool __pte_needs_invert(u64 val)
++{
++ return false;
++}
++
+ #endif /* _ASM_X86_PGTABLE_2LEVEL_H */
+--- a/arch/x86/include/asm/pgtable-3level.h
++++ b/arch/x86/include/asm/pgtable-3level.h
+@@ -184,4 +184,6 @@ static inline pmd_t native_pmdp_get_and_
+ #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
+ #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+
++#include <asm/pgtable-invert.h>
++
+ #endif /* _ASM_X86_PGTABLE_3LEVEL_H */
+--- /dev/null
++++ b/arch/x86/include/asm/pgtable-invert.h
+@@ -0,0 +1,32 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _ASM_PGTABLE_INVERT_H
++#define _ASM_PGTABLE_INVERT_H 1
++
++#ifndef __ASSEMBLY__
++
++static inline bool __pte_needs_invert(u64 val)
++{
++ return (val & (_PAGE_PRESENT|_PAGE_PROTNONE)) == _PAGE_PROTNONE;
++}
++
++/* Get a mask to xor with the page table entry to get the correct pfn. */
++static inline u64 protnone_mask(u64 val)
++{
++ return __pte_needs_invert(val) ? ~0ull : 0;
++}
++
++static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
++{
++ /*
++ * When a PTE transitions from NONE to !NONE or vice-versa
++ * invert the PFN part to stop speculation.
++ * pte_pfn undoes this when needed.
++ */
++ if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
++ val = (val & ~mask) | (~val & mask);
++ return val;
++}
++
++#endif /* __ASSEMBLY__ */
++
++#endif
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -148,19 +148,29 @@ static inline int pte_special(pte_t pte)
+ return pte_flags(pte) & _PAGE_SPECIAL;
+ }
+
++/* Entries that were set to PROT_NONE are inverted */
++
++static inline u64 protnone_mask(u64 val);
++
+ static inline unsigned long pte_pfn(pte_t pte)
+ {
+- return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
++ unsigned long pfn = pte_val(pte);
++ pfn ^= protnone_mask(pfn);
++ return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
+ }
+
+ static inline unsigned long pmd_pfn(pmd_t pmd)
+ {
+- return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
++ unsigned long pfn = pmd_val(pmd);
++ pfn ^= protnone_mask(pfn);
++ return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
+ }
+
+ static inline unsigned long pud_pfn(pud_t pud)
+ {
+- return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
++ unsigned long pfn = pud_val(pud);
++ pfn ^= protnone_mask(pfn);
++ return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
+ }
+
+ #define pte_page(pte) pfn_to_page(pte_pfn(pte))
+@@ -359,19 +369,25 @@ static inline pgprotval_t massage_pgprot
+
+ static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+ {
+- return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
+- massage_pgprot(pgprot));
++ phys_addr_t pfn = page_nr << PAGE_SHIFT;
++ pfn ^= protnone_mask(pgprot_val(pgprot));
++ pfn &= PTE_PFN_MASK;
++ return __pte(pfn | massage_pgprot(pgprot));
+ }
+
+ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+ {
+- return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
+- massage_pgprot(pgprot));
++ phys_addr_t pfn = page_nr << PAGE_SHIFT;
++ pfn ^= protnone_mask(pgprot_val(pgprot));
++ pfn &= PHYSICAL_PMD_PAGE_MASK;
++ return __pmd(pfn | massage_pgprot(pgprot));
+ }
+
++static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
++
+ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+ {
+- pteval_t val = pte_val(pte);
++ pteval_t val = pte_val(pte), oldval = val;
+
+ /*
+ * Chop off the NX bit (if present), and add the NX portion of
+@@ -379,17 +395,17 @@ static inline pte_t pte_modify(pte_t pte
+ */
+ val &= _PAGE_CHG_MASK;
+ val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
+-
++ val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
+ return __pte(val);
+ }
+
+ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+ {
+- pmdval_t val = pmd_val(pmd);
++ pmdval_t val = pmd_val(pmd), oldval = val;
+
+ val &= _HPAGE_CHG_MASK;
+ val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+-
++ val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
+ return __pmd(val);
+ }
+
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -235,6 +235,8 @@ extern void cleanup_highmap(void);
+ extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
+ extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
+
++#include <asm/pgtable-invert.h>
++
+ #endif /* !__ASSEMBLY__ */
+
+ #endif /* _ASM_X86_PGTABLE_64_H */
--- /dev/null
+From foo@baz Tue Aug 14 17:08:55 CEST 2018
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Wed, 13 Jun 2018 15:48:23 -0700
+Subject: x86/speculation/l1tf: Protect swap entries against L1TF
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 2f22b4cd45b67b3496f4aa4c7180a1271c6452f6 upstream
+
+With L1 terminal fault the CPU speculates into unmapped PTEs, and resulting
+side effects allow to read the memory the PTE is pointing too, if its
+values are still in the L1 cache.
+
+For swapped out pages Linux uses unmapped PTEs and stores a swap entry into
+them.
+
+To protect against L1TF it must be ensured that the swap entry is not
+pointing to valid memory, which requires setting higher bits (between bit
+36 and bit 45) that are inside the CPUs physical address space, but outside
+any real memory.
+
+To do this invert the offset to make sure the higher bits are always set,
+as long as the swap file is not too big.
+
+Note there is no workaround for 32bit !PAE, or on systems which have more
+than MAX_PA/2 worth of memory. The later case is very unlikely to happen on
+real systems.
+
+[AK: updated description and minor tweaks by. Split out from the original
+ patch ]
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Andi Kleen <ak@linux.intel.com>
+Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/pgtable_64.h | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -168,7 +168,7 @@ static inline int pgd_large(pgd_t pgd) {
+ *
+ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
+ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
+- * | TYPE (59-63) | OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
++ * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
+ *
+ * G (8) is aliased and used as a PROT_NONE indicator for
+ * !present ptes. We need to start storing swap entries above
+@@ -181,6 +181,9 @@ static inline int pgd_large(pgd_t pgd) {
+ *
+ * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
+ * but also L and G.
++ *
++ * The offset is inverted by a binary not operation to make the high
++ * physical bits set.
+ */
+ #define SWP_TYPE_BITS 5
+
+@@ -195,13 +198,15 @@ static inline int pgd_large(pgd_t pgd) {
+ #define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))
+
+ /* Shift up (to get rid of type), then down to get value */
+-#define __swp_offset(x) ((x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
++#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
+
+ /*
+ * Shift the offset up "too far" by TYPE bits, then down again
++ * The offset is inverted by a binary not operation to make the high
++ * physical bits set.
+ */
+ #define __swp_entry(type, offset) ((swp_entry_t) { \
+- ((unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
++ (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
+ | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })
+
+ #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })