--- /dev/null
+From 4f89fa286f6729312e227e7c2d764e8e7b9d340e Mon Sep 17 00:00:00 2001
+From: James Morse <james.morse@arm.com>
+Date: Mon, 6 Nov 2017 18:44:24 +0000
+Subject: ACPI / APEI: Replace ioremap_page_range() with fixmap
+
+From: James Morse <james.morse@arm.com>
+
+commit 4f89fa286f6729312e227e7c2d764e8e7b9d340e upstream.
+
+Replace ghes_io{re,un}map_pfn_{nmi,irq}()s use of ioremap_page_range()
+with __set_fixmap() as ioremap_page_range() may sleep to allocate a new
+level of page-table, even if its passed an existing final-address to
+use in the mapping.
+
+The GHES driver can only be enabled for architectures that select
+HAVE_ACPI_APEI: Add fixmap entries to both x86 and arm64.
+
+clear_fixmap() does the TLB invalidation in __set_fixmap() for arm64
+and __set_pte_vaddr() for x86. In each case its the same as the
+respective arch_apei_flush_tlb_one().
+
+Reported-by: Fengguang Wu <fengguang.wu@intel.com>
+Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: James Morse <james.morse@arm.com>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Tested-by: Tyler Baicar <tbaicar@codeaurora.org>
+Tested-by: Toshi Kani <toshi.kani@hpe.com>
+[ For the arm64 bits: ]
+Acked-by: Will Deacon <will.deacon@arm.com>
+[ For the x86 bits: ]
+Acked-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: All applicable <stable@vger.kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/include/asm/fixmap.h | 7 ++++++
+ arch/x86/include/asm/fixmap.h | 6 +++++
+ drivers/acpi/apei/ghes.c | 44 ++++++++++++----------------------------
+ 3 files changed, 27 insertions(+), 30 deletions(-)
+
+--- a/arch/arm64/include/asm/fixmap.h
++++ b/arch/arm64/include/asm/fixmap.h
+@@ -51,6 +51,13 @@ enum fixed_addresses {
+
+ FIX_EARLYCON_MEM_BASE,
+ FIX_TEXT_POKE0,
++
++#ifdef CONFIG_ACPI_APEI_GHES
++ /* Used for GHES mapping from assorted contexts */
++ FIX_APEI_GHES_IRQ,
++ FIX_APEI_GHES_NMI,
++#endif /* CONFIG_ACPI_APEI_GHES */
++
+ __end_of_permanent_fixed_addresses,
+
+ /*
+--- a/arch/x86/include/asm/fixmap.h
++++ b/arch/x86/include/asm/fixmap.h
+@@ -104,6 +104,12 @@ enum fixed_addresses {
+ FIX_GDT_REMAP_BEGIN,
+ FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
+
++#ifdef CONFIG_ACPI_APEI_GHES
++ /* Used for GHES mapping from assorted contexts */
++ FIX_APEI_GHES_IRQ,
++ FIX_APEI_GHES_NMI,
++#endif
++
+ __end_of_permanent_fixed_addresses,
+
+ /*
+--- a/drivers/acpi/apei/ghes.c
++++ b/drivers/acpi/apei/ghes.c
+@@ -51,6 +51,7 @@
+ #include <acpi/actbl1.h>
+ #include <acpi/ghes.h>
+ #include <acpi/apei.h>
++#include <asm/fixmap.h>
+ #include <asm/tlbflush.h>
+ #include <ras/ras_event.h>
+
+@@ -112,7 +113,7 @@ static DEFINE_MUTEX(ghes_list_mutex);
+ * Because the memory area used to transfer hardware error information
+ * from BIOS to Linux can be determined only in NMI, IRQ or timer
+ * handler, but general ioremap can not be used in atomic context, so
+- * a special version of atomic ioremap is implemented for that.
++ * the fixmap is used instead.
+ */
+
+ /*
+@@ -126,8 +127,8 @@ static DEFINE_MUTEX(ghes_list_mutex);
+ /* virtual memory area for atomic ioremap */
+ static struct vm_struct *ghes_ioremap_area;
+ /*
+- * These 2 spinlock is used to prevent atomic ioremap virtual memory
+- * area from being mapped simultaneously.
++ * These 2 spinlocks are used to prevent the fixmap entries from being used
++ * simultaneously.
+ */
+ static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
+ static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
+@@ -159,53 +160,36 @@ static void ghes_ioremap_exit(void)
+
+ static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
+ {
+- unsigned long vaddr;
+ phys_addr_t paddr;
+ pgprot_t prot;
+
+- vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr);
+-
+ paddr = pfn << PAGE_SHIFT;
+ prot = arch_apei_get_mem_attribute(paddr);
+- ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot);
++ __set_fixmap(FIX_APEI_GHES_NMI, paddr, prot);
+
+- return (void __iomem *)vaddr;
++ return (void __iomem *) fix_to_virt(FIX_APEI_GHES_NMI);
+ }
+
+ static void __iomem *ghes_ioremap_pfn_irq(u64 pfn)
+ {
+- unsigned long vaddr;
+ phys_addr_t paddr;
+ pgprot_t prot;
+
+- vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr);
+-
+ paddr = pfn << PAGE_SHIFT;
+ prot = arch_apei_get_mem_attribute(paddr);
++ __set_fixmap(FIX_APEI_GHES_IRQ, paddr, prot);
+
+- ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot);
+-
+- return (void __iomem *)vaddr;
++ return (void __iomem *) fix_to_virt(FIX_APEI_GHES_IRQ);
+ }
+
+-static void ghes_iounmap_nmi(void __iomem *vaddr_ptr)
++static void ghes_iounmap_nmi(void)
+ {
+- unsigned long vaddr = (unsigned long __force)vaddr_ptr;
+- void *base = ghes_ioremap_area->addr;
+-
+- BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base));
+- unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
+- arch_apei_flush_tlb_one(vaddr);
++ clear_fixmap(FIX_APEI_GHES_NMI);
+ }
+
+-static void ghes_iounmap_irq(void __iomem *vaddr_ptr)
++static void ghes_iounmap_irq(void)
+ {
+- unsigned long vaddr = (unsigned long __force)vaddr_ptr;
+- void *base = ghes_ioremap_area->addr;
+-
+- BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base));
+- unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
+- arch_apei_flush_tlb_one(vaddr);
++ clear_fixmap(FIX_APEI_GHES_IRQ);
+ }
+
+ static int ghes_estatus_pool_init(void)
+@@ -361,10 +345,10 @@ static void ghes_copy_tofrom_phys(void *
+ paddr += trunk;
+ buffer += trunk;
+ if (in_nmi) {
+- ghes_iounmap_nmi(vaddr);
++ ghes_iounmap_nmi();
+ raw_spin_unlock(&ghes_ioremap_lock_nmi);
+ } else {
+- ghes_iounmap_irq(vaddr);
++ ghes_iounmap_irq();
+ spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags);
+ }
+ }
--- /dev/null
+From 1943dc07b45e347c52c1bfdd4a37e04a86e399aa Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 2 Nov 2017 13:30:03 +0100
+Subject: bitops: Revert cbe96375025e ("bitops: Add clear/set_bit32() to linux/bitops.h")
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 1943dc07b45e347c52c1bfdd4a37e04a86e399aa upstream.
+
+These ops are not endian safe and may break on architectures which have
+aligment requirements.
+
+Reverts: cbe96375025e ("bitops: Add clear/set_bit32() to linux/bitops.h")
+Reported-by: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/bitops.h | 26 --------------------------
+ 1 file changed, 26 deletions(-)
+
+--- a/include/linux/bitops.h
++++ b/include/linux/bitops.h
+@@ -228,32 +228,6 @@ static inline unsigned long __ffs64(u64
+ return __ffs((unsigned long)word);
+ }
+
+-/*
+- * clear_bit32 - Clear a bit in memory for u32 array
+- * @nr: Bit to clear
+- * @addr: u32 * address of bitmap
+- *
+- * Same as clear_bit, but avoids needing casts for u32 arrays.
+- */
+-
+-static __always_inline void clear_bit32(long nr, volatile u32 *addr)
+-{
+- clear_bit(nr, (volatile unsigned long *)addr);
+-}
+-
+-/*
+- * set_bit32 - Set a bit in memory for u32 array
+- * @nr: Bit to clear
+- * @addr: u32 * address of bitmap
+- *
+- * Same as set_bit, but avoids needing casts for u32 arrays.
+- */
+-
+-static __always_inline void set_bit32(long nr, volatile u32 *addr)
+-{
+- set_bit(nr, (volatile unsigned long *)addr);
+-}
+-
+ #ifdef __KERNEL__
+
+ #ifndef set_mask_bits
--- /dev/null
+From ab95477e7cb35557ecfc837687007b646bab9a9f Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Tue, 12 Dec 2017 02:25:31 +0100
+Subject: bpf: fix build issues on um due to mising bpf_perf_event.h
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit ab95477e7cb35557ecfc837687007b646bab9a9f upstream.
+
+[ Note, this is a Git cherry-pick of the following commit:
+
+ a23f06f06dbe ("bpf: fix build issues on um due to mising bpf_perf_event.h")
+
+ ... for easier x86 PTI code testing and back-porting. ]
+
+Since c895f6f703ad ("bpf: correct broken uapi for
+BPF_PROG_TYPE_PERF_EVENT program type") um (uml) won't build
+on i386 or x86_64:
+
+ [...]
+ CC init/main.o
+ In file included from ../include/linux/perf_event.h:18:0,
+ from ../include/linux/trace_events.h:10,
+ from ../include/trace/syscall.h:7,
+ from ../include/linux/syscalls.h:82,
+ from ../init/main.c:20:
+ ../include/uapi/linux/bpf_perf_event.h:11:32: fatal error:
+ asm/bpf_perf_event.h: No such file or directory #include
+ <asm/bpf_perf_event.h>
+ [...]
+
+Lets add missing bpf_perf_event.h also to um arch. This seems
+to be the only one still missing.
+
+Fixes: c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type")
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Suggested-by: Richard Weinberger <richard@sigma-star.at>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Tested-by: Randy Dunlap <rdunlap@infradead.org>
+Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+Cc: Richard Weinberger <richard@sigma-star.at>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Acked-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/um/include/asm/Kbuild | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/um/include/asm/Kbuild
++++ b/arch/um/include/asm/Kbuild
+@@ -1,4 +1,5 @@
+ generic-y += barrier.h
++generic-y += bpf_perf_event.h
+ generic-y += bug.h
+ generic-y += clkdev.h
+ generic-y += current.h
--- /dev/null
+From 1784f9144b143a1e8b19fe94083b040aa559182b Mon Sep 17 00:00:00 2001
+From: Ingo Molnar <mingo@kernel.org>
+Date: Tue, 5 Dec 2017 14:14:47 +0100
+Subject: drivers/misc/intel/pti: Rename the header file to free up the namespace
+
+From: Ingo Molnar <mingo@kernel.org>
+
+commit 1784f9144b143a1e8b19fe94083b040aa559182b upstream.
+
+We'd like to use the 'PTI' acronym for 'Page Table Isolation' - free up the
+namespace by renaming the <linux/pti.h> driver header to <linux/intel-pti.h>.
+
+(Also standardize the header guard name while at it.)
+
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: J Freyensee <james_p_freyensee@linux.intel.com>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/misc/pti.c | 2 +-
+ include/linux/intel-pti.h | 43 +++++++++++++++++++++++++++++++++++++++++++
+ include/linux/pti.h | 43 -------------------------------------------
+ 3 files changed, 44 insertions(+), 44 deletions(-)
+
+--- a/drivers/misc/pti.c
++++ b/drivers/misc/pti.c
+@@ -32,7 +32,7 @@
+ #include <linux/pci.h>
+ #include <linux/mutex.h>
+ #include <linux/miscdevice.h>
+-#include <linux/pti.h>
++#include <linux/intel-pti.h>
+ #include <linux/slab.h>
+ #include <linux/uaccess.h>
+
+--- /dev/null
++++ b/include/linux/intel-pti.h
+@@ -0,0 +1,43 @@
++/*
++ * Copyright (C) Intel 2011
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ * The PTI (Parallel Trace Interface) driver directs trace data routed from
++ * various parts in the system out through the Intel Penwell PTI port and
++ * out of the mobile device for analysis with a debugging tool
++ * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7,
++ * compact JTAG, standard.
++ *
++ * This header file will allow other parts of the OS to use the
++ * interface to write out it's contents for debugging a mobile system.
++ */
++
++#ifndef LINUX_INTEL_PTI_H_
++#define LINUX_INTEL_PTI_H_
++
++/* offset for last dword of any PTI message. Part of MIPI P1149.7 */
++#define PTI_LASTDWORD_DTS 0x30
++
++/* basic structure used as a write address to the PTI HW */
++struct pti_masterchannel {
++ u8 master;
++ u8 channel;
++};
++
++/* the following functions are defined in misc/pti.c */
++void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count);
++struct pti_masterchannel *pti_request_masterchannel(u8 type,
++ const char *thread_name);
++void pti_release_masterchannel(struct pti_masterchannel *mc);
++
++#endif /* LINUX_INTEL_PTI_H_ */
+--- a/include/linux/pti.h
++++ /dev/null
+@@ -1,43 +0,0 @@
+-/*
+- * Copyright (C) Intel 2011
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License version 2 as
+- * published by the Free Software Foundation.
+- *
+- * This program is distributed in the hope that it will be useful,
+- * but WITHOUT ANY WARRANTY; without even the implied warranty of
+- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+- * GNU General Public License for more details.
+- *
+- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+- *
+- * The PTI (Parallel Trace Interface) driver directs trace data routed from
+- * various parts in the system out through the Intel Penwell PTI port and
+- * out of the mobile device for analysis with a debugging tool
+- * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7,
+- * compact JTAG, standard.
+- *
+- * This header file will allow other parts of the OS to use the
+- * interface to write out it's contents for debugging a mobile system.
+- */
+-
+-#ifndef PTI_H_
+-#define PTI_H_
+-
+-/* offset for last dword of any PTI message. Part of MIPI P1149.7 */
+-#define PTI_LASTDWORD_DTS 0x30
+-
+-/* basic structure used as a write address to the PTI HW */
+-struct pti_masterchannel {
+- u8 master;
+- u8 channel;
+-};
+-
+-/* the following functions are defined in misc/pti.c */
+-void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count);
+-struct pti_masterchannel *pti_request_masterchannel(u8 type,
+- const char *thread_name);
+-void pti_release_masterchannel(struct pti_masterchannel *mc);
+-
+-#endif /*PTI_H_*/
--- /dev/null
+From c2bc66082e1048c7573d72e62f597bdc5ce13fea Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Tue, 24 Oct 2017 11:22:47 +0100
+Subject: locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit c2bc66082e1048c7573d72e62f597bdc5ce13fea upstream.
+
+[ Note, this is a Git cherry-pick of the following commit:
+
+ 76ebbe78f739 ("locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()")
+
+ ... for easier x86 PTI code testing and back-porting. ]
+
+In preparation for the removal of lockless_dereference(), which is the
+same as READ_ONCE() on all architectures other than Alpha, add an
+implicit smp_read_barrier_depends() to READ_ONCE() so that it can be
+used to head dependency chains on all architectures.
+
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/1508840570-22169-3-git-send-email-will.deacon@arm.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/compiler.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/include/linux/compiler.h
++++ b/include/linux/compiler.h
+@@ -341,6 +341,7 @@ static __always_inline void __write_once
+ __read_once_size(&(x), __u.__c, sizeof(x)); \
+ else \
+ __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \
++ smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \
+ __u.__val; \
+ })
+ #define READ_ONCE(x) __READ_ONCE(x, 1)
--- /dev/null
+From 3382290ed2d5e275429cef510ab21889d3ccd164 Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Tue, 24 Oct 2017 11:22:48 +0100
+Subject: locking/barriers: Convert users of lockless_dereference() to READ_ONCE()
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 3382290ed2d5e275429cef510ab21889d3ccd164 upstream.
+
+[ Note, this is a Git cherry-pick of the following commit:
+
+ 506458efaf15 ("locking/barriers: Convert users of lockless_dereference() to READ_ONCE()")
+
+ ... for easier x86 PTI code testing and back-porting. ]
+
+READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it
+can be used instead of lockless_dereference() without any change in
+semantics.
+
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/events/core.c | 2 +-
+ arch/x86/include/asm/mmu_context.h | 4 ++--
+ arch/x86/kernel/ldt.c | 2 +-
+ drivers/md/dm-mpath.c | 20 ++++++++++----------
+ fs/dcache.c | 4 ++--
+ fs/overlayfs/ovl_entry.h | 2 +-
+ fs/overlayfs/readdir.c | 2 +-
+ include/linux/rculist.h | 4 ++--
+ include/linux/rcupdate.h | 4 ++--
+ kernel/events/core.c | 4 ++--
+ kernel/seccomp.c | 2 +-
+ kernel/task_work.c | 2 +-
+ mm/slab.h | 2 +-
+ 13 files changed, 27 insertions(+), 27 deletions(-)
+
+--- a/arch/x86/events/core.c
++++ b/arch/x86/events/core.c
+@@ -2371,7 +2371,7 @@ static unsigned long get_segment_base(un
+ struct ldt_struct *ldt;
+
+ /* IRQs are off, so this synchronizes with smp_store_release */
+- ldt = lockless_dereference(current->active_mm->context.ldt);
++ ldt = READ_ONCE(current->active_mm->context.ldt);
+ if (!ldt || idx >= ldt->nr_entries)
+ return 0;
+
+--- a/arch/x86/include/asm/mmu_context.h
++++ b/arch/x86/include/asm/mmu_context.h
+@@ -73,8 +73,8 @@ static inline void load_mm_ldt(struct mm
+ #ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+- /* lockless_dereference synchronizes with smp_store_release */
+- ldt = lockless_dereference(mm->context.ldt);
++ /* READ_ONCE synchronizes with smp_store_release */
++ ldt = READ_ONCE(mm->context.ldt);
+
+ /*
+ * Any change to mm->context.ldt is followed by an IPI to all
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -103,7 +103,7 @@ static void finalize_ldt_struct(struct l
+ static void install_ldt(struct mm_struct *current_mm,
+ struct ldt_struct *ldt)
+ {
+- /* Synchronizes with lockless_dereference in load_mm_ldt. */
++ /* Synchronizes with READ_ONCE in load_mm_ldt. */
+ smp_store_release(¤t_mm->context.ldt, ldt);
+
+ /* Activate the LDT for all CPUs using current_mm. */
+--- a/drivers/md/dm-mpath.c
++++ b/drivers/md/dm-mpath.c
+@@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(
+
+ pgpath = path_to_pgpath(path);
+
+- if (unlikely(lockless_dereference(m->current_pg) != pg)) {
++ if (unlikely(READ_ONCE(m->current_pg) != pg)) {
+ /* Only update current_pgpath if pg changed */
+ spin_lock_irqsave(&m->lock, flags);
+ m->current_pgpath = pgpath;
+@@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(stru
+ }
+
+ /* Were we instructed to switch PG? */
+- if (lockless_dereference(m->next_pg)) {
++ if (READ_ONCE(m->next_pg)) {
+ spin_lock_irqsave(&m->lock, flags);
+ pg = m->next_pg;
+ if (!pg) {
+@@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(stru
+
+ /* Don't change PG until it has no remaining paths */
+ check_current_pg:
+- pg = lockless_dereference(m->current_pg);
++ pg = READ_ONCE(m->current_pg);
+ if (pg) {
+ pgpath = choose_path_in_pg(m, pg, nr_bytes);
+ if (!IS_ERR_OR_NULL(pgpath))
+@@ -473,7 +473,7 @@ static int multipath_clone_and_map(struc
+ struct request *clone;
+
+ /* Do we need to select a new pgpath? */
+- pgpath = lockless_dereference(m->current_pgpath);
++ pgpath = READ_ONCE(m->current_pgpath);
+ if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+ pgpath = choose_pgpath(m, nr_bytes);
+
+@@ -533,7 +533,7 @@ static int __multipath_map_bio(struct mu
+ bool queue_io;
+
+ /* Do we need to select a new pgpath? */
+- pgpath = lockless_dereference(m->current_pgpath);
++ pgpath = READ_ONCE(m->current_pgpath);
+ queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
+ if (!pgpath || !queue_io)
+ pgpath = choose_pgpath(m, nr_bytes);
+@@ -1802,7 +1802,7 @@ static int multipath_prepare_ioctl(struc
+ struct pgpath *current_pgpath;
+ int r;
+
+- current_pgpath = lockless_dereference(m->current_pgpath);
++ current_pgpath = READ_ONCE(m->current_pgpath);
+ if (!current_pgpath)
+ current_pgpath = choose_pgpath(m, 0);
+
+@@ -1824,7 +1824,7 @@ static int multipath_prepare_ioctl(struc
+ }
+
+ if (r == -ENOTCONN) {
+- if (!lockless_dereference(m->current_pg)) {
++ if (!READ_ONCE(m->current_pg)) {
+ /* Path status changed, redo selection */
+ (void) choose_pgpath(m, 0);
+ }
+@@ -1893,9 +1893,9 @@ static int multipath_busy(struct dm_targ
+ return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED);
+
+ /* Guess which priority_group will be used at next mapping time */
+- pg = lockless_dereference(m->current_pg);
+- next_pg = lockless_dereference(m->next_pg);
+- if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
++ pg = READ_ONCE(m->current_pg);
++ next_pg = READ_ONCE(m->next_pg);
++ if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg))
+ pg = next_pg;
+
+ if (!pg) {
+--- a/fs/dcache.c
++++ b/fs/dcache.c
+@@ -231,7 +231,7 @@ static inline int dentry_cmp(const struc
+ {
+ /*
+ * Be careful about RCU walk racing with rename:
+- * use 'lockless_dereference' to fetch the name pointer.
++ * use 'READ_ONCE' to fetch the name pointer.
+ *
+ * NOTE! Even if a rename will mean that the length
+ * was not loaded atomically, we don't care. The
+@@ -245,7 +245,7 @@ static inline int dentry_cmp(const struc
+ * early because the data cannot match (there can
+ * be no NUL in the ct/tcount data)
+ */
+- const unsigned char *cs = lockless_dereference(dentry->d_name.name);
++ const unsigned char *cs = READ_ONCE(dentry->d_name.name);
+
+ return dentry_string_cmp(cs, ct, tcount);
+ }
+--- a/fs/overlayfs/ovl_entry.h
++++ b/fs/overlayfs/ovl_entry.h
+@@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(st
+
+ static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi)
+ {
+- return lockless_dereference(oi->__upperdentry);
++ return READ_ONCE(oi->__upperdentry);
+ }
+--- a/fs/overlayfs/readdir.c
++++ b/fs/overlayfs/readdir.c
+@@ -757,7 +757,7 @@ static int ovl_dir_fsync(struct file *fi
+ if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
+ struct inode *inode = file_inode(file);
+
+- realfile = lockless_dereference(od->upperfile);
++ realfile = READ_ONCE(od->upperfile);
+ if (!realfile) {
+ struct path upperpath;
+
+--- a/include/linux/rculist.h
++++ b/include/linux/rculist.h
+@@ -275,7 +275,7 @@ static inline void list_splice_tail_init
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+ #define list_entry_rcu(ptr, type, member) \
+- container_of(lockless_dereference(ptr), type, member)
++ container_of(READ_ONCE(ptr), type, member)
+
+ /*
+ * Where are list_empty_rcu() and list_first_entry_rcu()?
+@@ -368,7 +368,7 @@ static inline void list_splice_tail_init
+ * example is when items are added to the list, but never deleted.
+ */
+ #define list_entry_lockless(ptr, type, member) \
+- container_of((typeof(ptr))lockless_dereference(ptr), type, member)
++ container_of((typeof(ptr))READ_ONCE(ptr), type, member)
+
+ /**
+ * list_for_each_entry_lockless - iterate over rcu list of given type
+--- a/include/linux/rcupdate.h
++++ b/include/linux/rcupdate.h
+@@ -346,7 +346,7 @@ static inline void rcu_preempt_sleep_che
+ #define __rcu_dereference_check(p, c, space) \
+ ({ \
+ /* Dependency order vs. p above. */ \
+- typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \
++ typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \
+ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
+ rcu_dereference_sparse(p, space); \
+ ((typeof(*p) __force __kernel *)(________p1)); \
+@@ -360,7 +360,7 @@ static inline void rcu_preempt_sleep_che
+ #define rcu_dereference_raw(p) \
+ ({ \
+ /* Dependency order vs. p above. */ \
+- typeof(p) ________p1 = lockless_dereference(p); \
++ typeof(p) ________p1 = READ_ONCE(p); \
+ ((typeof(*p) __force __kernel *)(________p1)); \
+ })
+
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -4233,7 +4233,7 @@ static void perf_remove_from_owner(struc
+ * indeed free this event, otherwise we need to serialize on
+ * owner->perf_event_mutex.
+ */
+- owner = lockless_dereference(event->owner);
++ owner = READ_ONCE(event->owner);
+ if (owner) {
+ /*
+ * Since delayed_put_task_struct() also drops the last
+@@ -4330,7 +4330,7 @@ again:
+ * Cannot change, child events are not migrated, see the
+ * comment with perf_event_ctx_lock_nested().
+ */
+- ctx = lockless_dereference(child->ctx);
++ ctx = READ_ONCE(child->ctx);
+ /*
+ * Since child_mutex nests inside ctx::mutex, we must jump
+ * through hoops. We start by grabbing a reference on the ctx.
+--- a/kernel/seccomp.c
++++ b/kernel/seccomp.c
+@@ -190,7 +190,7 @@ static u32 seccomp_run_filters(const str
+ u32 ret = SECCOMP_RET_ALLOW;
+ /* Make sure cross-thread synced filter points somewhere sane. */
+ struct seccomp_filter *f =
+- lockless_dereference(current->seccomp.filter);
++ READ_ONCE(current->seccomp.filter);
+
+ /* Ensure unexpected behavior doesn't result in failing open. */
+ if (unlikely(WARN_ON(f == NULL)))
+--- a/kernel/task_work.c
++++ b/kernel/task_work.c
+@@ -68,7 +68,7 @@ task_work_cancel(struct task_struct *tas
+ * we raced with task_work_run(), *pprev == NULL/exited.
+ */
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+- while ((work = lockless_dereference(*pprev))) {
++ while ((work = READ_ONCE(*pprev))) {
+ if (work->func != func)
+ pprev = &work->next;
+ else if (cmpxchg(pprev, work, work->next) == work)
+--- a/mm/slab.h
++++ b/mm/slab.h
+@@ -259,7 +259,7 @@ cache_from_memcg_idx(struct kmem_cache *
+ * memcg_caches issues a write barrier to match this (see
+ * memcg_create_kmem_cache()).
+ */
+- cachep = lockless_dereference(arr->entries[idx]);
++ cachep = READ_ONCE(arr->entries[idx]);
+ rcu_read_unlock();
+
+ return cachep;
--- /dev/null
+From 2fe1bc1f501d55e5925b4035bcd85781adc76c63 Mon Sep 17 00:00:00 2001
+From: Andi Kleen <ak@linux.intel.com>
+Date: Thu, 31 Aug 2017 14:46:30 -0700
+Subject: perf/x86: Enable free running PEBS for REGS_USER/INTR
+
+From: Andi Kleen <ak@linux.intel.com>
+
+commit 2fe1bc1f501d55e5925b4035bcd85781adc76c63 upstream.
+
+[ Note, this is a Git cherry-pick of the following commit:
+
+ a47ba4d77e12 ("perf/x86: Enable free running PEBS for REGS_USER/INTR")
+
+ ... for easier x86 PTI code testing and back-porting. ]
+
+Currently free running PEBS is disabled when user or interrupt
+registers are requested. Most of the registers are actually
+available in the PEBS record and can be supported.
+
+So we just need to check for the supported registers and then
+allow it: it is all except for the segment register.
+
+For user registers this only works when the counter is limited
+to ring 3 only, so this also needs to be checked.
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/20170831214630.21892-1-andi@firstfloor.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/events/intel/core.c | 4 ++++
+ arch/x86/events/perf_event.h | 24 +++++++++++++++++++++++-
+ 2 files changed, 27 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/events/intel/core.c
++++ b/arch/x86/events/intel/core.c
+@@ -2958,6 +2958,10 @@ static unsigned long intel_pmu_free_runn
+
+ if (event->attr.use_clockid)
+ flags &= ~PERF_SAMPLE_TIME;
++ if (!event->attr.exclude_kernel)
++ flags &= ~PERF_SAMPLE_REGS_USER;
++ if (event->attr.sample_regs_user & ~PEBS_REGS)
++ flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
+ return flags;
+ }
+
+--- a/arch/x86/events/perf_event.h
++++ b/arch/x86/events/perf_event.h
+@@ -85,13 +85,15 @@ struct amd_nb {
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
++ * REGS_USER can be handled for events limited to ring 3.
+ *
+ */
+ #define PEBS_FREERUNNING_FLAGS \
+ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+- PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
++ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
++ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
+
+ /*
+ * A debug store configuration.
+@@ -110,6 +112,26 @@ struct debug_store {
+ u64 pebs_event_reset[MAX_PEBS_EVENTS];
+ };
+
++#define PEBS_REGS \
++ (PERF_REG_X86_AX | \
++ PERF_REG_X86_BX | \
++ PERF_REG_X86_CX | \
++ PERF_REG_X86_DX | \
++ PERF_REG_X86_DI | \
++ PERF_REG_X86_SI | \
++ PERF_REG_X86_SP | \
++ PERF_REG_X86_BP | \
++ PERF_REG_X86_IP | \
++ PERF_REG_X86_FLAGS | \
++ PERF_REG_X86_R8 | \
++ PERF_REG_X86_R9 | \
++ PERF_REG_X86_R10 | \
++ PERF_REG_X86_R11 | \
++ PERF_REG_X86_R12 | \
++ PERF_REG_X86_R13 | \
++ PERF_REG_X86_R14 | \
++ PERF_REG_X86_R15)
++
+ /*
+ * Per register state.
+ */
--- /dev/null
+From d744dcad39094c9187075e274d1cdef79c57c8b5 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Sat, 4 Nov 2017 04:19:50 -0700
+Subject: selftests/x86/ldt_gdt: Add infrastructure to test set_thread_area()
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit d744dcad39094c9187075e274d1cdef79c57c8b5 upstream.
+
+Much of the test design could apply to set_thread_area() (i.e. GDT),
+not just modify_ldt(). Add set_thread_area() to the
+install_valid_mode() helper.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/02c23f8fba5547007f741dc24c3926e5284ede02.1509794321.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/testing/selftests/x86/ldt_gdt.c | 53 +++++++++++++++++++++++-----------
+ 1 file changed, 37 insertions(+), 16 deletions(-)
+
+--- a/tools/testing/selftests/x86/ldt_gdt.c
++++ b/tools/testing/selftests/x86/ldt_gdt.c
+@@ -137,30 +137,51 @@ static void check_valid_segment(uint16_t
+ }
+ }
+
+-static bool install_valid_mode(const struct user_desc *desc, uint32_t ar,
+- bool oldmode)
++static bool install_valid_mode(const struct user_desc *d, uint32_t ar,
++ bool oldmode, bool ldt)
+ {
+- int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+- desc, sizeof(*desc));
+- if (ret < -1)
+- errno = -ret;
++ struct user_desc desc = *d;
++ int ret;
++
++ if (!ldt) {
++#ifndef __i386__
++ /* No point testing set_thread_area in a 64-bit build */
++ return false;
++#endif
++ if (!gdt_entry_num)
++ return false;
++ desc.entry_number = gdt_entry_num;
++
++ ret = syscall(SYS_set_thread_area, &desc);
++ } else {
++ ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
++ &desc, sizeof(desc));
++
++ if (ret < -1)
++ errno = -ret;
++
++ if (ret != 0 && errno == ENOSYS) {
++ printf("[OK]\tmodify_ldt returned -ENOSYS\n");
++ return false;
++ }
++ }
++
+ if (ret == 0) {
+- uint32_t limit = desc->limit;
+- if (desc->limit_in_pages)
++ uint32_t limit = desc.limit;
++ if (desc.limit_in_pages)
+ limit = (limit << 12) + 4095;
+- check_valid_segment(desc->entry_number, 1, ar, limit, true);
++ check_valid_segment(desc.entry_number, ldt, ar, limit, true);
+ return true;
+- } else if (errno == ENOSYS) {
+- printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+- return false;
+ } else {
+- if (desc->seg_32bit) {
+- printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
++ if (desc.seg_32bit) {
++ printf("[FAIL]\tUnexpected %s failure %d\n",
++ ldt ? "modify_ldt" : "set_thread_area",
+ errno);
+ nerrs++;
+ return false;
+ } else {
+- printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
++ printf("[OK]\t%s rejected 16 bit segment\n",
++ ldt ? "modify_ldt" : "set_thread_area");
+ return false;
+ }
+ }
+@@ -168,7 +189,7 @@ static bool install_valid_mode(const str
+
+ static bool install_valid(const struct user_desc *desc, uint32_t ar)
+ {
+- return install_valid_mode(desc, ar, false);
++ return install_valid_mode(desc, ar, false, true);
+ }
+
+ static void install_invalid(const struct user_desc *desc, bool oldmode)
--- /dev/null
+From adedf2893c192dd09b1cc2f2dcfdd7cad99ec49d Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Sat, 4 Nov 2017 04:19:51 -0700
+Subject: selftests/x86/ldt_gdt: Run most existing LDT test cases against the GDT as well
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit adedf2893c192dd09b1cc2f2dcfdd7cad99ec49d upstream.
+
+Now that the main test infrastructure supports the GDT, run tests
+that will pass the kernel's GDT permission tests against the GDT.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/686a1eda63414da38fcecc2412db8dba1ae40581.1509794321.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/testing/selftests/x86/ldt_gdt.c | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/tools/testing/selftests/x86/ldt_gdt.c
++++ b/tools/testing/selftests/x86/ldt_gdt.c
+@@ -189,7 +189,15 @@ static bool install_valid_mode(const str
+
+ static bool install_valid(const struct user_desc *desc, uint32_t ar)
+ {
+- return install_valid_mode(desc, ar, false, true);
++ bool ret = install_valid_mode(desc, ar, false, true);
++
++ if (desc->contents <= 1 && desc->seg_32bit &&
++ !desc->seg_not_present) {
++ /* Should work in the GDT, too. */
++ install_valid_mode(desc, ar, false, false);
++ }
++
++ return ret;
+ }
+
+ static void install_invalid(const struct user_desc *desc, bool oldmode)
x86-mm-relocate-page-fault-error-codes-to-traps.h.patch
x86-boot-relocate-definition-of-the-initial-state-of-cr0.patch
ptrace-x86-make-user_64bit_mode-available-to-32-bit-builds.patch
+x86-entry-64-remove-the-restore_c_regs_and_iret-label.patch
+x86-entry-64-split-the-iret-to-user-and-iret-to-kernel-paths.patch
+x86-entry-64-move-swapgs-into-the-common-iret-to-usermode-path.patch
+x86-entry-64-simplify-reg-restore-code-in-the-standard-iret-paths.patch
+x86-entry-64-shrink-paranoid_exit_restore-and-make-labels-local.patch
+x86-entry-64-use-pop-instead-of-movq-in-syscall_return_via_sysret.patch
+x86-entry-64-merge-the-fast-and-slow-sysret-paths.patch
+x86-entry-64-use-pop-instead-of-mov-to-restore-regs-on-nmi-return.patch
+x86-entry-64-remove-the-restore_..._regs-infrastructure.patch
+xen-x86-entry-64-add-xen-nmi-trap-entry.patch
+x86-entry-64-de-xen-ify-our-nmi-code.patch
+x86-entry-32-pull-the-msr_ia32_sysenter_cs-update-code-out-of-native_load_sp0.patch
+x86-entry-64-pass-sp0-directly-to-load_sp0.patch
+x86-entry-add-task_top_of_stack-to-find-the-top-of-a-task-s-stack.patch
+x86-xen-64-x86-entry-64-clean-up-sp-code-in-cpu_initialize_context.patch
+x86-entry-64-stop-initializing-tss.sp0-at-boot.patch
+x86-entry-64-remove-all-remaining-direct-thread_struct-sp0-reads.patch
+x86-entry-32-fix-cpu_current_top_of_stack-initialization-at-boot.patch
+x86-entry-64-remove-thread_struct-sp0.patch
+x86-traps-use-a-new-on_thread_stack-helper-to-clean-up-an-assertion.patch
+x86-entry-64-shorten-test-instructions.patch
+x86-cpuid-replace-set-clear_bit32.patch
+bitops-revert-cbe96375025e-bitops-add-clear-set_bit32-to-linux-bitops.h.patch
+x86-mm-define-_page_table-using-_kernpg_table.patch
+x86-cpufeatures-re-tabulate-the-x86_feature-definitions.patch
+x86-cpufeatures-fix-various-details-in-the-feature-definitions.patch
+selftests-x86-ldt_gdt-add-infrastructure-to-test-set_thread_area.patch
+selftests-x86-ldt_gdt-run-most-existing-ldt-test-cases-against-the-gdt-as-well.patch
+acpi-apei-replace-ioremap_page_range-with-fixmap.patch
+x86-virt-x86-platform-merge-struct-x86_hyper-into-struct-x86_platform-and-struct-x86_init.patch
+x86-virt-add-enum-for-hypervisors-to-replace-x86_hyper.patch
+drivers-misc-intel-pti-rename-the-header-file-to-free-up-the-namespace.patch
+x86-cpufeature-add-user-mode-instruction-prevention-definitions.patch
+x86-make-x86_bug_fxsave_leak-detectable-in-cpuid-on-amd.patch
+perf-x86-enable-free-running-pebs-for-regs_user-intr.patch
+bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_event.h.patch
+locking-barriers-add-implicit-smp_read_barrier_depends-to-read_once.patch
+locking-barriers-convert-users-of-lockless_dereference-to-read_once.patch
+x86-mm-kasan-don-t-use-vmemmap_populate-to-initialize-shadow.patch
+x86-entry-64-paravirt-use-paravirt-safe-macro-to-access-eflags.patch
+x86-unwinder-orc-dont-bail-on-stack-overflow.patch
+x86-unwinder-handle-stack-overflows-more-gracefully.patch
+x86-irq-remove-an-old-outdated-comment-about-context-tracking-races.patch
+x86-irq-64-print-the-offending-ip-in-the-stack-overflow-warning.patch
+x86-entry-64-allocate-and-enable-the-sysenter-stack.patch
+x86-dumpstack-add-get_stack_info-support-for-the-sysenter-stack.patch
+x86-entry-gdt-put-per-cpu-gdt-remaps-in-ascending-order.patch
+x86-mm-fixmap-generalize-the-gdt-fixmap-mechanism-introduce-struct-cpu_entry_area.patch
+x86-kasan-64-teach-kasan-about-the-cpu_entry_area.patch
+x86-entry-fix-assumptions-that-the-hw-tss-is-at-the-beginning-of-cpu_tss.patch
+x86-dumpstack-handle-stack-overflow-on-all-stacks.patch
+x86-entry-move-sysenter_stack-to-the-beginning-of-struct-tss_struct.patch
+x86-entry-remap-the-tss-into-the-cpu-entry-area.patch
+x86-entry-64-separate-cpu_current_top_of_stack-from-tss.sp0.patch
+x86-espfix-64-stop-assuming-that-pt_regs-is-on-the-entry-stack.patch
+x86-entry-64-use-a-per-cpu-trampoline-stack-for-idt-entries.patch
+x86-entry-64-return-to-userspace-from-the-trampoline-stack.patch
+x86-entry-64-create-a-per-cpu-syscall-entry-trampoline.patch
+x86-entry-64-move-the-ist-stacks-into-struct-cpu_entry_area.patch
+x86-entry-64-remove-the-sysenter-stack-canary.patch
+x86-entry-clean-up-the-sysenter_stack-code.patch
+x86-entry-64-make-cpu_entry_area.tss-read-only.patch
+x86-paravirt-dont-patch-flush_tlb_single.patch
+x86-paravirt-provide-a-way-to-check-for-hypervisors.patch
+x86-cpufeatures-make-cpu-bugs-sticky.patch
optee-fix-invalid-of_node_put-in-optee_driver_init.patch
backlight-pwm_bl-fix-overflow-condition.patch
drm-add-retries-for-lspcon-mode-detection.patch
--- /dev/null
+From a8b4db562e7283a1520f9e9730297ecaab7622ea Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Sun, 5 Nov 2017 18:27:51 -0800
+Subject: x86/cpufeature: Add User-Mode Instruction Prevention definitions
+
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+
+commit a8b4db562e7283a1520f9e9730297ecaab7622ea upstream.
+
+[ Note, this is a Git cherry-pick of the following commit: (limited to the cpufeatures.h file)
+
+ 3522c2a6a4f3 ("x86/cpufeature: Add User-Mode Instruction Prevention definitions")
+
+ ... for easier x86 PTI code testing and back-porting. ]
+
+User-Mode Instruction Prevention is a security feature present in new
+Intel processors that, when set, prevents the execution of a subset of
+instructions if such instructions are executed in user mode (CPL > 0).
+Attempting to execute such instructions causes a general protection
+exception.
+
+The subset of instructions comprises:
+
+ * SGDT - Store Global Descriptor Table
+ * SIDT - Store Interrupt Descriptor Table
+ * SLDT - Store Local Descriptor Table
+ * SMSW - Store Machine Status Word
+ * STR - Store Task Register
+
+This feature is also added to the list of disabled-features to allow
+a cleaner handling of build-time configuration.
+
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Chen Yucong <slaoub@gmail.com>
+Cc: Chris Metcalf <cmetcalf@mellanox.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Fenghua Yu <fenghua.yu@intel.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Huang Rui <ray.huang@amd.com>
+Cc: Jiri Slaby <jslaby@suse.cz>
+Cc: Jonathan Corbet <corbet@lwn.net>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Michael S. Tsirkin <mst@redhat.com>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Ravi V. Shankar <ravi.v.shankar@intel.com>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: Tony Luck <tony.luck@intel.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: ricardo.neri@intel.com
+Link: http://lkml.kernel.org/r/1509935277-22138-7-git-send-email-ricardo.neri-calderon@linux.intel.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/cpufeatures.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -296,6 +296,7 @@
+
+ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
+ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
++#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */
+ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
+ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
--- /dev/null
+From f3a624e901c633593156f7b00ca743a6204a29bc Mon Sep 17 00:00:00 2001
+From: Ingo Molnar <mingo@kernel.org>
+Date: Tue, 31 Oct 2017 13:17:23 +0100
+Subject: x86/cpufeatures: Fix various details in the feature definitions
+
+From: Ingo Molnar <mingo@kernel.org>
+
+commit f3a624e901c633593156f7b00ca743a6204a29bc upstream.
+
+Kept this commit separate from the re-tabulation changes, to make
+the changes easier to review:
+
+ - add better explanation for entries with no explanation
+ - fix/enhance the text of some of the entries
+ - fix the vertical alignment of some of the feature number definitions
+ - fix inconsistent capitalization
+ - ... and lots of other small details
+
+i.e. make it all more of a coherent unit, instead of a patchwork of years of additions.
+
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/20171031121723.28524-4-mingo@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/cpufeatures.h | 149 ++++++++++++++++++-------------------
+ 1 file changed, 74 insertions(+), 75 deletions(-)
+
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -20,14 +20,12 @@
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name. If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+- */
+-
+-/*
++ *
+ * When adding new features here that depend on other features,
+- * please update the table in kernel/cpu/cpuid-deps.c
++ * please update the table in kernel/cpu/cpuid-deps.c as well.
+ */
+
+-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
++/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */
+ #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
+ #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
+ #define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
+@@ -42,8 +40,7 @@
+ #define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
+ #define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
+ #define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
+-#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */
+- /* (plus FCMOVcc, FCOMI with FPU) */
++#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
+ #define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
+ #define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
+ #define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
+@@ -63,15 +60,15 @@
+ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+ /* Don't duplicate feature flags which are redundant with Intel! */
+ #define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
+-#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */
++#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */
+ #define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
+ #define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
+ #define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+ #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
+ #define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
+-#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */
+-#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */
+-#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */
++#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */
++#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */
++#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */
+
+ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+ #define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
+@@ -84,66 +81,67 @@
+ #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+ #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
+-/* cpu types for specific tunings: */
++
++/* CPU types for specific tunings: */
+ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
+ #define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
+ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
+ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
+ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+-#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
+-#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */
++#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */
++#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */
+ #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+ #define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
+ #define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
+-#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
+-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
+-#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
+-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
+-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
++#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
++#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
++#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
++#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */
++#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
+ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
+ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
+ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
+-#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
++#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */
+ #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+ #define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
+ #define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
+-#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
+-#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
+-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
++#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */
++#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */
++#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
+ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+ #define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
+
+-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
++/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */
+ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
+ #define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
+ #define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
+-#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
+-#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
++#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */
++#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */
+ #define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
+-#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */
++#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */
+ #define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
+ #define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
+ #define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
+ #define X86_FEATURE_CID ( 4*32+10) /* Context ID */
+ #define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
+ #define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
+-#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
++#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */
+ #define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
+-#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */
++#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */
+ #define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
+ #define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
+ #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
+ #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
+-#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */
++#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */
+ #define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
+ #define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
+-#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
++#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */
+ #define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
+-#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+-#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */
++#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */
++#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */
+ #define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
+-#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */
+-#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */
++#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */
++#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */
+ #define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
+
+ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+@@ -158,10 +156,10 @@
+ #define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
+ #define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
+
+-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
++/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */
+ #define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
+ #define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
+-#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */
++#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */
+ #define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
+ #define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
+ #define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
+@@ -175,16 +173,16 @@
+ #define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
+ #define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
+ #define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
+-#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */
++#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */
+ #define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
+-#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */
+-#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
+-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
++#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */
++#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */
++#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */
+ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
+-#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
+-#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */
++#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */
++#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */
+ #define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
+-#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
++#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */
+
+ /*
+ * Auxiliary flags: Linux defined - For features scattered in various
+@@ -192,7 +190,7 @@
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+-#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
++#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */
+ #define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
+ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
+ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+@@ -206,8 +204,8 @@
+
+ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
+ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
+-#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
+-#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
++#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
++#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+
+ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
+
+@@ -218,19 +216,19 @@
+ #define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
+ #define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
+
+-#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
++#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
+ #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
+
+
+-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+-#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
++/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
++#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
++#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */
+ #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
+ #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
+ #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
+ #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
+ #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
+-#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
++#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */
+ #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
+ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
+ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
+@@ -238,8 +236,8 @@
+ #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
+ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
+ #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
+-#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
+-#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
++#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */
++#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */
+ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
+ #define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
+ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+@@ -251,25 +249,25 @@
+ #define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
+ #define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
+
+-/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
+-#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
+-#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */
+-#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
+-#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
++/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */
++#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */
++#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
++#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
++#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
+
+-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
++/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */
+ #define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
+
+-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
++/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */
++#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */
+ #define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
+ #define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
+
+-/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
+-#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
+-#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
++/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
++#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
++#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
+
+-/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
++/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
+ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
+ #define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
+ #define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
+@@ -281,7 +279,7 @@
+ #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
+ #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+-/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
++/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
+ #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
+ #define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
+ #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
+@@ -296,24 +294,24 @@
+ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
+ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+
+-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
++/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
+ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
+ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
+ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+ #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
+ #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
+-#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */
+-#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */
+-#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */
++#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
++#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */
++#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
+ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
+ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
+ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
+
+-/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
+-#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
+-#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */
+-#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */
++/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
++#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */
++#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */
++#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */
+
+ /*
+ * BUG word(s)
+@@ -340,4 +338,5 @@
+ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
+ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
+ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
++
+ #endif /* _ASM_X86_CPUFEATURES_H */
--- /dev/null
+From 6cbd2171e89b13377261d15e64384df60ecb530e Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 4 Dec 2017 15:07:32 +0100
+Subject: x86/cpufeatures: Make CPU bugs sticky
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 6cbd2171e89b13377261d15e64384df60ecb530e upstream.
+
+There is currently no way to force CPU bug bits like CPU feature bits. That
+makes it impossible to set a bug bit once at boot and have it stick for all
+upcoming CPUs.
+
+Extend the force set/clear arrays to handle bug bits as well.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.992156574@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/cpufeature.h | 2 ++
+ arch/x86/include/asm/processor.h | 4 ++--
+ arch/x86/kernel/cpu/common.c | 6 +++---
+ 3 files changed, 7 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/include/asm/cpufeature.h
++++ b/arch/x86/include/asm/cpufeature.h
+@@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo
+ set_bit(bit, (unsigned long *)cpu_caps_set); \
+ } while (0)
+
++#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
++
+ #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
+ /*
+ * Static testing of CPU features. Used the same as boot_cpu_has().
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -163,8 +163,8 @@ extern struct cpuinfo_x86 boot_cpu_data;
+ extern struct cpuinfo_x86 new_cpu_data;
+
+ extern struct x86_hw_tss doublefault_tss;
+-extern __u32 cpu_caps_cleared[NCAPINTS];
+-extern __u32 cpu_caps_set[NCAPINTS];
++extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
++extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+
+ #ifdef CONFIG_SMP
+ DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -452,8 +452,8 @@ static const char *table_lookup_model(st
+ return NULL; /* Not found */
+ }
+
+-__u32 cpu_caps_cleared[NCAPINTS];
+-__u32 cpu_caps_set[NCAPINTS];
++__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
++__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+
+ void load_percpu_segment(int cpu)
+ {
+@@ -812,7 +812,7 @@ static void apply_forced_caps(struct cpu
+ {
+ int i;
+
+- for (i = 0; i < NCAPINTS; i++) {
++ for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
--- /dev/null
+From acbc845ffefd9fb70466182cd8555a26189462b2 Mon Sep 17 00:00:00 2001
+From: Ingo Molnar <mingo@kernel.org>
+Date: Tue, 31 Oct 2017 13:17:22 +0100
+Subject: x86/cpufeatures: Re-tabulate the X86_FEATURE definitions
+
+From: Ingo Molnar <mingo@kernel.org>
+
+commit acbc845ffefd9fb70466182cd8555a26189462b2 upstream.
+
+Over the years asm/cpufeatures.h has become somewhat of a mess: the original
+tabulation style was too narrow, while x86 feature names also kept growing
+in length, creating frequent field width overflows.
+
+Re-tabulate it to make it wider and easier to read/modify. Also harmonize
+the tabulation of the other defines in this file to match it.
+
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/20171031121723.28524-3-mingo@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/cpufeatures.h | 512 ++++++++++++++++++-------------------
+ 1 file changed, 256 insertions(+), 256 deletions(-)
+
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -13,8 +13,8 @@
+ /*
+ * Defines x86 CPU feature bits
+ */
+-#define NCAPINTS 18 /* N 32-bit words worth of info */
+-#define NBUGINTS 1 /* N 32-bit bug flags */
++#define NCAPINTS 18 /* N 32-bit words worth of info */
++#define NBUGINTS 1 /* N 32-bit bug flags */
+
+ /*
+ * Note: If the comment begins with a quoted string, that string is used
+@@ -28,163 +28,163 @@
+ */
+
+ /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+-#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
+-#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
+-#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
+-#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
+-#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
+-#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
+-#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
+-#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
+-#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
+-#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
+-#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
+-#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
+-#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
+-#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
+-#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */
++#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
++#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
++#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
++#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
++#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
++#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
++#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
++#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
++#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
++#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
++#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
++#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
++#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
++#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
++#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */
+ /* (plus FCMOVcc, FCOMI with FPU) */
+-#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
+-#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
+-#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
+-#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
+-#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
+-#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
+-#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
+-#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+-#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
+-#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
+-#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
+-#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
+-#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
+-#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
+-#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
++#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
++#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
++#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
++#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
++#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
++#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
++#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
++#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
++#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
++#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
++#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
++#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
++#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
++#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
++#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
+
+ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+ /* Don't duplicate feature flags which are redundant with Intel! */
+-#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
+-#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */
+-#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
+-#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
+-#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+-#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
+-#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
+-#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */
+-#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */
+-#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */
++#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
++#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */
++#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
++#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
++#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
++#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
++#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
++#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */
++#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */
++#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */
+
+ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+-#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
+-#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
+-#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
++#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
++#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
++#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
+
+ /* Other features, Linux-defined mapping, word 3 */
+ /* This range is used for feature bits which conflict or are synthesized */
+-#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
+-#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+-#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+-#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
++#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
++#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
++#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
++#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
+ /* cpu types for specific tunings: */
+-#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
+-#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
+-#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
+-#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
+-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+-#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
+-#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */
+-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+-#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
+-#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
+-#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
+-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
+-#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
+-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
+-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
+-#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
+-#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
+-#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
+-#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
+-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+-#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
+-#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
+-#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
+-#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
+-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
+-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+-#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
++#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
++#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
++#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
++#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
++#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
++#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
++#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */
++#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
++#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
++#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
++#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
++#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
++#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
++#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
++#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
++#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
++#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
++#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
++#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
++#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
++#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
++#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
++#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
++#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
++#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
++#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
++#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
+
+ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+-#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
+-#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
+-#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
+-#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
+-#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+-#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
+-#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */
+-#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
+-#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
+-#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
+-#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
+-#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
+-#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
+-#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
+-#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
+-#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */
+-#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
+-#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
+-#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
+-#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
+-#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */
+-#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
+-#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
++#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
++#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
++#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
++#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
++#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
++#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
++#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */
++#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
++#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
++#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
++#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
++#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
++#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
++#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
++#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
++#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */
++#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
++#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
++#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
++#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
++#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */
++#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
++#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
+ #define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
+-#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
+-#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+-#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */
+-#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
+-#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */
+-#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */
+-#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
++#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
++#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
++#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */
++#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
++#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */
++#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */
++#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
+
+ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+-#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
+-#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
+-#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+-#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+-#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+-#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
+-#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
+-#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
+-#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
+-#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
++#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
++#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
++#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
++#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
++#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
++#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
++#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
++#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
++#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
++#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
+
+ /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+-#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
+-#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
+-#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */
+-#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
+-#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
+-#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
+-#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
+-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
+-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
+-#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
+-#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
+-#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
+-#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
+-#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
+-#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
+-#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
+-#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */
+-#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
+-#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */
+-#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
+-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
+-#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
+-#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
+-#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */
+-#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
+-#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
++#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
++#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
++#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */
++#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
++#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
++#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
++#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
++#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
++#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
++#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
++#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
++#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
++#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
++#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
++#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
++#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
++#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */
++#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
++#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */
++#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
++#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
++#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
++#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
++#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */
++#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
++#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
+
+ /*
+ * Auxiliary flags: Linux defined - For features scattered in various
+@@ -192,152 +192,152 @@
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+-#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
+-#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
+-#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
+-#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+-#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
+-#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
+-#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
+-
+-#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
+-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+-#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
+-
+-#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
+-#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
+-#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
+-#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
++#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
++#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
++#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
++#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
++#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
++#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
++#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
++
++#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
++#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
++#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
++
++#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
++#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
++#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
++#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+
+-#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
++#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
+
+ /* Virtualization flags: Linux defined, word 8 */
+-#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
+-#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
+-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
+-#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
+-#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
++#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
++#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
++#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
++#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
++#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
+
+-#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
+-#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
++#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
++#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
+
+
+ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+-#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
+-#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
+-#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
+-#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
+-#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
+-#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
+-#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+-#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
+-#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
+-#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
+-#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
+-#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
+-#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
+-#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
+-#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
+-#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
+-#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
+-#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
+-#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+-#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
+-#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
+-#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+-#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
+-#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+-#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
+-#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
++#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
++#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
++#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
++#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
++#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
++#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
++#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
++#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
++#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
++#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
++#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
++#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
++#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
++#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
++#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
++#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
++#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
++#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
++#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
++#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
++#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
++#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
++#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
++#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
++#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
++#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
++#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
+
+ /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
+-#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
+-#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */
+-#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
+-#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
++#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
++#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */
++#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
++#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
+
+ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+-#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
++#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
+
+ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+-#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
+-#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
++#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
++#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
++#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
+
+ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
+-#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
+-#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
++#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
++#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
+
+ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
+-#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
+-#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
+-#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
+-#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
+-#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
+-#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
+-#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
+-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+-#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
+-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
++#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
++#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
++#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
++#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
++#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
++#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
++#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
++#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
++#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
++#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+ /* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
+-#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
+-#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
+-#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
+-#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
+-#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
+-#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+-#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
+-#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
+-#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
++#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
++#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
++#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
++#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
++#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
++#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
++#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
++#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
++#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
++#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
++#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
++#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
++#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+
+ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
+-#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
+-#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
+-#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+-#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+-#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
+-#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
+-#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */
+-#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */
+-#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */
+-#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
+-#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
+-#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
++#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
++#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
++#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
++#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
++#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
++#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
++#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */
++#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */
++#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */
++#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
++#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
++#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
+
+ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
+-#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
+-#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */
+-#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */
++#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
++#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */
++#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */
+
+ /*
+ * BUG word(s)
+ */
+-#define X86_BUG(x) (NCAPINTS*32 + (x))
++#define X86_BUG(x) (NCAPINTS*32 + (x))
+
+-#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
+-#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
+-#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
+-#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+-#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+-#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
+-#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+-#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+-#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
++#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
++#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
++#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
++#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
++#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
++#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
++#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
++#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
++#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+ #ifdef CONFIG_X86_32
+ /*
+ * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
+ * to avoid confusion.
+ */
+-#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
++#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+ #endif
+-#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
+-#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
+-#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
+-#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
++#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
++#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
++#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
++#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+ #endif /* _ASM_X86_CPUFEATURES_H */
--- /dev/null
+From 06dd688ddda5819025e014b79aea9af6ab475fa2 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 2 Nov 2017 13:22:35 +0100
+Subject: x86/cpuid: Replace set/clear_bit32()
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 06dd688ddda5819025e014b79aea9af6ab475fa2 upstream.
+
+Peter pointed out that the set/clear_bit32() variants are broken in various
+aspects.
+
+Replace them with open coded set/clear_bit() and type cast
+cpu_info::x86_capability as it's done in all other places throughout x86.
+
+Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies")
+Reported-by: Peter Ziljstra <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/cpu/cpuid-deps.c | 26 +++++++++++---------------
+ 1 file changed, 11 insertions(+), 15 deletions(-)
+
+--- a/arch/x86/kernel/cpu/cpuid-deps.c
++++ b/arch/x86/kernel/cpu/cpuid-deps.c
+@@ -62,23 +62,19 @@ const static struct cpuid_dep cpuid_deps
+ {}
+ };
+
+-static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit)
+-{
+- clear_bit32(bit, c->x86_capability);
+-}
+-
+-static inline void __setup_clear_cpu_cap(unsigned int bit)
+-{
+- clear_cpu_cap(&boot_cpu_data, bit);
+- set_bit32(bit, cpu_caps_cleared);
+-}
+-
+ static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+ {
+- if (!c)
+- __setup_clear_cpu_cap(feature);
+- else
+- __clear_cpu_cap(c, feature);
++ /*
++ * Note: This could use the non atomic __*_bit() variants, but the
++ * rest of the cpufeature code uses atomics as well, so keep it for
++ * consistency. Cleanup all of it separately.
++ */
++ if (!c) {
++ clear_cpu_cap(&boot_cpu_data, feature);
++ set_bit(feature, (unsigned long *)cpu_caps_cleared);
++ } else {
++ clear_bit(feature, (unsigned long *)c->x86_capability);
++ }
+ }
+
+ /* Take the capabilities and the BUG bits into account */
--- /dev/null
+From 33a2f1a6c4d7c0a02d1c006fb0379cc5ca3b96bb Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:13 +0100
+Subject: x86/dumpstack: Add get_stack_info() support for the SYSENTER stack
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 33a2f1a6c4d7c0a02d1c006fb0379cc5ca3b96bb upstream.
+
+get_stack_info() doesn't currently know about the SYSENTER stack, so
+unwinding will fail if we entered the kernel on the SYSENTER stack
+and haven't fully switched off. Teach get_stack_info() about the
+SYSENTER stack.
+
+With future patches applied that run part of the entry code on the
+SYSENTER stack and introduce an intentional BUG(), I would get:
+
+ PANIC: double fault, error_code: 0x0
+ ...
+ RIP: 0010:do_error_trap+0x33/0x1c0
+ ...
+ Call Trace:
+ Code: ...
+
+With this patch, I get:
+
+ PANIC: double fault, error_code: 0x0
+ ...
+ Call Trace:
+ <SYSENTER>
+ ? async_page_fault+0x36/0x60
+ ? invalid_op+0x22/0x40
+ ? async_page_fault+0x36/0x60
+ ? sync_regs+0x3c/0x40
+ ? sync_regs+0x2e/0x40
+ ? error_entry+0x6c/0xd0
+ ? async_page_fault+0x36/0x60
+ </SYSENTER>
+ Code: ...
+
+which is a lot more informative.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.392711508@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
+index 8da111b3c342..f8062bfd43a0 100644
+--- a/arch/x86/include/asm/stacktrace.h
++++ b/arch/x86/include/asm/stacktrace.h
+@@ -16,6 +16,7 @@ enum stack_type {
+ STACK_TYPE_TASK,
+ STACK_TYPE_IRQ,
+ STACK_TYPE_SOFTIRQ,
++ STACK_TYPE_SYSENTER,
+ STACK_TYPE_EXCEPTION,
+ STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
+ };
+@@ -28,6 +29,8 @@ struct stack_info {
+ bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info);
+
++bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
++
+ int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask);
+
+diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
+index 0bc95be5c638..a33a1373a252 100644
+--- a/arch/x86/kernel/dumpstack.c
++++ b/arch/x86/kernel/dumpstack.c
+@@ -43,6 +43,25 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ return true;
+ }
+
++bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
++{
++ struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
++
++ /* Treat the canary as part of the stack for unwinding purposes. */
++ void *begin = &tss->SYSENTER_stack_canary;
++ void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack);
++
++ if ((void *)stack < begin || (void *)stack >= end)
++ return false;
++
++ info->type = STACK_TYPE_SYSENTER;
++ info->begin = begin;
++ info->end = end;
++ info->next_sp = NULL;
++
++ return true;
++}
++
+ static void printk_stack_address(unsigned long address, int reliable,
+ char *log_lvl)
+ {
+diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
+index daefae83a3aa..5ff13a6b3680 100644
+--- a/arch/x86/kernel/dumpstack_32.c
++++ b/arch/x86/kernel/dumpstack_32.c
+@@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
++ if (type == STACK_TYPE_SYSENTER)
++ return "SYSENTER";
++
+ return NULL;
+ }
+
+@@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
+ if (task != current)
+ goto unknown;
+
++ if (in_sysenter_stack(stack, info))
++ goto recursion_check;
++
+ if (in_hardirq_stack(stack, info))
+ goto recursion_check;
+
+diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
+index 88ce2ffdb110..abc828f8c297 100644
+--- a/arch/x86/kernel/dumpstack_64.c
++++ b/arch/x86/kernel/dumpstack_64.c
+@@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type)
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
++ if (type == STACK_TYPE_SYSENTER)
++ return "SYSENTER";
++
+ if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
+ return exception_stack_names[type - STACK_TYPE_EXCEPTION];
+
+@@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
+ if (in_irq_stack(stack, info))
+ goto recursion_check;
+
++ if (in_sysenter_stack(stack, info))
++ goto recursion_check;
++
+ goto unknown;
+
+ recursion_check:
--- /dev/null
+From 6e60e583426c2f8751c22c2dfe5c207083b4483a Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:18 +0100
+Subject: x86/dumpstack: Handle stack overflow on all stacks
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 6e60e583426c2f8751c22c2dfe5c207083b4483a upstream.
+
+We currently special-case stack overflow on the task stack. We're
+going to start putting special stacks in the fixmap with a custom
+layout, so they'll have guard pages, too. Teach the unwinder to be
+able to unwind an overflow of any of the stacks.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.802057305@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/dumpstack.c | 24 ++++++++++++++----------
+ 1 file changed, 14 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kernel/dumpstack.c
++++ b/arch/x86/kernel/dumpstack.c
+@@ -112,24 +112,28 @@ void show_trace_log_lvl(struct task_stru
+ * - task stack
+ * - interrupt stack
+ * - HW exception stacks (double fault, nmi, debug, mce)
++ * - SYSENTER stack
+ *
+- * x86-32 can have up to three stacks:
++ * x86-32 can have up to four stacks:
+ * - task stack
+ * - softirq stack
+ * - hardirq stack
++ * - SYSENTER stack
+ */
+ for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ const char *stack_name;
+
+- /*
+- * If we overflowed the task stack into a guard page, jump back
+- * to the bottom of the usable stack.
+- */
+- if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
+- stack = task_stack_page(task);
+-
+- if (get_stack_info(stack, task, &stack_info, &visit_mask))
+- break;
++ if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
++ /*
++ * We weren't on a valid stack. It's possible that
++ * we overflowed a valid stack into a guard page.
++ * See if the next page up is valid so that we can
++ * generate some kind of backtrace if this happens.
++ */
++ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
++ if (get_stack_info(stack, task, &stack_info, &visit_mask))
++ break;
++ }
+
+ stack_name = stack_type_name(stack_info.type);
+ if (stack_name)
--- /dev/null
+From cd493a6deb8b78eca280d05f7fa73fd69403ae29 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:15 -0700
+Subject: x86/entry/32: Fix cpu_current_top_of_stack initialization at boot
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit cd493a6deb8b78eca280d05f7fa73fd69403ae29 upstream.
+
+cpu_current_top_of_stack's initialization forgot about
+TOP_OF_KERNEL_STACK_PADDING. This bug didn't matter because the
+idle threads never enter user mode.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/e5e370a7e6e4fddd1c4e4cf619765d96bb874b21.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/smpboot.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -962,8 +962,7 @@ void common_cpu_up(unsigned int cpu, str
+ #ifdef CONFIG_X86_32
+ /* Stack for startup_32 can be just as for start_secondary onwards */
+ irq_ctx_init(cpu);
+- per_cpu(cpu_current_top_of_stack, cpu) =
+- (unsigned long)task_stack_page(idle) + THREAD_SIZE;
++ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
+ #else
+ initial_gs = per_cpu_offset(cpu);
+ #endif
--- /dev/null
+From bd7dc5a6afac719d8ce4092391eef2c7e83c2a75 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:09 -0700
+Subject: x86/entry/32: Pull the MSR_IA32_SYSENTER_CS update code out of native_load_sp0()
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit bd7dc5a6afac719d8ce4092391eef2c7e83c2a75 upstream.
+
+This causes the MSR_IA32_SYSENTER_CS write to move out of the
+paravirt callback. This shouldn't affect Xen PV: Xen already ignores
+MSR_IA32_SYSENTER_ESP writes. In any event, Xen doesn't support
+vm86() in a useful way.
+
+Note to any potential backporters: This patch won't break lguest, as
+lguest didn't have any SYSENTER support at all.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/75cf09fe03ae778532d0ca6c65aa58e66bc2f90c.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/processor.h | 7 -------
+ arch/x86/include/asm/switch_to.h | 12 ++++++++++++
+ arch/x86/kernel/process_32.c | 4 +++-
+ arch/x86/kernel/process_64.c | 2 +-
+ arch/x86/kernel/vm86_32.c | 6 +++++-
+ 5 files changed, 21 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -521,13 +521,6 @@ static inline void
+ native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+ {
+ tss->x86_tss.sp0 = thread->sp0;
+-#ifdef CONFIG_X86_32
+- /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+- tss->x86_tss.ss1 = thread->sysenter_cs;
+- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+- }
+-#endif
+ }
+
+ static inline void native_swapgs(void)
+--- a/arch/x86/include/asm/switch_to.h
++++ b/arch/x86/include/asm/switch_to.h
+@@ -73,4 +73,16 @@ do { \
+ ((last) = __switch_to_asm((prev), (next))); \
+ } while (0)
+
++#ifdef CONFIG_X86_32
++static inline void refresh_sysenter_cs(struct thread_struct *thread)
++{
++ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
++ if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
++ return;
++
++ this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
++ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
++}
++#endif
++
+ #endif /* _ASM_X86_SWITCH_TO_H */
+--- a/arch/x86/kernel/process_32.c
++++ b/arch/x86/kernel/process_32.c
+@@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p,
+
+ /*
+ * Reload esp0 and cpu_current_top_of_stack. This changes
+- * current_thread_info().
++ * current_thread_info(). Refresh the SYSENTER configuration in
++ * case prev or next is vm86.
+ */
+ load_sp0(tss, next);
++ refresh_sysenter_cs(next);
+ this_cpu_write(cpu_current_top_of_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE);
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -464,7 +464,7 @@ __switch_to(struct task_struct *prev_p,
+ */
+ this_cpu_write(current_task, next_p);
+
+- /* Reload esp0 and ss1. This changes current_thread_info(). */
++ /* Reload sp0. */
+ load_sp0(tss, next);
+
+ /*
+--- a/arch/x86/kernel/vm86_32.c
++++ b/arch/x86/kernel/vm86_32.c
+@@ -55,6 +55,7 @@
+ #include <asm/irq.h>
+ #include <asm/traps.h>
+ #include <asm/vm86.h>
++#include <asm/switch_to.h>
+
+ /*
+ * Known problems:
+@@ -150,6 +151,7 @@ void save_v86_state(struct kernel_vm86_r
+ tsk->thread.sp0 = vm86->saved_sp0;
+ tsk->thread.sysenter_cs = __KERNEL_CS;
+ load_sp0(tss, &tsk->thread);
++ refresh_sysenter_cs(&tsk->thread);
+ vm86->saved_sp0 = 0;
+ put_cpu();
+
+@@ -369,8 +371,10 @@ static long do_sys_vm86(struct vm86plus_
+ /* make room for real-mode segments */
+ tsk->thread.sp0 += 16;
+
+- if (static_cpu_has(X86_FEATURE_SEP))
++ if (static_cpu_has(X86_FEATURE_SEP)) {
+ tsk->thread.sysenter_cs = 0;
++ refresh_sysenter_cs(&tsk->thread);
++ }
+
+ load_sp0(tss, &tsk->thread);
+ put_cpu();
--- /dev/null
+From 1a79797b58cddfa948420a7553241c79c013e3ca Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:12 +0100
+Subject: x86/entry/64: Allocate and enable the SYSENTER stack
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 1a79797b58cddfa948420a7553241c79c013e3ca upstream.
+
+This will simplify future changes that want scratch variables early in
+the SYSENTER handler -- they'll be able to spill registers to the
+stack. It also lets us get rid of a SWAPGS_UNSAFE_STACK user.
+
+This does not depend on CONFIG_IA32_EMULATION=y because we'll want the
+stack space even without IA32 emulation.
+
+As far as I can tell, the reason that this wasn't done from day 1 is
+that we use IST for #DB and #BP, which is IMO rather nasty and causes
+a lot more problems than it solves. But, since #DB uses IST, we don't
+actually need a real stack for SYSENTER (because SYSENTER with TF set
+will invoke #DB on the IST stack rather than the SYSENTER stack).
+
+I want to remove IST usage from these vectors some day, and this patch
+is a prerequisite for that as well.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.312726423@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64_compat.S | 2 +-
+ arch/x86/include/asm/processor.h | 3 ---
+ arch/x86/kernel/asm-offsets.c | 5 +++++
+ arch/x86/kernel/asm-offsets_32.c | 5 -----
+ arch/x86/kernel/cpu/common.c | 4 +++-
+ arch/x86/kernel/process.c | 2 --
+ arch/x86/kernel/traps.c | 3 +--
+ 7 files changed, 10 insertions(+), 14 deletions(-)
+
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -48,7 +48,7 @@
+ */
+ ENTRY(entry_SYSENTER_compat)
+ /* Interrupts are off on entry. */
+- SWAPGS_UNSAFE_STACK
++ SWAPGS
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ /*
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -339,14 +339,11 @@ struct tss_struct {
+ */
+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+
+-#ifdef CONFIG_X86_32
+ /*
+ * Space for the temporary SYSENTER stack.
+ */
+ unsigned long SYSENTER_stack_canary;
+ unsigned long SYSENTER_stack[64];
+-#endif
+-
+ } ____cacheline_aligned;
+
+ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+--- a/arch/x86/kernel/asm-offsets.c
++++ b/arch/x86/kernel/asm-offsets.c
+@@ -93,4 +93,9 @@ void common(void) {
+
+ BLANK();
+ DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
++
++ /* Offset from cpu_tss to SYSENTER_stack */
++ OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
++ /* Size of SYSENTER_stack */
++ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+ }
+--- a/arch/x86/kernel/asm-offsets_32.c
++++ b/arch/x86/kernel/asm-offsets_32.c
+@@ -50,11 +50,6 @@ void foo(void)
+ DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
+ offsetofend(struct tss_struct, SYSENTER_stack));
+
+- /* Offset from cpu_tss to SYSENTER_stack */
+- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
+- /* Size of SYSENTER_stack */
+- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+-
+ #ifdef CONFIG_CC_STACKPROTECTOR
+ BLANK();
+ OFFSET(stack_canary_offset, stack_canary, canary);
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1361,7 +1361,9 @@ void syscall_init(void)
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
++ wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
++ (unsigned long)this_cpu_ptr(&cpu_tss) +
++ offsetofend(struct tss_struct, SYSENTER_stack));
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+ #else
+ wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -71,9 +71,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(
+ */
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
+ #endif
+-#ifdef CONFIG_X86_32
+ .SYSENTER_stack_canary = STACK_END_MAGIC,
+-#endif
+ };
+ EXPORT_PER_CPU_SYMBOL(cpu_tss);
+
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -794,14 +794,13 @@ dotraplinkage void do_debug(struct pt_re
+ debug_stack_usage_dec();
+
+ exit:
+-#if defined(CONFIG_X86_32)
+ /*
+ * This is the most likely code path that involves non-trivial use
+ * of the SYSENTER stack. Check that we haven't overrun it.
+ */
+ WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
+ "Overran or corrupted SYSENTER stack\n");
+-#endif
++
+ ist_exit(regs);
+ }
+ NOKPROBE_SYMBOL(do_debug);
--- /dev/null
+From 3386bc8aed825e9f1f65ce38df4b109b2019b71a Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:25 +0100
+Subject: x86/entry/64: Create a per-CPU SYSCALL entry trampoline
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 3386bc8aed825e9f1f65ce38df4b109b2019b71a upstream.
+
+Handling SYSCALL is tricky: the SYSCALL handler is entered with every
+single register (except FLAGS), including RSP, live. It somehow needs
+to set RSP to point to a valid stack, which means it needs to save the
+user RSP somewhere and find its own stack pointer. The canonical way
+to do this is with SWAPGS, which lets us access percpu data using the
+%gs prefix.
+
+With PAGE_TABLE_ISOLATION-like pagetable switching, this is
+problematic. Without a scratch register, switching CR3 is impossible, so
+%gs-based percpu memory would need to be mapped in the user pagetables.
+Doing that without information leaks is difficult or impossible.
+
+Instead, use a different sneaky trick. Map a copy of the first part
+of the SYSCALL asm at a different address for each CPU. Now RIP
+varies depending on the CPU, so we can use RIP-relative memory access
+to access percpu memory. By putting the relevant information (one
+scratch slot and the stack address) at a constant offset relative to
+RIP, we can make SYSCALL work without relying on %gs.
+
+A nice thing about this approach is that we can easily switch it on
+and off if we want pagetable switching to be configurable.
+
+The compat variant of SYSCALL doesn't have this problem in the first
+place -- there are plenty of scratch registers, since we don't care
+about preserving r8-r15. This patch therefore doesn't touch SYSCALL32
+at all.
+
+This patch actually seems to be a small speedup. With this patch,
+SYSCALL touches an extra cache line and an extra virtual page, but
+the pipeline no longer stalls waiting for SWAPGS. It seems that, at
+least in a tight loop, the latter outweights the former.
+
+Thanks to David Laight for an optimization tip.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bpetkov@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 58 ++++++++++++++++++++++++++++++++++++++++++
+ arch/x86/include/asm/fixmap.h | 2 +
+ arch/x86/kernel/asm-offsets.c | 1
+ arch/x86/kernel/cpu/common.c | 15 ++++++++++
+ arch/x86/kernel/vmlinux.lds.S | 9 ++++++
+ 5 files changed, 84 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -136,6 +136,64 @@ END(native_usergs_sysret64)
+ * with them due to bugs in both AMD and Intel CPUs.
+ */
+
++ .pushsection .entry_trampoline, "ax"
++
++/*
++ * The code in here gets remapped into cpu_entry_area's trampoline. This means
++ * that the assembler and linker have the wrong idea as to where this code
++ * lives (and, in fact, it's mapped more than once, so it's not even at a
++ * fixed address). So we can't reference any symbols outside the entry
++ * trampoline and expect it to work.
++ *
++ * Instead, we carefully abuse %rip-relative addressing.
++ * _entry_trampoline(%rip) refers to the start of the remapped) entry
++ * trampoline. We can thus find cpu_entry_area with this macro:
++ */
++
++#define CPU_ENTRY_AREA \
++ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
++
++/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
++#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \
++ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
++
++ENTRY(entry_SYSCALL_64_trampoline)
++ UNWIND_HINT_EMPTY
++ swapgs
++
++ /* Stash the user RSP. */
++ movq %rsp, RSP_SCRATCH
++
++ /* Load the top of the task stack into RSP */
++ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
++
++ /* Start building the simulated IRET frame. */
++ pushq $__USER_DS /* pt_regs->ss */
++ pushq RSP_SCRATCH /* pt_regs->sp */
++ pushq %r11 /* pt_regs->flags */
++ pushq $__USER_CS /* pt_regs->cs */
++ pushq %rcx /* pt_regs->ip */
++
++ /*
++ * x86 lacks a near absolute jump, and we can't jump to the real
++ * entry text with a relative jump. We could push the target
++ * address and then use retq, but this destroys the pipeline on
++ * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
++ * spill RDI and restore it in a second-stage trampoline.
++ */
++ pushq %rdi
++ movq $entry_SYSCALL_64_stage2, %rdi
++ jmp *%rdi
++END(entry_SYSCALL_64_trampoline)
++
++ .popsection
++
++ENTRY(entry_SYSCALL_64_stage2)
++ UNWIND_HINT_EMPTY
++ popq %rdi
++ jmp entry_SYSCALL_64_after_hwframe
++END(entry_SYSCALL_64_stage2)
++
+ ENTRY(entry_SYSCALL_64)
+ UNWIND_HINT_EMPTY
+ /*
+--- a/arch/x86/include/asm/fixmap.h
++++ b/arch/x86/include/asm/fixmap.h
+@@ -61,6 +61,8 @@ struct cpu_entry_area {
+ * of the TSS region.
+ */
+ struct tss_struct tss;
++
++ char entry_trampoline[PAGE_SIZE];
+ };
+
+ #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
+--- a/arch/x86/kernel/asm-offsets.c
++++ b/arch/x86/kernel/asm-offsets.c
+@@ -101,4 +101,5 @@ void common(void) {
+
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
++ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+ }
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *,
+ static inline void setup_cpu_entry_area(int cpu)
+ {
+ #ifdef CONFIG_X86_64
++ extern char _entry_trampoline[];
++
+ /* On 64-bit systems, we use a read-only fixmap GDT. */
+ pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ #else
+@@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(
+ #ifdef CONFIG_X86_32
+ this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu));
+ #endif
++
++#ifdef CONFIG_X86_64
++ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
++ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
++#endif
+ }
+
+ /* Load the original GDT from the per-cpu structure */
+@@ -1395,10 +1402,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char,
+ /* May not be marked __init: used by software suspend */
+ void syscall_init(void)
+ {
++ extern char _entry_trampoline[];
++ extern char entry_SYSCALL_64_trampoline[];
++
+ int cpu = smp_processor_id();
++ unsigned long SYSCALL64_entry_trampoline =
++ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
++ (entry_SYSCALL_64_trampoline - _entry_trampoline);
+
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
++ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+
+ #ifdef CONFIG_IA32_EMULATION
+ wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
+--- a/arch/x86/kernel/vmlinux.lds.S
++++ b/arch/x86/kernel/vmlinux.lds.S
+@@ -107,6 +107,15 @@ SECTIONS
+ SOFTIRQENTRY_TEXT
+ *(.fixup)
+ *(.gnu.warning)
++
++#ifdef CONFIG_X86_64
++ . = ALIGN(PAGE_SIZE);
++ _entry_trampoline = .;
++ *(.entry_trampoline)
++ . = ALIGN(PAGE_SIZE);
++ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
++#endif
++
+ /* End of text section */
+ _etext = .;
+ } :text = 0x9090
--- /dev/null
+From 929bacec21478a72c78e4f29f98fb799bd00105a Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:08 -0700
+Subject: x86/entry/64: De-Xen-ify our NMI code
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 929bacec21478a72c78e4f29f98fb799bd00105a upstream.
+
+Xen PV is fundamentally incompatible with our fancy NMI code: it
+doesn't use IST at all, and Xen entries clobber two stack slots
+below the hardware frame.
+
+Drop Xen PV support from our NMI code entirely.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Acked-by: Juergen Gross <jgross@suse.com>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/bfbe711b5ae03f672f8848999a8eb2711efc7f98.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 30 ++++++++++++++++++------------
+ 1 file changed, 18 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -1241,9 +1241,13 @@ ENTRY(error_exit)
+ jmp retint_user
+ END(error_exit)
+
+-/* Runs on exception stack */
++/*
++ * Runs on exception stack. Xen PV does not go through this path at all,
++ * so we can use real assembly here.
++ */
+ ENTRY(nmi)
+ UNWIND_HINT_IRET_REGS
++
+ /*
+ * We allow breakpoints in NMIs. If a breakpoint occurs, then
+ * the iretq it performs will take us out of NMI context.
+@@ -1301,7 +1305,7 @@ ENTRY(nmi)
+ * stacks lest we corrupt the "NMI executing" variable.
+ */
+
+- SWAPGS_UNSAFE_STACK
++ swapgs
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+@@ -1466,7 +1470,7 @@ nested_nmi_out:
+ popq %rdx
+
+ /* We are returning to kernel mode, so this cannot result in a fault. */
+- INTERRUPT_RETURN
++ iretq
+
+ first_nmi:
+ /* Restore rdx. */
+@@ -1497,7 +1501,7 @@ first_nmi:
+ pushfq /* RFLAGS */
+ pushq $__KERNEL_CS /* CS */
+ pushq $1f /* RIP */
+- INTERRUPT_RETURN /* continues at repeat_nmi below */
++ iretq /* continues at repeat_nmi below */
+ UNWIND_HINT_IRET_REGS
+ 1:
+ #endif
+@@ -1572,20 +1576,22 @@ nmi_restore:
+ /*
+ * Clear "NMI executing". Set DF first so that we can easily
+ * distinguish the remaining code between here and IRET from
+- * the SYSCALL entry and exit paths. On a native kernel, we
+- * could just inspect RIP, but, on paravirt kernels,
+- * INTERRUPT_RETURN can translate into a jump into a
+- * hypercall page.
++ * the SYSCALL entry and exit paths.
++ *
++ * We arguably should just inspect RIP instead, but I (Andy) wrote
++ * this code when I had the misapprehension that Xen PV supported
++ * NMIs, and Xen PV would break that approach.
+ */
+ std
+ movq $0, 5*8(%rsp) /* clear "NMI executing" */
+
+ /*
+- * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+- * stack in a single instruction. We are returning to kernel
+- * mode, so this cannot result in a fault.
++ * iretq reads the "iret" frame and exits the NMI stack in a
++ * single instruction. We are returning to kernel mode, so this
++ * cannot result in a fault. Similarly, we don't need to worry
++ * about espfix64 on the way back to kernel mode.
+ */
+- INTERRUPT_RETURN
++ iretq
+ END(nmi)
+
+ ENTRY(ignore_sysret)
--- /dev/null
+From c482feefe1aeb150156248ba0fd3e029bc886605 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:29 +0100
+Subject: x86/entry/64: Make cpu_entry_area.tss read-only
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit c482feefe1aeb150156248ba0fd3e029bc886605 upstream.
+
+The TSS is a fairly juicy target for exploits, and, now that the TSS
+is in the cpu_entry_area, it's no longer protected by kASLR. Make it
+read-only on x86_64.
+
+On x86_32, it can't be RO because it's written by the CPU during task
+switches, and we use a task gate for double faults. I'd also be
+nervous about errata if we tried to make it RO even on configurations
+without double fault handling.
+
+[ tglx: AMD confirmed that there is no problem on 64-bit with TSS RO. So
+ it's probably safe to assume that it's a non issue, though Intel
+ might have been creative in that area. Still waiting for
+ confirmation. ]
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bpetkov@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.733700132@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_32.S | 4 ++--
+ arch/x86/entry/entry_64.S | 8 ++++----
+ arch/x86/include/asm/fixmap.h | 13 +++++++++----
+ arch/x86/include/asm/processor.h | 17 ++++++++---------
+ arch/x86/include/asm/switch_to.h | 4 ++--
+ arch/x86/include/asm/thread_info.h | 2 +-
+ arch/x86/kernel/asm-offsets.c | 5 ++---
+ arch/x86/kernel/asm-offsets_32.c | 4 ++--
+ arch/x86/kernel/cpu/common.c | 29 +++++++++++++++++++----------
+ arch/x86/kernel/ioport.c | 2 +-
+ arch/x86/kernel/process.c | 6 +++---
+ arch/x86/kernel/process_32.c | 2 +-
+ arch/x86/kernel/process_64.c | 2 +-
+ arch/x86/kernel/traps.c | 4 ++--
+ arch/x86/lib/delay.c | 4 ++--
+ arch/x86/xen/enlighten_pv.c | 2 +-
+ 16 files changed, 60 insertions(+), 48 deletions(-)
+
+--- a/arch/x86/entry/entry_32.S
++++ b/arch/x86/entry/entry_32.S
+@@ -942,7 +942,7 @@ ENTRY(debug)
+
+ /* Are we currently on the SYSENTER stack? */
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+- addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
++ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
+ cmpl $SIZEOF_SYSENTER_stack, %ecx
+ jb .Ldebug_from_sysenter_stack
+@@ -986,7 +986,7 @@ ENTRY(nmi)
+
+ /* Are we currently on the SYSENTER stack? */
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+- addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
++ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
+ cmpl $SIZEOF_SYSENTER_stack, %ecx
+ jb .Lnmi_from_sysenter_stack
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -154,7 +154,7 @@ END(native_usergs_sysret64)
+ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
+
+ /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
+-#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \
++#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
+ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
+
+ ENTRY(entry_SYSCALL_64_trampoline)
+@@ -390,7 +390,7 @@ syscall_return_via_sysret:
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+- movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
++ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+ pushq RSP-RDI(%rdi) /* RSP */
+ pushq (%rdi) /* RDI */
+@@ -719,7 +719,7 @@ GLOBAL(swapgs_restore_regs_and_return_to
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+- movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
++ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+ /* Copy the IRET frame to the trampoline stack. */
+ pushq 6*8(%rdi) /* SS */
+@@ -934,7 +934,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
+ /*
+ * Exception entry points.
+ */
+-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
++#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+
+ /*
+ * Switch to the thread stack. This is called with the IRET frame and
+--- a/arch/x86/include/asm/fixmap.h
++++ b/arch/x86/include/asm/fixmap.h
+@@ -56,9 +56,14 @@ struct cpu_entry_area {
+ char gdt[PAGE_SIZE];
+
+ /*
+- * The GDT is just below cpu_tss and thus serves (on x86_64) as a
+- * a read-only guard page for the SYSENTER stack at the bottom
+- * of the TSS region.
++ * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
++ * a a read-only guard page.
++ */
++ struct SYSENTER_stack_page SYSENTER_stack_page;
++
++ /*
++ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
++ * we need task switches to work, and task switches write to the TSS.
+ */
+ struct tss_struct tss;
+
+@@ -247,7 +252,7 @@ static inline struct cpu_entry_area *get
+
+ static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
+ {
+- return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack;
++ return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
+ }
+
+ #endif /* !__ASSEMBLY__ */
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -340,13 +340,11 @@ struct SYSENTER_stack {
+ unsigned long words[64];
+ };
+
+-struct tss_struct {
+- /*
+- * Space for the temporary SYSENTER stack, used for SYSENTER
+- * and the entry trampoline as well.
+- */
+- struct SYSENTER_stack SYSENTER_stack;
++struct SYSENTER_stack_page {
++ struct SYSENTER_stack stack;
++} __aligned(PAGE_SIZE);
+
++struct tss_struct {
+ /*
+ * The fixed hardware portion. This must not cross a page boundary
+ * at risk of violating the SDM's advice and potentially triggering
+@@ -363,7 +361,7 @@ struct tss_struct {
+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+ } __aligned(PAGE_SIZE);
+
+-DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss);
++DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
+
+ /*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+@@ -378,7 +376,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_
+ #ifdef CONFIG_X86_32
+ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+ #else
+-#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1
++/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
++#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
+ #endif
+
+ /*
+@@ -538,7 +537,7 @@ static inline void native_set_iopl_mask(
+ static inline void
+ native_load_sp0(unsigned long sp0)
+ {
+- this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
++ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
+ }
+
+ static inline void native_swapgs(void)
+--- a/arch/x86/include/asm/switch_to.h
++++ b/arch/x86/include/asm/switch_to.h
+@@ -79,10 +79,10 @@ do { \
+ static inline void refresh_sysenter_cs(struct thread_struct *thread)
+ {
+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+- if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
++ if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
+ return;
+
+- this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
++ this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+ }
+ #endif
+--- a/arch/x86/include/asm/thread_info.h
++++ b/arch/x86/include/asm/thread_info.h
+@@ -207,7 +207,7 @@ static inline int arch_within_stack_fram
+ #else /* !__ASSEMBLY__ */
+
+ #ifdef CONFIG_X86_64
+-# define cpu_current_top_of_stack (cpu_tss + TSS_sp1)
++# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
+ #endif
+
+ #endif
+--- a/arch/x86/kernel/asm-offsets.c
++++ b/arch/x86/kernel/asm-offsets.c
+@@ -94,10 +94,9 @@ void common(void) {
+ BLANK();
+ DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+- OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack);
+- DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
+-
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
++ OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
++ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
+ }
+--- a/arch/x86/kernel/asm-offsets_32.c
++++ b/arch/x86/kernel/asm-offsets_32.c
+@@ -47,8 +47,8 @@ void foo(void)
+ BLANK();
+
+ /* Offset from the sysenter stack to tss.sp0 */
+- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
+- offsetofend(struct tss_struct, SYSENTER_stack));
++ DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
++ offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
+
+ #ifdef CONFIG_CC_STACKPROTECTOR
+ BLANK();
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -487,6 +487,9 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char,
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+ #endif
+
++static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
++ SYSENTER_stack_storage);
++
+ static void __init
+ set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
+ {
+@@ -500,23 +503,29 @@ static void __init setup_cpu_entry_area(
+ #ifdef CONFIG_X86_64
+ extern char _entry_trampoline[];
+
+- /* On 64-bit systems, we use a read-only fixmap GDT. */
++ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+ pgprot_t gdt_prot = PAGE_KERNEL_RO;
++ pgprot_t tss_prot = PAGE_KERNEL_RO;
+ #else
+ /*
+ * On native 32-bit systems, the GDT cannot be read-only because
+ * our double fault handler uses a task gate, and entering through
+- * a task gate needs to change an available TSS to busy. If the GDT
+- * is read-only, that will triple fault.
++ * a task gate needs to change an available TSS to busy. If the
++ * GDT is read-only, that will triple fault. The TSS cannot be
++ * read-only because the CPU writes to it on task switches.
+ *
+- * On Xen PV, the GDT must be read-only because the hypervisor requires
+- * it.
++ * On Xen PV, the GDT must be read-only because the hypervisor
++ * requires it.
+ */
+ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ PAGE_KERNEL_RO : PAGE_KERNEL;
++ pgprot_t tss_prot = PAGE_KERNEL;
+ #endif
+
+ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
++ per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
++ PAGE_KERNEL);
+
+ /*
+ * The Intel SDM says (Volume 3, 7.2.1):
+@@ -539,9 +548,9 @@ static void __init setup_cpu_entry_area(
+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
+- &per_cpu(cpu_tss, cpu),
++ &per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct) / PAGE_SIZE,
+- PAGE_KERNEL);
++ tss_prot);
+
+ #ifdef CONFIG_X86_32
+ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+@@ -1305,7 +1314,7 @@ void enable_sep_cpu(void)
+ return;
+
+ cpu = get_cpu();
+- tss = &per_cpu(cpu_tss, cpu);
++ tss = &per_cpu(cpu_tss_rw, cpu);
+
+ /*
+ * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+@@ -1575,7 +1584,7 @@ void cpu_init(void)
+ if (cpu)
+ load_ucode_ap();
+
+- t = &per_cpu(cpu_tss, cpu);
++ t = &per_cpu(cpu_tss_rw, cpu);
+ oist = &per_cpu(orig_ist, cpu);
+
+ #ifdef CONFIG_NUMA
+@@ -1667,7 +1676,7 @@ void cpu_init(void)
+ {
+ int cpu = smp_processor_id();
+ struct task_struct *curr = current;
+- struct tss_struct *t = &per_cpu(cpu_tss, cpu);
++ struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
+
+ wait_for_master_cpu(cpu);
+
+--- a/arch/x86/kernel/ioport.c
++++ b/arch/x86/kernel/ioport.c
+@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long
+ * because the ->io_bitmap_max value must match the bitmap
+ * contents:
+ */
+- tss = &per_cpu(cpu_tss, get_cpu());
++ tss = &per_cpu(cpu_tss_rw, get_cpu());
+
+ if (turn_on)
+ bitmap_clear(t->io_bitmap_ptr, from, num);
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -47,7 +47,7 @@
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
++__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
+ .x86_tss = {
+ /*
+ * .sp0 is only used when entering ring 0 from a lower
+@@ -82,7 +82,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
+ #endif
+ };
+-EXPORT_PER_CPU_SYMBOL(cpu_tss);
++EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
+
+ DEFINE_PER_CPU(bool, __tss_limit_invalid);
+ EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
+@@ -111,7 +111,7 @@ void exit_thread(struct task_struct *tsk
+ struct fpu *fpu = &t->fpu;
+
+ if (bp) {
+- struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
++ struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
+
+ t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
+--- a/arch/x86/kernel/process_32.c
++++ b/arch/x86/kernel/process_32.c
+@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p,
+ struct fpu *prev_fpu = &prev->fpu;
+ struct fpu *next_fpu = &next->fpu;
+ int cpu = smp_processor_id();
+- struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
++ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
+
+ /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
+
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -399,7 +399,7 @@ __switch_to(struct task_struct *prev_p,
+ struct fpu *prev_fpu = &prev->fpu;
+ struct fpu *next_fpu = &next->fpu;
+ int cpu = smp_processor_id();
+- struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
++ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(irq_count) != -1);
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -364,7 +364,7 @@ dotraplinkage void do_double_fault(struc
+ regs->cs == __KERNEL_CS &&
+ regs->ip == (unsigned long)native_irq_return_iret)
+ {
+- struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
++ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+
+ /*
+ * regs->sp points to the failing IRET frame on the
+@@ -649,7 +649,7 @@ struct bad_iret_stack *fixup_bad_iret(st
+ * exception came from the IRET target.
+ */
+ struct bad_iret_stack *new_stack =
+- (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
++ (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+
+ /* Copy the IRET target to the new stack. */
+ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
+--- a/arch/x86/lib/delay.c
++++ b/arch/x86/lib/delay.c
+@@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long _
+ delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
+
+ /*
+- * Use cpu_tss as a cacheline-aligned, seldomly
++ * Use cpu_tss_rw as a cacheline-aligned, seldomly
+ * accessed per-cpu variable as the monitor target.
+ */
+- __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
++ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
+
+ /*
+ * AMD, like Intel, supports the EAX hint and EAX=0xf
+--- a/arch/x86/xen/enlighten_pv.c
++++ b/arch/x86/xen/enlighten_pv.c
+@@ -818,7 +818,7 @@ static void xen_load_sp0(unsigned long s
+ mcs = xen_mc_entry(0);
+ MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+- this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
++ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
+ }
+
+ void xen_set_iopl_mask(unsigned mask)
--- /dev/null
+From a512210643da8082cb44181dba8b18e752bd68f0 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:04 -0700
+Subject: x86/entry/64: Merge the fast and slow SYSRET paths
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit a512210643da8082cb44181dba8b18e752bd68f0 upstream.
+
+They did almost the same thing. Remove a bunch of pointless
+instructions (mostly hidden in macros) and reduce cognitive load by
+merging them.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/1204e20233fcab9130a1ba80b3b1879b5db3fc1f.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -221,10 +221,9 @@ entry_SYSCALL_64_fastpath:
+ TRACE_IRQS_ON /* user mode is traced as IRQs on */
+ movq RIP(%rsp), %rcx
+ movq EFLAGS(%rsp), %r11
+- RESTORE_C_REGS_EXCEPT_RCX_R11
+- movq RSP(%rsp), %rsp
++ addq $6*8, %rsp /* skip extra regs -- they were preserved */
+ UNWIND_HINT_EMPTY
+- USERGS_SYSRET64
++ jmp .Lpop_c_regs_except_rcx_r11_and_sysret
+
+ 1:
+ /*
+@@ -318,6 +317,7 @@ syscall_return_via_sysret:
+ /* rcx and r11 are already restored (see code above) */
+ UNWIND_HINT_EMPTY
+ POP_EXTRA_REGS
++.Lpop_c_regs_except_rcx_r11_and_sysret:
+ popq %rsi /* skip r11 */
+ popq %r10
+ popq %r9
--- /dev/null
+From 8a055d7f411d41755ce30db5bb65b154777c4b78 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:00 -0700
+Subject: x86/entry/64: Move SWAPGS into the common IRET-to-usermode path
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 8a055d7f411d41755ce30db5bb65b154777c4b78 upstream.
+
+All of the code paths that ended up doing IRET to usermode did
+SWAPGS immediately beforehand. Move the SWAPGS into the common
+code.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/27fd6f45b7cd640de38fb9066fd0349bcd11f8e1.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 32 ++++++++++++++------------------
+ arch/x86/entry/entry_64_compat.S | 3 +--
+ 2 files changed, 15 insertions(+), 20 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -250,12 +250,14 @@ return_from_SYSCALL_64:
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+- * a completely clean 64-bit userspace context.
++ * a completely clean 64-bit userspace context. If we're not,
++ * go to the slow exit path.
+ */
+ movq RCX(%rsp), %rcx
+ movq RIP(%rsp), %r11
+- cmpq %rcx, %r11 /* RCX == RIP */
+- jne opportunistic_sysret_failed
++
++ cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
++ jne swapgs_restore_regs_and_return_to_usermode
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+@@ -273,14 +275,14 @@ return_from_SYSCALL_64:
+
+ /* If this changed %rcx, it was not canonical */
+ cmpq %rcx, %r11
+- jne opportunistic_sysret_failed
++ jne swapgs_restore_regs_and_return_to_usermode
+
+ cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
+- jne opportunistic_sysret_failed
++ jne swapgs_restore_regs_and_return_to_usermode
+
+ movq R11(%rsp), %r11
+ cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
+- jne opportunistic_sysret_failed
++ jne swapgs_restore_regs_and_return_to_usermode
+
+ /*
+ * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
+@@ -301,12 +303,12 @@ return_from_SYSCALL_64:
+ * would never get past 'stuck_here'.
+ */
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+- jnz opportunistic_sysret_failed
++ jnz swapgs_restore_regs_and_return_to_usermode
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
+- jne opportunistic_sysret_failed
++ jne swapgs_restore_regs_and_return_to_usermode
+
+ /*
+ * We win! This label is here just for ease of understanding
+@@ -319,10 +321,6 @@ syscall_return_via_sysret:
+ movq RSP(%rsp), %rsp
+ UNWIND_HINT_EMPTY
+ USERGS_SYSRET64
+-
+-opportunistic_sysret_failed:
+- SWAPGS
+- jmp restore_regs_and_return_to_usermode
+ END(entry_SYSCALL_64)
+
+ ENTRY(stub_ptregs_64)
+@@ -423,8 +421,7 @@ ENTRY(ret_from_fork)
+ movq %rsp, %rdi
+ call syscall_return_slowpath /* returns with IRQs disabled */
+ TRACE_IRQS_ON /* user mode is traced as IRQS on */
+- SWAPGS
+- jmp restore_regs_and_return_to_usermode
++ jmp swapgs_restore_regs_and_return_to_usermode
+
+ 1:
+ /* kernel thread */
+@@ -612,9 +609,8 @@ GLOBAL(retint_user)
+ mov %rsp,%rdi
+ call prepare_exit_to_usermode
+ TRACE_IRQS_IRETQ
+- SWAPGS
+
+-GLOBAL(restore_regs_and_return_to_usermode)
++GLOBAL(swapgs_restore_regs_and_return_to_usermode)
+ #ifdef CONFIG_DEBUG_ENTRY
+ /* Assert that pt_regs indicates user mode. */
+ testl $3, CS(%rsp)
+@@ -622,6 +618,7 @@ GLOBAL(restore_regs_and_return_to_usermo
+ ud2
+ 1:
+ #endif
++ SWAPGS
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+@@ -1343,8 +1340,7 @@ ENTRY(nmi)
+ * Return back to user mode. We must *not* do the normal exit
+ * work, because we don't want to enable interrupts.
+ */
+- SWAPGS
+- jmp restore_regs_and_return_to_usermode
++ jmp swapgs_restore_regs_and_return_to_usermode
+
+ .Lnmi_from_kernel:
+ /*
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -337,8 +337,7 @@ ENTRY(entry_INT80_compat)
+
+ /* Go back to user mode. */
+ TRACE_IRQS_ON
+- SWAPGS
+- jmp restore_regs_and_return_to_usermode
++ jmp swapgs_restore_regs_and_return_to_usermode
+ END(entry_INT80_compat)
+
+ ENTRY(stub32_clone)
--- /dev/null
+From 40e7f949e0d9a33968ebde5d67f7e3a47c97742a Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:26 +0100
+Subject: x86/entry/64: Move the IST stacks into struct cpu_entry_area
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 40e7f949e0d9a33968ebde5d67f7e3a47c97742a upstream.
+
+The IST stacks are needed when an IST exception occurs and are accessed
+before any kernel code at all runs. Move them into struct cpu_entry_area.
+
+The IST stacks are unlike the rest of cpu_entry_area: they're used even for
+entries from kernel mode. This means that they should be set up before we
+load the final IDT. Move cpu_entry_area setup to trap_init() for the boot
+CPU and set it up for all possible CPUs at once in native_smp_prepare_cpus().
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.480598743@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/fixmap.h | 12 ++++++
+ arch/x86/kernel/cpu/common.c | 74 +++++++++++++++++++++++-------------------
+ arch/x86/kernel/traps.c | 3 +
+ 3 files changed, 57 insertions(+), 32 deletions(-)
+
+--- a/arch/x86/include/asm/fixmap.h
++++ b/arch/x86/include/asm/fixmap.h
+@@ -63,10 +63,22 @@ struct cpu_entry_area {
+ struct tss_struct tss;
+
+ char entry_trampoline[PAGE_SIZE];
++
++#ifdef CONFIG_X86_64
++ /*
++ * Exception stacks used for IST entries.
++ *
++ * In the future, this should have a separate slot for each stack
++ * with guard pages between them.
++ */
++ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
++#endif
+ };
+
+ #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
+
++extern void setup_cpu_entry_areas(void);
++
+ /*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -466,24 +466,36 @@ void load_percpu_segment(int cpu)
+ load_stack_canary_segment();
+ }
+
+-static void set_percpu_fixmap_pages(int fixmap_index, void *ptr,
+- int pages, pgprot_t prot)
+-{
+- int i;
+-
+- for (i = 0; i < pages; i++) {
+- __set_fixmap(fixmap_index - i,
+- per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot);
+- }
+-}
+-
+ #ifdef CONFIG_X86_32
+ /* The 32-bit entry code needs to find cpu_entry_area. */
+ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+ #endif
+
++#ifdef CONFIG_X86_64
++/*
++ * Special IST stacks which the CPU switches to when it calls
++ * an IST-marked descriptor entry. Up to 7 stacks (hardware
++ * limit), all of them are 4K, except the debug stack which
++ * is 8K.
++ */
++static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
++ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
++ [DEBUG_STACK - 1] = DEBUG_STKSZ
++};
++
++static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
++ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
++#endif
++
++static void __init
++set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
++{
++ for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
++ __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
++}
++
+ /* Setup the fixmap mappings only once per-processor */
+-static inline void setup_cpu_entry_area(int cpu)
++static void __init setup_cpu_entry_area(int cpu)
+ {
+ #ifdef CONFIG_X86_64
+ extern char _entry_trampoline[];
+@@ -532,15 +544,31 @@ static inline void setup_cpu_entry_area(
+ PAGE_KERNEL);
+
+ #ifdef CONFIG_X86_32
+- this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu));
++ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+ #endif
+
+ #ifdef CONFIG_X86_64
++ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
++ BUILD_BUG_ON(sizeof(exception_stacks) !=
++ sizeof(((struct cpu_entry_area *)0)->exception_stacks));
++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
++ &per_cpu(exception_stacks, cpu),
++ sizeof(exception_stacks) / PAGE_SIZE,
++ PAGE_KERNEL);
++
+ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+ #endif
+ }
+
++void __init setup_cpu_entry_areas(void)
++{
++ unsigned int cpu;
++
++ for_each_possible_cpu(cpu)
++ setup_cpu_entry_area(cpu);
++}
++
+ /* Load the original GDT from the per-cpu structure */
+ void load_direct_gdt(int cpu)
+ {
+@@ -1385,20 +1413,6 @@ DEFINE_PER_CPU(unsigned int, irq_count)
+ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+ EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
+-/*
+- * Special IST stacks which the CPU switches to when it calls
+- * an IST-marked descriptor entry. Up to 7 stacks (hardware
+- * limit), all of them are 4K, except the debug stack which
+- * is 8K.
+- */
+-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+- [DEBUG_STACK - 1] = DEBUG_STKSZ
+-};
+-
+-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+-
+ /* May not be marked __init: used by software suspend */
+ void syscall_init(void)
+ {
+@@ -1607,7 +1621,7 @@ void cpu_init(void)
+ * set up and load the per-CPU TSS
+ */
+ if (!oist->ist[0]) {
+- char *estacks = per_cpu(exception_stacks, cpu);
++ char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
+
+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+ estacks += exception_stack_sizes[v];
+@@ -1633,8 +1647,6 @@ void cpu_init(void)
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, me);
+
+- setup_cpu_entry_area(cpu);
+-
+ /*
+ * Initialize the TSS. sp0 points to the entry trampoline stack
+ * regardless of what task is running.
+@@ -1694,8 +1706,6 @@ void cpu_init(void)
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, curr);
+
+- setup_cpu_entry_area(cpu);
+-
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -947,6 +947,9 @@ dotraplinkage void do_iret_error(struct
+
+ void __init trap_init(void)
+ {
++ /* Init cpu_entry_area before IST entries are set up */
++ setup_cpu_entry_areas();
++
+ idt_setup_traps();
+
+ /*
--- /dev/null
+From e17f8234538d1ff708673f287a42457c4dee720d Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Mon, 4 Dec 2017 15:07:07 +0100
+Subject: x86/entry/64/paravirt: Use paravirt-safe macro to access eflags
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit e17f8234538d1ff708673f287a42457c4dee720d upstream.
+
+Commit 1d3e53e8624a ("x86/entry/64: Refactor IRQ stacks and make them
+NMI-safe") added DEBUG_ENTRY_ASSERT_IRQS_OFF macro that acceses eflags
+using 'pushfq' instruction when testing for IF bit. On PV Xen guests
+looking at IF flag directly will always see it set, resulting in 'ud2'.
+
+Introduce SAVE_FLAGS() macro that will use appropriate save_fl pv op when
+running paravirt.
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Juergen Gross <jgross@suse.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Cc: xen-devel@lists.xenproject.org
+Link: https://lkml.kernel.org/r/20171204150604.899457242@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 7 ++++---
+ arch/x86/include/asm/irqflags.h | 3 +++
+ arch/x86/include/asm/paravirt.h | 9 +++++++++
+ arch/x86/kernel/asm-offsets_64.c | 3 +++
+ 4 files changed, 19 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -462,12 +462,13 @@ END(irq_entries_start)
+
+ .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
+ #ifdef CONFIG_DEBUG_ENTRY
+- pushfq
+- testl $X86_EFLAGS_IF, (%rsp)
++ pushq %rax
++ SAVE_FLAGS(CLBR_RAX)
++ testl $X86_EFLAGS_IF, %eax
+ jz .Lokay_\@
+ ud2
+ .Lokay_\@:
+- addq $8, %rsp
++ popq %rax
+ #endif
+ .endm
+
+--- a/arch/x86/include/asm/irqflags.h
++++ b/arch/x86/include/asm/irqflags.h
+@@ -142,6 +142,9 @@ static inline notrace unsigned long arch
+ swapgs; \
+ sysretl
+
++#ifdef CONFIG_DEBUG_ENTRY
++#define SAVE_FLAGS(x) pushfq; popq %rax
++#endif
+ #else
+ #define INTERRUPT_RETURN iret
+ #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+--- a/arch/x86/include/asm/paravirt.h
++++ b/arch/x86/include/asm/paravirt.h
+@@ -927,6 +927,15 @@ extern void default_banner(void);
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
+ CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
++
++#ifdef CONFIG_DEBUG_ENTRY
++#define SAVE_FLAGS(clobbers) \
++ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
++ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
++ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
++ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
++#endif
++
+ #endif /* CONFIG_X86_32 */
+
+ #endif /* __ASSEMBLY__ */
+--- a/arch/x86/kernel/asm-offsets_64.c
++++ b/arch/x86/kernel/asm-offsets_64.c
+@@ -23,6 +23,9 @@ int main(void)
+ #ifdef CONFIG_PARAVIRT
+ OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
+ OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
++#ifdef CONFIG_DEBUG_ENTRY
++ OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
++#endif
+ BLANK();
+ #endif
+
--- /dev/null
+From da51da189a24bb9b7e2d5a123be096e51a4695a5 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:10 -0700
+Subject: x86/entry/64: Pass SP0 directly to load_sp0()
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit da51da189a24bb9b7e2d5a123be096e51a4695a5 upstream.
+
+load_sp0() had an odd signature:
+
+ void load_sp0(struct tss_struct *tss, struct thread_struct *thread);
+
+Simplify it to:
+
+ void load_sp0(unsigned long sp0);
+
+Also simplify a few get_cpu()/put_cpu() sequences to
+preempt_disable()/preempt_enable().
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/2655d8b42ed940aa384fe18ee1129bbbcf730a08.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/paravirt.h | 5 ++---
+ arch/x86/include/asm/paravirt_types.h | 2 +-
+ arch/x86/include/asm/processor.h | 9 ++++-----
+ arch/x86/kernel/cpu/common.c | 4 ++--
+ arch/x86/kernel/process_32.c | 2 +-
+ arch/x86/kernel/process_64.c | 2 +-
+ arch/x86/kernel/vm86_32.c | 14 ++++++--------
+ arch/x86/xen/enlighten_pv.c | 7 +++----
+ 8 files changed, 20 insertions(+), 25 deletions(-)
+
+--- a/arch/x86/include/asm/paravirt.h
++++ b/arch/x86/include/asm/paravirt.h
+@@ -16,10 +16,9 @@
+ #include <linux/cpumask.h>
+ #include <asm/frame.h>
+
+-static inline void load_sp0(struct tss_struct *tss,
+- struct thread_struct *thread)
++static inline void load_sp0(unsigned long sp0)
+ {
+- PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
++ PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0);
+ }
+
+ /* The paravirtualized CPUID instruction. */
+--- a/arch/x86/include/asm/paravirt_types.h
++++ b/arch/x86/include/asm/paravirt_types.h
+@@ -134,7 +134,7 @@ struct pv_cpu_ops {
+ void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
+ void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
+
+- void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
++ void (*load_sp0)(unsigned long sp0);
+
+ void (*set_iopl_mask)(unsigned mask);
+
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -518,9 +518,9 @@ static inline void native_set_iopl_mask(
+ }
+
+ static inline void
+-native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
++native_load_sp0(unsigned long sp0)
+ {
+- tss->x86_tss.sp0 = thread->sp0;
++ this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+ }
+
+ static inline void native_swapgs(void)
+@@ -545,10 +545,9 @@ static inline unsigned long current_top_
+ #else
+ #define __cpuid native_cpuid
+
+-static inline void load_sp0(struct tss_struct *tss,
+- struct thread_struct *thread)
++static inline void load_sp0(unsigned long sp0)
+ {
+- native_load_sp0(tss, thread);
++ native_load_sp0(sp0);
+ }
+
+ #define set_iopl_mask native_set_iopl_mask
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1570,7 +1570,7 @@ void cpu_init(void)
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, me);
+
+- load_sp0(t, ¤t->thread);
++ load_sp0(current->thread.sp0);
+ set_tss_desc(cpu, t);
+ load_TR_desc();
+ load_mm_ldt(&init_mm);
+@@ -1625,7 +1625,7 @@ void cpu_init(void)
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, curr);
+
+- load_sp0(t, thread);
++ load_sp0(thread->sp0);
+ set_tss_desc(cpu, t);
+ load_TR_desc();
+ load_mm_ldt(&init_mm);
+--- a/arch/x86/kernel/process_32.c
++++ b/arch/x86/kernel/process_32.c
+@@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p,
+ * current_thread_info(). Refresh the SYSENTER configuration in
+ * case prev or next is vm86.
+ */
+- load_sp0(tss, next);
++ load_sp0(next->sp0);
+ refresh_sysenter_cs(next);
+ this_cpu_write(cpu_current_top_of_stack,
+ (unsigned long)task_stack_page(next_p) +
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p,
+ this_cpu_write(current_task, next_p);
+
+ /* Reload sp0. */
+- load_sp0(tss, next);
++ load_sp0(next->sp0);
+
+ /*
+ * Now maybe reload the debug registers and handle I/O bitmaps
+--- a/arch/x86/kernel/vm86_32.c
++++ b/arch/x86/kernel/vm86_32.c
+@@ -95,7 +95,6 @@
+
+ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
+ {
+- struct tss_struct *tss;
+ struct task_struct *tsk = current;
+ struct vm86plus_struct __user *user;
+ struct vm86 *vm86 = current->thread.vm86;
+@@ -147,13 +146,13 @@ void save_v86_state(struct kernel_vm86_r
+ do_exit(SIGSEGV);
+ }
+
+- tss = &per_cpu(cpu_tss, get_cpu());
++ preempt_disable();
+ tsk->thread.sp0 = vm86->saved_sp0;
+ tsk->thread.sysenter_cs = __KERNEL_CS;
+- load_sp0(tss, &tsk->thread);
++ load_sp0(tsk->thread.sp0);
+ refresh_sysenter_cs(&tsk->thread);
+ vm86->saved_sp0 = 0;
+- put_cpu();
++ preempt_enable();
+
+ memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs));
+
+@@ -239,7 +238,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd
+
+ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
+ {
+- struct tss_struct *tss;
+ struct task_struct *tsk = current;
+ struct vm86 *vm86 = tsk->thread.vm86;
+ struct kernel_vm86_regs vm86regs;
+@@ -367,8 +365,8 @@ static long do_sys_vm86(struct vm86plus_
+ vm86->saved_sp0 = tsk->thread.sp0;
+ lazy_save_gs(vm86->regs32.gs);
+
+- tss = &per_cpu(cpu_tss, get_cpu());
+ /* make room for real-mode segments */
++ preempt_disable();
+ tsk->thread.sp0 += 16;
+
+ if (static_cpu_has(X86_FEATURE_SEP)) {
+@@ -376,8 +374,8 @@ static long do_sys_vm86(struct vm86plus_
+ refresh_sysenter_cs(&tsk->thread);
+ }
+
+- load_sp0(tss, &tsk->thread);
+- put_cpu();
++ load_sp0(tsk->thread.sp0);
++ preempt_enable();
+
+ if (vm86->flags & VM86_SCREEN_BITMAP)
+ mark_screen_rdonly(tsk->mm);
+--- a/arch/x86/xen/enlighten_pv.c
++++ b/arch/x86/xen/enlighten_pv.c
+@@ -811,15 +811,14 @@ static void __init xen_write_gdt_entry_b
+ }
+ }
+
+-static void xen_load_sp0(struct tss_struct *tss,
+- struct thread_struct *thread)
++static void xen_load_sp0(unsigned long sp0)
+ {
+ struct multicall_space mcs;
+
+ mcs = xen_mc_entry(0);
+- MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
++ MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+- tss->x86_tss.sp0 = thread->sp0;
++ this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+ }
+
+ void xen_set_iopl_mask(unsigned mask)
--- /dev/null
+From 46f5a10a721ce8dce8cc8fe55279b49e1c6b3288 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:14 -0700
+Subject: x86/entry/64: Remove all remaining direct thread_struct::sp0 reads
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 46f5a10a721ce8dce8cc8fe55279b49e1c6b3288 upstream.
+
+The only remaining readers in context switch code or vm86(), and
+they all just want to update TSS.sp0 to match the current task.
+Replace them all with a new helper update_sp0().
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/2d231687f4ff288c9d9e98d7861b7df374246ac3.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/switch_to.h | 6 ++++++
+ arch/x86/kernel/process_32.c | 2 +-
+ arch/x86/kernel/process_64.c | 2 +-
+ arch/x86/kernel/vm86_32.c | 4 ++--
+ 4 files changed, 10 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/include/asm/switch_to.h
++++ b/arch/x86/include/asm/switch_to.h
+@@ -85,4 +85,10 @@ static inline void refresh_sysenter_cs(s
+ }
+ #endif
+
++/* This is used when switching tasks or entering/exiting vm86 mode. */
++static inline void update_sp0(struct task_struct *task)
++{
++ load_sp0(task->thread.sp0);
++}
++
+ #endif /* _ASM_X86_SWITCH_TO_H */
+--- a/arch/x86/kernel/process_32.c
++++ b/arch/x86/kernel/process_32.c
+@@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p,
+ * current_thread_info(). Refresh the SYSENTER configuration in
+ * case prev or next is vm86.
+ */
+- load_sp0(next->sp0);
++ update_sp0(next_p);
+ refresh_sysenter_cs(next);
+ this_cpu_write(cpu_current_top_of_stack,
+ (unsigned long)task_stack_page(next_p) +
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p,
+ this_cpu_write(current_task, next_p);
+
+ /* Reload sp0. */
+- load_sp0(next->sp0);
++ update_sp0(next_p);
+
+ /*
+ * Now maybe reload the debug registers and handle I/O bitmaps
+--- a/arch/x86/kernel/vm86_32.c
++++ b/arch/x86/kernel/vm86_32.c
+@@ -149,7 +149,7 @@ void save_v86_state(struct kernel_vm86_r
+ preempt_disable();
+ tsk->thread.sp0 = vm86->saved_sp0;
+ tsk->thread.sysenter_cs = __KERNEL_CS;
+- load_sp0(tsk->thread.sp0);
++ update_sp0(tsk);
+ refresh_sysenter_cs(&tsk->thread);
+ vm86->saved_sp0 = 0;
+ preempt_enable();
+@@ -374,7 +374,7 @@ static long do_sys_vm86(struct vm86plus_
+ refresh_sysenter_cs(&tsk->thread);
+ }
+
+- load_sp0(tsk->thread.sp0);
++ update_sp0(tsk);
+ preempt_enable();
+
+ if (vm86->flags & VM86_SCREEN_BITMAP)
--- /dev/null
+From c39858de696f0cc160a544455e8403d663d577e9 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:06 -0700
+Subject: x86/entry/64: Remove the RESTORE_..._REGS infrastructure
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit c39858de696f0cc160a544455e8403d663d577e9 upstream.
+
+All users of RESTORE_EXTRA_REGS, RESTORE_C_REGS and such, and
+REMOVE_PT_GPREGS_FROM_STACK are gone. Delete the macros.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/c32672f6e47c561893316d48e06c7656b1039a36.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/calling.h | 52 -----------------------------------------------
+ 1 file changed, 52 deletions(-)
+
+--- a/arch/x86/entry/calling.h
++++ b/arch/x86/entry/calling.h
+@@ -142,16 +142,6 @@ For 32-bit we have the following convent
+ UNWIND_HINT_REGS offset=\offset
+ .endm
+
+- .macro RESTORE_EXTRA_REGS offset=0
+- movq 0*8+\offset(%rsp), %r15
+- movq 1*8+\offset(%rsp), %r14
+- movq 2*8+\offset(%rsp), %r13
+- movq 3*8+\offset(%rsp), %r12
+- movq 4*8+\offset(%rsp), %rbp
+- movq 5*8+\offset(%rsp), %rbx
+- UNWIND_HINT_REGS offset=\offset extra=0
+- .endm
+-
+ .macro POP_EXTRA_REGS
+ popq %r15
+ popq %r14
+@@ -173,48 +163,6 @@ For 32-bit we have the following convent
+ popq %rdi
+ .endm
+
+- .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
+- .if \rstor_r11
+- movq 6*8(%rsp), %r11
+- .endif
+- .if \rstor_r8910
+- movq 7*8(%rsp), %r10
+- movq 8*8(%rsp), %r9
+- movq 9*8(%rsp), %r8
+- .endif
+- .if \rstor_rax
+- movq 10*8(%rsp), %rax
+- .endif
+- .if \rstor_rcx
+- movq 11*8(%rsp), %rcx
+- .endif
+- .if \rstor_rdx
+- movq 12*8(%rsp), %rdx
+- .endif
+- movq 13*8(%rsp), %rsi
+- movq 14*8(%rsp), %rdi
+- UNWIND_HINT_IRET_REGS offset=16*8
+- .endm
+- .macro RESTORE_C_REGS
+- RESTORE_C_REGS_HELPER 1,1,1,1,1
+- .endm
+- .macro RESTORE_C_REGS_EXCEPT_RAX
+- RESTORE_C_REGS_HELPER 0,1,1,1,1
+- .endm
+- .macro RESTORE_C_REGS_EXCEPT_RCX
+- RESTORE_C_REGS_HELPER 1,0,1,1,1
+- .endm
+- .macro RESTORE_C_REGS_EXCEPT_R11
+- RESTORE_C_REGS_HELPER 1,1,0,1,1
+- .endm
+- .macro RESTORE_C_REGS_EXCEPT_RCX_R11
+- RESTORE_C_REGS_HELPER 1,0,0,1,1
+- .endm
+-
+- .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
+- subq $-(15*8+\addskip), %rsp
+- .endm
+-
+ .macro icebp
+ .byte 0xf1
+ .endm
--- /dev/null
+From 9da78ba6b47b46428cfdfc0851511ab29c869798 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:58:58 -0700
+Subject: x86/entry/64: Remove the restore_c_regs_and_iret label
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 9da78ba6b47b46428cfdfc0851511ab29c869798 upstream.
+
+The only user was the 64-bit opportunistic SYSRET failure path, and
+that path didn't really need it. This change makes the
+opportunistic SYSRET code a bit more straightforward and gets rid of
+the label.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/be3006a7ad3326e3458cf1cc55d416252cbe1986.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -246,7 +246,6 @@ entry_SYSCALL64_slow_path:
+ call do_syscall_64 /* returns with IRQs disabled */
+
+ return_from_SYSCALL_64:
+- RESTORE_EXTRA_REGS
+ TRACE_IRQS_IRETQ /* we're about to change IF */
+
+ /*
+@@ -315,6 +314,7 @@ return_from_SYSCALL_64:
+ */
+ syscall_return_via_sysret:
+ /* rcx and r11 are already restored (see code above) */
++ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS_EXCEPT_RCX_R11
+ movq RSP(%rsp), %rsp
+ UNWIND_HINT_EMPTY
+@@ -322,7 +322,7 @@ syscall_return_via_sysret:
+
+ opportunistic_sysret_failed:
+ SWAPGS
+- jmp restore_c_regs_and_iret
++ jmp restore_regs_and_iret
+ END(entry_SYSCALL_64)
+
+ ENTRY(stub_ptregs_64)
+@@ -639,7 +639,6 @@ retint_kernel:
+ */
+ GLOBAL(restore_regs_and_iret)
+ RESTORE_EXTRA_REGS
+-restore_c_regs_and_iret:
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+ INTERRUPT_RETURN
--- /dev/null
+From 7fbbd5cbebf118a9e09f5453f686656a167c3d1c Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:27 +0100
+Subject: x86/entry/64: Remove the SYSENTER stack canary
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 7fbbd5cbebf118a9e09f5453f686656a167c3d1c upstream.
+
+Now that the SYSENTER stack has a guard page, there's no need for a canary
+to detect overflow after the fact.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.572577316@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/processor.h | 1 -
+ arch/x86/kernel/dumpstack.c | 3 +--
+ arch/x86/kernel/process.c | 1 -
+ arch/x86/kernel/traps.c | 7 -------
+ 4 files changed, 1 insertion(+), 11 deletions(-)
+
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -341,7 +341,6 @@ struct tss_struct {
+ * Space for the temporary SYSENTER stack, used for SYSENTER
+ * and the entry trampoline as well.
+ */
+- unsigned long SYSENTER_stack_canary;
+ unsigned long SYSENTER_stack[64];
+
+ /*
+--- a/arch/x86/kernel/dumpstack.c
++++ b/arch/x86/kernel/dumpstack.c
+@@ -48,8 +48,7 @@ bool in_sysenter_stack(unsigned long *st
+ int cpu = smp_processor_id();
+ struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss;
+
+- /* Treat the canary as part of the stack for unwinding purposes. */
+- void *begin = &tss->SYSENTER_stack_canary;
++ void *begin = &tss->SYSENTER_stack;
+ void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack);
+
+ if ((void *)stack < begin || (void *)stack >= end)
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -81,7 +81,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(
+ */
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
+ #endif
+- .SYSENTER_stack_canary = STACK_END_MAGIC,
+ };
+ EXPORT_PER_CPU_SYMBOL(cpu_tss);
+
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -814,13 +814,6 @@ dotraplinkage void do_debug(struct pt_re
+ debug_stack_usage_dec();
+
+ exit:
+- /*
+- * This is the most likely code path that involves non-trivial use
+- * of the SYSENTER stack. Check that we haven't overrun it.
+- */
+- WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
+- "Overran or corrupted SYSENTER stack\n");
+-
+ ist_exit(regs);
+ }
+ NOKPROBE_SYMBOL(do_debug);
--- /dev/null
+From d375cf1530595e33961a8844192cddab913650e3 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:16 -0700
+Subject: x86/entry/64: Remove thread_struct::sp0
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit d375cf1530595e33961a8844192cddab913650e3 upstream.
+
+On x86_64, we can easily calculate sp0 when needed instead of
+storing it in thread_struct.
+
+On x86_32, a similar cleanup would be possible, but it would require
+cleaning up the vm86 code first, and that can wait for a later
+cleanup series.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/719cd9c66c548c4350d98a90f050aee8b17f8919.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/compat.h | 1 +
+ arch/x86/include/asm/processor.h | 28 +++++++++-------------------
+ arch/x86/include/asm/switch_to.h | 6 ++++++
+ arch/x86/kernel/process_64.c | 1 -
+ 4 files changed, 16 insertions(+), 20 deletions(-)
+
+--- a/arch/x86/include/asm/compat.h
++++ b/arch/x86/include/asm/compat.h
+@@ -7,6 +7,7 @@
+ */
+ #include <linux/types.h>
+ #include <linux/sched.h>
++#include <linux/sched/task_stack.h>
+ #include <asm/processor.h>
+ #include <asm/user32.h>
+ #include <asm/unistd.h>
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -431,7 +431,9 @@ typedef struct {
+ struct thread_struct {
+ /* Cached TLS descriptors: */
+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
++#ifdef CONFIG_X86_32
+ unsigned long sp0;
++#endif
+ unsigned long sp;
+ #ifdef CONFIG_X86_32
+ unsigned long sysenter_cs;
+@@ -798,6 +800,13 @@ static inline void spin_lock_prefetch(co
+
+ #define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
+
++#define task_pt_regs(task) \
++({ \
++ unsigned long __ptr = (unsigned long)task_stack_page(task); \
++ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
++ ((struct pt_regs *)__ptr) - 1; \
++})
++
+ #ifdef CONFIG_X86_32
+ /*
+ * User space process size: 3GB (default).
+@@ -817,23 +826,6 @@ static inline void spin_lock_prefetch(co
+ .addr_limit = KERNEL_DS, \
+ }
+
+-/*
+- * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
+- * This is necessary to guarantee that the entire "struct pt_regs"
+- * is accessible even if the CPU haven't stored the SS/ESP registers
+- * on the stack (interrupt gate does not save these registers
+- * when switching to the same priv ring).
+- * Therefore beware: accessing the ss/esp fields of the
+- * "struct pt_regs" is possible, but they may contain the
+- * completely wrong values.
+- */
+-#define task_pt_regs(task) \
+-({ \
+- unsigned long __ptr = (unsigned long)task_stack_page(task); \
+- __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
+- ((struct pt_regs *)__ptr) - 1; \
+-})
+-
+ #define KSTK_ESP(task) (task_pt_regs(task)->sp)
+
+ #else
+@@ -867,11 +859,9 @@ static inline void spin_lock_prefetch(co
+ #define STACK_TOP_MAX TASK_SIZE_MAX
+
+ #define INIT_THREAD { \
+- .sp0 = TOP_OF_INIT_STACK, \
+ .addr_limit = KERNEL_DS, \
+ }
+
+-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+ extern unsigned long KSTK_ESP(struct task_struct *task);
+
+ #endif /* CONFIG_X86_64 */
+--- a/arch/x86/include/asm/switch_to.h
++++ b/arch/x86/include/asm/switch_to.h
+@@ -2,6 +2,8 @@
+ #ifndef _ASM_X86_SWITCH_TO_H
+ #define _ASM_X86_SWITCH_TO_H
+
++#include <linux/sched/task_stack.h>
++
+ struct task_struct; /* one of the stranger aspects of C forward declarations */
+
+ struct task_struct *__switch_to_asm(struct task_struct *prev,
+@@ -88,7 +90,11 @@ static inline void refresh_sysenter_cs(s
+ /* This is used when switching tasks or entering/exiting vm86 mode. */
+ static inline void update_sp0(struct task_struct *task)
+ {
++#ifdef CONFIG_X86_32
+ load_sp0(task->thread.sp0);
++#else
++ load_sp0(task_top_of_stack(task));
++#endif
+ }
+
+ #endif /* _ASM_X86_SWITCH_TO_H */
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_
+ struct inactive_task_frame *frame;
+ struct task_struct *me = current;
+
+- p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
+ childregs = task_pt_regs(p);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
--- /dev/null
+From 3e3b9293d392c577b62e24e4bc9982320438e749 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:24 +0100
+Subject: x86/entry/64: Return to userspace from the trampoline stack
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 3e3b9293d392c577b62e24e4bc9982320438e749 upstream.
+
+By itself, this is useless. It gives us the ability to run some final code
+before exit that cannnot run on the kernel stack. This could include a CR3
+switch a la PAGE_TABLE_ISOLATION or some kernel stack erasing, for
+example. (Or even weird things like *changing* which kernel stack gets
+used as an ASLR-strengthening mechanism.)
+
+The SYSRET32 path is not covered yet. It could be in the future or
+we could just ignore it and force the slow path if needed.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.306546484@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 55 ++++++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 51 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -326,8 +326,24 @@ syscall_return_via_sysret:
+ popq %rsi /* skip rcx */
+ popq %rdx
+ popq %rsi
++
++ /*
++ * Now all regs are restored except RSP and RDI.
++ * Save old stack pointer and switch to trampoline stack.
++ */
++ movq %rsp, %rdi
++ movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
++
++ pushq RSP-RDI(%rdi) /* RSP */
++ pushq (%rdi) /* RDI */
++
++ /*
++ * We are on the trampoline stack. All regs except RDI are live.
++ * We can do future final exit work right here.
++ */
++
+ popq %rdi
+- movq RSP-ORIG_RAX(%rsp), %rsp
++ popq %rsp
+ USERGS_SYSRET64
+ END(entry_SYSCALL_64)
+
+@@ -630,10 +646,41 @@ GLOBAL(swapgs_restore_regs_and_return_to
+ ud2
+ 1:
+ #endif
+- SWAPGS
+ POP_EXTRA_REGS
+- POP_C_REGS
+- addq $8, %rsp /* skip regs->orig_ax */
++ popq %r11
++ popq %r10
++ popq %r9
++ popq %r8
++ popq %rax
++ popq %rcx
++ popq %rdx
++ popq %rsi
++
++ /*
++ * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
++ * Save old stack pointer and switch to trampoline stack.
++ */
++ movq %rsp, %rdi
++ movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
++
++ /* Copy the IRET frame to the trampoline stack. */
++ pushq 6*8(%rdi) /* SS */
++ pushq 5*8(%rdi) /* RSP */
++ pushq 4*8(%rdi) /* EFLAGS */
++ pushq 3*8(%rdi) /* CS */
++ pushq 2*8(%rdi) /* RIP */
++
++ /* Push user RDI on the trampoline stack. */
++ pushq (%rdi)
++
++ /*
++ * We are on the trampoline stack. All regs except RDI are live.
++ * We can do future final exit work right here.
++ */
++
++ /* Restore RDI. */
++ popq %rdi
++ SWAPGS
+ INTERRUPT_RETURN
+
+
--- /dev/null
+From 9aaefe7b59ae00605256a7d6bd1c1456432495fc Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:21 +0100
+Subject: x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 9aaefe7b59ae00605256a7d6bd1c1456432495fc upstream.
+
+On 64-bit kernels, we used to assume that TSS.sp0 was the current
+top of stack. With the addition of an entry trampoline, this will
+no longer be the case. Store the current top of stack in TSS.sp1,
+which is otherwise unused but shares the same cacheline.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.050864668@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/processor.h | 18 +++++++++++++-----
+ arch/x86/include/asm/thread_info.h | 2 +-
+ arch/x86/kernel/asm-offsets_64.c | 1 +
+ arch/x86/kernel/process.c | 10 ++++++++++
+ arch/x86/kernel/process_64.c | 1 +
+ 5 files changed, 26 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -309,7 +309,13 @@ struct x86_hw_tss {
+ struct x86_hw_tss {
+ u32 reserved1;
+ u64 sp0;
++
++ /*
++ * We store cpu_current_top_of_stack in sp1 so it's always accessible.
++ * Linux does not use ring 1, so sp1 is not otherwise needed.
++ */
+ u64 sp1;
++
+ u64 sp2;
+ u64 reserved2;
+ u64 ist[7];
+@@ -368,6 +374,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_
+
+ #ifdef CONFIG_X86_32
+ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
++#else
++#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1
+ #endif
+
+ /*
+@@ -539,12 +547,12 @@ static inline void native_swapgs(void)
+
+ static inline unsigned long current_top_of_stack(void)
+ {
+-#ifdef CONFIG_X86_64
+- return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
+-#else
+- /* sp0 on x86_32 is special in and around vm86 mode. */
++ /*
++ * We can't read directly from tss.sp0: sp0 on x86_32 is special in
++ * and around vm86 mode and sp0 on x86_64 is special because of the
++ * entry trampoline.
++ */
+ return this_cpu_read_stable(cpu_current_top_of_stack);
+-#endif
+ }
+
+ static inline bool on_thread_stack(void)
+--- a/arch/x86/include/asm/thread_info.h
++++ b/arch/x86/include/asm/thread_info.h
+@@ -207,7 +207,7 @@ static inline int arch_within_stack_fram
+ #else /* !__ASSEMBLY__ */
+
+ #ifdef CONFIG_X86_64
+-# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
++# define cpu_current_top_of_stack (cpu_tss + TSS_sp1)
+ #endif
+
+ #endif
+--- a/arch/x86/kernel/asm-offsets_64.c
++++ b/arch/x86/kernel/asm-offsets_64.c
+@@ -66,6 +66,7 @@ int main(void)
+
+ OFFSET(TSS_ist, tss_struct, x86_tss.ist);
+ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
++ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
+ BLANK();
+
+ #ifdef CONFIG_CC_STACKPROTECTOR
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(
+ * Poison it.
+ */
+ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
++
++#ifdef CONFIG_X86_64
++ /*
++ * .sp1 is cpu_current_top_of_stack. The init task never
++ * runs user code, but cpu_current_top_of_stack should still
++ * be well defined before the first context switch.
++ */
++ .sp1 = TOP_OF_INIT_STACK,
++#endif
++
+ #ifdef CONFIG_X86_32
+ .ss0 = __KERNEL_DS,
+ .ss1 = __KERNEL_CS,
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -461,6 +461,7 @@ __switch_to(struct task_struct *prev_p,
+ * Switch the PDA and FPU contexts.
+ */
+ this_cpu_write(current_task, next_p);
++ this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
+
+ /* Reload sp0. */
+ update_sp0(next_p);
--- /dev/null
+From 1e4c4f610f774df6088d7c065b2dd4d22adba698 Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@suse.de>
+Date: Thu, 2 Nov 2017 13:09:26 +0100
+Subject: x86/entry/64: Shorten TEST instructions
+
+From: Borislav Petkov <bp@suse.de>
+
+commit 1e4c4f610f774df6088d7c065b2dd4d22adba698 upstream.
+
+Convert TESTL to TESTB and save 3 bytes per callsite.
+
+No functionality change.
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/20171102120926.4srwerqrr7g72e2k@pd.tnic
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -621,7 +621,7 @@ GLOBAL(retint_user)
+ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
+ #ifdef CONFIG_DEBUG_ENTRY
+ /* Assert that pt_regs indicates user mode. */
+- testl $3, CS(%rsp)
++ testb $3, CS(%rsp)
+ jnz 1f
+ ud2
+ 1:
+@@ -654,7 +654,7 @@ retint_kernel:
+ GLOBAL(restore_regs_and_return_to_kernel)
+ #ifdef CONFIG_DEBUG_ENTRY
+ /* Assert that pt_regs indicates kernel mode. */
+- testl $3, CS(%rsp)
++ testb $3, CS(%rsp)
+ jz 1f
+ ud2
+ 1:
--- /dev/null
+From e53178328c9b96fbdbc719e78c93b5687ee007c3 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:02 -0700
+Subject: x86/entry/64: Shrink paranoid_exit_restore and make labels local
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit e53178328c9b96fbdbc719e78c93b5687ee007c3 upstream.
+
+paranoid_exit_restore was a copy of restore_regs_and_return_to_kernel.
+Merge them and make the paranoid_exit internal labels local.
+
+Keeping .Lparanoid_exit makes the code a bit shorter because it
+allows a 2-byte jnz instead of a 5-byte jnz.
+
+Saves 96 bytes of text.
+
+( This is still a bit suboptimal in a non-CONFIG_TRACE_IRQFLAGS
+ kernel, but fixing that would make the code rather messy. )
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/510d66a1895cda9473c84b1086f0bb974f22de6a.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 13 +++++--------
+ 1 file changed, 5 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -1124,17 +1124,14 @@ ENTRY(paranoid_exit)
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF_DEBUG
+ testl %ebx, %ebx /* swapgs needed? */
+- jnz paranoid_exit_no_swapgs
++ jnz .Lparanoid_exit_no_swapgs
+ TRACE_IRQS_IRETQ
+ SWAPGS_UNSAFE_STACK
+- jmp paranoid_exit_restore
+-paranoid_exit_no_swapgs:
++ jmp .Lparanoid_exit_restore
++.Lparanoid_exit_no_swapgs:
+ TRACE_IRQS_IRETQ_DEBUG
+-paranoid_exit_restore:
+- RESTORE_EXTRA_REGS
+- RESTORE_C_REGS
+- REMOVE_PT_GPREGS_FROM_STACK 8
+- INTERRUPT_RETURN
++.Lparanoid_exit_restore:
++ jmp restore_regs_and_return_to_kernel
+ END(paranoid_exit)
+
+ /*
--- /dev/null
+From e872045bfd9c465a8555bab4b8567d56a4d2d3bb Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:01 -0700
+Subject: x86/entry/64: Simplify reg restore code in the standard IRET paths
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit e872045bfd9c465a8555bab4b8567d56a4d2d3bb upstream.
+
+The old code restored all the registers with movq instead of pop.
+
+In theory, this was done because some CPUs have higher movq
+throughput, but any gain there would be tiny and is almost certainly
+outweighed by the higher text size.
+
+This saves 96 bytes of text.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/ad82520a207ccd851b04ba613f4f752b33ac05f7.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/calling.h | 21 +++++++++++++++++++++
+ arch/x86/entry/entry_64.S | 12 ++++++------
+ 2 files changed, 27 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/entry/calling.h
++++ b/arch/x86/entry/calling.h
+@@ -152,6 +152,27 @@ For 32-bit we have the following convent
+ UNWIND_HINT_REGS offset=\offset extra=0
+ .endm
+
++ .macro POP_EXTRA_REGS
++ popq %r15
++ popq %r14
++ popq %r13
++ popq %r12
++ popq %rbp
++ popq %rbx
++ .endm
++
++ .macro POP_C_REGS
++ popq %r11
++ popq %r10
++ popq %r9
++ popq %r8
++ popq %rax
++ popq %rcx
++ popq %rdx
++ popq %rsi
++ popq %rdi
++ .endm
++
+ .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
+ .if \rstor_r11
+ movq 6*8(%rsp), %r11
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -619,9 +619,9 @@ GLOBAL(swapgs_restore_regs_and_return_to
+ 1:
+ #endif
+ SWAPGS
+- RESTORE_EXTRA_REGS
+- RESTORE_C_REGS
+- REMOVE_PT_GPREGS_FROM_STACK 8
++ POP_EXTRA_REGS
++ POP_C_REGS
++ addq $8, %rsp /* skip regs->orig_ax */
+ INTERRUPT_RETURN
+
+
+@@ -651,9 +651,9 @@ GLOBAL(restore_regs_and_return_to_kernel
+ ud2
+ 1:
+ #endif
+- RESTORE_EXTRA_REGS
+- RESTORE_C_REGS
+- REMOVE_PT_GPREGS_FROM_STACK 8
++ POP_EXTRA_REGS
++ POP_C_REGS
++ addq $8, %rsp /* skip regs->orig_ax */
+ INTERRUPT_RETURN
+
+ ENTRY(native_iret)
--- /dev/null
+From 26c4ef9c49d8a0341f6d97ce2cfdd55d1236ed29 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:58:59 -0700
+Subject: x86/entry/64: Split the IRET-to-user and IRET-to-kernel paths
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 26c4ef9c49d8a0341f6d97ce2cfdd55d1236ed29 upstream.
+
+These code paths will diverge soon.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/dccf8c7b3750199b4b30383c812d4e2931811509.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 34 +++++++++++++++++++++++++---------
+ arch/x86/entry/entry_64_compat.S | 2 +-
+ arch/x86/kernel/head_64.S | 2 +-
+ 3 files changed, 27 insertions(+), 11 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -322,7 +322,7 @@ syscall_return_via_sysret:
+
+ opportunistic_sysret_failed:
+ SWAPGS
+- jmp restore_regs_and_iret
++ jmp restore_regs_and_return_to_usermode
+ END(entry_SYSCALL_64)
+
+ ENTRY(stub_ptregs_64)
+@@ -424,7 +424,7 @@ ENTRY(ret_from_fork)
+ call syscall_return_slowpath /* returns with IRQs disabled */
+ TRACE_IRQS_ON /* user mode is traced as IRQS on */
+ SWAPGS
+- jmp restore_regs_and_iret
++ jmp restore_regs_and_return_to_usermode
+
+ 1:
+ /* kernel thread */
+@@ -613,7 +613,20 @@ GLOBAL(retint_user)
+ call prepare_exit_to_usermode
+ TRACE_IRQS_IRETQ
+ SWAPGS
+- jmp restore_regs_and_iret
++
++GLOBAL(restore_regs_and_return_to_usermode)
++#ifdef CONFIG_DEBUG_ENTRY
++ /* Assert that pt_regs indicates user mode. */
++ testl $3, CS(%rsp)
++ jnz 1f
++ ud2
++1:
++#endif
++ RESTORE_EXTRA_REGS
++ RESTORE_C_REGS
++ REMOVE_PT_GPREGS_FROM_STACK 8
++ INTERRUPT_RETURN
++
+
+ /* Returning to kernel space */
+ retint_kernel:
+@@ -633,11 +646,14 @@ retint_kernel:
+ */
+ TRACE_IRQS_IRETQ
+
+-/*
+- * At this label, code paths which return to kernel and to user,
+- * which come from interrupts/exception and from syscalls, merge.
+- */
+-GLOBAL(restore_regs_and_iret)
++GLOBAL(restore_regs_and_return_to_kernel)
++#ifdef CONFIG_DEBUG_ENTRY
++ /* Assert that pt_regs indicates kernel mode. */
++ testl $3, CS(%rsp)
++ jz 1f
++ ud2
++1:
++#endif
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+@@ -1328,7 +1344,7 @@ ENTRY(nmi)
+ * work, because we don't want to enable interrupts.
+ */
+ SWAPGS
+- jmp restore_regs_and_iret
++ jmp restore_regs_and_return_to_usermode
+
+ .Lnmi_from_kernel:
+ /*
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -338,7 +338,7 @@ ENTRY(entry_INT80_compat)
+ /* Go back to user mode. */
+ TRACE_IRQS_ON
+ SWAPGS
+- jmp restore_regs_and_iret
++ jmp restore_regs_and_return_to_usermode
+ END(entry_INT80_compat)
+
+ ENTRY(stub32_clone)
+--- a/arch/x86/kernel/head_64.S
++++ b/arch/x86/kernel/head_64.S
+@@ -328,7 +328,7 @@ early_idt_handler_common:
+
+ 20:
+ decl early_recursion_flag(%rip)
+- jmp restore_regs_and_iret
++ jmp restore_regs_and_return_to_kernel
+ END(early_idt_handler_common)
+
+ __INITDATA
--- /dev/null
+From 20bb83443ea79087b5e5f8dab4e9d80bb9bf7acb Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:13 -0700
+Subject: x86/entry/64: Stop initializing TSS.sp0 at boot
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 20bb83443ea79087b5e5f8dab4e9d80bb9bf7acb upstream.
+
+In my quest to get rid of thread_struct::sp0, I want to clean up or
+remove all of its readers. Two of them are in cpu_init() (32-bit and
+64-bit), and they aren't needed. This is because we never enter
+userspace at all on the threads that CPUs are initialized in.
+
+Poison the initial TSS.sp0 and stop initializing it on CPU init.
+
+The comment text mostly comes from Dave Hansen. Thanks!
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/ee4a00540ad28c6cff475fbcc7769a4460acc861.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/cpu/common.c | 13 ++++++++++---
+ arch/x86/kernel/process.c | 8 +++++++-
+ 2 files changed, 17 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1570,9 +1570,13 @@ void cpu_init(void)
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, me);
+
+- load_sp0(current->thread.sp0);
++ /*
++ * Initialize the TSS. Don't bother initializing sp0, as the initial
++ * task never enters user mode.
++ */
+ set_tss_desc(cpu, t);
+ load_TR_desc();
++
+ load_mm_ldt(&init_mm);
+
+ clear_all_debug_regs();
+@@ -1594,7 +1598,6 @@ void cpu_init(void)
+ int cpu = smp_processor_id();
+ struct task_struct *curr = current;
+ struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+- struct thread_struct *thread = &curr->thread;
+
+ wait_for_master_cpu(cpu);
+
+@@ -1625,9 +1628,13 @@ void cpu_init(void)
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, curr);
+
+- load_sp0(thread->sp0);
++ /*
++ * Initialize the TSS. Don't bother initializing sp0, as the initial
++ * task never enters user mode.
++ */
+ set_tss_desc(cpu, t);
+ load_TR_desc();
++
+ load_mm_ldt(&init_mm);
+
+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -49,7 +49,13 @@
+ */
+ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+ .x86_tss = {
+- .sp0 = TOP_OF_INIT_STACK,
++ /*
++ * .sp0 is only used when entering ring 0 from a lower
++ * privilege level. Since the init task never runs anything
++ * but ring 0 code, there is no need for a valid value here.
++ * Poison it.
++ */
++ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
+ #ifdef CONFIG_X86_32
+ .ss0 = __KERNEL_DS,
+ .ss1 = __KERNEL_CS,
--- /dev/null
+From 7f2590a110b837af5679d08fc25c6227c5a8c497 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:23 +0100
+Subject: x86/entry/64: Use a per-CPU trampoline stack for IDT entries
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 7f2590a110b837af5679d08fc25c6227c5a8c497 upstream.
+
+Historically, IDT entries from usermode have always gone directly
+to the running task's kernel stack. Rearrange it so that we enter on
+a per-CPU trampoline stack and then manually switch to the task's stack.
+This touches a couple of extra cachelines, but it gives us a chance
+to run some code before we touch the kernel stack.
+
+The asm isn't exactly beautiful, but I think that fully refactoring
+it can wait.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.225330557@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 67 +++++++++++++++++++++++++++++----------
+ arch/x86/entry/entry_64_compat.S | 5 ++
+ arch/x86/include/asm/switch_to.h | 4 +-
+ arch/x86/include/asm/traps.h | 1
+ arch/x86/kernel/cpu/common.c | 6 ++-
+ arch/x86/kernel/traps.c | 21 ++++++------
+ 6 files changed, 72 insertions(+), 32 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -560,6 +560,13 @@ END(irq_entries_start)
+ /* 0(%rsp): ~(interrupt number) */
+ .macro interrupt func
+ cld
++
++ testb $3, CS-ORIG_RAX(%rsp)
++ jz 1f
++ SWAPGS
++ call switch_to_thread_stack
++1:
++
+ ALLOC_PT_GPREGS_ON_STACK
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
+@@ -569,12 +576,8 @@ END(irq_entries_start)
+ jz 1f
+
+ /*
+- * IRQ from user mode. Switch to kernel gsbase and inform context
+- * tracking that we're in kernel mode.
+- */
+- SWAPGS
+-
+- /*
++ * IRQ from user mode.
++ *
+ * We need to tell lockdep that IRQs are off. We can't do this until
+ * we fix gsbase, and we should do it before enter_from_user_mode
+ * (which can take locks). Since TRACE_IRQS_OFF idempotent,
+@@ -828,6 +831,32 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
+ */
+ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+
++/*
++ * Switch to the thread stack. This is called with the IRET frame and
++ * orig_ax on the stack. (That is, RDI..R12 are not on the stack and
++ * space has not been allocated for them.)
++ */
++ENTRY(switch_to_thread_stack)
++ UNWIND_HINT_FUNC
++
++ pushq %rdi
++ movq %rsp, %rdi
++ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
++ UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
++
++ pushq 7*8(%rdi) /* regs->ss */
++ pushq 6*8(%rdi) /* regs->rsp */
++ pushq 5*8(%rdi) /* regs->eflags */
++ pushq 4*8(%rdi) /* regs->cs */
++ pushq 3*8(%rdi) /* regs->ip */
++ pushq 2*8(%rdi) /* regs->orig_ax */
++ pushq 8(%rdi) /* return address */
++ UNWIND_HINT_FUNC
++
++ movq (%rdi), %rdi
++ ret
++END(switch_to_thread_stack)
++
+ .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
+ ENTRY(\sym)
+ UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+@@ -845,11 +874,12 @@ ENTRY(\sym)
+
+ ALLOC_PT_GPREGS_ON_STACK
+
+- .if \paranoid
+- .if \paranoid == 1
++ .if \paranoid < 2
+ testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
+- jnz 1f
++ jnz .Lfrom_usermode_switch_stack_\@
+ .endif
++
++ .if \paranoid
+ call paranoid_entry
+ .else
+ call error_entry
+@@ -891,20 +921,15 @@ ENTRY(\sym)
+ jmp error_exit
+ .endif
+
+- .if \paranoid == 1
++ .if \paranoid < 2
+ /*
+- * Paranoid entry from userspace. Switch stacks and treat it
++ * Entry from userspace. Switch stacks and treat it
+ * as a normal entry. This means that paranoid handlers
+ * run in real process context if user_mode(regs).
+ */
+-1:
++.Lfrom_usermode_switch_stack_\@:
+ call error_entry
+
+-
+- movq %rsp, %rdi /* pt_regs pointer */
+- call sync_regs
+- movq %rax, %rsp /* switch stack */
+-
+ movq %rsp, %rdi /* pt_regs pointer */
+
+ .if \has_error_code
+@@ -1165,6 +1190,14 @@ ENTRY(error_entry)
+ SWAPGS
+
+ .Lerror_entry_from_usermode_after_swapgs:
++ /* Put us onto the real thread stack. */
++ popq %r12 /* save return addr in %12 */
++ movq %rsp, %rdi /* arg0 = pt_regs pointer */
++ call sync_regs
++ movq %rax, %rsp /* switch stack */
++ ENCODE_FRAME_POINTER
++ pushq %r12
++
+ /*
+ * We need to tell lockdep that IRQs are off. We can't do this until
+ * we fix gsbase, and we should do it before enter_from_user_mode
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat)
+ */
+ movl %eax, %eax
+
+- /* Construct struct pt_regs on stack (iret frame is already on stack) */
+ pushq %rax /* pt_regs->orig_ax */
++
++ /* switch to thread stack expects orig_ax to be pushed */
++ call switch_to_thread_stack
++
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+--- a/arch/x86/include/asm/switch_to.h
++++ b/arch/x86/include/asm/switch_to.h
+@@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(s
+ /* This is used when switching tasks or entering/exiting vm86 mode. */
+ static inline void update_sp0(struct task_struct *task)
+ {
++ /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
+ #ifdef CONFIG_X86_32
+ load_sp0(task->thread.sp0);
+ #else
+- load_sp0(task_top_of_stack(task));
++ if (static_cpu_has(X86_FEATURE_XENPV))
++ load_sp0(task_top_of_stack(task));
+ #endif
+ }
+
+--- a/arch/x86/include/asm/traps.h
++++ b/arch/x86/include/asm/traps.h
+@@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_presen
+ dotraplinkage void do_stack_segment(struct pt_regs *, long);
+ #ifdef CONFIG_X86_64
+ dotraplinkage void do_double_fault(struct pt_regs *, long);
+-asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
+ #endif
+ dotraplinkage void do_general_protection(struct pt_regs *, long);
+ dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1623,11 +1623,13 @@ void cpu_init(void)
+ setup_cpu_entry_area(cpu);
+
+ /*
+- * Initialize the TSS. Don't bother initializing sp0, as the initial
+- * task never enters user mode.
++ * Initialize the TSS. sp0 points to the entry trampoline stack
++ * regardless of what task is running.
+ */
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+ load_TR_desc();
++ load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss +
++ offsetofend(struct tss_struct, SYSENTER_stack));
+
+ load_mm_ldt(&init_mm);
+
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -619,14 +619,15 @@ NOKPROBE_SYMBOL(do_int3);
+
+ #ifdef CONFIG_X86_64
+ /*
+- * Help handler running on IST stack to switch off the IST stack if the
+- * interrupted code was in user mode. The actual stack switch is done in
+- * entry_64.S
++ * Help handler running on a per-cpu (IST or entry trampoline) stack
++ * to switch to the normal thread stack if the interrupted code was in
++ * user mode. The actual stack switch is done in entry_64.S
+ */
+ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
+ {
+- struct pt_regs *regs = task_pt_regs(current);
+- *regs = *eregs;
++ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
++ if (regs != eregs)
++ *regs = *eregs;
+ return regs;
+ }
+ NOKPROBE_SYMBOL(sync_regs);
+@@ -642,13 +643,13 @@ struct bad_iret_stack *fixup_bad_iret(st
+ /*
+ * This is called from entry_64.S early in handling a fault
+ * caused by a bad iret to user mode. To handle the fault
+- * correctly, we want move our stack frame to task_pt_regs
+- * and we want to pretend that the exception came from the
+- * iret target.
++ * correctly, we want to move our stack frame to where it would
++ * be had we entered directly on the entry stack (rather than
++ * just below the IRET frame) and we want to pretend that the
++ * exception came from the IRET target.
+ */
+ struct bad_iret_stack *new_stack =
+- container_of(task_pt_regs(current),
+- struct bad_iret_stack, regs);
++ (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
+
+ /* Copy the IRET target to the new stack. */
+ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
--- /dev/null
+From 471ee4832209e986029b9fabdaad57b1eecb856b Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:05 -0700
+Subject: x86/entry/64: Use POP instead of MOV to restore regs on NMI return
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 471ee4832209e986029b9fabdaad57b1eecb856b upstream.
+
+This gets rid of the last user of the old RESTORE_..._REGS infrastructure.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/652a260f17a160789bc6a41d997f98249b73e2ab.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -1560,11 +1560,14 @@ end_repeat_nmi:
+ nmi_swapgs:
+ SWAPGS_UNSAFE_STACK
+ nmi_restore:
+- RESTORE_EXTRA_REGS
+- RESTORE_C_REGS
++ POP_EXTRA_REGS
++ POP_C_REGS
+
+- /* Point RSP at the "iret" frame. */
+- REMOVE_PT_GPREGS_FROM_STACK 6*8
++ /*
++ * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
++ * at the "iret" frame.
++ */
++ addq $6*8, %rsp
+
+ /*
+ * Clear "NMI executing". Set DF first so that we can easily
--- /dev/null
+From 4fbb39108f972437c44e5ffa781b56635d496826 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:03 -0700
+Subject: x86/entry/64: Use pop instead of movq in syscall_return_via_sysret
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 4fbb39108f972437c44e5ffa781b56635d496826 upstream.
+
+Saves 64 bytes.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/6609b7f74ab31c36604ad746e019ea8495aec76c.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -316,10 +316,18 @@ return_from_SYSCALL_64:
+ */
+ syscall_return_via_sysret:
+ /* rcx and r11 are already restored (see code above) */
+- RESTORE_EXTRA_REGS
+- RESTORE_C_REGS_EXCEPT_RCX_R11
+- movq RSP(%rsp), %rsp
+ UNWIND_HINT_EMPTY
++ POP_EXTRA_REGS
++ popq %rsi /* skip r11 */
++ popq %r10
++ popq %r9
++ popq %r8
++ popq %rax
++ popq %rsi /* skip rcx */
++ popq %rdx
++ popq %rsi
++ popq %rdi
++ movq RSP-ORIG_RAX(%rsp), %rsp
+ USERGS_SYSRET64
+ END(entry_SYSCALL_64)
+
--- /dev/null
+From 3500130b84a3cdc5b6796eba1daf178944935efe Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:11 -0700
+Subject: x86/entry: Add task_top_of_stack() to find the top of a task's stack
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 3500130b84a3cdc5b6796eba1daf178944935efe upstream.
+
+This will let us get rid of a few places that hardcode accesses to
+thread.sp0.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/b49b3f95a8ff858c40c9b0f5b32be0355324327d.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/processor.h | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -796,6 +796,8 @@ static inline void spin_lock_prefetch(co
+ #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
+ TOP_OF_KERNEL_STACK_PADDING)
+
++#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
++
+ #ifdef CONFIG_X86_32
+ /*
+ * User space process size: 3GB (default).
--- /dev/null
+From 0f9a48100fba3f189724ae88a450c2261bf91c80 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:28 +0100
+Subject: x86/entry: Clean up the SYSENTER_stack code
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 0f9a48100fba3f189724ae88a450c2261bf91c80 upstream.
+
+The existing code was a mess, mainly because C arrays are nasty. Turn
+SYSENTER_stack into a struct, add a helper to find it, and do all the
+obvious cleanups this enables.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bpetkov@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.653244723@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_32.S | 4 ++--
+ arch/x86/entry/entry_64.S | 2 +-
+ arch/x86/include/asm/fixmap.h | 5 +++++
+ arch/x86/include/asm/processor.h | 6 +++++-
+ arch/x86/kernel/asm-offsets.c | 6 ++----
+ arch/x86/kernel/cpu/common.c | 14 +++-----------
+ arch/x86/kernel/dumpstack.c | 7 +++----
+ 7 files changed, 21 insertions(+), 23 deletions(-)
+
+--- a/arch/x86/entry/entry_32.S
++++ b/arch/x86/entry/entry_32.S
+@@ -942,7 +942,7 @@ ENTRY(debug)
+
+ /* Are we currently on the SYSENTER stack? */
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+- addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
++ addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
+ cmpl $SIZEOF_SYSENTER_stack, %ecx
+ jb .Ldebug_from_sysenter_stack
+@@ -986,7 +986,7 @@ ENTRY(nmi)
+
+ /* Are we currently on the SYSENTER stack? */
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+- addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
++ addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
+ cmpl $SIZEOF_SYSENTER_stack, %ecx
+ jb .Lnmi_from_sysenter_stack
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -154,7 +154,7 @@ END(native_usergs_sysret64)
+ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
+
+ /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
+-#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \
++#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \
+ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
+
+ ENTRY(entry_SYSCALL_64_trampoline)
+--- a/arch/x86/include/asm/fixmap.h
++++ b/arch/x86/include/asm/fixmap.h
+@@ -245,5 +245,10 @@ static inline struct cpu_entry_area *get
+ return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
+ }
+
++static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
++{
++ return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack;
++}
++
+ #endif /* !__ASSEMBLY__ */
+ #endif /* _ASM_X86_FIXMAP_H */
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -336,12 +336,16 @@ struct x86_hw_tss {
+ #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
+ #define INVALID_IO_BITMAP_OFFSET 0x8000
+
++struct SYSENTER_stack {
++ unsigned long words[64];
++};
++
+ struct tss_struct {
+ /*
+ * Space for the temporary SYSENTER stack, used for SYSENTER
+ * and the entry trampoline as well.
+ */
+- unsigned long SYSENTER_stack[64];
++ struct SYSENTER_stack SYSENTER_stack;
+
+ /*
+ * The fixed hardware portion. This must not cross a page boundary
+--- a/arch/x86/kernel/asm-offsets.c
++++ b/arch/x86/kernel/asm-offsets.c
+@@ -94,10 +94,8 @@ void common(void) {
+ BLANK();
+ DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+- /* Offset from cpu_tss to SYSENTER_stack */
+- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
+- /* Size of SYSENTER_stack */
+- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
++ OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack);
++ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
+
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1314,12 +1314,7 @@ void enable_sep_cpu(void)
+
+ tss->x86_tss.ss1 = __KERNEL_CS;
+ wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+-
+- wrmsr(MSR_IA32_SYSENTER_ESP,
+- (unsigned long)&get_cpu_entry_area(cpu)->tss +
+- offsetofend(struct tss_struct, SYSENTER_stack),
+- 0);
+-
++ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
+
+ put_cpu();
+@@ -1436,9 +1431,7 @@ void syscall_init(void)
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+- wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+- (unsigned long)&get_cpu_entry_area(cpu)->tss +
+- offsetofend(struct tss_struct, SYSENTER_stack));
++ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+ #else
+ wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
+@@ -1653,8 +1646,7 @@ void cpu_init(void)
+ */
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+ load_TR_desc();
+- load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss +
+- offsetofend(struct tss_struct, SYSENTER_stack));
++ load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
+
+ load_mm_ldt(&init_mm);
+
+--- a/arch/x86/kernel/dumpstack.c
++++ b/arch/x86/kernel/dumpstack.c
+@@ -45,11 +45,10 @@ bool in_task_stack(unsigned long *stack,
+
+ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
+ {
+- int cpu = smp_processor_id();
+- struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss;
++ struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
+
+- void *begin = &tss->SYSENTER_stack;
+- void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack);
++ void *begin = ss;
++ void *end = ss + 1;
+
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
--- /dev/null
+From 7fb983b4dd569e08564134a850dfd4eb1c63d9b8 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:17 +0100
+Subject: x86/entry: Fix assumptions that the HW TSS is at the beginning of cpu_tss
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 7fb983b4dd569e08564134a850dfd4eb1c63d9b8 upstream.
+
+A future patch will move SYSENTER_stack to the beginning of cpu_tss
+to help detect overflow. Before this can happen, fix several code
+paths that hardcode assumptions about the old layout.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Dave Hansen <dave.hansen@intel.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.722425540@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/desc.h | 2 +-
+ arch/x86/include/asm/processor.h | 9 +++++++--
+ arch/x86/kernel/cpu/common.c | 8 ++++----
+ arch/x86/kernel/doublefault.c | 32 +++++++++++++++-----------------
+ arch/x86/kvm/vmx.c | 2 +-
+ arch/x86/power/cpu.c | 13 +++++++------
+ 6 files changed, 35 insertions(+), 31 deletions(-)
+
+--- a/arch/x86/include/asm/desc.h
++++ b/arch/x86/include/asm/desc.h
+@@ -178,7 +178,7 @@ static inline void set_tssldt_descriptor
+ #endif
+ }
+
+-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
++static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
+ {
+ struct desc_struct *d = get_cpu_gdt_rw(cpu);
+ tss_desc tss;
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -162,7 +162,7 @@ enum cpuid_regs_idx {
+ extern struct cpuinfo_x86 boot_cpu_data;
+ extern struct cpuinfo_x86 new_cpu_data;
+
+-extern struct tss_struct doublefault_tss;
++extern struct x86_hw_tss doublefault_tss;
+ extern __u32 cpu_caps_cleared[NCAPINTS];
+ extern __u32 cpu_caps_set[NCAPINTS];
+
+@@ -252,6 +252,11 @@ static inline void load_cr3(pgd_t *pgdir
+ write_cr3(__sme_pa(pgdir));
+ }
+
++/*
++ * Note that while the legacy 'TSS' name comes from 'Task State Segment',
++ * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
++ * unrelated to the task-switch mechanism:
++ */
+ #ifdef CONFIG_X86_32
+ /* This is the TSS defined by the hardware. */
+ struct x86_hw_tss {
+@@ -322,7 +327,7 @@ struct x86_hw_tss {
+ #define IO_BITMAP_BITS 65536
+ #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+ #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+-#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
++#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
+ #define INVALID_IO_BITMAP_OFFSET 0x8000
+
+ struct tss_struct {
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1557,7 +1557,7 @@ void cpu_init(void)
+ }
+ }
+
+- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
++ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+
+ /*
+ * <= is required because the CPU will access up to
+@@ -1576,7 +1576,7 @@ void cpu_init(void)
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
+- set_tss_desc(cpu, t);
++ set_tss_desc(cpu, &t->x86_tss);
+ load_TR_desc();
+
+ load_mm_ldt(&init_mm);
+@@ -1634,12 +1634,12 @@ void cpu_init(void)
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
+- set_tss_desc(cpu, t);
++ set_tss_desc(cpu, &t->x86_tss);
+ load_TR_desc();
+
+ load_mm_ldt(&init_mm);
+
+- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
++ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+
+ #ifdef CONFIG_DOUBLEFAULT
+ /* Set up doublefault TSS pointer in the GDT */
+--- a/arch/x86/kernel/doublefault.c
++++ b/arch/x86/kernel/doublefault.c
+@@ -50,25 +50,23 @@ static void doublefault_fn(void)
+ cpu_relax();
+ }
+
+-struct tss_struct doublefault_tss __cacheline_aligned = {
+- .x86_tss = {
+- .sp0 = STACK_START,
+- .ss0 = __KERNEL_DS,
+- .ldt = 0,
+- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
++struct x86_hw_tss doublefault_tss __cacheline_aligned = {
++ .sp0 = STACK_START,
++ .ss0 = __KERNEL_DS,
++ .ldt = 0,
++ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+
+- .ip = (unsigned long) doublefault_fn,
+- /* 0x2 bit is always set */
+- .flags = X86_EFLAGS_SF | 0x2,
+- .sp = STACK_START,
+- .es = __USER_DS,
+- .cs = __KERNEL_CS,
+- .ss = __KERNEL_DS,
+- .ds = __USER_DS,
+- .fs = __KERNEL_PERCPU,
++ .ip = (unsigned long) doublefault_fn,
++ /* 0x2 bit is always set */
++ .flags = X86_EFLAGS_SF | 0x2,
++ .sp = STACK_START,
++ .es = __USER_DS,
++ .cs = __KERNEL_CS,
++ .ss = __KERNEL_DS,
++ .ds = __USER_DS,
++ .fs = __KERNEL_PERCPU,
+
+- .__cr3 = __pa_nodebug(swapper_pg_dir),
+- }
++ .__cr3 = __pa_nodebug(swapper_pg_dir),
+ };
+
+ /* dummy for do_double_fault() call */
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -2295,7 +2295,7 @@ static void vmx_vcpu_load(struct kvm_vcp
+ * processors. See 22.2.4.
+ */
+ vmcs_writel(HOST_TR_BASE,
+- (unsigned long)this_cpu_ptr(&cpu_tss));
++ (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss));
+ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
+
+ /*
+--- a/arch/x86/power/cpu.c
++++ b/arch/x86/power/cpu.c
+@@ -165,12 +165,13 @@ static void fix_processor_context(void)
+ struct desc_struct *desc = get_cpu_gdt_rw(cpu);
+ tss_desc tss;
+ #endif
+- set_tss_desc(cpu, t); /*
+- * This just modifies memory; should not be
+- * necessary. But... This is necessary, because
+- * 386 hardware has concept of busy TSS or some
+- * similar stupidity.
+- */
++
++ /*
++ * This just modifies memory; should not be necessary. But... This is
++ * necessary, because 386 hardware has concept of busy TSS or some
++ * similar stupidity.
++ */
++ set_tss_desc(cpu, &t->x86_tss);
+
+ #ifdef CONFIG_X86_64
+ memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
--- /dev/null
+From aaeed3aeb39c1ba69f0a49baec8cb728121d0a91 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:14 +0100
+Subject: x86/entry/gdt: Put per-CPU GDT remaps in ascending order
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit aaeed3aeb39c1ba69f0a49baec8cb728121d0a91 upstream.
+
+We currently have CPU 0's GDT at the top of the GDT range and
+higher-numbered CPUs at lower addresses. This happens because the
+fixmap is upside down (index 0 is the top of the fixmap).
+
+Flip it so that GDTs are in ascending order by virtual address.
+This will simplify a future patch that will generalize the GDT
+remap to contain multiple pages.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.471561421@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/desc.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/desc.h
++++ b/arch/x86/include/asm/desc.h
+@@ -63,7 +63,7 @@ static inline struct desc_struct *get_cu
+ /* Get the fixmap index for a specific processor */
+ static inline unsigned int get_cpu_gdt_ro_index(int cpu)
+ {
+- return FIX_GDT_REMAP_BEGIN + cpu;
++ return FIX_GDT_REMAP_END - cpu;
+ }
+
+ /* Provide the fixmap address of the remapped GDT */
--- /dev/null
+From 1a935bc3d4ea61556461a9e92a68ca3556232efd Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:19 +0100
+Subject: x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 1a935bc3d4ea61556461a9e92a68ca3556232efd upstream.
+
+SYSENTER_stack should have reliable overflow detection, which
+means that it needs to be at the bottom of a page, not the top.
+Move it to the beginning of struct tss_struct and page-align it.
+
+Also add an assertion to make sure that the fixed hardware TSS
+doesn't cross a page boundary.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.881827433@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/processor.h | 21 ++++++++++++---------
+ arch/x86/kernel/cpu/common.c | 21 +++++++++++++++++++++
+ 2 files changed, 33 insertions(+), 9 deletions(-)
+
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -332,7 +332,16 @@ struct x86_hw_tss {
+
+ struct tss_struct {
+ /*
+- * The hardware state:
++ * Space for the temporary SYSENTER stack, used for SYSENTER
++ * and the entry trampoline as well.
++ */
++ unsigned long SYSENTER_stack_canary;
++ unsigned long SYSENTER_stack[64];
++
++ /*
++ * The fixed hardware portion. This must not cross a page boundary
++ * at risk of violating the SDM's advice and potentially triggering
++ * errata.
+ */
+ struct x86_hw_tss x86_tss;
+
+@@ -343,15 +352,9 @@ struct tss_struct {
+ * be within the limit.
+ */
+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
++} __aligned(PAGE_SIZE);
+
+- /*
+- * Space for the temporary SYSENTER stack.
+- */
+- unsigned long SYSENTER_stack_canary;
+- unsigned long SYSENTER_stack[64];
+-} ____cacheline_aligned;
+-
+-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
++DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss);
+
+ /*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -487,6 +487,27 @@ static inline void setup_cpu_entry_area(
+ #endif
+
+ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
++
++ /*
++ * The Intel SDM says (Volume 3, 7.2.1):
++ *
++ * Avoid placing a page boundary in the part of the TSS that the
++ * processor reads during a task switch (the first 104 bytes). The
++ * processor may not correctly perform address translations if a
++ * boundary occurs in this area. During a task switch, the processor
++ * reads and writes into the first 104 bytes of each TSS (using
++ * contiguous physical addresses beginning with the physical address
++ * of the first byte of the TSS). So, after TSS access begins, if
++ * part of the 104 bytes is not physically contiguous, the processor
++ * will access incorrect information without generating a page-fault
++ * exception.
++ *
++ * There are also a lot of errata involving the TSS spanning a page
++ * boundary. Assert that we're not doing that.
++ */
++ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
++ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
++
+ }
+
+ /* Load the original GDT from the per-cpu structure */
--- /dev/null
+From 72f5e08dbba2d01aa90b592cf76c378ea233b00b Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:20 +0100
+Subject: x86/entry: Remap the TSS into the CPU entry area
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 72f5e08dbba2d01aa90b592cf76c378ea233b00b upstream.
+
+This has a secondary purpose: it puts the entry stack into a region
+with a well-controlled layout. A subsequent patch will take
+advantage of this to streamline the SYSCALL entry code to be able to
+find it more easily.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bpetkov@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.962042855@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_32.S | 6 ++++--
+ arch/x86/include/asm/fixmap.h | 7 +++++++
+ arch/x86/kernel/asm-offsets.c | 3 +++
+ arch/x86/kernel/cpu/common.c | 41 +++++++++++++++++++++++++++++++++++------
+ arch/x86/kernel/dumpstack.c | 3 ++-
+ arch/x86/kvm/vmx.c | 2 +-
+ arch/x86/power/cpu.c | 11 ++++++-----
+ 7 files changed, 58 insertions(+), 15 deletions(-)
+
+--- a/arch/x86/entry/entry_32.S
++++ b/arch/x86/entry/entry_32.S
+@@ -941,7 +941,8 @@ ENTRY(debug)
+ movl %esp, %eax # pt_regs pointer
+
+ /* Are we currently on the SYSENTER stack? */
+- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
++ movl PER_CPU_VAR(cpu_entry_area), %ecx
++ addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
+ cmpl $SIZEOF_SYSENTER_stack, %ecx
+ jb .Ldebug_from_sysenter_stack
+@@ -984,7 +985,8 @@ ENTRY(nmi)
+ movl %esp, %eax # pt_regs pointer
+
+ /* Are we currently on the SYSENTER stack? */
+- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
++ movl PER_CPU_VAR(cpu_entry_area), %ecx
++ addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
+ cmpl $SIZEOF_SYSENTER_stack, %ecx
+ jb .Lnmi_from_sysenter_stack
+--- a/arch/x86/include/asm/fixmap.h
++++ b/arch/x86/include/asm/fixmap.h
+@@ -54,6 +54,13 @@ extern unsigned long __FIXADDR_TOP;
+ */
+ struct cpu_entry_area {
+ char gdt[PAGE_SIZE];
++
++ /*
++ * The GDT is just below cpu_tss and thus serves (on x86_64) as a
++ * a read-only guard page for the SYSENTER stack at the bottom
++ * of the TSS region.
++ */
++ struct tss_struct tss;
+ };
+
+ #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
+--- a/arch/x86/kernel/asm-offsets.c
++++ b/arch/x86/kernel/asm-offsets.c
+@@ -98,4 +98,7 @@ void common(void) {
+ OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
+ /* Size of SYSENTER_stack */
+ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
++
++ /* Layout info for cpu_entry_area */
++ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+ }
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -466,6 +466,22 @@ void load_percpu_segment(int cpu)
+ load_stack_canary_segment();
+ }
+
++static void set_percpu_fixmap_pages(int fixmap_index, void *ptr,
++ int pages, pgprot_t prot)
++{
++ int i;
++
++ for (i = 0; i < pages; i++) {
++ __set_fixmap(fixmap_index - i,
++ per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot);
++ }
++}
++
++#ifdef CONFIG_X86_32
++/* The 32-bit entry code needs to find cpu_entry_area. */
++DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
++#endif
++
+ /* Setup the fixmap mappings only once per-processor */
+ static inline void setup_cpu_entry_area(int cpu)
+ {
+@@ -507,7 +523,15 @@ static inline void setup_cpu_entry_area(
+ */
+ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
++ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
++ &per_cpu(cpu_tss, cpu),
++ sizeof(struct tss_struct) / PAGE_SIZE,
++ PAGE_KERNEL);
+
++#ifdef CONFIG_X86_32
++ this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu));
++#endif
+ }
+
+ /* Load the original GDT from the per-cpu structure */
+@@ -1257,7 +1281,8 @@ void enable_sep_cpu(void)
+ wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+
+ wrmsr(MSR_IA32_SYSENTER_ESP,
+- (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
++ (unsigned long)&get_cpu_entry_area(cpu)->tss +
++ offsetofend(struct tss_struct, SYSENTER_stack),
+ 0);
+
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
+@@ -1370,6 +1395,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char,
+ /* May not be marked __init: used by software suspend */
+ void syscall_init(void)
+ {
++ int cpu = smp_processor_id();
++
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+
+@@ -1383,7 +1410,7 @@ void syscall_init(void)
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+- (unsigned long)this_cpu_ptr(&cpu_tss) +
++ (unsigned long)&get_cpu_entry_area(cpu)->tss +
+ offsetofend(struct tss_struct, SYSENTER_stack));
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+ #else
+@@ -1593,11 +1620,13 @@ void cpu_init(void)
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, me);
+
++ setup_cpu_entry_area(cpu);
++
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
+- set_tss_desc(cpu, &t->x86_tss);
++ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+ load_TR_desc();
+
+ load_mm_ldt(&init_mm);
+@@ -1610,7 +1639,6 @@ void cpu_init(void)
+ if (is_uv_system())
+ uv_cpu_init();
+
+- setup_cpu_entry_area(cpu);
+ load_fixmap_gdt(cpu);
+ }
+
+@@ -1651,11 +1679,13 @@ void cpu_init(void)
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, curr);
+
++ setup_cpu_entry_area(cpu);
++
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
+- set_tss_desc(cpu, &t->x86_tss);
++ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+ load_TR_desc();
+
+ load_mm_ldt(&init_mm);
+@@ -1672,7 +1702,6 @@ void cpu_init(void)
+
+ fpu__init_cpu();
+
+- setup_cpu_entry_area(cpu);
+ load_fixmap_gdt(cpu);
+ }
+ #endif
+--- a/arch/x86/kernel/dumpstack.c
++++ b/arch/x86/kernel/dumpstack.c
+@@ -45,7 +45,8 @@ bool in_task_stack(unsigned long *stack,
+
+ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
+ {
+- struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
++ int cpu = smp_processor_id();
++ struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss;
+
+ /* Treat the canary as part of the stack for unwinding purposes. */
+ void *begin = &tss->SYSENTER_stack_canary;
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -2295,7 +2295,7 @@ static void vmx_vcpu_load(struct kvm_vcp
+ * processors. See 22.2.4.
+ */
+ vmcs_writel(HOST_TR_BASE,
+- (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss));
++ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
+ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
+
+ /*
+--- a/arch/x86/power/cpu.c
++++ b/arch/x86/power/cpu.c
+@@ -160,18 +160,19 @@ static void do_fpu_end(void)
+ static void fix_processor_context(void)
+ {
+ int cpu = smp_processor_id();
+- struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+ #ifdef CONFIG_X86_64
+ struct desc_struct *desc = get_cpu_gdt_rw(cpu);
+ tss_desc tss;
+ #endif
+
+ /*
+- * This just modifies memory; should not be necessary. But... This is
+- * necessary, because 386 hardware has concept of busy TSS or some
+- * similar stupidity.
++ * We need to reload TR, which requires that we change the
++ * GDT entry to indicate "available" first.
++ *
++ * XXX: This could probably all be replaced by a call to
++ * force_reload_TR().
+ */
+- set_tss_desc(cpu, &t->x86_tss);
++ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+
+ #ifdef CONFIG_X86_64
+ memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
--- /dev/null
+From 6d9256f0a89eaff97fca6006100bcaea8d1d8bdb Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:22 +0100
+Subject: x86/espfix/64: Stop assuming that pt_regs is on the entry stack
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 6d9256f0a89eaff97fca6006100bcaea8d1d8bdb upstream.
+
+When we start using an entry trampoline, a #GP from userspace will
+be delivered on the entry stack, not on the task stack. Fix the
+espfix64 #DF fixup to set up #GP according to TSS.SP0, rather than
+assuming that pt_regs + 1 == SP0. This won't change anything
+without an entry stack, but it will make the code continue to work
+when an entry stack is added.
+
+While we're at it, improve the comments to explain what's actually
+going on.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.130778051@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/traps.c | 37 ++++++++++++++++++++++++++++---------
+ 1 file changed, 28 insertions(+), 9 deletions(-)
+
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struc
+
+ /*
+ * If IRET takes a non-IST fault on the espfix64 stack, then we
+- * end up promoting it to a doublefault. In that case, modify
+- * the stack to make it look like we just entered the #GP
+- * handler from user space, similar to bad_iret.
++ * end up promoting it to a doublefault. In that case, take
++ * advantage of the fact that we're not using the normal (TSS.sp0)
++ * stack right now. We can write a fake #GP(0) frame at TSS.sp0
++ * and then modify our own IRET frame so that, when we return,
++ * we land directly at the #GP(0) vector with the stack already
++ * set up according to its expectations.
++ *
++ * The net result is that our #GP handler will think that we
++ * entered from usermode with the bad user context.
+ *
+ * No need for ist_enter here because we don't use RCU.
+ */
+@@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struc
+ regs->cs == __KERNEL_CS &&
+ regs->ip == (unsigned long)native_irq_return_iret)
+ {
+- struct pt_regs *normal_regs = task_pt_regs(current);
++ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
+
+- /* Fake a #GP(0) from userspace. */
+- memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
+- normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
++ /*
++ * regs->sp points to the failing IRET frame on the
++ * ESPFIX64 stack. Copy it to the entry stack. This fills
++ * in gpregs->ss through gpregs->ip.
++ *
++ */
++ memmove(&gpregs->ip, (void *)regs->sp, 5*8);
++ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
++
++ /*
++ * Adjust our frame so that we return straight to the #GP
++ * vector with the expected RSP value. This is safe because
++ * we won't enable interupts or schedule before we invoke
++ * general_protection, so nothing will clobber the stack
++ * frame we just set up.
++ */
+ regs->ip = (unsigned long)general_protection;
+- regs->sp = (unsigned long)&normal_regs->orig_ax;
++ regs->sp = (unsigned long)&gpregs->orig_ax;
+
+ return;
+ }
+@@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struc
+ *
+ * Processors update CR2 whenever a page fault is detected. If a
+ * second page fault occurs while an earlier page fault is being
+- * deliv- ered, the faulting linear address of the second fault will
++ * delivered, the faulting linear address of the second fault will
+ * overwrite the contents of CR2 (replacing the previous
+ * address). These updates to CR2 occur even if the page fault
+ * results in a double fault or occurs during the delivery of a
--- /dev/null
+From 4f3789e792296e21405f708cf3cb409d7c7d5683 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:11 +0100
+Subject: x86/irq/64: Print the offending IP in the stack overflow warning
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 4f3789e792296e21405f708cf3cb409d7c7d5683 upstream.
+
+In case something goes wrong with unwind (not unlikely in case of
+overflow), print the offending IP where we detected the overflow.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.231677119@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/irq_64.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kernel/irq_64.c
++++ b/arch/x86/kernel/irq_64.c
+@@ -57,10 +57,10 @@ static inline void stack_overflow_check(
+ if (regs->sp >= estack_top && regs->sp <= estack_bottom)
+ return;
+
+- WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
++ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
+ current->comm, curbase, regs->sp,
+ irq_stack_top, irq_stack_bottom,
+- estack_top, estack_bottom);
++ estack_top, estack_bottom, (void *)regs->ip);
+
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
--- /dev/null
+From 6669a692605547892a026445e460bf233958bd7f Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:10 +0100
+Subject: x86/irq: Remove an old outdated comment about context tracking races
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 6669a692605547892a026445e460bf233958bd7f upstream.
+
+That race has been fixed and code cleaned up for a while now.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.150551639@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/irq.c | 12 ------------
+ 1 file changed, 12 deletions(-)
+
+--- a/arch/x86/kernel/irq.c
++++ b/arch/x86/kernel/irq.c
+@@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IR
+ /* high bit used in ret_from_ code */
+ unsigned vector = ~regs->orig_ax;
+
+- /*
+- * NB: Unlike exception entries, IRQ entries do not reliably
+- * handle context tracking in the low-level entry code. This is
+- * because syscall entries execute briefly with IRQs on before
+- * updating context tracking state, so we can take an IRQ from
+- * kernel mode with CONTEXT_USER. The low-level entry code only
+- * updates the context if we came from user mode, so we won't
+- * switch to CONTEXT_KERNEL. We'll fix that once the syscall
+- * code is cleaned up enough that we can cleanly defer enabling
+- * IRQs.
+- */
+-
+ entering_irq();
+
+ /* entering_irq() tells RCU that we're not quiescent. Check it. */
--- /dev/null
+From 21506525fb8ddb0342f2a2370812d47f6a1f3833 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:16 +0100
+Subject: x86/kasan/64: Teach KASAN about the cpu_entry_area
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 21506525fb8ddb0342f2a2370812d47f6a1f3833 upstream.
+
+The cpu_entry_area will contain stacks. Make sure that KASAN has
+appropriate shadow mappings for them.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: kasan-dev@googlegroups.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.642806442@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/kasan_init_64.c | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/mm/kasan_init_64.c
++++ b/arch/x86/mm/kasan_init_64.c
+@@ -277,6 +277,7 @@ void __init kasan_early_init(void)
+ void __init kasan_init(void)
+ {
+ int i;
++ void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
+
+ #ifdef CONFIG_KASAN_INLINE
+ register_die_notifier(&kasan_die_notifier);
+@@ -329,8 +330,23 @@ void __init kasan_init(void)
+ (unsigned long)kasan_mem_to_shadow(_end),
+ early_pfn_to_nid(__pa(_stext)));
+
++ shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
++ shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
++ shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
++ PAGE_SIZE);
++
++ shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
++ shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
++ shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
++ PAGE_SIZE);
++
+ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+- (void *)KASAN_SHADOW_END);
++ shadow_cpu_entry_begin);
++
++ kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
++ (unsigned long)shadow_cpu_entry_end, 0);
++
++ kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END);
+
+ load_cr3(init_top_pgt);
+ __flush_tlb_all();
--- /dev/null
+From f2dbad36c55e5d3a91dccbde6e8cae345fe5632f Mon Sep 17 00:00:00 2001
+From: Rudolf Marek <r.marek@assembler.cz>
+Date: Tue, 28 Nov 2017 22:01:06 +0100
+Subject: x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD
+
+From: Rudolf Marek <r.marek@assembler.cz>
+
+commit f2dbad36c55e5d3a91dccbde6e8cae345fe5632f upstream.
+
+[ Note, this is a Git cherry-pick of the following commit:
+
+ 2b67799bdf25 ("x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD")
+
+ ... for easier x86 PTI code testing and back-porting. ]
+
+The latest AMD AMD64 Architecture Programmer's Manual
+adds a CPUID feature XSaveErPtr (CPUID_Fn80000008_EBX[2]).
+
+If this feature is set, the FXSAVE, XSAVE, FXSAVEOPT, XSAVEC, XSAVES
+/ FXRSTOR, XRSTOR, XRSTORS always save/restore error pointers,
+thus making the X86_BUG_FXSAVE_LEAK workaround obsolete on such CPUs.
+
+Signed-Off-By: Rudolf Marek <r.marek@assembler.cz>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Tested-by: Borislav Petkov <bp@suse.de>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Link: https://lkml.kernel.org/r/bdcebe90-62c5-1f05-083c-eba7f08b2540@assembler.cz
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/cpufeatures.h | 1 +
+ arch/x86/kernel/cpu/amd.c | 7 +++++--
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -266,6 +266,7 @@
+ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
+ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
+ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
++#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
+
+ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
+ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -804,8 +804,11 @@ static void init_amd(struct cpuinfo_x86
+ case 0x17: init_amd_zn(c); break;
+ }
+
+- /* Enable workaround for FXSAVE leak */
+- if (c->x86 >= 6)
++ /*
++ * Enable workaround for FXSAVE leak on CPUs
++ * without a XSaveErPtr feature
++ */
++ if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR)))
+ set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
+
+ cpu_detect_cache_sizes(c);
--- /dev/null
+From c7da092a1f243bfd1bfb4124f538e69e941882da Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@suse.de>
+Date: Fri, 3 Nov 2017 11:20:28 +0100
+Subject: x86/mm: Define _PAGE_TABLE using _KERNPG_TABLE
+
+From: Borislav Petkov <bp@suse.de>
+
+commit c7da092a1f243bfd1bfb4124f538e69e941882da upstream.
+
+... so that the difference is obvious.
+
+No functionality change.
+
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/20171103102028.20284-1-bp@alien8.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/pgtable_types.h | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -200,10 +200,9 @@ enum page_cache_mode {
+
+ #define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
+
+-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
+ #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
+ _PAGE_DIRTY | _PAGE_ENC)
++#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER)
+
+ #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
+ #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC)
--- /dev/null
+From ef8813ab280507972bb57e4b1b502811ad4411e9 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:15 +0100
+Subject: x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce struct cpu_entry_area
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit ef8813ab280507972bb57e4b1b502811ad4411e9 upstream.
+
+Currently, the GDT is an ad-hoc array of pages, one per CPU, in the
+fixmap. Generalize it to be an array of a new 'struct cpu_entry_area'
+so that we can cleanly add new things to it.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.563271721@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/desc.h | 9 +--------
+ arch/x86/include/asm/fixmap.h | 37 +++++++++++++++++++++++++++++++++++--
+ arch/x86/kernel/cpu/common.c | 14 +++++++-------
+ arch/x86/xen/mmu_pv.c | 2 +-
+ 4 files changed, 44 insertions(+), 18 deletions(-)
+
+--- a/arch/x86/include/asm/desc.h
++++ b/arch/x86/include/asm/desc.h
+@@ -60,17 +60,10 @@ static inline struct desc_struct *get_cu
+ return this_cpu_ptr(&gdt_page)->gdt;
+ }
+
+-/* Get the fixmap index for a specific processor */
+-static inline unsigned int get_cpu_gdt_ro_index(int cpu)
+-{
+- return FIX_GDT_REMAP_END - cpu;
+-}
+-
+ /* Provide the fixmap address of the remapped GDT */
+ static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
+ {
+- unsigned int idx = get_cpu_gdt_ro_index(cpu);
+- return (struct desc_struct *)__fix_to_virt(idx);
++ return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
+ }
+
+ /* Provide the current read-only GDT */
+--- a/arch/x86/include/asm/fixmap.h
++++ b/arch/x86/include/asm/fixmap.h
+@@ -44,6 +44,19 @@ extern unsigned long __FIXADDR_TOP;
+ PAGE_SIZE)
+ #endif
+
++/*
++ * cpu_entry_area is a percpu region in the fixmap that contains things
++ * needed by the CPU and early entry/exit code. Real types aren't used
++ * for all fields here to avoid circular header dependencies.
++ *
++ * Every field is a virtual alias of some other allocated backing store.
++ * There is no direct allocation of a struct cpu_entry_area.
++ */
++struct cpu_entry_area {
++ char gdt[PAGE_SIZE];
++};
++
++#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
+
+ /*
+ * Here we define all the compile-time 'special' virtual
+@@ -101,8 +114,8 @@ enum fixed_addresses {
+ FIX_LNW_VRTC,
+ #endif
+ /* Fixmap entries to remap the GDTs, one per processor. */
+- FIX_GDT_REMAP_BEGIN,
+- FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
++ FIX_CPU_ENTRY_AREA_TOP,
++ FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
+
+ #ifdef CONFIG_ACPI_APEI_GHES
+ /* Used for GHES mapping from assorted contexts */
+@@ -191,5 +204,25 @@ void __init *early_memremap_decrypted_wp
+ void __early_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags);
+
++static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
++{
++ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
++
++ return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
++}
++
++#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
++ BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
++ __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
++ })
++
++#define get_cpu_entry_area_index(cpu, field) \
++ __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
++
++static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
++{
++ return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
++}
++
+ #endif /* !__ASSEMBLY__ */
+ #endif /* _ASM_X86_FIXMAP_H */
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -466,12 +466,12 @@ void load_percpu_segment(int cpu)
+ load_stack_canary_segment();
+ }
+
+-/* Setup the fixmap mapping only once per-processor */
+-static inline void setup_fixmap_gdt(int cpu)
++/* Setup the fixmap mappings only once per-processor */
++static inline void setup_cpu_entry_area(int cpu)
+ {
+ #ifdef CONFIG_X86_64
+ /* On 64-bit systems, we use a read-only fixmap GDT. */
+- pgprot_t prot = PAGE_KERNEL_RO;
++ pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ #else
+ /*
+ * On native 32-bit systems, the GDT cannot be read-only because
+@@ -482,11 +482,11 @@ static inline void setup_fixmap_gdt(int
+ * On Xen PV, the GDT must be read-only because the hypervisor requires
+ * it.
+ */
+- pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
++ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ PAGE_KERNEL_RO : PAGE_KERNEL;
+ #endif
+
+- __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
++ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
+ }
+
+ /* Load the original GDT from the per-cpu structure */
+@@ -1589,7 +1589,7 @@ void cpu_init(void)
+ if (is_uv_system())
+ uv_cpu_init();
+
+- setup_fixmap_gdt(cpu);
++ setup_cpu_entry_area(cpu);
+ load_fixmap_gdt(cpu);
+ }
+
+@@ -1651,7 +1651,7 @@ void cpu_init(void)
+
+ fpu__init_cpu();
+
+- setup_fixmap_gdt(cpu);
++ setup_cpu_entry_area(cpu);
+ load_fixmap_gdt(cpu);
+ }
+ #endif
+--- a/arch/x86/xen/mmu_pv.c
++++ b/arch/x86/xen/mmu_pv.c
+@@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx,
+ #endif
+ case FIX_TEXT_POKE0:
+ case FIX_TEXT_POKE1:
+- case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
++ case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
+ /* All local page mappings */
+ pte = pfn_pte(phys, prot);
+ break;
--- /dev/null
+From 2aeb07365bcd489620f71390a7d2031cd4dfb83e Mon Sep 17 00:00:00 2001
+From: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Date: Wed, 15 Nov 2017 17:36:35 -0800
+Subject: x86/mm/kasan: Don't use vmemmap_populate() to initialize shadow
+
+From: Andrey Ryabinin <aryabinin@virtuozzo.com>
+
+commit 2aeb07365bcd489620f71390a7d2031cd4dfb83e upstream.
+
+[ Note, this is a Git cherry-pick of the following commit:
+
+ d17a1d97dc20: ("x86/mm/kasan: don't use vmemmap_populate() to initialize shadow")
+
+ ... for easier x86 PTI code testing and back-porting. ]
+
+The KASAN shadow is currently mapped using vmemmap_populate() since that
+provides a semi-convenient way to map pages into init_top_pgt. However,
+since that no longer zeroes the mapped pages, it is not suitable for
+KASAN, which requires zeroed shadow memory.
+
+Add kasan_populate_shadow() interface and use it instead of
+vmemmap_populate(). Besides, this allows us to take advantage of
+gigantic pages and use them to populate the shadow, which should save us
+some memory wasted on page tables and reduce TLB pressure.
+
+Link: http://lkml.kernel.org/r/20171103185147.2688-2-pasha.tatashin@oracle.com
+Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Steven Sistare <steven.sistare@oracle.com>
+Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
+Cc: Bob Picco <bob.picco@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christian Borntraeger <borntraeger@de.ibm.com>
+Cc: David S. Miller <davem@davemloft.net>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Sam Ravnborg <sam@ravnborg.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/Kconfig | 2
+ arch/x86/mm/kasan_init_64.c | 143 +++++++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 137 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -108,7 +108,7 @@ config X86
+ select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
+ select HAVE_ARCH_JUMP_LABEL
+- select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
++ select HAVE_ARCH_KASAN if X86_64
+ select HAVE_ARCH_KGDB
+ select HAVE_ARCH_KMEMCHECK
+ select HAVE_ARCH_MMAP_RND_BITS if MMU
+--- a/arch/x86/mm/kasan_init_64.c
++++ b/arch/x86/mm/kasan_init_64.c
+@@ -4,12 +4,14 @@
+ #include <linux/bootmem.h>
+ #include <linux/kasan.h>
+ #include <linux/kdebug.h>
++#include <linux/memblock.h>
+ #include <linux/mm.h>
+ #include <linux/sched.h>
+ #include <linux/sched/task.h>
+ #include <linux/vmalloc.h>
+
+ #include <asm/e820/types.h>
++#include <asm/pgalloc.h>
+ #include <asm/tlbflush.h>
+ #include <asm/sections.h>
+ #include <asm/pgtable.h>
+@@ -18,7 +20,134 @@ extern struct range pfn_mapped[E820_MAX_
+
+ static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+
+-static int __init map_range(struct range *range)
++static __init void *early_alloc(size_t size, int nid)
++{
++ return memblock_virt_alloc_try_nid_nopanic(size, size,
++ __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
++}
++
++static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
++ unsigned long end, int nid)
++{
++ pte_t *pte;
++
++ if (pmd_none(*pmd)) {
++ void *p;
++
++ if (boot_cpu_has(X86_FEATURE_PSE) &&
++ ((end - addr) == PMD_SIZE) &&
++ IS_ALIGNED(addr, PMD_SIZE)) {
++ p = early_alloc(PMD_SIZE, nid);
++ if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
++ return;
++ else if (p)
++ memblock_free(__pa(p), PMD_SIZE);
++ }
++
++ p = early_alloc(PAGE_SIZE, nid);
++ pmd_populate_kernel(&init_mm, pmd, p);
++ }
++
++ pte = pte_offset_kernel(pmd, addr);
++ do {
++ pte_t entry;
++ void *p;
++
++ if (!pte_none(*pte))
++ continue;
++
++ p = early_alloc(PAGE_SIZE, nid);
++ entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
++ set_pte_at(&init_mm, addr, pte, entry);
++ } while (pte++, addr += PAGE_SIZE, addr != end);
++}
++
++static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
++ unsigned long end, int nid)
++{
++ pmd_t *pmd;
++ unsigned long next;
++
++ if (pud_none(*pud)) {
++ void *p;
++
++ if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
++ ((end - addr) == PUD_SIZE) &&
++ IS_ALIGNED(addr, PUD_SIZE)) {
++ p = early_alloc(PUD_SIZE, nid);
++ if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
++ return;
++ else if (p)
++ memblock_free(__pa(p), PUD_SIZE);
++ }
++
++ p = early_alloc(PAGE_SIZE, nid);
++ pud_populate(&init_mm, pud, p);
++ }
++
++ pmd = pmd_offset(pud, addr);
++ do {
++ next = pmd_addr_end(addr, end);
++ if (!pmd_large(*pmd))
++ kasan_populate_pmd(pmd, addr, next, nid);
++ } while (pmd++, addr = next, addr != end);
++}
++
++static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
++ unsigned long end, int nid)
++{
++ pud_t *pud;
++ unsigned long next;
++
++ if (p4d_none(*p4d)) {
++ void *p = early_alloc(PAGE_SIZE, nid);
++
++ p4d_populate(&init_mm, p4d, p);
++ }
++
++ pud = pud_offset(p4d, addr);
++ do {
++ next = pud_addr_end(addr, end);
++ if (!pud_large(*pud))
++ kasan_populate_pud(pud, addr, next, nid);
++ } while (pud++, addr = next, addr != end);
++}
++
++static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
++ unsigned long end, int nid)
++{
++ void *p;
++ p4d_t *p4d;
++ unsigned long next;
++
++ if (pgd_none(*pgd)) {
++ p = early_alloc(PAGE_SIZE, nid);
++ pgd_populate(&init_mm, pgd, p);
++ }
++
++ p4d = p4d_offset(pgd, addr);
++ do {
++ next = p4d_addr_end(addr, end);
++ kasan_populate_p4d(p4d, addr, next, nid);
++ } while (p4d++, addr = next, addr != end);
++}
++
++static void __init kasan_populate_shadow(unsigned long addr, unsigned long end,
++ int nid)
++{
++ pgd_t *pgd;
++ unsigned long next;
++
++ addr = addr & PAGE_MASK;
++ end = round_up(end, PAGE_SIZE);
++ pgd = pgd_offset_k(addr);
++ do {
++ next = pgd_addr_end(addr, end);
++ kasan_populate_pgd(pgd, addr, next, nid);
++ } while (pgd++, addr = next, addr != end);
++}
++
++static void __init map_range(struct range *range)
+ {
+ unsigned long start;
+ unsigned long end;
+@@ -26,7 +155,7 @@ static int __init map_range(struct range
+ start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
+ end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
+
+- return vmemmap_populate(start, end, NUMA_NO_NODE);
++ kasan_populate_shadow(start, end, early_pfn_to_nid(range->start));
+ }
+
+ static void __init clear_pgds(unsigned long start,
+@@ -189,16 +318,16 @@ void __init kasan_init(void)
+ if (pfn_mapped[i].end == 0)
+ break;
+
+- if (map_range(&pfn_mapped[i]))
+- panic("kasan: unable to allocate shadow!");
++ map_range(&pfn_mapped[i]);
+ }
++
+ kasan_populate_zero_shadow(
+ kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+- vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+- (unsigned long)kasan_mem_to_shadow(_end),
+- NUMA_NO_NODE);
++ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
++ (unsigned long)kasan_mem_to_shadow(_end),
++ early_pfn_to_nid(__pa(_stext)));
+
+ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+ (void *)KASAN_SHADOW_END);
--- /dev/null
+From a035795499ca1c2bd1928808d1a156eda1420383 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 4 Dec 2017 15:07:30 +0100
+Subject: x86/paravirt: Dont patch flush_tlb_single
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit a035795499ca1c2bd1928808d1a156eda1420383 upstream.
+
+native_flush_tlb_single() will be changed with the upcoming
+PAGE_TABLE_ISOLATION feature. This requires to have more code in
+there than INVLPG.
+
+Remove the paravirt patching for it.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Reviewed-by: Juergen Gross <jgross@suse.com>
+Acked-by: Peter Zijlstra <peterz@infradead.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Cc: linux-mm@kvack.org
+Cc: michael.schwarz@iaik.tugraz.at
+Cc: moritz.lipp@iaik.tugraz.at
+Cc: richard.fellner@student.tugraz.at
+Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/paravirt_patch_64.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/arch/x86/kernel/paravirt_patch_64.c
++++ b/arch/x86/kernel/paravirt_patch_64.c
+@@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq;
+ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
+ DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
+ DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
+-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
+ DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
+
+ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
+@@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobb
+ PATCH_SITE(pv_mmu_ops, read_cr2);
+ PATCH_SITE(pv_mmu_ops, read_cr3);
+ PATCH_SITE(pv_mmu_ops, write_cr3);
+- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
+ PATCH_SITE(pv_cpu_ops, wbinvd);
+ #if defined(CONFIG_PARAVIRT_SPINLOCKS)
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
--- /dev/null
+From 79cc74155218316b9a5d28577c7077b2adba8e58 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 4 Dec 2017 15:07:31 +0100
+Subject: x86/paravirt: Provide a way to check for hypervisors
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 79cc74155218316b9a5d28577c7077b2adba8e58 upstream.
+
+There is no generic way to test whether a kernel is running on a specific
+hypervisor. But that's required to prevent the upcoming user address space
+separation feature in certain guest modes.
+
+Make the hypervisor type enum unconditionally available and provide a
+helper function which allows to test for a specific type.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Juergen Gross <jgross@suse.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150606.912938129@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/hypervisor.h | 25 +++++++++++++++----------
+ 1 file changed, 15 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/include/asm/hypervisor.h
++++ b/arch/x86/include/asm/hypervisor.h
+@@ -20,16 +20,7 @@
+ #ifndef _ASM_X86_HYPERVISOR_H
+ #define _ASM_X86_HYPERVISOR_H
+
+-#ifdef CONFIG_HYPERVISOR_GUEST
+-
+-#include <asm/kvm_para.h>
+-#include <asm/x86_init.h>
+-#include <asm/xen/hypervisor.h>
+-
+-/*
+- * x86 hypervisor information
+- */
+-
++/* x86 hypervisor types */
+ enum x86_hypervisor_type {
+ X86_HYPER_NATIVE = 0,
+ X86_HYPER_VMWARE,
+@@ -39,6 +30,12 @@ enum x86_hypervisor_type {
+ X86_HYPER_KVM,
+ };
+
++#ifdef CONFIG_HYPERVISOR_GUEST
++
++#include <asm/kvm_para.h>
++#include <asm/x86_init.h>
++#include <asm/xen/hypervisor.h>
++
+ struct hypervisor_x86 {
+ /* Hypervisor name */
+ const char *name;
+@@ -58,7 +55,15 @@ struct hypervisor_x86 {
+
+ extern enum x86_hypervisor_type x86_hyper_type;
+ extern void init_hypervisor_platform(void);
++static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
++{
++ return x86_hyper_type == type;
++}
+ #else
+ static inline void init_hypervisor_platform(void) { }
++static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
++{
++ return type == X86_HYPER_NATIVE;
++}
+ #endif /* CONFIG_HYPERVISOR_GUEST */
+ #endif /* _ASM_X86_HYPERVISOR_H */
--- /dev/null
+From 3383642c2f9d4f5b4fa37436db4a109a1a10018c Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:17 -0700
+Subject: x86/traps: Use a new on_thread_stack() helper to clean up an assertion
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 3383642c2f9d4f5b4fa37436db4a109a1a10018c upstream.
+
+Let's keep the stack-related logic together rather than open-coding
+a comparison in an assertion in the traps code.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/856b15bee1f55017b8f79d3758b0d51c48a08cf8.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/processor.h | 6 ++++++
+ arch/x86/kernel/traps.c | 3 +--
+ 2 files changed, 7 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -542,6 +542,12 @@ static inline unsigned long current_top_
+ #endif
+ }
+
++static inline bool on_thread_stack(void)
++{
++ return (unsigned long)(current_top_of_stack() -
++ current_stack_pointer) < THREAD_SIZE;
++}
++
+ #ifdef CONFIG_PARAVIRT
+ #include <asm/paravirt.h>
+ #else
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -141,8 +141,7 @@ void ist_begin_non_atomic(struct pt_regs
+ * will catch asm bugs and any attempt to use ist_preempt_enable
+ * from double_fault.
+ */
+- BUG_ON((unsigned long)(current_top_of_stack() -
+- current_stack_pointer) >= THREAD_SIZE);
++ BUG_ON(!on_thread_stack());
+
+ preempt_enable_no_resched();
+ }
--- /dev/null
+From b02fcf9ba1211097754b286043cd87a8b4907e75 Mon Sep 17 00:00:00 2001
+From: Josh Poimboeuf <jpoimboe@redhat.com>
+Date: Mon, 4 Dec 2017 15:07:09 +0100
+Subject: x86/unwinder: Handle stack overflows more gracefully
+
+From: Josh Poimboeuf <jpoimboe@redhat.com>
+
+commit b02fcf9ba1211097754b286043cd87a8b4907e75 upstream.
+
+There are at least two unwinder bugs hindering the debugging of
+stack-overflow crashes:
+
+- It doesn't deal gracefully with the case where the stack overflows and
+ the stack pointer itself isn't on a valid stack but the
+ to-be-dereferenced data *is*.
+
+- The ORC oops dump code doesn't know how to print partial pt_regs, for the
+ case where if we get an interrupt/exception in *early* entry code
+ before the full pt_regs have been saved.
+
+Fix both issues.
+
+http://lkml.kernel.org/r/20171126024031.uxi4numpbjm5rlbr@treble
+
+Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bpetkov@suse.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150605.071425003@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kdebug.h | 1
+ arch/x86/include/asm/unwind.h | 7 +++
+ arch/x86/kernel/dumpstack.c | 32 ++++++++++++++---
+ arch/x86/kernel/process_64.c | 11 ++----
+ arch/x86/kernel/unwind_orc.c | 76 ++++++++++++++----------------------------
+ 5 files changed, 66 insertions(+), 61 deletions(-)
+
+--- a/arch/x86/include/asm/kdebug.h
++++ b/arch/x86/include/asm/kdebug.h
+@@ -26,6 +26,7 @@ extern void die(const char *, struct pt_
+ extern int __must_check __die(const char *, struct pt_regs *, long);
+ extern void show_stack_regs(struct pt_regs *regs);
+ extern void __show_regs(struct pt_regs *regs, int all);
++extern void show_iret_regs(struct pt_regs *regs);
+ extern unsigned long oops_begin(void);
+ extern void oops_end(unsigned long, struct pt_regs *, int signr);
+
+--- a/arch/x86/include/asm/unwind.h
++++ b/arch/x86/include/asm/unwind.h
+@@ -7,6 +7,9 @@
+ #include <asm/ptrace.h>
+ #include <asm/stacktrace.h>
+
++#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
++#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
++
+ struct unwind_state {
+ struct stack_info stack_info;
+ unsigned long stack_mask;
+@@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *s
+ }
+
+ #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
++/*
++ * WARNING: The entire pt_regs may not be safe to dereference. In some cases,
++ * only the iret frame registers are accessible. Use with caution!
++ */
+ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+ {
+ if (unwind_done(state))
+--- a/arch/x86/kernel/dumpstack.c
++++ b/arch/x86/kernel/dumpstack.c
+@@ -50,6 +50,28 @@ static void printk_stack_address(unsigne
+ printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
+ }
+
++void show_iret_regs(struct pt_regs *regs)
++{
++ printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
++ printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
++ regs->sp, regs->flags);
++}
++
++static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
++{
++ if (on_stack(info, regs, sizeof(*regs)))
++ __show_regs(regs, 0);
++ else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
++ IRET_FRAME_SIZE)) {
++ /*
++ * When an interrupt or exception occurs in entry code, the
++ * full pt_regs might not have been saved yet. In that case
++ * just print the iret frame.
++ */
++ show_iret_regs(regs);
++ }
++}
++
+ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, char *log_lvl)
+ {
+@@ -94,8 +116,8 @@ void show_trace_log_lvl(struct task_stru
+ if (stack_name)
+ printk("%s <%s>\n", log_lvl, stack_name);
+
+- if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
+- __show_regs(regs, 0);
++ if (regs)
++ show_regs_safe(&stack_info, regs);
+
+ /*
+ * Scan the stack, printing any text addresses we find. At the
+@@ -119,7 +141,7 @@ void show_trace_log_lvl(struct task_stru
+
+ /*
+ * Don't print regs->ip again if it was already printed
+- * by __show_regs() below.
++ * by show_regs_safe() below.
+ */
+ if (regs && stack == ®s->ip)
+ goto next;
+@@ -155,8 +177,8 @@ next:
+
+ /* if the frame has entry regs, print them */
+ regs = unwind_get_entry_regs(&state);
+- if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
+- __show_regs(regs, 0);
++ if (regs)
++ show_regs_safe(&stack_info, regs);
+ }
+
+ if (stack_name)
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, i
+ unsigned int fsindex, gsindex;
+ unsigned int ds, cs, es;
+
+- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
+- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
+- regs->sp, regs->flags);
++ show_iret_regs(regs);
++
+ if (regs->orig_ax != -1)
+ pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
+ else
+@@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, i
+ printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
+ regs->r13, regs->r14, regs->r15);
+
++ if (!all)
++ return;
++
+ asm("movl %%ds,%0" : "=r" (ds));
+ asm("movl %%cs,%0" : "=r" (cs));
+ asm("movl %%es,%0" : "=r" (es));
+@@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, i
+ rdmsrl(MSR_GS_BASE, gs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+
+- if (!all)
+- return;
+-
+ cr0 = read_cr0();
+ cr2 = read_cr2();
+ cr3 = __read_cr3();
+--- a/arch/x86/kernel/unwind_orc.c
++++ b/arch/x86/kernel/unwind_orc.c
+@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address
+ return NULL;
+ }
+
+-static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
++static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
+ size_t len)
+ {
+ struct stack_info *info = &state->stack_info;
++ void *addr = (void *)_addr;
+
+- /*
+- * If the address isn't on the current stack, switch to the next one.
+- *
+- * We may have to traverse multiple stacks to deal with the possibility
+- * that info->next_sp could point to an empty stack and the address
+- * could be on a subsequent stack.
+- */
+- while (!on_stack(info, (void *)addr, len))
+- if (get_stack_info(info->next_sp, state->task, info,
+- &state->stack_mask))
+- return false;
++ if (!on_stack(info, addr, len) &&
++ (get_stack_info(addr, state->task, info, &state->stack_mask)))
++ return false;
+
+ return true;
+ }
+@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwin
+ return true;
+ }
+
+-#define REGS_SIZE (sizeof(struct pt_regs))
+-#define SP_OFFSET (offsetof(struct pt_regs, sp))
+-#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
+-#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
+-
+ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+- unsigned long *ip, unsigned long *sp, bool full)
++ unsigned long *ip, unsigned long *sp)
+ {
+- size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
+- size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
+- struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
+-
+- if (IS_ENABLED(CONFIG_X86_64)) {
+- if (!stack_access_ok(state, addr, regs_size))
+- return false;
+-
+- *ip = regs->ip;
+- *sp = regs->sp;
++ struct pt_regs *regs = (struct pt_regs *)addr;
+
+- return true;
+- }
++ /* x86-32 support will be more complicated due to the ®s->sp hack */
++ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
+
+- if (!stack_access_ok(state, addr, sp_offset))
++ if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
+ return false;
+
+ *ip = regs->ip;
++ *sp = regs->sp;
++ return true;
++}
+
+- if (user_mode(regs)) {
+- if (!stack_access_ok(state, addr + sp_offset,
+- REGS_SIZE - SP_OFFSET))
+- return false;
+-
+- *sp = regs->sp;
+- } else
+- *sp = (unsigned long)®s->sp;
++static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
++ unsigned long *ip, unsigned long *sp)
++{
++ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
+
++ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
++ return false;
++
++ *ip = regs->ip;
++ *sp = regs->sp;
+ return true;
+ }
+
+@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_sta
+ unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
+ enum stack_type prev_type = state->stack_info.type;
+ struct orc_entry *orc;
+- struct pt_regs *ptregs;
+ bool indirect = false;
+
+ if (unwind_done(state))
+@@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_sta
+ break;
+
+ case ORC_TYPE_REGS:
+- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
++ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
+ orc_warn("can't dereference registers at %p for ip %pB\n",
+ (void *)sp, (void *)orig_ip);
+ goto done;
+@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_sta
+ break;
+
+ case ORC_TYPE_REGS_IRET:
+- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
++ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
+ orc_warn("can't dereference iret registers at %p for ip %pB\n",
+ (void *)sp, (void *)orig_ip);
+ goto done;
+ }
+
+- ptregs = container_of((void *)sp, struct pt_regs, ip);
+- if ((unsigned long)ptregs >= prev_sp &&
+- on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
+- state->regs = ptregs;
+- state->full_regs = false;
+- } else
+- state->regs = NULL;
+-
++ state->regs = (void *)sp - IRET_FRAME_OFFSET;
++ state->full_regs = false;
+ state->signal = true;
+ break;
+
--- /dev/null
+From d3a09104018cf2ad5973dfa8a9c138ef9f5015a3 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Mon, 4 Dec 2017 15:07:08 +0100
+Subject: x86/unwinder/orc: Dont bail on stack overflow
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit d3a09104018cf2ad5973dfa8a9c138ef9f5015a3 upstream.
+
+If the stack overflows into a guard page and the ORC unwinder should work
+well: by construction, there can't be any meaningful data in the guard page
+because no writes to the guard page will have succeeded.
+
+But there is a bug that prevents unwinding from working correctly: if the
+starting register state has RSP pointing into a stack guard page, the ORC
+unwinder bails out immediately.
+
+Instead of bailing out immediately check whether the next page up is a
+valid check page and if so analyze that. As a result the ORC unwinder will
+start the unwind.
+
+Tested by intentionally overflowing the task stack. The result is an
+accurate call trace instead of a trace consisting purely of '?' entries.
+
+There are a few other bugs that are triggered if the unwinder encounters a
+stack overflow after the first step, but they are outside the scope of this
+fix.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: aliguori@amazon.com
+Cc: daniel.gruss@iaik.tugraz.at
+Cc: hughd@google.com
+Cc: keescook@google.com
+Link: https://lkml.kernel.org/r/20171204150604.991389777@linutronix.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/unwind_orc.c | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kernel/unwind_orc.c
++++ b/arch/x86/kernel/unwind_orc.c
+@@ -553,8 +553,18 @@ void __unwind_start(struct unwind_state
+ }
+
+ if (get_stack_info((unsigned long *)state->sp, state->task,
+- &state->stack_info, &state->stack_mask))
+- return;
++ &state->stack_info, &state->stack_mask)) {
++ /*
++ * We weren't on a valid stack. It's possible that
++ * we overflowed a valid stack into a guard page.
++ * See if the next page up is valid so that we can
++ * generate some kind of backtrace if this happens.
++ */
++ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
++ if (get_stack_info(next_page, state->task, &state->stack_info,
++ &state->stack_mask))
++ return;
++ }
+
+ /*
+ * The caller can provide the address of the first frame directly
--- /dev/null
+From 03b2a320b19f1424e9ac9c21696be9c60b6d0d93 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Thu, 9 Nov 2017 14:27:36 +0100
+Subject: x86/virt: Add enum for hypervisors to replace x86_hyper
+
+From: Juergen Gross <jgross@suse.com>
+
+commit 03b2a320b19f1424e9ac9c21696be9c60b6d0d93 upstream.
+
+The x86_hyper pointer is only used for checking whether a virtual
+device is supporting the hypervisor the system is running on.
+
+Use an enum for that purpose instead and drop the x86_hyper pointer.
+
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Acked-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Xavier Deguillard <xdeguillard@vmware.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: akataria@vmware.com
+Cc: arnd@arndb.de
+Cc: boris.ostrovsky@oracle.com
+Cc: devel@linuxdriverproject.org
+Cc: dmitry.torokhov@gmail.com
+Cc: gregkh@linuxfoundation.org
+Cc: haiyangz@microsoft.com
+Cc: kvm@vger.kernel.org
+Cc: kys@microsoft.com
+Cc: linux-graphics-maintainer@vmware.com
+Cc: linux-input@vger.kernel.org
+Cc: moltmann@vmware.com
+Cc: pbonzini@redhat.com
+Cc: pv-drivers@vmware.com
+Cc: rkrcmar@redhat.com
+Cc: sthemmin@microsoft.com
+Cc: virtualization@lists.linux-foundation.org
+Cc: xen-devel@lists.xenproject.org
+Link: http://lkml.kernel.org/r/20171109132739.23465-3-jgross@suse.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/hyperv/hv_init.c | 2 +-
+ arch/x86/include/asm/hypervisor.h | 23 ++++++++++++++---------
+ arch/x86/kernel/cpu/hypervisor.c | 12 +++++++++---
+ arch/x86/kernel/cpu/mshyperv.c | 4 ++--
+ arch/x86/kernel/cpu/vmware.c | 4 ++--
+ arch/x86/kernel/kvm.c | 4 ++--
+ arch/x86/xen/enlighten_hvm.c | 4 ++--
+ arch/x86/xen/enlighten_pv.c | 4 ++--
+ drivers/hv/vmbus_drv.c | 2 +-
+ drivers/input/mouse/vmmouse.c | 10 ++++------
+ drivers/misc/vmw_balloon.c | 2 +-
+ 11 files changed, 40 insertions(+), 31 deletions(-)
+
+--- a/arch/x86/hyperv/hv_init.c
++++ b/arch/x86/hyperv/hv_init.c
+@@ -113,7 +113,7 @@ void hyperv_init(void)
+ u64 guest_id;
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+
+- if (x86_hyper != &x86_hyper_ms_hyperv)
++ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
+ return;
+
+ /* Allocate percpu VP index */
+--- a/arch/x86/include/asm/hypervisor.h
++++ b/arch/x86/include/asm/hypervisor.h
+@@ -29,6 +29,16 @@
+ /*
+ * x86 hypervisor information
+ */
++
++enum x86_hypervisor_type {
++ X86_HYPER_NATIVE = 0,
++ X86_HYPER_VMWARE,
++ X86_HYPER_MS_HYPERV,
++ X86_HYPER_XEN_PV,
++ X86_HYPER_XEN_HVM,
++ X86_HYPER_KVM,
++};
++
+ struct hypervisor_x86 {
+ /* Hypervisor name */
+ const char *name;
+@@ -36,6 +46,9 @@ struct hypervisor_x86 {
+ /* Detection routine */
+ uint32_t (*detect)(void);
+
++ /* Hypervisor type */
++ enum x86_hypervisor_type type;
++
+ /* init time callbacks */
+ struct x86_hyper_init init;
+
+@@ -43,15 +56,7 @@ struct hypervisor_x86 {
+ struct x86_hyper_runtime runtime;
+ };
+
+-extern const struct hypervisor_x86 *x86_hyper;
+-
+-/* Recognized hypervisors */
+-extern const struct hypervisor_x86 x86_hyper_vmware;
+-extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+-extern const struct hypervisor_x86 x86_hyper_xen_pv;
+-extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+-extern const struct hypervisor_x86 x86_hyper_kvm;
+-
++extern enum x86_hypervisor_type x86_hyper_type;
+ extern void init_hypervisor_platform(void);
+ #else
+ static inline void init_hypervisor_platform(void) { }
+--- a/arch/x86/kernel/cpu/hypervisor.c
++++ b/arch/x86/kernel/cpu/hypervisor.c
+@@ -26,6 +26,12 @@
+ #include <asm/processor.h>
+ #include <asm/hypervisor.h>
+
++extern const struct hypervisor_x86 x86_hyper_vmware;
++extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
++extern const struct hypervisor_x86 x86_hyper_xen_pv;
++extern const struct hypervisor_x86 x86_hyper_xen_hvm;
++extern const struct hypervisor_x86 x86_hyper_kvm;
++
+ static const __initconst struct hypervisor_x86 * const hypervisors[] =
+ {
+ #ifdef CONFIG_XEN_PV
+@@ -41,8 +47,8 @@ static const __initconst struct hypervis
+ #endif
+ };
+
+-const struct hypervisor_x86 *x86_hyper;
+-EXPORT_SYMBOL(x86_hyper);
++enum x86_hypervisor_type x86_hyper_type;
++EXPORT_SYMBOL(x86_hyper_type);
+
+ static inline const struct hypervisor_x86 * __init
+ detect_hypervisor_vendor(void)
+@@ -87,6 +93,6 @@ void __init init_hypervisor_platform(voi
+ copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
+ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
+
+- x86_hyper = h;
++ x86_hyper_type = h->type;
+ x86_init.hyper.init_platform();
+ }
+--- a/arch/x86/kernel/cpu/mshyperv.c
++++ b/arch/x86/kernel/cpu/mshyperv.c
+@@ -254,9 +254,9 @@ static void __init ms_hyperv_init_platfo
+ #endif
+ }
+
+-const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
++const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+ .name = "Microsoft Hyper-V",
+ .detect = ms_hyperv_platform,
++ .type = X86_HYPER_MS_HYPERV,
+ .init.init_platform = ms_hyperv_init_platform,
+ };
+-EXPORT_SYMBOL(x86_hyper_ms_hyperv);
+--- a/arch/x86/kernel/cpu/vmware.c
++++ b/arch/x86/kernel/cpu/vmware.c
+@@ -205,10 +205,10 @@ static bool __init vmware_legacy_x2apic_
+ (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
+ }
+
+-const __refconst struct hypervisor_x86 x86_hyper_vmware = {
++const __initconst struct hypervisor_x86 x86_hyper_vmware = {
+ .name = "VMware",
+ .detect = vmware_platform,
++ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
+ };
+-EXPORT_SYMBOL(x86_hyper_vmware);
+--- a/arch/x86/kernel/kvm.c
++++ b/arch/x86/kernel/kvm.c
+@@ -544,12 +544,12 @@ static uint32_t __init kvm_detect(void)
+ return kvm_cpuid_base();
+ }
+
+-const struct hypervisor_x86 x86_hyper_kvm __refconst = {
++const __initconst struct hypervisor_x86 x86_hyper_kvm = {
+ .name = "KVM",
+ .detect = kvm_detect,
++ .type = X86_HYPER_KVM,
+ .init.x2apic_available = kvm_para_available,
+ };
+-EXPORT_SYMBOL_GPL(x86_hyper_kvm);
+
+ static __init int activate_jump_labels(void)
+ {
+--- a/arch/x86/xen/enlighten_hvm.c
++++ b/arch/x86/xen/enlighten_hvm.c
+@@ -226,12 +226,12 @@ static uint32_t __init xen_platform_hvm(
+ return xen_cpuid_base();
+ }
+
+-const struct hypervisor_x86 x86_hyper_xen_hvm = {
++const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = {
+ .name = "Xen HVM",
+ .detect = xen_platform_hvm,
++ .type = X86_HYPER_XEN_HVM,
+ .init.init_platform = xen_hvm_guest_init,
+ .init.x2apic_available = xen_x2apic_para_available,
+ .init.init_mem_mapping = xen_hvm_init_mem_mapping,
+ .runtime.pin_vcpu = xen_pin_vcpu,
+ };
+-EXPORT_SYMBOL(x86_hyper_xen_hvm);
+--- a/arch/x86/xen/enlighten_pv.c
++++ b/arch/x86/xen/enlighten_pv.c
+@@ -1459,9 +1459,9 @@ static uint32_t __init xen_platform_pv(v
+ return 0;
+ }
+
+-const struct hypervisor_x86 x86_hyper_xen_pv = {
++const __initconst struct hypervisor_x86 x86_hyper_xen_pv = {
+ .name = "Xen PV",
+ .detect = xen_platform_pv,
++ .type = X86_HYPER_XEN_PV,
+ .runtime.pin_vcpu = xen_pin_vcpu,
+ };
+-EXPORT_SYMBOL(x86_hyper_xen_pv);
+--- a/drivers/hv/vmbus_drv.c
++++ b/drivers/hv/vmbus_drv.c
+@@ -1534,7 +1534,7 @@ static int __init hv_acpi_init(void)
+ {
+ int ret, t;
+
+- if (x86_hyper != &x86_hyper_ms_hyperv)
++ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
+ return -ENODEV;
+
+ init_completion(&probe_event);
+--- a/drivers/input/mouse/vmmouse.c
++++ b/drivers/input/mouse/vmmouse.c
+@@ -316,11 +316,9 @@ static int vmmouse_enable(struct psmouse
+ /*
+ * Array of supported hypervisors.
+ */
+-static const struct hypervisor_x86 *vmmouse_supported_hypervisors[] = {
+- &x86_hyper_vmware,
+-#ifdef CONFIG_KVM_GUEST
+- &x86_hyper_kvm,
+-#endif
++static enum x86_hypervisor_type vmmouse_supported_hypervisors[] = {
++ X86_HYPER_VMWARE,
++ X86_HYPER_KVM,
+ };
+
+ /**
+@@ -331,7 +329,7 @@ static bool vmmouse_check_hypervisor(voi
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(vmmouse_supported_hypervisors); i++)
+- if (vmmouse_supported_hypervisors[i] == x86_hyper)
++ if (vmmouse_supported_hypervisors[i] == x86_hyper_type)
+ return true;
+
+ return false;
+--- a/drivers/misc/vmw_balloon.c
++++ b/drivers/misc/vmw_balloon.c
+@@ -1271,7 +1271,7 @@ static int __init vmballoon_init(void)
+ * Check if we are running on VMware's hypervisor and bail out
+ * if we are not.
+ */
+- if (x86_hyper != &x86_hyper_vmware)
++ if (x86_hyper_type != X86_HYPER_VMWARE)
+ return -ENODEV;
+
+ for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
--- /dev/null
+From f72e38e8ec8869ac0ba5a75d7d2f897d98a1454e Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Thu, 9 Nov 2017 14:27:35 +0100
+Subject: x86/virt, x86/platform: Merge 'struct x86_hyper' into 'struct x86_platform' and 'struct x86_init'
+
+From: Juergen Gross <jgross@suse.com>
+
+commit f72e38e8ec8869ac0ba5a75d7d2f897d98a1454e upstream.
+
+Instead of x86_hyper being either NULL on bare metal or a pointer to a
+struct hypervisor_x86 in case of the kernel running as a guest merge
+the struct into x86_platform and x86_init.
+
+This will remove the need for wrappers making it hard to find out what
+is being called. With dummy functions added for all callbacks testing
+for a NULL function pointer can be removed, too.
+
+Suggested-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Acked-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: akataria@vmware.com
+Cc: boris.ostrovsky@oracle.com
+Cc: devel@linuxdriverproject.org
+Cc: haiyangz@microsoft.com
+Cc: kvm@vger.kernel.org
+Cc: kys@microsoft.com
+Cc: pbonzini@redhat.com
+Cc: rkrcmar@redhat.com
+Cc: rusty@rustcorp.com.au
+Cc: sthemmin@microsoft.com
+Cc: virtualization@lists.linux-foundation.org
+Cc: xen-devel@lists.xenproject.org
+Link: http://lkml.kernel.org/r/20171109132739.23465-2-jgross@suse.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/hypervisor.h | 25 +++-------------
+ arch/x86/include/asm/x86_init.h | 24 ++++++++++++++++
+ arch/x86/kernel/apic/apic.c | 2 -
+ arch/x86/kernel/cpu/hypervisor.c | 56 ++++++++++++++++++--------------------
+ arch/x86/kernel/cpu/mshyperv.c | 2 -
+ arch/x86/kernel/cpu/vmware.c | 4 +-
+ arch/x86/kernel/kvm.c | 2 -
+ arch/x86/kernel/x86_init.c | 9 ++++++
+ arch/x86/mm/init.c | 2 -
+ arch/x86/xen/enlighten_hvm.c | 8 ++---
+ arch/x86/xen/enlighten_pv.c | 2 -
+ include/linux/hypervisor.h | 8 ++++-
+ 12 files changed, 82 insertions(+), 62 deletions(-)
+
+--- a/arch/x86/include/asm/hypervisor.h
++++ b/arch/x86/include/asm/hypervisor.h
+@@ -23,6 +23,7 @@
+ #ifdef CONFIG_HYPERVISOR_GUEST
+
+ #include <asm/kvm_para.h>
++#include <asm/x86_init.h>
+ #include <asm/xen/hypervisor.h>
+
+ /*
+@@ -35,17 +36,11 @@ struct hypervisor_x86 {
+ /* Detection routine */
+ uint32_t (*detect)(void);
+
+- /* Platform setup (run once per boot) */
+- void (*init_platform)(void);
++ /* init time callbacks */
++ struct x86_hyper_init init;
+
+- /* X2APIC detection (run once per boot) */
+- bool (*x2apic_available)(void);
+-
+- /* pin current vcpu to specified physical cpu (run rarely) */
+- void (*pin_vcpu)(int);
+-
+- /* called during init_mem_mapping() to setup early mappings. */
+- void (*init_mem_mapping)(void);
++ /* runtime callbacks */
++ struct x86_hyper_runtime runtime;
+ };
+
+ extern const struct hypervisor_x86 *x86_hyper;
+@@ -58,17 +53,7 @@ extern const struct hypervisor_x86 x86_h
+ extern const struct hypervisor_x86 x86_hyper_kvm;
+
+ extern void init_hypervisor_platform(void);
+-extern bool hypervisor_x2apic_available(void);
+-extern void hypervisor_pin_vcpu(int cpu);
+-
+-static inline void hypervisor_init_mem_mapping(void)
+-{
+- if (x86_hyper && x86_hyper->init_mem_mapping)
+- x86_hyper->init_mem_mapping();
+-}
+ #else
+ static inline void init_hypervisor_platform(void) { }
+-static inline bool hypervisor_x2apic_available(void) { return false; }
+-static inline void hypervisor_init_mem_mapping(void) { }
+ #endif /* CONFIG_HYPERVISOR_GUEST */
+ #endif /* _ASM_X86_HYPERVISOR_H */
+--- a/arch/x86/include/asm/x86_init.h
++++ b/arch/x86/include/asm/x86_init.h
+@@ -115,6 +115,18 @@ struct x86_init_pci {
+ };
+
+ /**
++ * struct x86_hyper_init - x86 hypervisor init functions
++ * @init_platform: platform setup
++ * @x2apic_available: X2APIC detection
++ * @init_mem_mapping: setup early mappings during init_mem_mapping()
++ */
++struct x86_hyper_init {
++ void (*init_platform)(void);
++ bool (*x2apic_available)(void);
++ void (*init_mem_mapping)(void);
++};
++
++/**
+ * struct x86_init_ops - functions for platform specific setup
+ *
+ */
+@@ -127,6 +139,7 @@ struct x86_init_ops {
+ struct x86_init_timers timers;
+ struct x86_init_iommu iommu;
+ struct x86_init_pci pci;
++ struct x86_hyper_init hyper;
+ };
+
+ /**
+@@ -200,6 +213,15 @@ struct x86_legacy_features {
+ };
+
+ /**
++ * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks
++ *
++ * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely)
++ */
++struct x86_hyper_runtime {
++ void (*pin_vcpu)(int cpu);
++};
++
++/**
+ * struct x86_platform_ops - platform specific runtime functions
+ * @calibrate_cpu: calibrate CPU
+ * @calibrate_tsc: calibrate TSC, if different from CPU
+@@ -218,6 +240,7 @@ struct x86_legacy_features {
+ * possible in x86_early_init_platform_quirks() by
+ * only using the current x86_hardware_subarch
+ * semantics.
++ * @hyper: x86 hypervisor specific runtime callbacks
+ */
+ struct x86_platform_ops {
+ unsigned long (*calibrate_cpu)(void);
+@@ -233,6 +256,7 @@ struct x86_platform_ops {
+ void (*apic_post_init)(void);
+ struct x86_legacy_features legacy;
+ void (*set_legacy_features)(void);
++ struct x86_hyper_runtime hyper;
+ };
+
+ struct pci_dev;
+--- a/arch/x86/kernel/apic/apic.c
++++ b/arch/x86/kernel/apic/apic.c
+@@ -1645,7 +1645,7 @@ static __init void try_to_enable_x2apic(
+ * under KVM
+ */
+ if (max_physical_apicid > 255 ||
+- !hypervisor_x2apic_available()) {
++ !x86_init.hyper.x2apic_available()) {
+ pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
+ x2apic_disable();
+ return;
+--- a/arch/x86/kernel/cpu/hypervisor.c
++++ b/arch/x86/kernel/cpu/hypervisor.c
+@@ -44,51 +44,49 @@ static const __initconst struct hypervis
+ const struct hypervisor_x86 *x86_hyper;
+ EXPORT_SYMBOL(x86_hyper);
+
+-static inline void __init
++static inline const struct hypervisor_x86 * __init
+ detect_hypervisor_vendor(void)
+ {
+- const struct hypervisor_x86 *h, * const *p;
++ const struct hypervisor_x86 *h = NULL, * const *p;
+ uint32_t pri, max_pri = 0;
+
+ for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+- h = *p;
+- pri = h->detect();
+- if (pri != 0 && pri > max_pri) {
++ pri = (*p)->detect();
++ if (pri > max_pri) {
+ max_pri = pri;
+- x86_hyper = h;
++ h = *p;
+ }
+ }
+
+- if (max_pri)
+- pr_info("Hypervisor detected: %s\n", x86_hyper->name);
+-}
+-
+-void __init init_hypervisor_platform(void)
+-{
+-
+- detect_hypervisor_vendor();
++ if (h)
++ pr_info("Hypervisor detected: %s\n", h->name);
+
+- if (!x86_hyper)
+- return;
+-
+- if (x86_hyper->init_platform)
+- x86_hyper->init_platform();
++ return h;
+ }
+
+-bool __init hypervisor_x2apic_available(void)
++static void __init copy_array(const void *src, void *target, unsigned int size)
+ {
+- return x86_hyper &&
+- x86_hyper->x2apic_available &&
+- x86_hyper->x2apic_available();
++ unsigned int i, n = size / sizeof(void *);
++ const void * const *from = (const void * const *)src;
++ const void **to = (const void **)target;
++
++ for (i = 0; i < n; i++)
++ if (from[i])
++ to[i] = from[i];
+ }
+
+-void hypervisor_pin_vcpu(int cpu)
++void __init init_hypervisor_platform(void)
+ {
+- if (!x86_hyper)
++ const struct hypervisor_x86 *h;
++
++ h = detect_hypervisor_vendor();
++
++ if (!h)
+ return;
+
+- if (x86_hyper->pin_vcpu)
+- x86_hyper->pin_vcpu(cpu);
+- else
+- WARN_ONCE(1, "vcpu pinning requested but not supported!\n");
++ copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
++ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
++
++ x86_hyper = h;
++ x86_init.hyper.init_platform();
+ }
+--- a/arch/x86/kernel/cpu/mshyperv.c
++++ b/arch/x86/kernel/cpu/mshyperv.c
+@@ -257,6 +257,6 @@ static void __init ms_hyperv_init_platfo
+ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+ .name = "Microsoft Hyper-V",
+ .detect = ms_hyperv_platform,
+- .init_platform = ms_hyperv_init_platform,
++ .init.init_platform = ms_hyperv_init_platform,
+ };
+ EXPORT_SYMBOL(x86_hyper_ms_hyperv);
+--- a/arch/x86/kernel/cpu/vmware.c
++++ b/arch/x86/kernel/cpu/vmware.c
+@@ -208,7 +208,7 @@ static bool __init vmware_legacy_x2apic_
+ const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+ .name = "VMware",
+ .detect = vmware_platform,
+- .init_platform = vmware_platform_setup,
+- .x2apic_available = vmware_legacy_x2apic_available,
++ .init.init_platform = vmware_platform_setup,
++ .init.x2apic_available = vmware_legacy_x2apic_available,
+ };
+ EXPORT_SYMBOL(x86_hyper_vmware);
+--- a/arch/x86/kernel/kvm.c
++++ b/arch/x86/kernel/kvm.c
+@@ -547,7 +547,7 @@ static uint32_t __init kvm_detect(void)
+ const struct hypervisor_x86 x86_hyper_kvm __refconst = {
+ .name = "KVM",
+ .detect = kvm_detect,
+- .x2apic_available = kvm_para_available,
++ .init.x2apic_available = kvm_para_available,
+ };
+ EXPORT_SYMBOL_GPL(x86_hyper_kvm);
+
+--- a/arch/x86/kernel/x86_init.c
++++ b/arch/x86/kernel/x86_init.c
+@@ -28,6 +28,8 @@ void x86_init_noop(void) { }
+ void __init x86_init_uint_noop(unsigned int unused) { }
+ int __init iommu_init_noop(void) { return 0; }
+ void iommu_shutdown_noop(void) { }
++bool __init bool_x86_init_noop(void) { return false; }
++void x86_op_int_noop(int cpu) { }
+
+ /*
+ * The platform setup functions are preset with the default functions
+@@ -81,6 +83,12 @@ struct x86_init_ops x86_init __initdata
+ .init_irq = x86_default_pci_init_irq,
+ .fixup_irqs = x86_default_pci_fixup_irqs,
+ },
++
++ .hyper = {
++ .init_platform = x86_init_noop,
++ .x2apic_available = bool_x86_init_noop,
++ .init_mem_mapping = x86_init_noop,
++ },
+ };
+
+ struct x86_cpuinit_ops x86_cpuinit = {
+@@ -101,6 +109,7 @@ struct x86_platform_ops x86_platform __r
+ .get_nmi_reason = default_get_nmi_reason,
+ .save_sched_clock_state = tsc_save_sched_clock_state,
+ .restore_sched_clock_state = tsc_restore_sched_clock_state,
++ .hyper.pin_vcpu = x86_op_int_noop,
+ };
+
+ EXPORT_SYMBOL_GPL(x86_platform);
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -671,7 +671,7 @@ void __init init_mem_mapping(void)
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+
+- hypervisor_init_mem_mapping();
++ x86_init.hyper.init_mem_mapping();
+
+ early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+ }
+--- a/arch/x86/xen/enlighten_hvm.c
++++ b/arch/x86/xen/enlighten_hvm.c
+@@ -229,9 +229,9 @@ static uint32_t __init xen_platform_hvm(
+ const struct hypervisor_x86 x86_hyper_xen_hvm = {
+ .name = "Xen HVM",
+ .detect = xen_platform_hvm,
+- .init_platform = xen_hvm_guest_init,
+- .pin_vcpu = xen_pin_vcpu,
+- .x2apic_available = xen_x2apic_para_available,
+- .init_mem_mapping = xen_hvm_init_mem_mapping,
++ .init.init_platform = xen_hvm_guest_init,
++ .init.x2apic_available = xen_x2apic_para_available,
++ .init.init_mem_mapping = xen_hvm_init_mem_mapping,
++ .runtime.pin_vcpu = xen_pin_vcpu,
+ };
+ EXPORT_SYMBOL(x86_hyper_xen_hvm);
+--- a/arch/x86/xen/enlighten_pv.c
++++ b/arch/x86/xen/enlighten_pv.c
+@@ -1462,6 +1462,6 @@ static uint32_t __init xen_platform_pv(v
+ const struct hypervisor_x86 x86_hyper_xen_pv = {
+ .name = "Xen PV",
+ .detect = xen_platform_pv,
+- .pin_vcpu = xen_pin_vcpu,
++ .runtime.pin_vcpu = xen_pin_vcpu,
+ };
+ EXPORT_SYMBOL(x86_hyper_xen_pv);
+--- a/include/linux/hypervisor.h
++++ b/include/linux/hypervisor.h
+@@ -7,8 +7,12 @@
+ * Juergen Gross <jgross@suse.com>
+ */
+
+-#ifdef CONFIG_HYPERVISOR_GUEST
+-#include <asm/hypervisor.h>
++#ifdef CONFIG_X86
++#include <asm/x86_init.h>
++static inline void hypervisor_pin_vcpu(int cpu)
++{
++ x86_platform.hyper.pin_vcpu(cpu);
++}
+ #else
+ static inline void hypervisor_pin_vcpu(int cpu)
+ {
--- /dev/null
+From f16b3da1dc936c0f8121741d0a1731bf242f2f56 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 2 Nov 2017 00:59:12 -0700
+Subject: x86/xen/64, x86/entry/64: Clean up SP code in cpu_initialize_context()
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit f16b3da1dc936c0f8121741d0a1731bf242f2f56 upstream.
+
+I'm removing thread_struct::sp0, and Xen's usage of it is slightly
+dubious and unnecessary. Use appropriate helpers instead.
+
+While we're at at, reorder the code slightly to make it more obvious
+what's going on.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Reviewed-by: Juergen Gross <jgross@suse.com>
+Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/d5b9a3da2b47c68325bd2bbe8f82d9554dee0d0f.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/xen/smp_pv.c | 17 ++++++++++++++---
+ 1 file changed, 14 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/xen/smp_pv.c
++++ b/arch/x86/xen/smp_pv.c
+@@ -14,6 +14,7 @@
+ * single-threaded.
+ */
+ #include <linux/sched.h>
++#include <linux/sched/task_stack.h>
+ #include <linux/err.h>
+ #include <linux/slab.h>
+ #include <linux/smp.h>
+@@ -294,12 +295,19 @@ cpu_initialize_context(unsigned int cpu,
+ #endif
+ memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+
++ /*
++ * Bring up the CPU in cpu_bringup_and_idle() with the stack
++ * pointing just below where pt_regs would be if it were a normal
++ * kernel entry.
++ */
+ ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+ ctxt->flags = VGCF_IN_KERNEL;
+ ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+ ctxt->user_regs.ds = __USER_DS;
+ ctxt->user_regs.es = __USER_DS;
+ ctxt->user_regs.ss = __KERNEL_DS;
++ ctxt->user_regs.cs = __KERNEL_CS;
++ ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle);
+
+ xen_copy_trap_info(ctxt->trap_ctxt);
+
+@@ -314,8 +322,13 @@ cpu_initialize_context(unsigned int cpu,
+ ctxt->gdt_frames[0] = gdt_mfn;
+ ctxt->gdt_ents = GDT_ENTRIES;
+
++ /*
++ * Set SS:SP that Xen will use when entering guest kernel mode
++ * from guest user mode. Subsequent calls to load_sp0() can
++ * change this value.
++ */
+ ctxt->kernel_ss = __KERNEL_DS;
+- ctxt->kernel_sp = idle->thread.sp0;
++ ctxt->kernel_sp = task_top_of_stack(idle);
+
+ #ifdef CONFIG_X86_32
+ ctxt->event_callback_cs = __KERNEL_CS;
+@@ -327,10 +340,8 @@ cpu_initialize_context(unsigned int cpu,
+ (unsigned long)xen_hypervisor_callback;
+ ctxt->failsafe_callback_eip =
+ (unsigned long)xen_failsafe_callback;
+- ctxt->user_regs.cs = __KERNEL_CS;
+ per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+
+- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
+ ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
+ if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
+ BUG();
--- /dev/null
+From 43e4111086a70c78bedb6ad990bee97f17b27a6e Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Thu, 2 Nov 2017 00:59:07 -0700
+Subject: xen, x86/entry/64: Add xen NMI trap entry
+
+From: Juergen Gross <jgross@suse.com>
+
+commit 43e4111086a70c78bedb6ad990bee97f17b27a6e upstream.
+
+Instead of trying to execute any NMI via the bare metal's NMI trap
+handler use a Xen specific one for PV domains, like we do for e.g.
+debug traps. As in a PV domain the NMI is handled via the normal
+kernel stack this is the correct thing to do.
+
+This will enable us to get rid of the very fragile and questionable
+dependencies between the bare metal NMI handler and Xen assumptions
+believed to be broken anyway.
+
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/5baf5c0528d58402441550c5770b98e7961e7680.1509609304.git.luto@kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/entry/entry_64.S | 2 +-
+ arch/x86/include/asm/traps.h | 2 +-
+ arch/x86/xen/enlighten_pv.c | 2 +-
+ arch/x86/xen/xen-asm_64.S | 2 +-
+ 4 files changed, 4 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -1079,6 +1079,7 @@ idtentry int3 do_int3 has_error_code
+ idtentry stack_segment do_stack_segment has_error_code=1
+
+ #ifdef CONFIG_XEN
++idtentry xennmi do_nmi has_error_code=0
+ idtentry xendebug do_debug has_error_code=0
+ idtentry xenint3 do_int3 has_error_code=0
+ #endif
+@@ -1241,7 +1242,6 @@ ENTRY(error_exit)
+ END(error_exit)
+
+ /* Runs on exception stack */
+-/* XXX: broken on Xen PV */
+ ENTRY(nmi)
+ UNWIND_HINT_IRET_REGS
+ /*
+--- a/arch/x86/include/asm/traps.h
++++ b/arch/x86/include/asm/traps.h
+@@ -38,9 +38,9 @@ asmlinkage void simd_coprocessor_error(v
+
+ #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+ asmlinkage void xen_divide_error(void);
++asmlinkage void xen_xennmi(void);
+ asmlinkage void xen_xendebug(void);
+ asmlinkage void xen_xenint3(void);
+-asmlinkage void xen_nmi(void);
+ asmlinkage void xen_overflow(void);
+ asmlinkage void xen_bounds(void);
+ asmlinkage void xen_invalid_op(void);
+--- a/arch/x86/xen/enlighten_pv.c
++++ b/arch/x86/xen/enlighten_pv.c
+@@ -601,7 +601,7 @@ static struct trap_array_entry trap_arra
+ #ifdef CONFIG_X86_MCE
+ { machine_check, xen_machine_check, true },
+ #endif
+- { nmi, xen_nmi, true },
++ { nmi, xen_xennmi, true },
+ { overflow, xen_overflow, false },
+ #ifdef CONFIG_IA32_EMULATION
+ { entry_INT80_compat, xen_entry_INT80_compat, false },
+--- a/arch/x86/xen/xen-asm_64.S
++++ b/arch/x86/xen/xen-asm_64.S
+@@ -30,7 +30,7 @@ xen_pv_trap debug
+ xen_pv_trap xendebug
+ xen_pv_trap int3
+ xen_pv_trap xenint3
+-xen_pv_trap nmi
++xen_pv_trap xennmi
+ xen_pv_trap overflow
+ xen_pv_trap bounds
+ xen_pv_trap invalid_op