+++ /dev/null
-From 7ed6fb9b5a5510e4ef78ab27419184741169978a Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@zytor.com>
-Date: Wed, 21 May 2014 10:22:59 -0700
-Subject: Revert "x86-64, modify_ldt: Make support for 16-bit segments a runtime option"
-
-From: "H. Peter Anvin" <hpa@zytor.com>
-
-commit 7ed6fb9b5a5510e4ef78ab27419184741169978a upstream.
-
-This reverts commit fa81511bb0bbb2b1aace3695ce869da9762624ff in
-preparation of merging in the proper fix (espfix64).
-
-Signed-off-by: H. Peter Anvin <hpa@zytor.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/kernel/ldt.c | 4 +---
- arch/x86/vdso/vdso32-setup.c | 8 --------
- 2 files changed, 1 insertion(+), 11 deletions(-)
-
---- a/arch/x86/kernel/ldt.c
-+++ b/arch/x86/kernel/ldt.c
-@@ -20,8 +20,6 @@
- #include <asm/mmu_context.h>
- #include <asm/syscalls.h>
-
--int sysctl_ldt16 = 0;
--
- #ifdef CONFIG_SMP
- static void flush_ldt(void *current_mm)
- {
-@@ -236,7 +234,7 @@ static int write_ldt(void __user *ptr, u
- * IRET leaking the high bits of the kernel stack address.
- */
- #ifdef CONFIG_X86_64
-- if (!ldt_info.seg_32bit && !sysctl_ldt16) {
-+ if (!ldt_info.seg_32bit) {
- error = -EINVAL;
- goto out_unlock;
- }
---- a/arch/x86/vdso/vdso32-setup.c
-+++ b/arch/x86/vdso/vdso32-setup.c
-@@ -41,7 +41,6 @@ enum {
- #ifdef CONFIG_X86_64
- #define vdso_enabled sysctl_vsyscall32
- #define arch_setup_additional_pages syscall32_setup_pages
--extern int sysctl_ldt16;
- #endif
-
- /*
-@@ -380,13 +379,6 @@ static ctl_table abi_table2[] = {
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
-- },
-- {
-- .procname = "ldt16",
-- .data = &sysctl_ldt16,
-- .maxlen = sizeof(int),
-- .mode = 0644,
-- .proc_handler = proc_dointvec
- },
- {}
- };
+++ /dev/null
-revert-x86-64-modify_ldt-make-support-for-16-bit-segments-a-runtime-option.patch
-x86-64-espfix-don-t-leak-bits-31-16-of-esp-returning-to-16-bit-stack.patch
-x86-espfix-move-espfix-definitions-into-a-separate-header-file.patch
-x86-espfix-fix-broken-header-guard.patch
-x86-espfix-make-espfix64-a-kconfig-option-fix-uml.patch
-x86-espfix-make-it-possible-to-disable-16-bit-support.patch
+++ /dev/null
-From 3891a04aafd668686239349ea58f3314ea2af86b Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-Date: Tue, 29 Apr 2014 16:46:09 -0700
-Subject: x86-64, espfix: Don't leak bits 31:16 of %esp returning to 16-bit stack
-
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-
-commit 3891a04aafd668686239349ea58f3314ea2af86b upstream.
-
-The IRET instruction, when returning to a 16-bit segment, only
-restores the bottom 16 bits of the user space stack pointer. This
-causes some 16-bit software to break, but it also leaks kernel state
-to user space. We have a software workaround for that ("espfix") for
-the 32-bit kernel, but it relies on a nonzero stack segment base which
-is not available in 64-bit mode.
-
-In checkin:
-
- b3b42ac2cbae x86-64, modify_ldt: Ban 16-bit segments on 64-bit kernels
-
-we "solved" this by forbidding 16-bit segments on 64-bit kernels, with
-the logic that 16-bit support is crippled on 64-bit kernels anyway (no
-V86 support), but it turns out that people are doing stuff like
-running old Win16 binaries under Wine and expect it to work.
-
-This works around this by creating percpu "ministacks", each of which
-is mapped 2^16 times 64K apart. When we detect that the return SS is
-on the LDT, we copy the IRET frame to the ministack and use the
-relevant alias to return to userspace. The ministacks are mapped
-readonly, so if IRET faults we promote #GP to #DF which is an IST
-vector and thus has its own stack; we then do the fixup in the #DF
-handler.
-
-(Making #GP an IST exception would make the msr_safe functions unsafe
-in NMI/MC context, and quite possibly have other effects.)
-
-Special thanks to:
-
-- Andy Lutomirski, for the suggestion of using very small stack slots
- and copy (as opposed to map) the IRET frame there, and for the
- suggestion to mark them readonly and let the fault promote to #DF.
-- Konrad Wilk for paravirt fixup and testing.
-- Borislav Petkov for testing help and useful comments.
-
-Reported-by: Brian Gerst <brgerst@gmail.com>
-Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com
-Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
-Cc: Borislav Petkov <bp@alien8.de>
-Cc: Andrew Lutomriski <amluto@gmail.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Dirk Hohndel <dirk@hohndel.org>
-Cc: Arjan van de Ven <arjan.van.de.ven@intel.com>
-Cc: comex <comexk@gmail.com>
-Cc: Alexander van Heukelum <heukelum@fastmail.fm>
-Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
-Cc: <stable@vger.kernel.org> # consider after upstream merge
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- Documentation/x86/x86_64/mm.txt | 2
- arch/x86/include/asm/pgtable_64_types.h | 2
- arch/x86/include/asm/setup.h | 3
- arch/x86/kernel/Makefile | 1
- arch/x86/kernel/entry_64.S | 73 ++++++++++-
- arch/x86/kernel/espfix_64.c | 208 ++++++++++++++++++++++++++++++++
- arch/x86/kernel/ldt.c | 11 -
- arch/x86/kernel/smpboot.c | 7 +
- arch/x86/mm/dump_pagetables.c | 31 +++-
- init/main.c | 4
- 10 files changed, 316 insertions(+), 26 deletions(-)
-
---- a/Documentation/x86/x86_64/mm.txt
-+++ b/Documentation/x86/x86_64/mm.txt
-@@ -12,6 +12,8 @@ ffffc90000000000 - ffffe8ffffffffff (=45
- ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
- ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
- ... unused hole ...
-+ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
-+... unused hole ...
- ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
- ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space
- ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
---- a/arch/x86/include/asm/pgtable_64_types.h
-+++ b/arch/x86/include/asm/pgtable_64_types.h
-@@ -61,6 +61,8 @@ typedef struct { pteval_t pte; } pte_t;
- #define MODULES_VADDR _AC(0xffffffffa0000000, UL)
- #define MODULES_END _AC(0xffffffffff000000, UL)
- #define MODULES_LEN (MODULES_END - MODULES_VADDR)
-+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
-
- #define EARLY_DYNAMIC_PAGE_TABLES 64
-
---- a/arch/x86/include/asm/setup.h
-+++ b/arch/x86/include/asm/setup.h
-@@ -60,6 +60,9 @@ extern void x86_ce4100_early_setup(void)
- static inline void x86_ce4100_early_setup(void) { }
- #endif
-
-+extern void init_espfix_bsp(void);
-+extern void init_espfix_ap(void);
-+
- #ifndef _SETUP
-
- /*
---- a/arch/x86/kernel/Makefile
-+++ b/arch/x86/kernel/Makefile
-@@ -27,6 +27,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x86
- obj-y += syscall_$(BITS).o
- obj-$(CONFIG_X86_64) += vsyscall_64.o
- obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
-+obj-$(CONFIG_X86_64) += espfix_64.o
- obj-y += bootflag.o e820.o
- obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
- obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
---- a/arch/x86/kernel/entry_64.S
-+++ b/arch/x86/kernel/entry_64.S
-@@ -58,6 +58,7 @@
- #include <asm/asm.h>
- #include <asm/context_tracking.h>
- #include <asm/smap.h>
-+#include <asm/pgtable_types.h>
- #include <linux/err.h>
-
- /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-@@ -1055,8 +1056,16 @@ restore_args:
- RESTORE_ARGS 1,8,1
-
- irq_return:
-+ /*
-+ * Are we returning to a stack segment from the LDT? Note: in
-+ * 64-bit mode SS:RSP on the exception stack is always valid.
-+ */
-+ testb $4,(SS-RIP)(%rsp)
-+ jnz irq_return_ldt
-+
-+irq_return_iret:
- INTERRUPT_RETURN
-- _ASM_EXTABLE(irq_return, bad_iret)
-+ _ASM_EXTABLE(irq_return_iret, bad_iret)
-
- #ifdef CONFIG_PARAVIRT
- ENTRY(native_iret)
-@@ -1064,6 +1073,30 @@ ENTRY(native_iret)
- _ASM_EXTABLE(native_iret, bad_iret)
- #endif
-
-+irq_return_ldt:
-+ pushq_cfi %rax
-+ pushq_cfi %rdi
-+ SWAPGS
-+ movq PER_CPU_VAR(espfix_waddr),%rdi
-+ movq %rax,(0*8)(%rdi) /* RAX */
-+ movq (2*8)(%rsp),%rax /* RIP */
-+ movq %rax,(1*8)(%rdi)
-+ movq (3*8)(%rsp),%rax /* CS */
-+ movq %rax,(2*8)(%rdi)
-+ movq (4*8)(%rsp),%rax /* RFLAGS */
-+ movq %rax,(3*8)(%rdi)
-+ movq (6*8)(%rsp),%rax /* SS */
-+ movq %rax,(5*8)(%rdi)
-+ movq (5*8)(%rsp),%rax /* RSP */
-+ movq %rax,(4*8)(%rdi)
-+ andl $0xffff0000,%eax
-+ popq_cfi %rdi
-+ orq PER_CPU_VAR(espfix_stack),%rax
-+ SWAPGS
-+ movq %rax,%rsp
-+ popq_cfi %rax
-+ jmp irq_return_iret
-+
- .section .fixup,"ax"
- bad_iret:
- /*
-@@ -1127,9 +1160,41 @@ ENTRY(retint_kernel)
- call preempt_schedule_irq
- jmp exit_intr
- #endif
--
- CFI_ENDPROC
- END(common_interrupt)
-+
-+ /*
-+ * If IRET takes a fault on the espfix stack, then we
-+ * end up promoting it to a doublefault. In that case,
-+ * modify the stack to make it look like we just entered
-+ * the #GP handler from user space, similar to bad_iret.
-+ */
-+ ALIGN
-+__do_double_fault:
-+ XCPT_FRAME 1 RDI+8
-+ movq RSP(%rdi),%rax /* Trap on the espfix stack? */
-+ sarq $PGDIR_SHIFT,%rax
-+ cmpl $ESPFIX_PGD_ENTRY,%eax
-+ jne do_double_fault /* No, just deliver the fault */
-+ cmpl $__KERNEL_CS,CS(%rdi)
-+ jne do_double_fault
-+ movq RIP(%rdi),%rax
-+ cmpq $irq_return_iret,%rax
-+#ifdef CONFIG_PARAVIRT
-+ je 1f
-+ cmpq $native_iret,%rax
-+#endif
-+ jne do_double_fault /* This shouldn't happen... */
-+1:
-+ movq PER_CPU_VAR(kernel_stack),%rax
-+ subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
-+ movq %rax,RSP(%rdi)
-+ movq $0,(%rax) /* Missing (lost) #GP error code */
-+ movq $general_protection,RIP(%rdi)
-+ retq
-+ CFI_ENDPROC
-+END(__do_double_fault)
-+
- /*
- * End of kprobes section
- */
-@@ -1298,7 +1363,7 @@ zeroentry overflow do_overflow
- zeroentry bounds do_bounds
- zeroentry invalid_op do_invalid_op
- zeroentry device_not_available do_device_not_available
--paranoiderrorentry double_fault do_double_fault
-+paranoiderrorentry double_fault __do_double_fault
- zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
- errorentry invalid_TSS do_invalid_TSS
- errorentry segment_not_present do_segment_not_present
-@@ -1585,7 +1650,7 @@ error_sti:
- */
- error_kernelspace:
- incl %ebx
-- leaq irq_return(%rip),%rcx
-+ leaq irq_return_iret(%rip),%rcx
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
- movl %ecx,%eax /* zero extend */
---- /dev/null
-+++ b/arch/x86/kernel/espfix_64.c
-@@ -0,0 +1,208 @@
-+/* ----------------------------------------------------------------------- *
-+ *
-+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
-+ *
-+ * This program is free software; you can redistribute it and/or modify it
-+ * under the terms and conditions of the GNU General Public License,
-+ * version 2, as published by the Free Software Foundation.
-+ *
-+ * This program is distributed in the hope it will be useful, but WITHOUT
-+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
-+ * more details.
-+ *
-+ * ----------------------------------------------------------------------- */
-+
-+/*
-+ * The IRET instruction, when returning to a 16-bit segment, only
-+ * restores the bottom 16 bits of the user space stack pointer. This
-+ * causes some 16-bit software to break, but it also leaks kernel state
-+ * to user space.
-+ *
-+ * This works around this by creating percpu "ministacks", each of which
-+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
-+ * on the LDT, we copy the IRET frame to the ministack and use the
-+ * relevant alias to return to userspace. The ministacks are mapped
-+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
-+ * vector and thus has its own stack; we then do the fixup in the #DF
-+ * handler.
-+ *
-+ * This file sets up the ministacks and the related page tables. The
-+ * actual ministack invocation is in entry_64.S.
-+ */
-+
-+#include <linux/init.h>
-+#include <linux/init_task.h>
-+#include <linux/kernel.h>
-+#include <linux/percpu.h>
-+#include <linux/gfp.h>
-+#include <linux/random.h>
-+#include <asm/pgtable.h>
-+#include <asm/pgalloc.h>
-+#include <asm/setup.h>
-+
-+/*
-+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
-+ * it up to a cache line to avoid unnecessary sharing.
-+ */
-+#define ESPFIX_STACK_SIZE (8*8UL)
-+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
-+
-+/* There is address space for how many espfix pages? */
-+#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
-+
-+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
-+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
-+# error "Need more than one PGD for the ESPFIX hack"
-+#endif
-+
-+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
-+
-+/* This contains the *bottom* address of the espfix stack */
-+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
-+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
-+
-+/* Initialization mutex - should this be a spinlock? */
-+static DEFINE_MUTEX(espfix_init_mutex);
-+
-+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
-+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
-+static void *espfix_pages[ESPFIX_MAX_PAGES];
-+
-+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
-+ __aligned(PAGE_SIZE);
-+
-+static unsigned int page_random, slot_random;
-+
-+/*
-+ * This returns the bottom address of the espfix stack for a specific CPU.
-+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
-+ * we have to account for some amount of padding at the end of each page.
-+ */
-+static inline unsigned long espfix_base_addr(unsigned int cpu)
-+{
-+ unsigned long page, slot;
-+ unsigned long addr;
-+
-+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
-+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
-+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
-+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
-+ addr += ESPFIX_BASE_ADDR;
-+ return addr;
-+}
-+
-+#define PTE_STRIDE (65536/PAGE_SIZE)
-+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
-+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
-+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
-+
-+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
-+
-+static void init_espfix_random(void)
-+{
-+ unsigned long rand;
-+
-+ /*
-+ * This is run before the entropy pools are initialized,
-+ * but this is hopefully better than nothing.
-+ */
-+ if (!arch_get_random_long(&rand)) {
-+ /* The constant is an arbitrary large prime */
-+ rdtscll(rand);
-+ rand *= 0xc345c6b72fd16123UL;
-+ }
-+
-+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
-+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
-+ & (ESPFIX_PAGE_SPACE - 1);
-+}
-+
-+void __init init_espfix_bsp(void)
-+{
-+ pgd_t *pgd_p;
-+ pteval_t ptemask;
-+
-+ ptemask = __supported_pte_mask;
-+
-+ /* Install the espfix pud into the kernel page directory */
-+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
-+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
-+
-+ /* Randomize the locations */
-+ init_espfix_random();
-+
-+ /* The rest is the same as for any other processor */
-+ init_espfix_ap();
-+}
-+
-+void init_espfix_ap(void)
-+{
-+ unsigned int cpu, page;
-+ unsigned long addr;
-+ pud_t pud, *pud_p;
-+ pmd_t pmd, *pmd_p;
-+ pte_t pte, *pte_p;
-+ int n;
-+ void *stack_page;
-+ pteval_t ptemask;
-+
-+ /* We only have to do this once... */
-+ if (likely(this_cpu_read(espfix_stack)))
-+ return; /* Already initialized */
-+
-+ cpu = smp_processor_id();
-+ addr = espfix_base_addr(cpu);
-+ page = cpu/ESPFIX_STACKS_PER_PAGE;
-+
-+ /* Did another CPU already set this up? */
-+ stack_page = ACCESS_ONCE(espfix_pages[page]);
-+ if (likely(stack_page))
-+ goto done;
-+
-+ mutex_lock(&espfix_init_mutex);
-+
-+ /* Did we race on the lock? */
-+ stack_page = ACCESS_ONCE(espfix_pages[page]);
-+ if (stack_page)
-+ goto unlock_done;
-+
-+ ptemask = __supported_pte_mask;
-+
-+ pud_p = &espfix_pud_page[pud_index(addr)];
-+ pud = *pud_p;
-+ if (!pud_present(pud)) {
-+ pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
-+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
-+ paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
-+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
-+ set_pud(&pud_p[n], pud);
-+ }
-+
-+ pmd_p = pmd_offset(&pud, addr);
-+ pmd = *pmd_p;
-+ if (!pmd_present(pmd)) {
-+ pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
-+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
-+ paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
-+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
-+ set_pmd(&pmd_p[n], pmd);
-+ }
-+
-+ pte_p = pte_offset_kernel(&pmd, addr);
-+ stack_page = (void *)__get_free_page(GFP_KERNEL);
-+ pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
-+ paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT);
-+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
-+ set_pte(&pte_p[n*PTE_STRIDE], pte);
-+
-+ /* Job is done for this CPU and any CPU which shares this page */
-+ ACCESS_ONCE(espfix_pages[page]) = stack_page;
-+
-+unlock_done:
-+ mutex_unlock(&espfix_init_mutex);
-+done:
-+ this_cpu_write(espfix_stack, addr);
-+ this_cpu_write(espfix_waddr, (unsigned long)stack_page
-+ + (addr & ~PAGE_MASK));
-+}
---- a/arch/x86/kernel/ldt.c
-+++ b/arch/x86/kernel/ldt.c
-@@ -229,17 +229,6 @@ static int write_ldt(void __user *ptr, u
- }
- }
-
-- /*
-- * On x86-64 we do not support 16-bit segments due to
-- * IRET leaking the high bits of the kernel stack address.
-- */
--#ifdef CONFIG_X86_64
-- if (!ldt_info.seg_32bit) {
-- error = -EINVAL;
-- goto out_unlock;
-- }
--#endif
--
- fill_ldt(&ldt, &ldt_info);
- if (oldmode)
- ldt.avl = 0;
---- a/arch/x86/kernel/smpboot.c
-+++ b/arch/x86/kernel/smpboot.c
-@@ -265,6 +265,13 @@ notrace static void __cpuinit start_seco
- check_tsc_sync_target();
-
- /*
-+ * Enable the espfix hack for this CPU
-+ */
-+#ifdef CONFIG_X86_64
-+ init_espfix_ap();
-+#endif
-+
-+ /*
- * We need to hold vector_lock so there the set of online cpus
- * does not change while we are assigning vectors to cpus. Holding
- * this lock ensures we don't half assign or remove an irq from a cpu.
---- a/arch/x86/mm/dump_pagetables.c
-+++ b/arch/x86/mm/dump_pagetables.c
-@@ -30,11 +30,13 @@ struct pg_state {
- unsigned long start_address;
- unsigned long current_address;
- const struct addr_marker *marker;
-+ unsigned long lines;
- };
-
- struct addr_marker {
- unsigned long start_address;
- const char *name;
-+ unsigned long max_lines;
- };
-
- /* indices for address_markers; keep sync'd w/ address_markers below */
-@@ -45,6 +47,7 @@ enum address_markers_idx {
- LOW_KERNEL_NR,
- VMALLOC_START_NR,
- VMEMMAP_START_NR,
-+ ESPFIX_START_NR,
- HIGH_KERNEL_NR,
- MODULES_VADDR_NR,
- MODULES_END_NR,
-@@ -67,6 +70,7 @@ static struct addr_marker address_marker
- { PAGE_OFFSET, "Low Kernel Mapping" },
- { VMALLOC_START, "vmalloc() Area" },
- { VMEMMAP_START, "Vmemmap" },
-+ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
- { __START_KERNEL_map, "High Kernel Mapping" },
- { MODULES_VADDR, "Modules" },
- { MODULES_END, "End Modules" },
-@@ -163,7 +167,7 @@ static void note_page(struct seq_file *m
- pgprot_t new_prot, int level)
- {
- pgprotval_t prot, cur;
-- static const char units[] = "KMGTPE";
-+ static const char units[] = "BKMGTPE";
-
- /*
- * If we have a "break" in the series, we need to flush the state that
-@@ -178,6 +182,7 @@ static void note_page(struct seq_file *m
- st->current_prot = new_prot;
- st->level = level;
- st->marker = address_markers;
-+ st->lines = 0;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
- } else if (prot != cur || level != st->level ||
- st->current_address >= st->marker[1].start_address) {
-@@ -188,17 +193,21 @@ static void note_page(struct seq_file *m
- /*
- * Now print the actual finished series
- */
-- seq_printf(m, "0x%0*lx-0x%0*lx ",
-- width, st->start_address,
-- width, st->current_address);
--
-- delta = (st->current_address - st->start_address) >> 10;
-- while (!(delta & 1023) && unit[1]) {
-- delta >>= 10;
-- unit++;
-+ if (!st->marker->max_lines ||
-+ st->lines < st->marker->max_lines) {
-+ seq_printf(m, "0x%0*lx-0x%0*lx ",
-+ width, st->start_address,
-+ width, st->current_address);
-+
-+ delta = (st->current_address - st->start_address) >> 10;
-+ while (!(delta & 1023) && unit[1]) {
-+ delta >>= 10;
-+ unit++;
-+ }
-+ seq_printf(m, "%9lu%c ", delta, *unit);
-+ printk_prot(m, st->current_prot, st->level);
- }
-- seq_printf(m, "%9lu%c ", delta, *unit);
-- printk_prot(m, st->current_prot, st->level);
-+ st->lines++;
-
- /*
- * We print markers for special areas of address space,
---- a/init/main.c
-+++ b/init/main.c
-@@ -606,6 +606,10 @@ asmlinkage void __init start_kernel(void
- if (efi_enabled(EFI_RUNTIME_SERVICES))
- efi_enter_virtual_mode();
- #endif
-+#ifdef CONFIG_X86_64
-+ /* Should be run before the first non-init thread is created */
-+ init_espfix_bsp();
-+#endif
- thread_info_cache_init();
- cred_init();
- fork_init(totalram_pages);
+++ /dev/null
-From 20b68535cd27183ebd3651ff313afb2b97dac941 Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-Date: Fri, 2 May 2014 11:33:51 -0700
-Subject: x86, espfix: Fix broken header guard
-
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-
-commit 20b68535cd27183ebd3651ff313afb2b97dac941 upstream.
-
-Header guard is #ifndef, not #ifdef...
-
-Reported-by: Fengguang Wu <fengguang.wu@intel.com>
-Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/include/asm/espfix.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
---- a/arch/x86/include/asm/espfix.h
-+++ b/arch/x86/include/asm/espfix.h
-@@ -1,4 +1,4 @@
--#ifdef _ASM_X86_ESPFIX_H
-+#ifndef _ASM_X86_ESPFIX_H
- #define _ASM_X86_ESPFIX_H
-
- #ifdef CONFIG_X86_64
+++ /dev/null
-From 197725de65477bc8509b41388157c1a2283542bb Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@zytor.com>
-Date: Sun, 4 May 2014 10:00:49 -0700
-Subject: x86, espfix: Make espfix64 a Kconfig option, fix UML
-
-From: "H. Peter Anvin" <hpa@zytor.com>
-
-commit 197725de65477bc8509b41388157c1a2283542bb upstream.
-
-Make espfix64 a hidden Kconfig option. This fixes the x86-64 UML
-build which had broken due to the non-existence of init_espfix_bsp()
-in UML: since UML uses its own Kconfig, this option does not appear in
-the UML build.
-
-This also makes it possible to make support for 16-bit segments a
-configuration option, for the people who want to minimize the size of
-the kernel.
-
-Reported-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: H. Peter Anvin <hpa@zytor.com>
-Cc: Richard Weinberger <richard@nod.at>
-Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/Kconfig | 4 ++++
- arch/x86/kernel/Makefile | 2 +-
- arch/x86/kernel/smpboot.c | 2 +-
- init/main.c | 2 +-
- 4 files changed, 7 insertions(+), 3 deletions(-)
-
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -956,6 +956,10 @@ config VM86
- XFree86 to initialize some video cards via BIOS. Disabling this
- option saves about 6k.
-
-+config X86_ESPFIX64
-+ def_bool y
-+ depends on X86_64
-+
- config TOSHIBA
- tristate "Toshiba Laptop support"
- depends on X86_32
---- a/arch/x86/kernel/Makefile
-+++ b/arch/x86/kernel/Makefile
-@@ -27,7 +27,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x86
- obj-y += syscall_$(BITS).o
- obj-$(CONFIG_X86_64) += vsyscall_64.o
- obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
--obj-$(CONFIG_X86_64) += espfix_64.o
-+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
- obj-y += bootflag.o e820.o
- obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
- obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
---- a/arch/x86/kernel/smpboot.c
-+++ b/arch/x86/kernel/smpboot.c
-@@ -267,7 +267,7 @@ notrace static void __cpuinit start_seco
- /*
- * Enable the espfix hack for this CPU
- */
--#ifdef CONFIG_X86_64
-+#ifdef CONFIG_X86_ESPFIX64
- init_espfix_ap();
- #endif
-
---- a/init/main.c
-+++ b/init/main.c
-@@ -606,7 +606,7 @@ asmlinkage void __init start_kernel(void
- if (efi_enabled(EFI_RUNTIME_SERVICES))
- efi_enter_virtual_mode();
- #endif
--#ifdef CONFIG_X86_64
-+#ifdef CONFIG_X86_ESPFIX64
- /* Should be run before the first non-init thread is created */
- init_espfix_bsp();
- #endif
+++ /dev/null
-From 34273f41d57ee8d854dcd2a1d754cbb546cb548f Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@zytor.com>
-Date: Sun, 4 May 2014 10:36:22 -0700
-Subject: x86, espfix: Make it possible to disable 16-bit support
-
-From: "H. Peter Anvin" <hpa@zytor.com>
-
-commit 34273f41d57ee8d854dcd2a1d754cbb546cb548f upstream.
-
-Embedded systems, which may be very memory-size-sensitive, are
-extremely unlikely to ever encounter any 16-bit software, so make it
-a CONFIG_EXPERT option to turn off support for any 16-bit software
-whatsoever.
-
-Signed-off-by: H. Peter Anvin <hpa@zytor.com>
-Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/Kconfig | 23 ++++++++++++++++++-----
- arch/x86/kernel/entry_32.S | 12 ++++++++++++
- arch/x86/kernel/entry_64.S | 8 ++++++++
- arch/x86/kernel/ldt.c | 5 +++++
- 4 files changed, 43 insertions(+), 5 deletions(-)
-
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -951,14 +951,27 @@ config VM86
- default y
- depends on X86_32
- ---help---
-- This option is required by programs like DOSEMU to run 16-bit legacy
-- code on X86 processors. It also may be needed by software like
-- XFree86 to initialize some video cards via BIOS. Disabling this
-- option saves about 6k.
-+ This option is required by programs like DOSEMU to run
-+ 16-bit real mode legacy code on x86 processors. It also may
-+ be needed by software like XFree86 to initialize some video
-+ cards via BIOS. Disabling this option saves about 6K.
-+
-+config X86_16BIT
-+ bool "Enable support for 16-bit segments" if EXPERT
-+ default y
-+ ---help---
-+ This option is required by programs like Wine to run 16-bit
-+ protected mode legacy code on x86 processors. Disabling
-+ this option saves about 300 bytes on i386, or around 6K text
-+ plus 16K runtime memory on x86-64,
-+
-+config X86_ESPFIX32
-+ def_bool y
-+ depends on X86_16BIT && X86_32
-
- config X86_ESPFIX64
- def_bool y
-- depends on X86_64
-+ depends on X86_16BIT && X86_64
-
- config TOSHIBA
- tristate "Toshiba Laptop support"
---- a/arch/x86/kernel/entry_32.S
-+++ b/arch/x86/kernel/entry_32.S
-@@ -531,6 +531,7 @@ syscall_exit:
- restore_all:
- TRACE_IRQS_IRET
- restore_all_notrace:
-+#ifdef CONFIG_X86_ESPFIX32
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
- # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
- # are returning to the kernel.
-@@ -541,6 +542,7 @@ restore_all_notrace:
- cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
- CFI_REMEMBER_STATE
- je ldt_ss # returning to user-space with LDT SS
-+#endif
- restore_nocheck:
- RESTORE_REGS 4 # skip orig_eax/error_code
- irq_return:
-@@ -553,6 +555,7 @@ ENTRY(iret_exc)
- .previous
- _ASM_EXTABLE(irq_return,iret_exc)
-
-+#ifdef CONFIG_X86_ESPFIX32
- CFI_RESTORE_STATE
- ldt_ss:
- #ifdef CONFIG_PARAVIRT
-@@ -596,6 +599,7 @@ ldt_ss:
- lss (%esp), %esp /* switch to espfix segment */
- CFI_ADJUST_CFA_OFFSET -8
- jmp restore_nocheck
-+#endif
- CFI_ENDPROC
- ENDPROC(system_call)
-
-@@ -708,6 +712,7 @@ END(syscall_badsys)
- * the high word of the segment base from the GDT and swiches to the
- * normal stack and adjusts ESP with the matching offset.
- */
-+#ifdef CONFIG_X86_ESPFIX32
- /* fixup the stack */
- mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
- mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
-@@ -717,8 +722,10 @@ END(syscall_badsys)
- pushl_cfi %eax
- lss (%esp), %esp /* switch to the normal stack segment */
- CFI_ADJUST_CFA_OFFSET -8
-+#endif
- .endm
- .macro UNWIND_ESPFIX_STACK
-+#ifdef CONFIG_X86_ESPFIX32
- movl %ss, %eax
- /* see if on espfix stack */
- cmpw $__ESPFIX_SS, %ax
-@@ -729,6 +736,7 @@ END(syscall_badsys)
- /* switch to normal stack */
- FIXUP_ESPFIX_STACK
- 27:
-+#endif
- .endm
-
- /*
-@@ -1336,11 +1344,13 @@ END(debug)
- ENTRY(nmi)
- RING0_INT_FRAME
- ASM_CLAC
-+#ifdef CONFIG_X86_ESPFIX32
- pushl_cfi %eax
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl_cfi %eax
- je nmi_espfix_stack
-+#endif
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl_cfi %eax
-@@ -1380,6 +1390,7 @@ nmi_debug_stack_check:
- FIX_STACK 24, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-+#ifdef CONFIG_X86_ESPFIX32
- nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
-@@ -1401,6 +1412,7 @@ nmi_espfix_stack:
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
-+#endif
- CFI_ENDPROC
- END(nmi)
-
---- a/arch/x86/kernel/entry_64.S
-+++ b/arch/x86/kernel/entry_64.S
-@@ -1060,8 +1060,10 @@ irq_return:
- * Are we returning to a stack segment from the LDT? Note: in
- * 64-bit mode SS:RSP on the exception stack is always valid.
- */
-+#ifdef CONFIG_X86_ESPFIX64
- testb $4,(SS-RIP)(%rsp)
- jnz irq_return_ldt
-+#endif
-
- irq_return_iret:
- INTERRUPT_RETURN
-@@ -1073,6 +1075,7 @@ ENTRY(native_iret)
- _ASM_EXTABLE(native_iret, bad_iret)
- #endif
-
-+#ifdef CONFIG_X86_ESPFIX64
- irq_return_ldt:
- pushq_cfi %rax
- pushq_cfi %rdi
-@@ -1096,6 +1099,7 @@ irq_return_ldt:
- movq %rax,%rsp
- popq_cfi %rax
- jmp irq_return_iret
-+#endif
-
- .section .fixup,"ax"
- bad_iret:
-@@ -1169,6 +1173,7 @@ END(common_interrupt)
- * modify the stack to make it look like we just entered
- * the #GP handler from user space, similar to bad_iret.
- */
-+#ifdef CONFIG_X86_ESPFIX64
- ALIGN
- __do_double_fault:
- XCPT_FRAME 1 RDI+8
-@@ -1194,6 +1199,9 @@ __do_double_fault:
- retq
- CFI_ENDPROC
- END(__do_double_fault)
-+#else
-+# define __do_double_fault do_double_fault
-+#endif
-
- /*
- * End of kprobes section
---- a/arch/x86/kernel/ldt.c
-+++ b/arch/x86/kernel/ldt.c
-@@ -229,6 +229,11 @@ static int write_ldt(void __user *ptr, u
- }
- }
-
-+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
-+ error = -EINVAL;
-+ goto out_unlock;
-+ }
-+
- fill_ldt(&ldt, &ldt_info);
- if (oldmode)
- ldt.avl = 0;
+++ /dev/null
-From e1fe9ed8d2a4937510d0d60e20705035c2609aea Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-Date: Thu, 1 May 2014 14:12:23 -0700
-Subject: x86, espfix: Move espfix definitions into a separate header file
-
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-
-commit e1fe9ed8d2a4937510d0d60e20705035c2609aea upstream.
-
-Sparse warns that the percpu variables aren't declared before they are
-defined. Rather than hacking around it, move espfix definitions into
-a proper header file.
-
-Reported-by: Fengguang Wu <fengguang.wu@intel.com>
-Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/include/asm/espfix.h | 16 ++++++++++++++++
- arch/x86/include/asm/setup.h | 5 ++---
- arch/x86/kernel/espfix_64.c | 1 +
- 3 files changed, 19 insertions(+), 3 deletions(-)
-
---- /dev/null
-+++ b/arch/x86/include/asm/espfix.h
-@@ -0,0 +1,16 @@
-+#ifdef _ASM_X86_ESPFIX_H
-+#define _ASM_X86_ESPFIX_H
-+
-+#ifdef CONFIG_X86_64
-+
-+#include <asm/percpu.h>
-+
-+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
-+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
-+
-+extern void init_espfix_bsp(void);
-+extern void init_espfix_ap(void);
-+
-+#endif /* CONFIG_X86_64 */
-+
-+#endif /* _ASM_X86_ESPFIX_H */
---- a/arch/x86/include/asm/setup.h
-+++ b/arch/x86/include/asm/setup.h
-@@ -60,11 +60,10 @@ extern void x86_ce4100_early_setup(void)
- static inline void x86_ce4100_early_setup(void) { }
- #endif
-
--extern void init_espfix_bsp(void);
--extern void init_espfix_ap(void);
--
- #ifndef _SETUP
-
-+#include <asm/espfix.h>
-+
- /*
- * This is set up by the setup-routine at boot-time
- */
---- a/arch/x86/kernel/espfix_64.c
-+++ b/arch/x86/kernel/espfix_64.c
-@@ -40,6 +40,7 @@
- #include <asm/pgtable.h>
- #include <asm/pgalloc.h>
- #include <asm/setup.h>
-+#include <asm/espfix.h>
-
- /*
- * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+++ /dev/null
-From 7ed6fb9b5a5510e4ef78ab27419184741169978a Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@zytor.com>
-Date: Wed, 21 May 2014 10:22:59 -0700
-Subject: Revert "x86-64, modify_ldt: Make support for 16-bit segments a runtime option"
-
-From: "H. Peter Anvin" <hpa@zytor.com>
-
-commit 7ed6fb9b5a5510e4ef78ab27419184741169978a upstream.
-
-This reverts commit fa81511bb0bbb2b1aace3695ce869da9762624ff in
-preparation of merging in the proper fix (espfix64).
-
-Signed-off-by: H. Peter Anvin <hpa@zytor.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/kernel/ldt.c | 4 +---
- arch/x86/vdso/vdso32-setup.c | 8 --------
- 2 files changed, 1 insertion(+), 11 deletions(-)
-
---- a/arch/x86/kernel/ldt.c
-+++ b/arch/x86/kernel/ldt.c
-@@ -20,8 +20,6 @@
- #include <asm/mmu_context.h>
- #include <asm/syscalls.h>
-
--int sysctl_ldt16 = 0;
--
- #ifdef CONFIG_SMP
- static void flush_ldt(void *current_mm)
- {
-@@ -236,7 +234,7 @@ static int write_ldt(void __user *ptr, u
- * IRET leaking the high bits of the kernel stack address.
- */
- #ifdef CONFIG_X86_64
-- if (!ldt_info.seg_32bit && !sysctl_ldt16) {
-+ if (!ldt_info.seg_32bit) {
- error = -EINVAL;
- goto out_unlock;
- }
---- a/arch/x86/vdso/vdso32-setup.c
-+++ b/arch/x86/vdso/vdso32-setup.c
-@@ -41,7 +41,6 @@ enum {
- #ifdef CONFIG_X86_64
- #define vdso_enabled sysctl_vsyscall32
- #define arch_setup_additional_pages syscall32_setup_pages
--extern int sysctl_ldt16;
- #endif
-
- /*
-@@ -380,13 +379,6 @@ static ctl_table abi_table2[] = {
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
-- },
-- {
-- .procname = "ldt16",
-- .data = &sysctl_ldt16,
-- .maxlen = sizeof(int),
-- .mode = 0644,
-- .proc_handler = proc_dointvec
- },
- {}
- };
+++ /dev/null
-revert-x86-64-modify_ldt-make-support-for-16-bit-segments-a-runtime-option.patch
-x86-64-espfix-don-t-leak-bits-31-16-of-esp-returning-to-16-bit-stack.patch
-x86-espfix-move-espfix-definitions-into-a-separate-header-file.patch
-x86-espfix-fix-broken-header-guard.patch
-x86-espfix-make-espfix64-a-kconfig-option-fix-uml.patch
-x86-espfix-make-it-possible-to-disable-16-bit-support.patch
+++ /dev/null
-From 3891a04aafd668686239349ea58f3314ea2af86b Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-Date: Tue, 29 Apr 2014 16:46:09 -0700
-Subject: x86-64, espfix: Don't leak bits 31:16 of %esp returning to 16-bit stack
-
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-
-commit 3891a04aafd668686239349ea58f3314ea2af86b upstream.
-
-The IRET instruction, when returning to a 16-bit segment, only
-restores the bottom 16 bits of the user space stack pointer. This
-causes some 16-bit software to break, but it also leaks kernel state
-to user space. We have a software workaround for that ("espfix") for
-the 32-bit kernel, but it relies on a nonzero stack segment base which
-is not available in 64-bit mode.
-
-In checkin:
-
- b3b42ac2cbae x86-64, modify_ldt: Ban 16-bit segments on 64-bit kernels
-
-we "solved" this by forbidding 16-bit segments on 64-bit kernels, with
-the logic that 16-bit support is crippled on 64-bit kernels anyway (no
-V86 support), but it turns out that people are doing stuff like
-running old Win16 binaries under Wine and expect it to work.
-
-This works around this by creating percpu "ministacks", each of which
-is mapped 2^16 times 64K apart. When we detect that the return SS is
-on the LDT, we copy the IRET frame to the ministack and use the
-relevant alias to return to userspace. The ministacks are mapped
-readonly, so if IRET faults we promote #GP to #DF which is an IST
-vector and thus has its own stack; we then do the fixup in the #DF
-handler.
-
-(Making #GP an IST exception would make the msr_safe functions unsafe
-in NMI/MC context, and quite possibly have other effects.)
-
-Special thanks to:
-
-- Andy Lutomirski, for the suggestion of using very small stack slots
- and copy (as opposed to map) the IRET frame there, and for the
- suggestion to mark them readonly and let the fault promote to #DF.
-- Konrad Wilk for paravirt fixup and testing.
-- Borislav Petkov for testing help and useful comments.
-
-Reported-by: Brian Gerst <brgerst@gmail.com>
-Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com
-Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
-Cc: Borislav Petkov <bp@alien8.de>
-Cc: Andrew Lutomriski <amluto@gmail.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Dirk Hohndel <dirk@hohndel.org>
-Cc: Arjan van de Ven <arjan.van.de.ven@intel.com>
-Cc: comex <comexk@gmail.com>
-Cc: Alexander van Heukelum <heukelum@fastmail.fm>
-Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
-Cc: <stable@vger.kernel.org> # consider after upstream merge
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- Documentation/x86/x86_64/mm.txt | 2
- arch/x86/include/asm/pgtable_64_types.h | 2
- arch/x86/include/asm/setup.h | 3
- arch/x86/kernel/Makefile | 1
- arch/x86/kernel/entry_64.S | 73 ++++++++++-
- arch/x86/kernel/espfix_64.c | 208 ++++++++++++++++++++++++++++++++
- arch/x86/kernel/ldt.c | 11 -
- arch/x86/kernel/smpboot.c | 7 +
- arch/x86/mm/dump_pagetables.c | 31 +++-
- init/main.c | 4
- 10 files changed, 316 insertions(+), 26 deletions(-)
-
---- a/Documentation/x86/x86_64/mm.txt
-+++ b/Documentation/x86/x86_64/mm.txt
-@@ -12,6 +12,8 @@ ffffc90000000000 - ffffe8ffffffffff (=45
- ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
- ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
- ... unused hole ...
-+ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
-+... unused hole ...
- ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
- ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space
-
---- a/arch/x86/include/asm/pgtable_64_types.h
-+++ b/arch/x86/include/asm/pgtable_64_types.h
-@@ -59,5 +59,7 @@ typedef struct { pteval_t pte; } pte_t;
- #define MODULES_VADDR _AC(0xffffffffa0000000, UL)
- #define MODULES_END _AC(0xffffffffff000000, UL)
- #define MODULES_LEN (MODULES_END - MODULES_VADDR)
-+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
-
- #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
---- a/arch/x86/include/asm/setup.h
-+++ b/arch/x86/include/asm/setup.h
-@@ -59,6 +59,9 @@ extern void x86_ce4100_early_setup(void)
- static inline void x86_ce4100_early_setup(void) { }
- #endif
-
-+extern void init_espfix_bsp(void);
-+extern void init_espfix_ap(void);
-+
- #ifndef _SETUP
-
- /*
---- a/arch/x86/kernel/Makefile
-+++ b/arch/x86/kernel/Makefile
-@@ -28,6 +28,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x86
- obj-y += syscall_$(BITS).o
- obj-$(CONFIG_X86_64) += vsyscall_64.o
- obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
-+obj-$(CONFIG_X86_64) += espfix_64.o
- obj-y += bootflag.o e820.o
- obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
- obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
---- a/arch/x86/kernel/entry_64.S
-+++ b/arch/x86/kernel/entry_64.S
-@@ -55,6 +55,7 @@
- #include <asm/paravirt.h>
- #include <asm/ftrace.h>
- #include <asm/percpu.h>
-+#include <asm/pgtable_types.h>
- #include <linux/err.h>
-
- /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-@@ -899,10 +900,18 @@ restore_args:
- RESTORE_ARGS 1,8,1
-
- irq_return:
-+ /*
-+ * Are we returning to a stack segment from the LDT? Note: in
-+ * 64-bit mode SS:RSP on the exception stack is always valid.
-+ */
-+ testb $4,(SS-RIP)(%rsp)
-+ jnz irq_return_ldt
-+
-+irq_return_iret:
- INTERRUPT_RETURN
-
- .section __ex_table, "a"
-- .quad irq_return, bad_iret
-+ .quad irq_return_iret, bad_iret
- .previous
-
- #ifdef CONFIG_PARAVIRT
-@@ -914,6 +923,30 @@ ENTRY(native_iret)
- .previous
- #endif
-
-+irq_return_ldt:
-+ pushq_cfi %rax
-+ pushq_cfi %rdi
-+ SWAPGS
-+ movq PER_CPU_VAR(espfix_waddr),%rdi
-+ movq %rax,(0*8)(%rdi) /* RAX */
-+ movq (2*8)(%rsp),%rax /* RIP */
-+ movq %rax,(1*8)(%rdi)
-+ movq (3*8)(%rsp),%rax /* CS */
-+ movq %rax,(2*8)(%rdi)
-+ movq (4*8)(%rsp),%rax /* RFLAGS */
-+ movq %rax,(3*8)(%rdi)
-+ movq (6*8)(%rsp),%rax /* SS */
-+ movq %rax,(5*8)(%rdi)
-+ movq (5*8)(%rsp),%rax /* RSP */
-+ movq %rax,(4*8)(%rdi)
-+ andl $0xffff0000,%eax
-+ popq_cfi %rdi
-+ orq PER_CPU_VAR(espfix_stack),%rax
-+ SWAPGS
-+ movq %rax,%rsp
-+ popq_cfi %rax
-+ jmp irq_return_iret
-+
- .section .fixup,"ax"
- bad_iret:
- /*
-@@ -977,9 +1010,41 @@ ENTRY(retint_kernel)
- call preempt_schedule_irq
- jmp exit_intr
- #endif
--
- CFI_ENDPROC
- END(common_interrupt)
-+
-+ /*
-+ * If IRET takes a fault on the espfix stack, then we
-+ * end up promoting it to a doublefault. In that case,
-+ * modify the stack to make it look like we just entered
-+ * the #GP handler from user space, similar to bad_iret.
-+ */
-+ ALIGN
-+__do_double_fault:
-+ XCPT_FRAME 1 RDI+8
-+ movq RSP(%rdi),%rax /* Trap on the espfix stack? */
-+ sarq $PGDIR_SHIFT,%rax
-+ cmpl $ESPFIX_PGD_ENTRY,%eax
-+ jne do_double_fault /* No, just deliver the fault */
-+ cmpl $__KERNEL_CS,CS(%rdi)
-+ jne do_double_fault
-+ movq RIP(%rdi),%rax
-+ cmpq $irq_return_iret,%rax
-+#ifdef CONFIG_PARAVIRT
-+ je 1f
-+ cmpq $native_iret,%rax
-+#endif
-+ jne do_double_fault /* This shouldn't happen... */
-+1:
-+ movq PER_CPU_VAR(kernel_stack),%rax
-+ subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
-+ movq %rax,RSP(%rdi)
-+ movq $0,(%rax) /* Missing (lost) #GP error code */
-+ movq $general_protection,RIP(%rdi)
-+ retq
-+ CFI_ENDPROC
-+END(__do_double_fault)
-+
- /*
- * End of kprobes section
- */
-@@ -1155,7 +1220,7 @@ zeroentry overflow do_overflow
- zeroentry bounds do_bounds
- zeroentry invalid_op do_invalid_op
- zeroentry device_not_available do_device_not_available
--paranoiderrorentry double_fault do_double_fault
-+paranoiderrorentry double_fault __do_double_fault
- zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
- errorentry invalid_TSS do_invalid_TSS
- errorentry segment_not_present do_segment_not_present
-@@ -1486,7 +1551,7 @@ error_sti:
- */
- error_kernelspace:
- incl %ebx
-- leaq irq_return(%rip),%rcx
-+ leaq irq_return_iret(%rip),%rcx
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
- movl %ecx,%eax /* zero extend */
---- /dev/null
-+++ b/arch/x86/kernel/espfix_64.c
-@@ -0,0 +1,208 @@
-+/* ----------------------------------------------------------------------- *
-+ *
-+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
-+ *
-+ * This program is free software; you can redistribute it and/or modify it
-+ * under the terms and conditions of the GNU General Public License,
-+ * version 2, as published by the Free Software Foundation.
-+ *
-+ * This program is distributed in the hope it will be useful, but WITHOUT
-+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
-+ * more details.
-+ *
-+ * ----------------------------------------------------------------------- */
-+
-+/*
-+ * The IRET instruction, when returning to a 16-bit segment, only
-+ * restores the bottom 16 bits of the user space stack pointer. This
-+ * causes some 16-bit software to break, but it also leaks kernel state
-+ * to user space.
-+ *
-+ * This works around this by creating percpu "ministacks", each of which
-+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
-+ * on the LDT, we copy the IRET frame to the ministack and use the
-+ * relevant alias to return to userspace. The ministacks are mapped
-+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
-+ * vector and thus has its own stack; we then do the fixup in the #DF
-+ * handler.
-+ *
-+ * This file sets up the ministacks and the related page tables. The
-+ * actual ministack invocation is in entry_64.S.
-+ */
-+
-+#include <linux/init.h>
-+#include <linux/init_task.h>
-+#include <linux/kernel.h>
-+#include <linux/percpu.h>
-+#include <linux/gfp.h>
-+#include <linux/random.h>
-+#include <asm/pgtable.h>
-+#include <asm/pgalloc.h>
-+#include <asm/setup.h>
-+
-+/*
-+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
-+ * it up to a cache line to avoid unnecessary sharing.
-+ */
-+#define ESPFIX_STACK_SIZE (8*8UL)
-+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
-+
-+/* There is address space for how many espfix pages? */
-+#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
-+
-+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
-+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
-+# error "Need more than one PGD for the ESPFIX hack"
-+#endif
-+
-+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
-+
-+/* This contains the *bottom* address of the espfix stack */
-+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
-+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
-+
-+/* Initialization mutex - should this be a spinlock? */
-+static DEFINE_MUTEX(espfix_init_mutex);
-+
-+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
-+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
-+static void *espfix_pages[ESPFIX_MAX_PAGES];
-+
-+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
-+ __aligned(PAGE_SIZE);
-+
-+static unsigned int page_random, slot_random;
-+
-+/*
-+ * This returns the bottom address of the espfix stack for a specific CPU.
-+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
-+ * we have to account for some amount of padding at the end of each page.
-+ */
-+static inline unsigned long espfix_base_addr(unsigned int cpu)
-+{
-+ unsigned long page, slot;
-+ unsigned long addr;
-+
-+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
-+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
-+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
-+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
-+ addr += ESPFIX_BASE_ADDR;
-+ return addr;
-+}
-+
-+#define PTE_STRIDE (65536/PAGE_SIZE)
-+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
-+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
-+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
-+
-+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
-+
-+static void init_espfix_random(void)
-+{
-+ unsigned long rand;
-+
-+ /*
-+ * This is run before the entropy pools are initialized,
-+ * but this is hopefully better than nothing.
-+ */
-+ if (!arch_get_random_long(&rand)) {
-+ /* The constant is an arbitrary large prime */
-+ rdtscll(rand);
-+ rand *= 0xc345c6b72fd16123UL;
-+ }
-+
-+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
-+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
-+ & (ESPFIX_PAGE_SPACE - 1);
-+}
-+
-+void __init init_espfix_bsp(void)
-+{
-+ pgd_t *pgd_p;
-+ pteval_t ptemask;
-+
-+ ptemask = __supported_pte_mask;
-+
-+ /* Install the espfix pud into the kernel page directory */
-+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
-+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
-+
-+ /* Randomize the locations */
-+ init_espfix_random();
-+
-+ /* The rest is the same as for any other processor */
-+ init_espfix_ap();
-+}
-+
-+void init_espfix_ap(void)
-+{
-+ unsigned int cpu, page;
-+ unsigned long addr;
-+ pud_t pud, *pud_p;
-+ pmd_t pmd, *pmd_p;
-+ pte_t pte, *pte_p;
-+ int n;
-+ void *stack_page;
-+ pteval_t ptemask;
-+
-+ /* We only have to do this once... */
-+ if (likely(this_cpu_read(espfix_stack)))
-+ return; /* Already initialized */
-+
-+ cpu = smp_processor_id();
-+ addr = espfix_base_addr(cpu);
-+ page = cpu/ESPFIX_STACKS_PER_PAGE;
-+
-+ /* Did another CPU already set this up? */
-+ stack_page = ACCESS_ONCE(espfix_pages[page]);
-+ if (likely(stack_page))
-+ goto done;
-+
-+ mutex_lock(&espfix_init_mutex);
-+
-+ /* Did we race on the lock? */
-+ stack_page = ACCESS_ONCE(espfix_pages[page]);
-+ if (stack_page)
-+ goto unlock_done;
-+
-+ ptemask = __supported_pte_mask;
-+
-+ pud_p = &espfix_pud_page[pud_index(addr)];
-+ pud = *pud_p;
-+ if (!pud_present(pud)) {
-+ pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
-+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
-+ paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
-+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
-+ set_pud(&pud_p[n], pud);
-+ }
-+
-+ pmd_p = pmd_offset(&pud, addr);
-+ pmd = *pmd_p;
-+ if (!pmd_present(pmd)) {
-+ pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
-+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
-+ paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
-+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
-+ set_pmd(&pmd_p[n], pmd);
-+ }
-+
-+ pte_p = pte_offset_kernel(&pmd, addr);
-+ stack_page = (void *)__get_free_page(GFP_KERNEL);
-+ pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
-+ paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT);
-+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
-+ set_pte(&pte_p[n*PTE_STRIDE], pte);
-+
-+ /* Job is done for this CPU and any CPU which shares this page */
-+ ACCESS_ONCE(espfix_pages[page]) = stack_page;
-+
-+unlock_done:
-+ mutex_unlock(&espfix_init_mutex);
-+done:
-+ this_cpu_write(espfix_stack, addr);
-+ this_cpu_write(espfix_waddr, (unsigned long)stack_page
-+ + (addr & ~PAGE_MASK));
-+}
---- a/arch/x86/kernel/ldt.c
-+++ b/arch/x86/kernel/ldt.c
-@@ -229,17 +229,6 @@ static int write_ldt(void __user *ptr, u
- }
- }
-
-- /*
-- * On x86-64 we do not support 16-bit segments due to
-- * IRET leaking the high bits of the kernel stack address.
-- */
--#ifdef CONFIG_X86_64
-- if (!ldt_info.seg_32bit) {
-- error = -EINVAL;
-- goto out_unlock;
-- }
--#endif
--
- fill_ldt(&ldt, &ldt_info);
- if (oldmode)
- ldt.avl = 0;
---- a/arch/x86/kernel/smpboot.c
-+++ b/arch/x86/kernel/smpboot.c
-@@ -271,6 +271,13 @@ notrace static void __cpuinit start_seco
- check_tsc_sync_target();
-
- /*
-+ * Enable the espfix hack for this CPU
-+ */
-+#ifdef CONFIG_X86_64
-+ init_espfix_ap();
-+#endif
-+
-+ /*
- * We need to hold call_lock, so there is no inconsistency
- * between the time smp_call_function() determines number of
- * IPI recipients, and the time when the determination is made
---- a/arch/x86/mm/dump_pagetables.c
-+++ b/arch/x86/mm/dump_pagetables.c
-@@ -30,11 +30,13 @@ struct pg_state {
- unsigned long start_address;
- unsigned long current_address;
- const struct addr_marker *marker;
-+ unsigned long lines;
- };
-
- struct addr_marker {
- unsigned long start_address;
- const char *name;
-+ unsigned long max_lines;
- };
-
- /* indices for address_markers; keep sync'd w/ address_markers below */
-@@ -45,6 +47,7 @@ enum address_markers_idx {
- LOW_KERNEL_NR,
- VMALLOC_START_NR,
- VMEMMAP_START_NR,
-+ ESPFIX_START_NR,
- HIGH_KERNEL_NR,
- MODULES_VADDR_NR,
- MODULES_END_NR,
-@@ -67,6 +70,7 @@ static struct addr_marker address_marker
- { PAGE_OFFSET, "Low Kernel Mapping" },
- { VMALLOC_START, "vmalloc() Area" },
- { VMEMMAP_START, "Vmemmap" },
-+ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
- { __START_KERNEL_map, "High Kernel Mapping" },
- { MODULES_VADDR, "Modules" },
- { MODULES_END, "End Modules" },
-@@ -163,7 +167,7 @@ static void note_page(struct seq_file *m
- pgprot_t new_prot, int level)
- {
- pgprotval_t prot, cur;
-- static const char units[] = "KMGTPE";
-+ static const char units[] = "BKMGTPE";
-
- /*
- * If we have a "break" in the series, we need to flush the state that
-@@ -178,6 +182,7 @@ static void note_page(struct seq_file *m
- st->current_prot = new_prot;
- st->level = level;
- st->marker = address_markers;
-+ st->lines = 0;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
- } else if (prot != cur || level != st->level ||
- st->current_address >= st->marker[1].start_address) {
-@@ -188,17 +193,21 @@ static void note_page(struct seq_file *m
- /*
- * Now print the actual finished series
- */
-- seq_printf(m, "0x%0*lx-0x%0*lx ",
-- width, st->start_address,
-- width, st->current_address);
--
-- delta = (st->current_address - st->start_address) >> 10;
-- while (!(delta & 1023) && unit[1]) {
-- delta >>= 10;
-- unit++;
-+ if (!st->marker->max_lines ||
-+ st->lines < st->marker->max_lines) {
-+ seq_printf(m, "0x%0*lx-0x%0*lx ",
-+ width, st->start_address,
-+ width, st->current_address);
-+
-+ delta = (st->current_address - st->start_address) >> 10;
-+ while (!(delta & 1023) && unit[1]) {
-+ delta >>= 10;
-+ unit++;
-+ }
-+ seq_printf(m, "%9lu%c ", delta, *unit);
-+ printk_prot(m, st->current_prot, st->level);
- }
-- seq_printf(m, "%9lu%c ", delta, *unit);
-- printk_prot(m, st->current_prot, st->level);
-+ st->lines++;
-
- /*
- * We print markers for special areas of address space,
---- a/init/main.c
-+++ b/init/main.c
-@@ -606,6 +606,10 @@ asmlinkage void __init start_kernel(void
- if (efi_enabled(EFI_RUNTIME_SERVICES))
- efi_enter_virtual_mode();
- #endif
-+#ifdef CONFIG_X86_64
-+ /* Should be run before the first non-init thread is created */
-+ init_espfix_bsp();
-+#endif
- thread_info_cache_init();
- cred_init();
- fork_init(totalram_pages);
+++ /dev/null
-From 20b68535cd27183ebd3651ff313afb2b97dac941 Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-Date: Fri, 2 May 2014 11:33:51 -0700
-Subject: x86, espfix: Fix broken header guard
-
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-
-commit 20b68535cd27183ebd3651ff313afb2b97dac941 upstream.
-
-Header guard is #ifndef, not #ifdef...
-
-Reported-by: Fengguang Wu <fengguang.wu@intel.com>
-Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/include/asm/espfix.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
---- a/arch/x86/include/asm/espfix.h
-+++ b/arch/x86/include/asm/espfix.h
-@@ -1,4 +1,4 @@
--#ifdef _ASM_X86_ESPFIX_H
-+#ifndef _ASM_X86_ESPFIX_H
- #define _ASM_X86_ESPFIX_H
-
- #ifdef CONFIG_X86_64
+++ /dev/null
-From 197725de65477bc8509b41388157c1a2283542bb Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@zytor.com>
-Date: Sun, 4 May 2014 10:00:49 -0700
-Subject: x86, espfix: Make espfix64 a Kconfig option, fix UML
-
-From: "H. Peter Anvin" <hpa@zytor.com>
-
-commit 197725de65477bc8509b41388157c1a2283542bb upstream.
-
-Make espfix64 a hidden Kconfig option. This fixes the x86-64 UML
-build which had broken due to the non-existence of init_espfix_bsp()
-in UML: since UML uses its own Kconfig, this option does not appear in
-the UML build.
-
-This also makes it possible to make support for 16-bit segments a
-configuration option, for the people who want to minimize the size of
-the kernel.
-
-Reported-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: H. Peter Anvin <hpa@zytor.com>
-Cc: Richard Weinberger <richard@nod.at>
-Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/Kconfig | 4 ++++
- arch/x86/kernel/Makefile | 2 +-
- arch/x86/kernel/smpboot.c | 2 +-
- init/main.c | 2 +-
- 4 files changed, 7 insertions(+), 3 deletions(-)
-
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -920,6 +920,10 @@ config VM86
- XFree86 to initialize some video cards via BIOS. Disabling this
- option saves about 6k.
-
-+config X86_ESPFIX64
-+ def_bool y
-+ depends on X86_64
-+
- config TOSHIBA
- tristate "Toshiba Laptop support"
- depends on X86_32
---- a/arch/x86/kernel/Makefile
-+++ b/arch/x86/kernel/Makefile
-@@ -28,7 +28,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x86
- obj-y += syscall_$(BITS).o
- obj-$(CONFIG_X86_64) += vsyscall_64.o
- obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
--obj-$(CONFIG_X86_64) += espfix_64.o
-+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
- obj-y += bootflag.o e820.o
- obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
- obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
---- a/arch/x86/kernel/smpboot.c
-+++ b/arch/x86/kernel/smpboot.c
-@@ -273,7 +273,7 @@ notrace static void __cpuinit start_seco
- /*
- * Enable the espfix hack for this CPU
- */
--#ifdef CONFIG_X86_64
-+#ifdef CONFIG_X86_ESPFIX64
- init_espfix_ap();
- #endif
-
---- a/init/main.c
-+++ b/init/main.c
-@@ -606,7 +606,7 @@ asmlinkage void __init start_kernel(void
- if (efi_enabled(EFI_RUNTIME_SERVICES))
- efi_enter_virtual_mode();
- #endif
--#ifdef CONFIG_X86_64
-+#ifdef CONFIG_X86_ESPFIX64
- /* Should be run before the first non-init thread is created */
- init_espfix_bsp();
- #endif
+++ /dev/null
-From 34273f41d57ee8d854dcd2a1d754cbb546cb548f Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@zytor.com>
-Date: Sun, 4 May 2014 10:36:22 -0700
-Subject: x86, espfix: Make it possible to disable 16-bit support
-
-From: "H. Peter Anvin" <hpa@zytor.com>
-
-commit 34273f41d57ee8d854dcd2a1d754cbb546cb548f upstream.
-
-Embedded systems, which may be very memory-size-sensitive, are
-extremely unlikely to ever encounter any 16-bit software, so make it
-a CONFIG_EXPERT option to turn off support for any 16-bit software
-whatsoever.
-
-Signed-off-by: H. Peter Anvin <hpa@zytor.com>
-Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/Kconfig | 23 ++++++++++++++++++-----
- arch/x86/kernel/entry_32.S | 12 ++++++++++++
- arch/x86/kernel/entry_64.S | 8 ++++++++
- arch/x86/kernel/ldt.c | 5 +++++
- 4 files changed, 43 insertions(+), 5 deletions(-)
-
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -915,14 +915,27 @@ config VM86
- default y
- depends on X86_32
- ---help---
-- This option is required by programs like DOSEMU to run 16-bit legacy
-- code on X86 processors. It also may be needed by software like
-- XFree86 to initialize some video cards via BIOS. Disabling this
-- option saves about 6k.
-+ This option is required by programs like DOSEMU to run
-+ 16-bit real mode legacy code on x86 processors. It also may
-+ be needed by software like XFree86 to initialize some video
-+ cards via BIOS. Disabling this option saves about 6K.
-+
-+config X86_16BIT
-+ bool "Enable support for 16-bit segments" if EXPERT
-+ default y
-+ ---help---
-+ This option is required by programs like Wine to run 16-bit
-+ protected mode legacy code on x86 processors. Disabling
-+ this option saves about 300 bytes on i386, or around 6K text
-+ plus 16K runtime memory on x86-64,
-+
-+config X86_ESPFIX32
-+ def_bool y
-+ depends on X86_16BIT && X86_32
-
- config X86_ESPFIX64
- def_bool y
-- depends on X86_64
-+ depends on X86_16BIT && X86_64
-
- config TOSHIBA
- tristate "Toshiba Laptop support"
---- a/arch/x86/kernel/entry_32.S
-+++ b/arch/x86/kernel/entry_32.S
-@@ -524,6 +524,7 @@ syscall_exit:
- restore_all:
- TRACE_IRQS_IRET
- restore_all_notrace:
-+#ifdef CONFIG_X86_ESPFIX32
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
- # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
- # are returning to the kernel.
-@@ -534,6 +535,7 @@ restore_all_notrace:
- cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
- CFI_REMEMBER_STATE
- je ldt_ss # returning to user-space with LDT SS
-+#endif
- restore_nocheck:
- RESTORE_REGS 4 # skip orig_eax/error_code
- irq_return:
-@@ -549,6 +551,7 @@ ENTRY(iret_exc)
- .long irq_return,iret_exc
- .previous
-
-+#ifdef CONFIG_X86_ESPFIX32
- CFI_RESTORE_STATE
- ldt_ss:
- #ifdef CONFIG_PARAVIRT
-@@ -592,6 +595,7 @@ ldt_ss:
- lss (%esp), %esp /* switch to espfix segment */
- CFI_ADJUST_CFA_OFFSET -8
- jmp restore_nocheck
-+#endif
- CFI_ENDPROC
- ENDPROC(system_call)
-
-@@ -765,6 +769,7 @@ ENDPROC(ptregs_clone)
- * the high word of the segment base from the GDT and swiches to the
- * normal stack and adjusts ESP with the matching offset.
- */
-+#ifdef CONFIG_X86_ESPFIX32
- /* fixup the stack */
- mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
- mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
-@@ -774,8 +779,10 @@ ENDPROC(ptregs_clone)
- pushl_cfi %eax
- lss (%esp), %esp /* switch to the normal stack segment */
- CFI_ADJUST_CFA_OFFSET -8
-+#endif
- .endm
- .macro UNWIND_ESPFIX_STACK
-+#ifdef CONFIG_X86_ESPFIX32
- movl %ss, %eax
- /* see if on espfix stack */
- cmpw $__ESPFIX_SS, %ax
-@@ -786,6 +793,7 @@ ENDPROC(ptregs_clone)
- /* switch to normal stack */
- FIXUP_ESPFIX_STACK
- 27:
-+#endif
- .endm
-
- /*
-@@ -1317,11 +1325,13 @@ END(debug)
- */
- ENTRY(nmi)
- RING0_INT_FRAME
-+#ifdef CONFIG_X86_ESPFIX32
- pushl_cfi %eax
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl_cfi %eax
- je nmi_espfix_stack
-+#endif
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl_cfi %eax
-@@ -1361,6 +1371,7 @@ nmi_debug_stack_check:
- FIX_STACK 24, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-+#ifdef CONFIG_X86_ESPFIX32
- nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
-@@ -1382,6 +1393,7 @@ nmi_espfix_stack:
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
-+#endif
- CFI_ENDPROC
- END(nmi)
-
---- a/arch/x86/kernel/entry_64.S
-+++ b/arch/x86/kernel/entry_64.S
-@@ -904,8 +904,10 @@ irq_return:
- * Are we returning to a stack segment from the LDT? Note: in
- * 64-bit mode SS:RSP on the exception stack is always valid.
- */
-+#ifdef CONFIG_X86_ESPFIX64
- testb $4,(SS-RIP)(%rsp)
- jnz irq_return_ldt
-+#endif
-
- irq_return_iret:
- INTERRUPT_RETURN
-@@ -923,6 +925,7 @@ ENTRY(native_iret)
- .previous
- #endif
-
-+#ifdef CONFIG_X86_ESPFIX64
- irq_return_ldt:
- pushq_cfi %rax
- pushq_cfi %rdi
-@@ -946,6 +949,7 @@ irq_return_ldt:
- movq %rax,%rsp
- popq_cfi %rax
- jmp irq_return_iret
-+#endif
-
- .section .fixup,"ax"
- bad_iret:
-@@ -1019,6 +1023,7 @@ END(common_interrupt)
- * modify the stack to make it look like we just entered
- * the #GP handler from user space, similar to bad_iret.
- */
-+#ifdef CONFIG_X86_ESPFIX64
- ALIGN
- __do_double_fault:
- XCPT_FRAME 1 RDI+8
-@@ -1044,6 +1049,9 @@ __do_double_fault:
- retq
- CFI_ENDPROC
- END(__do_double_fault)
-+#else
-+# define __do_double_fault do_double_fault
-+#endif
-
- /*
- * End of kprobes section
---- a/arch/x86/kernel/ldt.c
-+++ b/arch/x86/kernel/ldt.c
-@@ -229,6 +229,11 @@ static int write_ldt(void __user *ptr, u
- }
- }
-
-+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
-+ error = -EINVAL;
-+ goto out_unlock;
-+ }
-+
- fill_ldt(&ldt, &ldt_info);
- if (oldmode)
- ldt.avl = 0;
+++ /dev/null
-From e1fe9ed8d2a4937510d0d60e20705035c2609aea Mon Sep 17 00:00:00 2001
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-Date: Thu, 1 May 2014 14:12:23 -0700
-Subject: x86, espfix: Move espfix definitions into a separate header file
-
-From: "H. Peter Anvin" <hpa@linux.intel.com>
-
-commit e1fe9ed8d2a4937510d0d60e20705035c2609aea upstream.
-
-Sparse warns that the percpu variables aren't declared before they are
-defined. Rather than hacking around it, move espfix definitions into
-a proper header file.
-
-Reported-by: Fengguang Wu <fengguang.wu@intel.com>
-Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/include/asm/espfix.h | 16 ++++++++++++++++
- arch/x86/include/asm/setup.h | 5 ++---
- arch/x86/kernel/espfix_64.c | 1 +
- 3 files changed, 19 insertions(+), 3 deletions(-)
-
---- /dev/null
-+++ b/arch/x86/include/asm/espfix.h
-@@ -0,0 +1,16 @@
-+#ifdef _ASM_X86_ESPFIX_H
-+#define _ASM_X86_ESPFIX_H
-+
-+#ifdef CONFIG_X86_64
-+
-+#include <asm/percpu.h>
-+
-+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
-+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
-+
-+extern void init_espfix_bsp(void);
-+extern void init_espfix_ap(void);
-+
-+#endif /* CONFIG_X86_64 */
-+
-+#endif /* _ASM_X86_ESPFIX_H */
---- a/arch/x86/include/asm/setup.h
-+++ b/arch/x86/include/asm/setup.h
-@@ -59,11 +59,10 @@ extern void x86_ce4100_early_setup(void)
- static inline void x86_ce4100_early_setup(void) { }
- #endif
-
--extern void init_espfix_bsp(void);
--extern void init_espfix_ap(void);
--
- #ifndef _SETUP
-
-+#include <asm/espfix.h>
-+
- /*
- * This is set up by the setup-routine at boot-time
- */
---- a/arch/x86/kernel/espfix_64.c
-+++ b/arch/x86/kernel/espfix_64.c
-@@ -40,6 +40,7 @@
- #include <asm/pgtable.h>
- #include <asm/pgalloc.h>
- #include <asm/setup.h>
-+#include <asm/espfix.h>
-
- /*
- * Note: we only need 6*8 = 48 bytes for the espfix stack, but round