]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
KAISER: Kernel Address Isolation
authorRichard Fellner <richard.fellner@student.tugraz.at>
Thu, 4 May 2017 12:26:50 +0000 (14:26 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 5 Jan 2018 14:46:32 +0000 (15:46 +0100)
This patch introduces our implementation of KAISER (Kernel Address Isolation to
have Side-channels Efficiently Removed), a kernel isolation technique to close
hardware side channels on kernel address information.

More information about the patch can be found on:

        https://github.com/IAIK/KAISER

From: Richard Fellner <richard.fellner@student.tugraz.at>
From: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
Date: Thu, 4 May 2017 14:26:50 +0200
Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2
Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5

To: <linux-kernel@vger.kernel.org>
To: <kernel-hardening@lists.openwall.com>
Cc: <clementine.maurice@iaik.tugraz.at>
Cc: <moritz.lipp@iaik.tugraz.at>
Cc: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
Cc: Richard Fellner <richard.fellner@student.tugraz.at>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: <kirill.shutemov@linux.intel.com>
Cc: <anders.fogh@gdata-adan.de>
After several recent works [1,2,3] KASLR on x86_64 was basically
considered dead by many researchers. We have been working on an
efficient but effective fix for this problem and found that not mapping
the kernel space when running in user mode is the solution to this
problem [4] (the corresponding paper [5] will be presented at ESSoS17).

With this RFC patch we allow anybody to configure their kernel with the
flag CONFIG_KAISER to add our defense mechanism.

If there are any questions we would love to answer them.
We also appreciate any comments!

Cheers,
Daniel (+ the KAISER team from Graz University of Technology)

[1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf
[2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf
[3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf
[4] https://github.com/IAIK/KAISER
[5] https://gruss.cc/files/kaiser.pdf

[patch based also on
https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kernel-Address-Isolation.patch]

Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
22 files changed:
arch/x86/entry/entry_64.S
arch/x86/entry/entry_64_compat.S
arch/x86/include/asm/hw_irq.h
arch/x86/include/asm/kaiser.h [new file with mode: 0644]
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_64.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/processor.h
arch/x86/kernel/cpu/common.c
arch/x86/kernel/espfix_64.c
arch/x86/kernel/head_64.S
arch/x86/kernel/irqinit.c
arch/x86/kernel/process.c
arch/x86/mm/Makefile
arch/x86/mm/kaiser.c [new file with mode: 0644]
arch/x86/mm/pageattr.c
arch/x86/mm/pgtable.c
include/asm-generic/vmlinux.lds.h
include/linux/percpu-defs.h
init/main.c
kernel/fork.c
security/Kconfig

index e7b0e7ff4c588e529d91e00df7544502824a1522..9467a2c4bc6086a9aad5c615caacfcea12a81ee7 100644 (file)
@@ -36,6 +36,7 @@
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
 #include <asm/export.h>
+#include <asm/kaiser.h>
 #include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
@@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)
         * it is too small to ever cause noticeable irq latency.
         */
        SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
        /*
         * A hypervisor implementation might want to use a label
         * after the swapgs, so that it can do the swapgs
@@ -228,6 +230,7 @@ entry_SYSCALL_64_fastpath:
        movq    RIP(%rsp), %rcx
        movq    EFLAGS(%rsp), %r11
        RESTORE_C_REGS_EXCEPT_RCX_R11
+       SWITCH_USER_CR3
        movq    RSP(%rsp), %rsp
        USERGS_SYSRET64
 
@@ -323,10 +326,12 @@ return_from_SYSCALL_64:
 syscall_return_via_sysret:
        /* rcx and r11 are already restored (see code above) */
        RESTORE_C_REGS_EXCEPT_RCX_R11
+       SWITCH_USER_CR3
        movq    RSP(%rsp), %rsp
        USERGS_SYSRET64
 
 opportunistic_sysret_failed:
+       SWITCH_USER_CR3
        SWAPGS
        jmp     restore_c_regs_and_iret
 END(entry_SYSCALL_64)
@@ -424,6 +429,7 @@ ENTRY(ret_from_fork)
        movq    %rsp, %rdi
        call    syscall_return_slowpath /* returns with IRQs disabled */
        TRACE_IRQS_ON                   /* user mode is traced as IRQS on */
+       SWITCH_USER_CR3
        SWAPGS
        jmp     restore_regs_and_iret
 
@@ -478,6 +484,7 @@ END(irq_entries_start)
         * tracking that we're in kernel mode.
         */
        SWAPGS
+       SWITCH_KERNEL_CR3
 
        /*
         * We need to tell lockdep that IRQs are off.  We can't do this until
@@ -535,6 +542,7 @@ GLOBAL(retint_user)
        mov     %rsp,%rdi
        call    prepare_exit_to_usermode
        TRACE_IRQS_IRETQ
+       SWITCH_USER_CR3
        SWAPGS
        jmp     restore_regs_and_iret
 
@@ -612,6 +620,7 @@ native_irq_return_ldt:
 
        pushq   %rdi                            /* Stash user RDI */
        SWAPGS
+       SWITCH_KERNEL_CR3
        movq    PER_CPU_VAR(espfix_waddr), %rdi
        movq    %rax, (0*8)(%rdi)               /* user RAX */
        movq    (1*8)(%rsp), %rax               /* user RIP */
@@ -638,6 +647,7 @@ native_irq_return_ldt:
         * still points to an RO alias of the ESPFIX stack.
         */
        orq     PER_CPU_VAR(espfix_stack), %rax
+       SWITCH_USER_CR3
        SWAPGS
        movq    %rax, %rsp
 
@@ -1034,6 +1044,7 @@ ENTRY(paranoid_entry)
        testl   %edx, %edx
        js      1f                              /* negative -> in kernel */
        SWAPGS
+       SWITCH_KERNEL_CR3
        xorl    %ebx, %ebx
 1:     ret
 END(paranoid_entry)
@@ -1056,6 +1067,7 @@ ENTRY(paranoid_exit)
        testl   %ebx, %ebx                      /* swapgs needed? */
        jnz     paranoid_exit_no_swapgs
        TRACE_IRQS_IRETQ
+       SWITCH_USER_CR3_NO_STACK
        SWAPGS_UNSAFE_STACK
        jmp     paranoid_exit_restore
 paranoid_exit_no_swapgs:
@@ -1084,6 +1096,7 @@ ENTRY(error_entry)
         * from user mode due to an IRET fault.
         */
        SWAPGS
+       SWITCH_KERNEL_CR3
 
 .Lerror_entry_from_usermode_after_swapgs:
        /*
@@ -1135,6 +1148,7 @@ ENTRY(error_entry)
         * Switch to kernel gsbase:
         */
        SWAPGS
+       SWITCH_KERNEL_CR3
 
        /*
         * Pretend that the exception came from user mode: set up pt_regs
@@ -1235,6 +1249,7 @@ ENTRY(nmi)
         */
 
        SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
        cld
        movq    %rsp, %rdx
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1275,6 +1290,7 @@ ENTRY(nmi)
         * work, because we don't want to enable interrupts.  Fortunately,
         * do_nmi doesn't modify pt_regs.
         */
+       SWITCH_USER_CR3
        SWAPGS
        jmp     restore_c_regs_and_iret
 
@@ -1486,6 +1502,7 @@ end_repeat_nmi:
        testl   %ebx, %ebx                      /* swapgs needed? */
        jnz     nmi_restore
 nmi_swapgs:
+       SWITCH_USER_CR3_NO_STACK
        SWAPGS_UNSAFE_STACK
 nmi_restore:
        RESTORE_EXTRA_REGS
index e1721dafbcb13fab9230cc20d598b18ebef8306b..f0e384ee8fc69317090f556d1f98869292be90fa 100644 (file)
@@ -13,6 +13,7 @@
 #include <asm/irqflags.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
+#include <asm/kaiser.h>
 #include <linux/linkage.h>
 #include <linux/err.h>
 
@@ -48,6 +49,7 @@
 ENTRY(entry_SYSENTER_compat)
        /* Interrupts are off on entry. */
        SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
        /*
@@ -184,6 +186,7 @@ ENDPROC(entry_SYSENTER_compat)
 ENTRY(entry_SYSCALL_compat)
        /* Interrupts are off on entry. */
        SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
 
        /* Stash user ESP and switch to the kernel stack. */
        movl    %esp, %r8d
@@ -259,6 +262,7 @@ sysret32_from_system_call:
        xorq    %r8, %r8
        xorq    %r9, %r9
        xorq    %r10, %r10
+       SWITCH_USER_CR3
        movq    RSP-ORIG_RAX(%rsp), %rsp
        swapgs
        sysretl
@@ -297,7 +301,7 @@ ENTRY(entry_INT80_compat)
        PARAVIRT_ADJUST_EXCEPTION_FRAME
        ASM_CLAC                        /* Do this early to minimize exposure */
        SWAPGS
-
+       SWITCH_KERNEL_CR3_NO_STACK
        /*
         * User tracing code (ptrace or signal handlers) might assume that
         * the saved RAX contains a 32-bit number when we're invoking a 32-bit
@@ -338,6 +342,7 @@ ENTRY(entry_INT80_compat)
 
        /* Go back to user mode. */
        TRACE_IRQS_ON
+       SWITCH_USER_CR3_NO_STACK
        SWAPGS
        jmp     restore_regs_and_iret
 END(entry_INT80_compat)
index b90e1053049bdd17ad36989315db8ac1b3ccc973..0817d63bce41e378743d4044662b0a6aa547379c 100644 (file)
@@ -178,7 +178,7 @@ extern char irq_entries_start[];
 #define VECTOR_RETRIGGERED     ((void *)~0UL)
 
 typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
 
 #endif /* !ASSEMBLY_ */
 
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644 (file)
index 0000000..63ee830
--- /dev/null
@@ -0,0 +1,113 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+/* This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
+ * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
+ * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
+ * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
+ * of the user space, or the stacks.
+ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg
+movq %cr3, \reg
+orq $(0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+
+.macro SWITCH_USER_CR3_NO_STACK
+
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_USER_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg
+.endm
+.macro SWITCH_USER_CR3_NO_STACK
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+#else /* __ASSEMBLY__ */
+
+
+#ifdef CONFIG_KAISER
+// Upon kernel/user mode switch, it may happen that
+// the address space has to be switched before the registers have been stored.
+// To change the address space, another register is needed.
+// A register therefore has to be stored/restored.
+//
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+#endif /* CONFIG_KAISER */
+
+/**
+ *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  the mapping is done on a global scope, so no bigger synchronization has to be done.
+ *  the pages have to be manually unmapped again when they are not needed any longer.
+ */
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+
+/**
+ *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *
+ *  most parts of the shadow mapping can be mapped upon boot time.
+ *  only the thread stacks have to be mapped on runtime.
+ *  the mapped regions are not unmapped at all.
+ */
+extern void kaiser_init(void);
+
+#endif
+
+
+
+#endif /* _ASM_X86_KAISER_H */
index 437feb436efa666dbe13732c7d4269160fed49e3..4b479c9b064f8eb0e0e33439e676e4414ce6c696 100644 (file)
@@ -904,6 +904,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+       // clone the shadow pgd part as well
+       memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+#endif
 }
 
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
index 1cc82ece9ac1819b92ec82aca72805c0e966af97..e6ea39fa6d0b7f66eda097de9cd59c8461f7a919 100644 (file)
@@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud)
        native_set_pud(pud, native_make_pud(0));
 }
 
+#ifdef CONFIG_KAISER
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+       return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+       return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
+}
+#endif /* CONFIG_KAISER */
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+#ifdef CONFIG_KAISER
+       // We know that a pgd is page aligned.
+       // Therefore the lower indices have to be mapped to user space.
+       // These pages are mapped to the shadow mapping.
+       if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+               native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+       }
+
+       pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+#else /* CONFIG_KAISER */
        *pgdp = pgd;
+#endif
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
index 8b4de22d64299e8997e8b12270e5c23112f85597..00fecbb153ac9519135d9bcdbecd73b3175ecce7 100644 (file)
 #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY    (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE      (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL   (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL   (_AT(pteval_t, 0))
+#else
+#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
 #define _PAGE_SOFTW1   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT      (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
 #define _PAGE_DEVMAP   (_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#ifdef CONFIG_KAISER
+#define _PAGE_PROTNONE (_AT(pteval_t, 0))
+#else
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#endif
 
 #define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                         _PAGE_ACCESSED | _PAGE_DIRTY)
index 83db0eae99797da9fca22c59eee3fb1ed12ca373..3d4784e2f8624e5494909a8280459cea01d56ab3 100644 (file)
@@ -308,7 +308,7 @@ struct tss_struct {
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
 
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
@@ -335,6 +335,11 @@ union irq_stack_union {
                char gs_base[40];
                unsigned long stack_canary;
        };
+
+       struct {
+               char irq_stack_pointer[64];
+               char unused[IRQ_STACK_SIZE - 64];
+       };
 };
 
 DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
index 91588be529b968257db495630c5606ca78bcc5e7..3efde13eaed0c108fcf1f6ffc51e504941f3d6b5 100644 (file)
@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {
 
 static const struct cpu_dev *this_cpu = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
        /*
         * We need valid kernel segments for data and code in long mode too
@@ -1365,7 +1365,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
          [DEBUG_STACK - 1]                     = DEBUG_STKSZ
 };
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
        [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 
 /* May not be marked __init: used by software suspend */
index 04f89caef9c4926c40092aed725db848adc2379e..9ff875a1aa24d9a05897f3434ae70caa6e66673b 100644 (file)
@@ -41,6 +41,7 @@
 #include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/espfix.h>
+#include <asm/kaiser.h>
 
 /*
  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,11 @@ void __init init_espfix_bsp(void)
        /* Install the espfix pud into the kernel page directory */
        pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
        pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+#ifdef CONFIG_KAISER
+       // add the esp stack pud to the shadow mapping here.
+       // This can be done directly, because the fixup stack has its own pud
+       set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
+#endif
 
        /* Randomize the locations */
        init_espfix_random();
index b4421cc191b056727f8f8c0def78a750b319a1c4..9e849b520780e8357b1dff407e289479f1f6d7f8 100644 (file)
@@ -405,6 +405,14 @@ GLOBAL(early_recursion_flag)
        .balign PAGE_SIZE; \
 GLOBAL(name)
 
+#ifdef CONFIG_KAISER
+#define NEXT_PGD_PAGE(name) \
+       .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)                       \
        i = 0 ;                                         \
@@ -414,7 +422,7 @@ GLOBAL(name)
        .endr
 
        __INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
        .fill   511,8,0
        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
@@ -424,10 +432,10 @@ NEXT_PAGE(early_dynamic_pgts)
        .data
 
 #ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
-       .fill   512,8,0
+NEXT_PGD_PAGE(init_level4_pgt)
+       .fill   2*512,8,0
 #else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
        .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
index 1423ab1b0312269fd3130392a94a74db755cc2b5..f480b38a03c35b0e2ffa377d4d84aaa4f42f58c0 100644 (file)
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
        .flags = IRQF_NO_THREAD,
 };
 
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
        [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
index 8e10e72bf6ee3c157e899fd483d6e5be5ffb85bc..a55b32007785dc4565247965528b6877abdccfbc 100644 (file)
@@ -41,7 +41,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
        .x86_tss = {
                .sp0 = TOP_OF_INIT_STACK,
 #ifdef CONFIG_X86_32
index 96d2b847e09ea504fc3ac824d347651a3bc880b9..682c162333ba5cf86db160dc2fd13a2b039cbb4d 100644 (file)
@@ -38,4 +38,4 @@ obj-$(CONFIG_NUMA_EMU)                += numa_emulation.o
 obj-$(CONFIG_X86_INTEL_MPX)    += mpx.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
-
+obj-$(CONFIG_KAISER) += kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644 (file)
index 0000000..cf1bb92
--- /dev/null
@@ -0,0 +1,160 @@
+
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#ifdef CONFIG_KAISER
+
+__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/**
+ * Get the real ppn from a address in kernel mapping.
+ * @param address The virtual adrress
+ * @return the physical address
+ */
+static inline unsigned long get_pa_from_mapping (unsigned long address)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = pgd_offset_k(address);
+       BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+
+       pud = pud_offset(pgd, address);
+       BUG_ON(pud_none(*pud));
+
+       if (pud_large(*pud)) {
+               return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+       }
+
+       pmd = pmd_offset(pud, address);
+       BUG_ON(pmd_none(*pmd));
+
+       if (pmd_large(*pmd)) {
+               return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+       }
+
+       pte = pte_offset_kernel(pmd, address);
+       BUG_ON(pte_none(*pte));
+
+       return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+}
+
+void _kaiser_copy (unsigned long start_addr, unsigned long size,
+                                       unsigned long flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       unsigned long address;
+       unsigned long end_addr = start_addr + size;
+       unsigned long target_address;
+
+       for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
+                       address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
+               target_address = get_pa_from_mapping(address);
+
+               pgd = native_get_shadow_pgd(pgd_offset_k(address));
+
+               BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
+               BUG_ON(pgd_large(*pgd));
+
+               pud = pud_offset(pgd, address);
+               if (pud_none(*pud)) {
+                       set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
+               }
+               BUG_ON(pud_large(*pud));
+
+               pmd = pmd_offset(pud, address);
+               if (pmd_none(*pmd)) {
+                       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
+               }
+               BUG_ON(pmd_large(*pmd));
+
+               pte = pte_offset_kernel(pmd, address);
+               if (pte_none(*pte)) {
+                       set_pte(pte, __pte(flags | target_address));
+               } else {
+                       BUG_ON(__pa(pte_page(*pte)) != target_address);
+               }
+       }
+}
+
+// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
+static inline void __init _kaiser_init(void)
+{
+       pgd_t *pgd;
+       int i = 0;
+
+       pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+       for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+               set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+       }
+}
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+spinlock_t shadow_table_lock;
+void __init kaiser_init(void)
+{
+       int cpu;
+       spin_lock_init(&shadow_table_lock);
+
+       spin_lock(&shadow_table_lock);
+
+       _kaiser_init();
+
+       for_each_possible_cpu(cpu) {
+               // map the per cpu user variables
+               _kaiser_copy(
+                               (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
+                               (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
+                               __PAGE_KERNEL);
+       }
+
+       // map the entry/exit text section, which is responsible to switch between user- and kernel mode
+       _kaiser_copy(
+                       (unsigned long) __entry_text_start,
+                       (unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
+                       __PAGE_KERNEL_RX);
+
+       // the fixed map address of the idt_table
+       _kaiser_copy(
+                       (unsigned long) idt_descr.address,
+                       sizeof(gate_desc) * NR_VECTORS,
+                       __PAGE_KERNEL_RO);
+
+       spin_unlock(&shadow_table_lock);
+}
+
+// add a mapping to the shadow-mapping, and synchronize the mappings
+void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+       spin_lock(&shadow_table_lock);
+       _kaiser_copy(addr, size, flags);
+       spin_unlock(&shadow_table_lock);
+}
+
+extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+       pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
+       spin_lock(&shadow_table_lock);
+       do {
+               unmap_pud_range(pgd, start, start + size);
+       } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
+       spin_unlock(&shadow_table_lock);
+}
+#endif /* CONFIG_KAISER */
index e3353c97d0862d2a20bdd060fc229af2de8324bb..c17412f92d77cb4040d66f38f820a4d86b6a39f9 100644 (file)
@@ -823,7 +823,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
                        pud_clear(pud);
 }
 
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 {
        pud_t *pud = pud_offset(pgd, start);
 
index 3feec5af4e67c096b9bd663edc4a94fb587f67bb..27d218b385383e44cd2f548fd2ae67bd34485ecf 100644 (file)
@@ -346,12 +346,38 @@ static inline void _pgd_free(pgd_t *pgd)
 #else
 static inline pgd_t *_pgd_alloc(void)
 {
+#ifdef CONFIG_KAISER
+       // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
+       // block. Therefore, we have to allocate at least 3 pages. However, the
+       // __get_free_pages returns us 4 pages. Hence, we store the base pointer at
+       // the beginning of the page of our 8kb-aligned memory block in order to
+       // correctly free it afterwars.
+
+       unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
+
+       if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
+       {
+               *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
+               return (pgd_t *) pages;
+       }
+       else
+       {
+               *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
+               return (pgd_t *) (pages + PAGE_SIZE);
+       }
+#else
        return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#endif
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
+#ifdef CONFIG_KAISER
+  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
+       free_pages(pages, get_order(4*PAGE_SIZE));
+#else
        free_page((unsigned long)pgd);
+#endif
 }
 #endif /* CONFIG_X86_PAE */
 
index dc81e5287ebfb22af52148c9dcc7a9194d5d023e..d6ab144147760790054c6f8782ddb8438dcfb486 100644 (file)
  */
 #define PERCPU_INPUT(cacheline)                                                \
        VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
-       *(.data..percpu..first)                                         \
+       \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;        \
+       *(.data..percpu..first)           \
+       . = ALIGN(cacheline);           \
+       *(.data..percpu..user_mapped)            \
+       *(.data..percpu..user_mapped..shared_aligned)        \
+       . = ALIGN(PAGE_SIZE);           \
+       *(.data..percpu..user_mapped..page_aligned)          \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;        \
+       \
        . = ALIGN(PAGE_SIZE);                                           \
        *(.data..percpu..page_aligned)                                  \
        . = ALIGN(cacheline);                                           \
index 8f16299ca0683f426e9a1dfb21ab68b244655166..8ea945f63a05db9af98ebcd75ce272f9d3fd1ec9 100644 (file)
 
 #endif
 
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
 /*
  * Base implementations of per-CPU variable declarations and definitions, where
  * the section in which the variable is to be placed is provided by the
 #define DEFINE_PER_CPU(type, name)                                     \
        DEFINE_PER_CPU_SECTION(type, name, "")
 
+#define DECLARE_PER_CPU_USER_MAPPED(type, name)         \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name)          \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
 /*
  * Declaration/definition used for per-CPU variables that must come first in
  * the set of variables.
        DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
        ____cacheline_aligned_in_smp
 
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)                 \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)                  \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
 #define DECLARE_PER_CPU_ALIGNED(type, name)                            \
        DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
        ____cacheline_aligned
 #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                                \
        DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")            \
        __aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)      \
+  DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")   \
+  __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)       \
+  DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")    \
+  __aligned(PAGE_SIZE)
 
 /*
  * Declaration/definition used for per-CPU variables that must be read mostly.
index 25bac88bc66e8d9a2e1d96b6891f061b552f0609..2c009f77e6553dd3548b4b2ec33125f32f96f7ec 100644 (file)
@@ -86,6 +86,9 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#endif
 
 static int kernel_init(void *);
 
@@ -473,6 +476,9 @@ static void __init mm_init(void)
        pgtable_init();
        vmalloc_init();
        ioremap_huge_init();
+#ifdef CONFIG_KAISER
+       kaiser_init();
+#endif
 }
 
 asmlinkage __visible void __init start_kernel(void)
index 9321b1ad3335ade560477040df46b745c78a028d..4014be1dd2b6b25f7fa6e4ce4d5b811652b93c0e 100644 (file)
@@ -211,8 +211,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 #endif
 }
 
+extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size);
 static inline void free_thread_stack(struct task_struct *tsk)
 {
+#ifdef CONFIG_KAISER
+       kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
+#endif
 #ifdef CONFIG_VMAP_STACK
        if (task_stack_vm_area(tsk)) {
                unsigned long flags;
@@ -468,6 +472,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
 }
 
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
        struct task_struct *tsk;
@@ -495,6 +500,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
         * functions again.
         */
        tsk->stack = stack;
+#ifdef CONFIG_KAISER
+       kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+#endif
 #ifdef CONFIG_VMAP_STACK
        tsk->stack_vm_area = stack_vm_area;
 #endif
index 118f4549404ef2ed0241e86faceb03f2d3646d79..f515ac302257d2bfe9242e41f08edafdbf87d8c9 100644 (file)
@@ -30,6 +30,13 @@ config SECURITY
          model will be used.
 
          If you are unsure how to answer this question, answer N.
+config KAISER
+       bool "Remove the kernel mapping in user mode"
+       depends on X86_64
+       depends on !PARAVIRT
+       help
+         This enforces a strict kernel and user space isolation in order to close
+         hardware side channels on kernel address information.
 
 config SECURITYFS
        bool "Enable the securityfs filesystem"