KAISER: Kernel Address Isolation

author Richard Fellner <richard.fellner@student.tugraz.at>

Thu, 4 May 2017 12:26:50 +0000 (14:26 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 5 Jan 2018 14:46:32 +0000 (15:46 +0100)
author Richard Fellner <richard.fellner@student.tugraz.at>
Thu, 4 May 2017 12:26:50 +0000 (14:26 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 5 Jan 2018 14:46:32 +0000 (15:46 +0100)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S

index e7b0e7ff4c588e529d91e00df7544502824a1522..9467a2c4bc6086a9aad5c615caacfcea12a81ee7 100644 (file)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,6 +36,7 @@
  #include <asm/smap.h>
  #include <asm/pgtable_types.h>
  #include <asm/export.h>
+#include <asm/kaiser.h>
  #include <linux/err.h>
  
  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
@@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)
          * it is too small to ever cause noticeable irq latency.
          */
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
         /*
          * A hypervisor implementation might want to use a label
          * after the swapgs, so that it can do the swapgs
@@ -228,6 +230,7 @@ entry_SYSCALL_64_fastpath:
         movq    RIP(%rsp), %rcx
         movq    EFLAGS(%rsp), %r11
         RESTORE_C_REGS_EXCEPT_RCX_R11
+       SWITCH_USER_CR3
         movq    RSP(%rsp), %rsp
         USERGS_SYSRET64
  
@@ -323,10 +326,12 @@ return_from_SYSCALL_64:
  syscall_return_via_sysret:
         /* rcx and r11 are already restored (see code above) */
         RESTORE_C_REGS_EXCEPT_RCX_R11
+       SWITCH_USER_CR3
         movq    RSP(%rsp), %rsp
         USERGS_SYSRET64
  
  opportunistic_sysret_failed:
+       SWITCH_USER_CR3
         SWAPGS
         jmp     restore_c_regs_and_iret
  END(entry_SYSCALL_64)
@@ -424,6 +429,7 @@ ENTRY(ret_from_fork)
         movq    %rsp, %rdi
         call    syscall_return_slowpath /* returns with IRQs disabled */
         TRACE_IRQS_ON                   /* user mode is traced as IRQS on */
+       SWITCH_USER_CR3
         SWAPGS
         jmp     restore_regs_and_iret
  
@@ -478,6 +484,7 @@ END(irq_entries_start)
          * tracking that we're in kernel mode.
          */
         SWAPGS
+       SWITCH_KERNEL_CR3
  
         /*
          * We need to tell lockdep that IRQs are off.  We can't do this until
@@ -535,6 +542,7 @@ GLOBAL(retint_user)
         mov     %rsp,%rdi
         call    prepare_exit_to_usermode
         TRACE_IRQS_IRETQ
+       SWITCH_USER_CR3
         SWAPGS
         jmp     restore_regs_and_iret
  
@@ -612,6 +620,7 @@ native_irq_return_ldt:
  
         pushq   %rdi                            /* Stash user RDI */
         SWAPGS
+       SWITCH_KERNEL_CR3
         movq    PER_CPU_VAR(espfix_waddr), %rdi
         movq    %rax, (0*8)(%rdi)               /* user RAX */
         movq    (1*8)(%rsp), %rax               /* user RIP */
@@ -638,6 +647,7 @@ native_irq_return_ldt:
          * still points to an RO alias of the ESPFIX stack.
          */
         orq     PER_CPU_VAR(espfix_stack), %rax
+       SWITCH_USER_CR3
         SWAPGS
         movq    %rax, %rsp
  
@@ -1034,6 +1044,7 @@ ENTRY(paranoid_entry)
         testl   %edx, %edx
         js      1f                              /* negative -> in kernel */
         SWAPGS
+       SWITCH_KERNEL_CR3
         xorl    %ebx, %ebx
  1:     ret
  END(paranoid_entry)
@@ -1056,6 +1067,7 @@ ENTRY(paranoid_exit)
         testl   %ebx, %ebx                      /* swapgs needed? */
         jnz     paranoid_exit_no_swapgs
         TRACE_IRQS_IRETQ
+       SWITCH_USER_CR3_NO_STACK
         SWAPGS_UNSAFE_STACK
         jmp     paranoid_exit_restore
  paranoid_exit_no_swapgs:
@@ -1084,6 +1096,7 @@ ENTRY(error_entry)
          * from user mode due to an IRET fault.
          */
         SWAPGS
+       SWITCH_KERNEL_CR3
  
  .Lerror_entry_from_usermode_after_swapgs:
         /*
@@ -1135,6 +1148,7 @@ ENTRY(error_entry)
          * Switch to kernel gsbase:
          */
         SWAPGS
+       SWITCH_KERNEL_CR3
  
         /*
          * Pretend that the exception came from user mode: set up pt_regs
@@ -1235,6 +1249,7 @@ ENTRY(nmi)
          */
  
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
         cld
         movq    %rsp, %rdx
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1275,6 +1290,7 @@ ENTRY(nmi)
          * work, because we don't want to enable interrupts.  Fortunately,
          * do_nmi doesn't modify pt_regs.
          */
+       SWITCH_USER_CR3
         SWAPGS
         jmp     restore_c_regs_and_iret
  
@@ -1486,6 +1502,7 @@ end_repeat_nmi:
         testl   %ebx, %ebx                      /* swapgs needed? */
         jnz     nmi_restore
  nmi_swapgs:
+       SWITCH_USER_CR3_NO_STACK
         SWAPGS_UNSAFE_STACK
  nmi_restore:
         RESTORE_EXTRA_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S

index e1721dafbcb13fab9230cc20d598b18ebef8306b..f0e384ee8fc69317090f556d1f98869292be90fa 100644 (file)
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
  #include <asm/irqflags.h>
  #include <asm/asm.h>
  #include <asm/smap.h>
+#include <asm/kaiser.h>
  #include <linux/linkage.h>
  #include <linux/err.h>
  
@@ -48,6 +49,7 @@
  ENTRY(entry_SYSENTER_compat)
         /* Interrupts are off on entry. */
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
  
         /*
@@ -184,6 +186,7 @@ ENDPROC(entry_SYSENTER_compat)
  ENTRY(entry_SYSCALL_compat)
         /* Interrupts are off on entry. */
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
  
         /* Stash user ESP and switch to the kernel stack. */
         movl    %esp, %r8d
@@ -259,6 +262,7 @@ sysret32_from_system_call:
         xorq    %r8, %r8
         xorq    %r9, %r9
         xorq    %r10, %r10
+       SWITCH_USER_CR3
         movq    RSP-ORIG_RAX(%rsp), %rsp
         swapgs
         sysretl
@@ -297,7 +301,7 @@ ENTRY(entry_INT80_compat)
         PARAVIRT_ADJUST_EXCEPTION_FRAME
         ASM_CLAC                        /* Do this early to minimize exposure */
         SWAPGS
-
+       SWITCH_KERNEL_CR3_NO_STACK
         /*
          * User tracing code (ptrace or signal handlers) might assume that
          * the saved RAX contains a 32-bit number when we're invoking a 32-bit
@@ -338,6 +342,7 @@ ENTRY(entry_INT80_compat)
  
         /* Go back to user mode. */
         TRACE_IRQS_ON
+       SWITCH_USER_CR3_NO_STACK
         SWAPGS
         jmp     restore_regs_and_iret
  END(entry_INT80_compat)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h

index b90e1053049bdd17ad36989315db8ac1b3ccc973..0817d63bce41e378743d4044662b0a6aa547379c 100644 (file)
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -178,7 +178,7 @@ extern char irq_entries_start[];
  #define VECTOR_RETRIGGERED     ((void *)~0UL)
  
  typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
  
  #endif /* !ASSEMBLY_ */
  
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h

new file mode 100644 (file)

index 0000000..63ee830
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,113 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+/* This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
+ * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
+ * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
+ * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
+ * of the user space, or the stacks.
+ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg
+movq %cr3, \reg
+orq $(0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+
+.macro SWITCH_USER_CR3_NO_STACK
+
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_USER_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg
+.endm
+.macro SWITCH_USER_CR3_NO_STACK
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+#else /* __ASSEMBLY__ */
+
+
+#ifdef CONFIG_KAISER
+// Upon kernel/user mode switch, it may happen that
+// the address space has to be switched before the registers have been stored.
+// To change the address space, another register is needed.
+// A register therefore has to be stored/restored.
+//
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+#endif /* CONFIG_KAISER */
+
+/**
+ *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  the mapping is done on a global scope, so no bigger synchronization has to be done.
+ *  the pages have to be manually unmapped again when they are not needed any longer.
+ */
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+
+/**
+ *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *
+ *  most parts of the shadow mapping can be mapped upon boot time.
+ *  only the thread stacks have to be mapped on runtime.
+ *  the mapped regions are not unmapped at all.
+ */
+extern void kaiser_init(void);
+
+#endif
+
+
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index 437feb436efa666dbe13732c7d4269160fed49e3..4b479c9b064f8eb0e0e33439e676e4414ce6c696 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -904,6 +904,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
  {
         memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+       // clone the shadow pgd part as well
+       memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+#endif
  }
  
  #define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h

index 1cc82ece9ac1819b92ec82aca72805c0e966af97..e6ea39fa6d0b7f66eda097de9cd59c8461f7a919 100644 (file)
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud)
         native_set_pud(pud, native_make_pud(0));
  }
  
+#ifdef CONFIG_KAISER
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+       return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+       return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
+}
+#endif /* CONFIG_KAISER */
+
  static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
  {
+#ifdef CONFIG_KAISER
+       // We know that a pgd is page aligned.
+       // Therefore the lower indices have to be mapped to user space.
+       // These pages are mapped to the shadow mapping.
+       if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+               native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+       }
+
+       pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+#else /* CONFIG_KAISER */
         *pgdp = pgd;
+#endif
  }
  
  static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h

index 8b4de22d64299e8997e8b12270e5c23112f85597..00fecbb153ac9519135d9bcdbecd73b3175ecce7 100644 (file)
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -45,7 +45,11 @@
  #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
  #define _PAGE_DIRTY    (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
  #define _PAGE_PSE      (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL   (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL   (_AT(pteval_t, 0))
+#else
+#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
  #define _PAGE_SOFTW1   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
  #define _PAGE_SOFTW2   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
  #define _PAGE_PAT      (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -119,7 +123,11 @@
  #define _PAGE_DEVMAP   (_AT(pteval_t, 0))
  #endif
  
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#ifdef CONFIG_KAISER
+#define _PAGE_PROTNONE (_AT(pteval_t, 0))
+#else
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#endif
  
  #define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                          _PAGE_ACCESSED | _PAGE_DIRTY)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index 83db0eae99797da9fca22c59eee3fb1ed12ca373..3d4784e2f8624e5494909a8280459cea01d56ab3 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -308,7 +308,7 @@ struct tss_struct {
  
  } ____cacheline_aligned;
  
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
  
  #ifdef CONFIG_X86_32
  DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
@@ -335,6 +335,11 @@ union irq_stack_union {
                 char gs_base[40];
                 unsigned long stack_canary;
         };
+
+       struct {
+               char irq_stack_pointer[64];
+               char unused[IRQ_STACK_SIZE - 64];
+       };
  };
  
  DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index 91588be529b968257db495630c5606ca78bcc5e7..3efde13eaed0c108fcf1f6ffc51e504941f3d6b5 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {
  
  static const struct cpu_dev *this_cpu = &default_cpu;
  
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
  #ifdef CONFIG_X86_64
         /*
          * We need valid kernel segments for data and code in long mode too
@@ -1365,7 +1365,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
           [DEBUG_STACK - 1]                     = DEBUG_STKSZ
  };
  
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
         [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
  
  /* May not be marked __init: used by software suspend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c

index 04f89caef9c4926c40092aed725db848adc2379e..9ff875a1aa24d9a05897f3434ae70caa6e66673b 100644 (file)
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
  #include <asm/pgalloc.h>
  #include <asm/setup.h>
  #include <asm/espfix.h>
+#include <asm/kaiser.h>
  
  /*
   * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,11 @@ void __init init_espfix_bsp(void)
         /* Install the espfix pud into the kernel page directory */
         pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
         pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+#ifdef CONFIG_KAISER
+       // add the esp stack pud to the shadow mapping here.
+       // This can be done directly, because the fixup stack has its own pud
+       set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
+#endif
  
         /* Randomize the locations */
         init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

index b4421cc191b056727f8f8c0def78a750b319a1c4..9e849b520780e8357b1dff407e289479f1f6d7f8 100644 (file)
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -405,6 +405,14 @@ GLOBAL(early_recursion_flag)
         .balign PAGE_SIZE; \
  GLOBAL(name)
  
+#ifdef CONFIG_KAISER
+#define NEXT_PGD_PAGE(name) \
+       .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#endif
+
  /* Automate the creation of 1 to 1 mapping pmd entries */
  #define PMDS(START, PERM, COUNT)                       \
         i = 0 ;                                         \
@@ -414,7 +422,7 @@ GLOBAL(name)
         .endr
  
         __INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
         .fill   511,8,0
         .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
  
@@ -424,10 +432,10 @@ NEXT_PAGE(early_dynamic_pgts)
         .data
  
  #ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
-       .fill   512,8,0
+NEXT_PGD_PAGE(init_level4_pgt)
+       .fill   2*512,8,0
  #else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
         .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
         .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
         .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c

index 1423ab1b0312269fd3130392a94a74db755cc2b5..f480b38a03c35b0e2ffa377d4d84aaa4f42f58c0 100644 (file)
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
         .flags = IRQF_NO_THREAD,
  };
  
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
         [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
  };
  
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index 8e10e72bf6ee3c157e899fd483d6e5be5ffb85bc..a55b32007785dc4565247965528b6877abdccfbc 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -41,7 +41,7 @@
   * section. Since TSS's are completely CPU-local, we want them
   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
   */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
         .x86_tss = {
                 .sp0 = TOP_OF_INIT_STACK,
  #ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

index 96d2b847e09ea504fc3ac824d347651a3bc880b9..682c162333ba5cf86db160dc2fd13a2b039cbb4d 100644 (file)
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -38,4 +38,4 @@ obj-$(CONFIG_NUMA_EMU)                += numa_emulation.o
  obj-$(CONFIG_X86_INTEL_MPX)    += mpx.o
  obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
  obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
-
+obj-$(CONFIG_KAISER) += kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c

new file mode 100644 (file)

index 0000000..cf1bb92
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,160 @@
+
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#ifdef CONFIG_KAISER
+
+__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/**
+ * Get the real ppn from a address in kernel mapping.
+ * @param address The virtual adrress
+ * @return the physical address
+ */
+static inline unsigned long get_pa_from_mapping (unsigned long address)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = pgd_offset_k(address);
+       BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+
+       pud = pud_offset(pgd, address);
+       BUG_ON(pud_none(*pud));
+
+       if (pud_large(*pud)) {
+               return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+       }
+
+       pmd = pmd_offset(pud, address);
+       BUG_ON(pmd_none(*pmd));
+
+       if (pmd_large(*pmd)) {
+               return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+       }
+
+       pte = pte_offset_kernel(pmd, address);
+       BUG_ON(pte_none(*pte));
+
+       return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+}
+
+void _kaiser_copy (unsigned long start_addr, unsigned long size,
+                                       unsigned long flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       unsigned long address;
+       unsigned long end_addr = start_addr + size;
+       unsigned long target_address;
+
+       for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
+                       address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
+               target_address = get_pa_from_mapping(address);
+
+               pgd = native_get_shadow_pgd(pgd_offset_k(address));
+
+               BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
+               BUG_ON(pgd_large(*pgd));
+
+               pud = pud_offset(pgd, address);
+               if (pud_none(*pud)) {
+                       set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
+               }
+               BUG_ON(pud_large(*pud));
+
+               pmd = pmd_offset(pud, address);
+               if (pmd_none(*pmd)) {
+                       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
+               }
+               BUG_ON(pmd_large(*pmd));
+
+               pte = pte_offset_kernel(pmd, address);
+               if (pte_none(*pte)) {
+                       set_pte(pte, __pte(flags | target_address));
+               } else {
+                       BUG_ON(__pa(pte_page(*pte)) != target_address);
+               }
+       }
+}
+
+// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
+static inline void __init _kaiser_init(void)
+{
+       pgd_t *pgd;
+       int i = 0;
+
+       pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+       for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+               set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+       }
+}
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+spinlock_t shadow_table_lock;
+void __init kaiser_init(void)
+{
+       int cpu;
+       spin_lock_init(&shadow_table_lock);
+
+       spin_lock(&shadow_table_lock);
+
+       _kaiser_init();
+
+       for_each_possible_cpu(cpu) {
+               // map the per cpu user variables
+               _kaiser_copy(
+                               (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
+                               (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
+                               __PAGE_KERNEL);
+       }
+
+       // map the entry/exit text section, which is responsible to switch between user- and kernel mode
+       _kaiser_copy(
+                       (unsigned long) __entry_text_start,
+                       (unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
+                       __PAGE_KERNEL_RX);
+
+       // the fixed map address of the idt_table
+       _kaiser_copy(
+                       (unsigned long) idt_descr.address,
+                       sizeof(gate_desc) * NR_VECTORS,
+                       __PAGE_KERNEL_RO);
+
+       spin_unlock(&shadow_table_lock);
+}
+
+// add a mapping to the shadow-mapping, and synchronize the mappings
+void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+       spin_lock(&shadow_table_lock);
+       _kaiser_copy(addr, size, flags);
+       spin_unlock(&shadow_table_lock);
+}
+
+extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+       pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
+       spin_lock(&shadow_table_lock);
+       do {
+               unmap_pud_range(pgd, start, start + size);
+       } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
+       spin_unlock(&shadow_table_lock);
+}
+#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

index e3353c97d0862d2a20bdd060fc229af2de8324bb..c17412f92d77cb4040d66f38f820a4d86b6a39f9 100644 (file)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -823,7 +823,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
                         pud_clear(pud);
  }
  
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
  {
         pud_t *pud = pud_offset(pgd, start);
  
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c

index 3feec5af4e67c096b9bd663edc4a94fb587f67bb..27d218b385383e44cd2f548fd2ae67bd34485ecf 100644 (file)
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -346,12 +346,38 @@ static inline void _pgd_free(pgd_t *pgd)
  #else
  static inline pgd_t *_pgd_alloc(void)
  {
+#ifdef CONFIG_KAISER
+       // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
+       // block. Therefore, we have to allocate at least 3 pages. However, the
+       // __get_free_pages returns us 4 pages. Hence, we store the base pointer at
+       // the beginning of the page of our 8kb-aligned memory block in order to
+       // correctly free it afterwars.
+
+       unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
+
+       if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
+       {
+               *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
+               return (pgd_t *) pages;
+       }
+       else
+       {
+               *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
+               return (pgd_t *) (pages + PAGE_SIZE);
+       }
+#else
         return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#endif
  }
  
  static inline void _pgd_free(pgd_t *pgd)
  {
+#ifdef CONFIG_KAISER
+  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
+       free_pages(pages, get_order(4*PAGE_SIZE));
+#else
         free_page((unsigned long)pgd);
+#endif
  }
  #endif /* CONFIG_X86_PAE */
  
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index dc81e5287ebfb22af52148c9dcc7a9194d5d023e..d6ab144147760790054c6f8782ddb8438dcfb486 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -778,7 +778,16 @@
   */
  #define PERCPU_INPUT(cacheline)                                                \
         VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
-       *(.data..percpu..first)                                         \
+       \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;        \
+       *(.data..percpu..first)           \
+       . = ALIGN(cacheline);           \
+       *(.data..percpu..user_mapped)            \
+       *(.data..percpu..user_mapped..shared_aligned)        \
+       . = ALIGN(PAGE_SIZE);           \
+       *(.data..percpu..user_mapped..page_aligned)          \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;        \
+       \
         . = ALIGN(PAGE_SIZE);                                           \
         *(.data..percpu..page_aligned)                                  \
         . = ALIGN(cacheline);                                           \
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h

index 8f16299ca0683f426e9a1dfb21ab68b244655166..8ea945f63a05db9af98ebcd75ce272f9d3fd1ec9 100644 (file)
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,6 +35,12 @@
  
  #endif
  
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
  /*
   * Base implementations of per-CPU variable declarations and definitions, where
   * the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
  #define DEFINE_PER_CPU(type, name)                                     \
         DEFINE_PER_CPU_SECTION(type, name, "")
  
+#define DECLARE_PER_CPU_USER_MAPPED(type, name)         \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name)          \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
  /*
   * Declaration/definition used for per-CPU variables that must come first in
   * the set of variables.
@@ -144,6 +156,14 @@
         DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
         ____cacheline_aligned_in_smp
  
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)                 \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)                  \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
  #define DECLARE_PER_CPU_ALIGNED(type, name)                            \
         DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
         ____cacheline_aligned
@@ -162,6 +182,16 @@
  #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                                \
         DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")            \
         __aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)      \
+  DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")   \
+  __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)       \
+  DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")    \
+  __aligned(PAGE_SIZE)
  
  /*
   * Declaration/definition used for per-CPU variables that must be read mostly.
diff --git a/init/main.c b/init/main.c

index 25bac88bc66e8d9a2e1d96b6891f061b552f0609..2c009f77e6553dd3548b4b2ec33125f32f96f7ec 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -86,6 +86,9 @@
  #include <asm/setup.h>
  #include <asm/sections.h>
  #include <asm/cacheflush.h>
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#endif
  
  static int kernel_init(void *);
  
@@ -473,6 +476,9 @@ static void __init mm_init(void)
         pgtable_init();
         vmalloc_init();
         ioremap_huge_init();
+#ifdef CONFIG_KAISER
+       kaiser_init();
+#endif
  }
  
  asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c

index 9321b1ad3335ade560477040df46b745c78a028d..4014be1dd2b6b25f7fa6e4ce4d5b811652b93c0e 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,8 +211,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
  #endif
  }
  
+extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size);
  static inline void free_thread_stack(struct task_struct *tsk)
  {
+#ifdef CONFIG_KAISER
+       kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
+#endif
  #ifdef CONFIG_VMAP_STACK
         if (task_stack_vm_area(tsk)) {
                 unsigned long flags;
@@ -468,6 +472,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
         *stackend = STACK_END_MAGIC;    /* for overflow detection */
  }
  
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
  {
         struct task_struct *tsk;
@@ -495,6 +500,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
          * functions again.
          */
         tsk->stack = stack;
+#ifdef CONFIG_KAISER
+       kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+#endif
  #ifdef CONFIG_VMAP_STACK
         tsk->stack_vm_area = stack_vm_area;
  #endif
diff --git a/security/Kconfig b/security/Kconfig

index 118f4549404ef2ed0241e86faceb03f2d3646d79..f515ac302257d2bfe9242e41f08edafdbf87d8c9 100644 (file)
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -30,6 +30,13 @@ config SECURITY
           model will be used.
  
           If you are unsure how to answer this question, answer N.
+config KAISER
+       bool "Remove the kernel mapping in user mode"
+       depends on X86_64
+       depends on !PARAVIRT
+       help
+         This enforces a strict kernel and user space isolation in order to close
+         hardware side channels on kernel address information.
  
  config SECURITYFS
         bool "Enable the securityfs filesystem"
author	Richard Fellner <richard.fellner@student.tugraz.at>
	Thu, 4 May 2017 12:26:50 +0000 (14:26 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 5 Jan 2018 14:46:32 +0000 (15:46 +0100)
arch/x86/entry/entry_64.S		patch \| blob \| blame \| history
arch/x86/entry/entry_64_compat.S		patch \| blob \| blame \| history
arch/x86/include/asm/hw_irq.h		patch \| blob \| blame \| history
arch/x86/include/asm/kaiser.h	[new file with mode: 0644]	patch \| blob
arch/x86/include/asm/pgtable.h		patch \| blob \| blame \| history
arch/x86/include/asm/pgtable_64.h		patch \| blob \| blame \| history
arch/x86/include/asm/pgtable_types.h		patch \| blob \| blame \| history
arch/x86/include/asm/processor.h		patch \| blob \| blame \| history
arch/x86/kernel/cpu/common.c		patch \| blob \| blame \| history
arch/x86/kernel/espfix_64.c		patch \| blob \| blame \| history
arch/x86/kernel/head_64.S		patch \| blob \| blame \| history
arch/x86/kernel/irqinit.c		patch \| blob \| blame \| history
arch/x86/kernel/process.c		patch \| blob \| blame \| history
arch/x86/mm/Makefile		patch \| blob \| blame \| history
arch/x86/mm/kaiser.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/pageattr.c		patch \| blob \| blame \| history
arch/x86/mm/pgtable.c		patch \| blob \| blame \| history
include/asm-generic/vmlinux.lds.h		patch \| blob \| blame \| history
include/linux/percpu-defs.h		patch \| blob \| blame \| history
init/main.c		patch \| blob \| blame \| history
kernel/fork.c		patch \| blob \| blame \| history
security/Kconfig		patch \| blob \| blame \| history