From: Greg Kroah-Hartman Date: Thu, 13 Dec 2018 19:13:12 +0000 (+0100) Subject: 4.4-stable patches X-Git-Tag: v4.19.10~22 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=28bdf407baf6d8e960837d4b7af85b73c8c219cb;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: alsa-pcm-remove-sndrv_pcm_ioctl1_info-internal-command.patch bpf-prevent-memory-disambiguation-attack.patch bpf-support-8-byte-metafield-access.patch bpf-verifier-add-spi-variable-to-check_stack_write.patch bpf-verifier-pass-instruction-index-to-check_mem_access-and-check_xadd.patch hugetlbfs-check-for-pgoff-value-overflow.patch hugetlbfs-fix-bug-in-pgoff-overflow-checking.patch hugetlbfs-fix-offset-overflow-in-hugetlbfs-mmap.patch kvm-nvmx-eliminate-vmcs02-pool.patch kvm-nvmx-fix-msr-bitmaps-to-prevent-l2-from-accessing-l0-x2apic.patch kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch kvm-svm-implement-virt_spec_ctrl-support-for-ssbd.patch kvm-svm-move-spec-control-call-after-restore-of-gs.patch kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch kvm-vmx-emulate-msr_ia32_arch_capabilities.patch kvm-vmx-introduce-alloc_loaded_vmcs.patch kvm-vmx-make-msr-bitmaps-per-vcpu.patch kvm-x86-add-ibpb-support.patch kvm-x86-remove-indirect-msr-op-calls-from-spec_ctrl.patch mm-hugetlb.c-don-t-call-region_abort-if-region_chg-fails.patch posix-timers-sanitize-overrun-handling.patch sr-pass-down-correctly-sized-scsi-sense-buffer.patch swiotlb-clean-up-reporting.patch wil6210-missing-length-check-in-wmi_set_ie.patch x86-bugs-kvm-extend-speculation-control-for-virt_spec_ctrl.patch x86-bugs-kvm-support-the-combination-of-guest-and-host-ibrs.patch x86-fix-smap-in-32-bit-environments.patch x86-introduce-__uaccess_begin_nospec-and-uaccess_try_nospec.patch x86-kvm-vmx-expose-spec_ctrl-bit-2-to-the-guest.patch x86-reorganize-smap-handling-in-user-space-accesses.patch x86-speculation-use-synthetic-bits-for-ibrs-ibpb-stibp.patch x86-uaccess-use-__uaccess_begin_nospec-and-uaccess_try_nospec.patch x86-usercopy-replace-open-coded-stac-clac-with-__uaccess_-begin-end.patch --- diff --git a/queue-4.4/alsa-pcm-remove-sndrv_pcm_ioctl1_info-internal-command.patch b/queue-4.4/alsa-pcm-remove-sndrv_pcm_ioctl1_info-internal-command.patch new file mode 100644 index 00000000000..98ec044af8e --- /dev/null +++ b/queue-4.4/alsa-pcm-remove-sndrv_pcm_ioctl1_info-internal-command.patch @@ -0,0 +1,71 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Takashi Sakamoto +Date: Wed, 14 Jun 2017 19:30:03 +0900 +Subject: ALSA: pcm: remove SNDRV_PCM_IOCTL1_INFO internal command + +From: Takashi Sakamoto + +commit e11f0f90a626f93899687b1cc909ee37dd6c5809 upstream. + +Drivers can implement 'struct snd_pcm_ops.ioctl' to handle some requests +from ALSA PCM core. These requests are internal purpose in kernel land. +Usually common set of operations are used for it. + +SNDRV_PCM_IOCTL1_INFO is one of the requests. According to code comment, +it has been obsoleted in the old days. + +We can see old releases in ftp.alsa-project.org. The command was firstly +introduced in v0.5.0 release as SND_PCM_IOCTL1_INFO, to allow drivers to +fill data of 'struct snd_pcm_channel_info' type. In v0.9.0 release, +this was obsoleted by the other commands for ioctl(2) such as +SNDRV_PCM_IOCTL_CHANNEL_INFO. + +This commit removes the long-abandoned command, bye. + +Signed-off-by: Takashi Sakamoto +Signed-off-by: Takashi Iwai +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + include/sound/pcm.h | 2 +- + sound/core/pcm_lib.c | 2 -- + sound/core/pcm_native.c | 6 +----- + 3 files changed, 2 insertions(+), 8 deletions(-) + +--- a/include/sound/pcm.h ++++ b/include/sound/pcm.h +@@ -100,7 +100,7 @@ struct snd_pcm_ops { + #endif + + #define SNDRV_PCM_IOCTL1_RESET 0 +-#define SNDRV_PCM_IOCTL1_INFO 1 ++/* 1 is absent slot. */ + #define SNDRV_PCM_IOCTL1_CHANNEL_INFO 2 + #define SNDRV_PCM_IOCTL1_GSTATE 3 + #define SNDRV_PCM_IOCTL1_FIFO_SIZE 4 +--- a/sound/core/pcm_lib.c ++++ b/sound/core/pcm_lib.c +@@ -1849,8 +1849,6 @@ int snd_pcm_lib_ioctl(struct snd_pcm_sub + unsigned int cmd, void *arg) + { + switch (cmd) { +- case SNDRV_PCM_IOCTL1_INFO: +- return 0; + case SNDRV_PCM_IOCTL1_RESET: + return snd_pcm_lib_ioctl_reset(substream, arg); + case SNDRV_PCM_IOCTL1_CHANNEL_INFO: +--- a/sound/core/pcm_native.c ++++ b/sound/core/pcm_native.c +@@ -214,11 +214,7 @@ int snd_pcm_info(struct snd_pcm_substrea + info->subdevices_avail = pstr->substream_count - pstr->substream_opened; + strlcpy(info->subname, substream->name, sizeof(info->subname)); + runtime = substream->runtime; +- /* AB: FIXME!!! This is definitely nonsense */ +- if (runtime) { +- info->sync = runtime->sync; +- substream->ops->ioctl(substream, SNDRV_PCM_IOCTL1_INFO, info); +- } ++ + return 0; + } + diff --git a/queue-4.4/bpf-prevent-memory-disambiguation-attack.patch b/queue-4.4/bpf-prevent-memory-disambiguation-attack.patch new file mode 100644 index 00000000000..4207b96b423 --- /dev/null +++ b/queue-4.4/bpf-prevent-memory-disambiguation-attack.patch @@ -0,0 +1,145 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Alexei Starovoitov +Date: Tue, 15 May 2018 09:27:05 -0700 +Subject: bpf: Prevent memory disambiguation attack + +From: Alexei Starovoitov + +commit af86ca4e3088fe5eacf2f7e58c01fa68ca067672 upstream. + +Detect code patterns where malicious 'speculative store bypass' can be used +and sanitize such patterns. + + 39: (bf) r3 = r10 + 40: (07) r3 += -216 + 41: (79) r8 = *(u64 *)(r7 +0) // slow read + 42: (7a) *(u64 *)(r10 -72) = 0 // verifier inserts this instruction + 43: (7b) *(u64 *)(r8 +0) = r3 // this store becomes slow due to r8 + 44: (79) r1 = *(u64 *)(r6 +0) // cpu speculatively executes this load + 45: (71) r2 = *(u8 *)(r1 +0) // speculatively arbitrary 'load byte' + // is now sanitized + +Above code after x86 JIT becomes: + e5: mov %rbp,%rdx + e8: add $0xffffffffffffff28,%rdx + ef: mov 0x0(%r13),%r14 + f3: movq $0x0,-0x48(%rbp) + fb: mov %rdx,0x0(%r14) + ff: mov 0x0(%rbx),%rdi +103: movzbq 0x0(%rdi),%rsi + +Signed-off-by: Alexei Starovoitov +Signed-off-by: Thomas Gleixner +[bwh: Backported to 4.4: + - Add verifier_env parameter to check_stack_write() + - Look up stack slot_types with state->stack_slot_type[] rather than + state->stack[].slot_type[] + - Drop bpf_verifier_env argument to verbose() + - Adjust filename, context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 59 insertions(+), 4 deletions(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -191,6 +191,7 @@ struct bpf_insn_aux_data { + enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ + struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + }; ++ int sanitize_stack_off; /* stack slot to be cleared */ + bool seen; /* this insn was processed by the verifier */ + }; + +@@ -569,8 +570,9 @@ static bool is_spillable_regtype(enum bp + /* check_stack_read/write functions track spill/fill of registers, + * stack boundary and alignment are checked in check_mem_access() + */ +-static int check_stack_write(struct verifier_state *state, int off, int size, +- int value_regno) ++static int check_stack_write(struct verifier_env *env, ++ struct verifier_state *state, int off, ++ int size, int value_regno, int insn_idx) + { + int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; + /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, +@@ -589,8 +591,32 @@ static int check_stack_write(struct veri + /* save register state */ + state->spilled_regs[spi] = state->regs[value_regno]; + +- for (i = 0; i < BPF_REG_SIZE; i++) ++ for (i = 0; i < BPF_REG_SIZE; i++) { ++ if (state->stack_slot_type[MAX_BPF_STACK + off + i] == STACK_MISC && ++ !env->allow_ptr_leaks) { ++ int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; ++ int soff = (-spi - 1) * BPF_REG_SIZE; ++ ++ /* detected reuse of integer stack slot with a pointer ++ * which means either llvm is reusing stack slot or ++ * an attacker is trying to exploit CVE-2018-3639 ++ * (speculative store bypass) ++ * Have to sanitize that slot with preemptive ++ * store of zero. ++ */ ++ if (*poff && *poff != soff) { ++ /* disallow programs where single insn stores ++ * into two different stack slots, since verifier ++ * cannot sanitize them ++ */ ++ verbose("insn %d cannot access two stack slots fp%d and fp%d", ++ insn_idx, *poff, soff); ++ return -EINVAL; ++ } ++ *poff = soff; ++ } + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; ++ } + } else { + /* regular write of data into stack */ + state->spilled_regs[spi] = (struct reg_state) {}; +@@ -746,7 +772,8 @@ static int check_mem_access(struct verif + verbose("attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } +- err = check_stack_write(state, off, size, value_regno); ++ err = check_stack_write(env, state, off, size, ++ value_regno, insn_idx); + } else { + err = check_stack_read(state, off, size, value_regno); + } +@@ -2228,6 +2255,34 @@ static int convert_ctx_accesses(struct v + else + continue; + ++ if (type == BPF_WRITE && ++ env->insn_aux_data[i + delta].sanitize_stack_off) { ++ struct bpf_insn patch[] = { ++ /* Sanitize suspicious stack slot with zero. ++ * There are no memory dependencies for this store, ++ * since it's only using frame pointer and immediate ++ * constant of zero ++ */ ++ BPF_ST_MEM(BPF_DW, BPF_REG_FP, ++ env->insn_aux_data[i + delta].sanitize_stack_off, ++ 0), ++ /* the original STX instruction will immediately ++ * overwrite the same stack slot with appropriate value ++ */ ++ *insn, ++ }; ++ ++ cnt = ARRAY_SIZE(patch); ++ new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); ++ if (!new_prog) ++ return -ENOMEM; ++ ++ delta += cnt - 1; ++ env->prog = new_prog; ++ insn = new_prog->insnsi + i + delta; ++ continue; ++ } ++ + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) + continue; + diff --git a/queue-4.4/bpf-support-8-byte-metafield-access.patch b/queue-4.4/bpf-support-8-byte-metafield-access.patch new file mode 100644 index 00000000000..6ac46735b76 --- /dev/null +++ b/queue-4.4/bpf-support-8-byte-metafield-access.patch @@ -0,0 +1,51 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Alexei Starovoitov +Date: Thu, 1 Sep 2016 18:37:21 -0700 +Subject: bpf: support 8-byte metafield access + +From: Alexei Starovoitov + +commit cedaf52693f02372010548c63b2e63228b959099 upstream. + +The verifier supported only 4-byte metafields in +struct __sk_buff and struct xdp_md. The metafields in upcoming +struct bpf_perf_event are 8-byte to match register width in struct pt_regs. +Teach verifier to recognize 8-byte metafield access. +The patch doesn't affect safety of sockets and xdp programs. +They check for 4-byte only ctx access before these conditions are hit. + +Signed-off-by: Alexei Starovoitov +Acked-by: Daniel Borkmann +Signed-off-by: David S. Miller +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -1844,7 +1844,8 @@ static int do_check(struct verifier_env + if (err) + return err; + +- if (BPF_SIZE(insn->code) != BPF_W) { ++ if (BPF_SIZE(insn->code) != BPF_W && ++ BPF_SIZE(insn->code) != BPF_DW) { + insn_idx++; + continue; + } +@@ -2220,9 +2221,11 @@ static int convert_ctx_accesses(struct v + for (i = 0; i < insn_cnt; i++, insn++) { + u32 cnt; + +- if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) ++ if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || ++ insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) + type = BPF_READ; +- else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) ++ else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || ++ insn->code == (BPF_STX | BPF_MEM | BPF_DW)) + type = BPF_WRITE; + else + continue; diff --git a/queue-4.4/bpf-verifier-add-spi-variable-to-check_stack_write.patch b/queue-4.4/bpf-verifier-add-spi-variable-to-check_stack_write.patch new file mode 100644 index 00000000000..6c665079e4d --- /dev/null +++ b/queue-4.4/bpf-verifier-add-spi-variable-to-check_stack_write.patch @@ -0,0 +1,47 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Ben Hutchings +Date: Wed, 5 Dec 2018 22:45:15 +0000 +Subject: bpf/verifier: Add spi variable to check_stack_write() + +From: Ben Hutchings + +Extracted from commit dc503a8ad984 "bpf/verifier: track liveness for +pruning". + +Cc: Daniel Borkmann +Cc: Alexei Starovoitov +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -572,7 +572,7 @@ static bool is_spillable_regtype(enum bp + static int check_stack_write(struct verifier_state *state, int off, int size, + int value_regno) + { +- int i; ++ int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; + /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, + * so it's aligned access and [off, off + size) are within stack limits + */ +@@ -587,15 +587,13 @@ static int check_stack_write(struct veri + } + + /* save register state */ +- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = +- state->regs[value_regno]; ++ state->spilled_regs[spi] = state->regs[value_regno]; + + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + } else { + /* regular write of data into stack */ +- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = +- (struct reg_state) {}; ++ state->spilled_regs[spi] = (struct reg_state) {}; + + for (i = 0; i < size; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; diff --git a/queue-4.4/bpf-verifier-pass-instruction-index-to-check_mem_access-and-check_xadd.patch b/queue-4.4/bpf-verifier-pass-instruction-index-to-check_mem_access-and-check_xadd.patch new file mode 100644 index 00000000000..3452b76f876 --- /dev/null +++ b/queue-4.4/bpf-verifier-pass-instruction-index-to-check_mem_access-and-check_xadd.patch @@ -0,0 +1,90 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Ben Hutchings +Date: Wed, 5 Dec 2018 22:41:36 +0000 +Subject: bpf/verifier: Pass instruction index to check_mem_access() and check_xadd() + +From: Ben Hutchings + +Extracted from commit 31fd85816dbe "bpf: permits narrower load from +bpf program context fields". + +Cc: Daniel Borkmann +Cc: Alexei Starovoitov +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -694,7 +694,7 @@ static bool is_ctx_reg(struct verifier_e + * if t==write && value_regno==-1, some unknown value is stored into memory + * if t==read && value_regno==-1, don't care what we read from memory + */ +-static int check_mem_access(struct verifier_env *env, u32 regno, int off, ++static int check_mem_access(struct verifier_env *env, int insn_idx, u32 regno, int off, + int bpf_size, enum bpf_access_type t, + int value_regno) + { +@@ -758,7 +758,7 @@ static int check_mem_access(struct verif + return err; + } + +-static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) ++static int check_xadd(struct verifier_env *env, int insn_idx, struct bpf_insn *insn) + { + struct reg_state *regs = env->cur_state.regs; + int err; +@@ -791,13 +791,13 @@ static int check_xadd(struct verifier_en + } + + /* check whether atomic_add can read the memory */ +- err = check_mem_access(env, insn->dst_reg, insn->off, ++ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, -1); + if (err) + return err; + + /* check whether atomic_add can write into the same memory */ +- return check_mem_access(env, insn->dst_reg, insn->off, ++ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1); + } + +@@ -1836,7 +1836,7 @@ static int do_check(struct verifier_env + /* check that memory (src_reg + off) is readable, + * the state of dst_reg will be updated by this func + */ +- err = check_mem_access(env, insn->src_reg, insn->off, ++ err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, + insn->dst_reg); + if (err) +@@ -1875,7 +1875,7 @@ static int do_check(struct verifier_env + enum bpf_reg_type *prev_dst_type, dst_reg_type; + + if (BPF_MODE(insn->code) == BPF_XADD) { +- err = check_xadd(env, insn); ++ err = check_xadd(env, insn_idx, insn); + if (err) + return err; + insn_idx++; +@@ -1894,7 +1894,7 @@ static int do_check(struct verifier_env + dst_reg_type = regs[insn->dst_reg].type; + + /* check that memory (dst_reg + off) is writeable */ +- err = check_mem_access(env, insn->dst_reg, insn->off, ++ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + insn->src_reg); + if (err) +@@ -1929,7 +1929,7 @@ static int do_check(struct verifier_env + } + + /* check that memory (dst_reg + off) is writeable */ +- err = check_mem_access(env, insn->dst_reg, insn->off, ++ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + -1); + if (err) diff --git a/queue-4.4/hugetlbfs-check-for-pgoff-value-overflow.patch b/queue-4.4/hugetlbfs-check-for-pgoff-value-overflow.patch new file mode 100644 index 00000000000..9770230a4d7 --- /dev/null +++ b/queue-4.4/hugetlbfs-check-for-pgoff-value-overflow.patch @@ -0,0 +1,114 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Mike Kravetz +Date: Thu, 22 Mar 2018 16:17:13 -0700 +Subject: hugetlbfs: check for pgoff value overflow + +From: Mike Kravetz + +commit 63489f8e821144000e0bdca7e65a8d1cc23a7ee7 upstream. + +A vma with vm_pgoff large enough to overflow a loff_t type when +converted to a byte offset can be passed via the remap_file_pages system +call. The hugetlbfs mmap routine uses the byte offset to calculate +reservations and file size. + +A sequence such as: + + mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0); + remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0); + +will result in the following when task exits/file closed, + + kernel BUG at mm/hugetlb.c:749! + Call Trace: + hugetlbfs_evict_inode+0x2f/0x40 + evict+0xcb/0x190 + __dentry_kill+0xcb/0x150 + __fput+0x164/0x1e0 + task_work_run+0x84/0xa0 + exit_to_usermode_loop+0x7d/0x80 + do_syscall_64+0x18b/0x190 + entry_SYSCALL_64_after_hwframe+0x3d/0xa2 + +The overflowed pgoff value causes hugetlbfs to try to set up a mapping +with a negative range (end < start) that leaves invalid state which +causes the BUG. + +The previous overflow fix to this code was incomplete and did not take +the remap_file_pages system call into account. + +[mike.kravetz@oracle.com: v3] + Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com +[akpm@linux-foundation.org: include mmdebug.h] +[akpm@linux-foundation.org: fix -ve left shift count on sh] +Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com +Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap") +Signed-off-by: Mike Kravetz +Reported-by: Nic Losby +Acked-by: Michal Hocko +Cc: "Kirill A . Shutemov" +Cc: Yisheng Xie +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[bwh: Backported to 4.4: Use a conditional WARN() instead of VM_WARN()] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/hugetlbfs/inode.c | 17 ++++++++++++++--- + mm/hugetlb.c | 8 ++++++++ + 2 files changed, 22 insertions(+), 3 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -118,6 +118,16 @@ static void huge_pagevec_release(struct + pagevec_reinit(pvec); + } + ++/* ++ * Mask used when checking the page offset value passed in via system ++ * calls. This value will be converted to a loff_t which is signed. ++ * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the ++ * value. The extra bit (- 1 in the shift value) is to take the sign ++ * bit into account. ++ */ ++#define PGOFF_LOFFT_MAX \ ++ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) ++ + static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) + { + struct inode *inode = file_inode(file); +@@ -137,12 +147,13 @@ static int hugetlbfs_file_mmap(struct fi + vma->vm_ops = &hugetlb_vm_ops; + + /* +- * Offset passed to mmap (before page shift) could have been +- * negative when represented as a (l)off_t. ++ * page based offset in vm_pgoff could be sufficiently large to ++ * overflow a (l)off_t when converted to byte offset. + */ +- if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) ++ if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + return -EINVAL; + ++ /* must be huge page aligned */ + if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + return -EINVAL; + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -4053,6 +4053,14 @@ int hugetlb_reserve_pages(struct inode * + struct resv_map *resv_map; + long gbl_reserve; + ++ /* This should never happen */ ++ if (from > to) { ++#ifdef CONFIG_DEBUG_VM ++ WARN(1, "%s called with a negative range\n", __func__); ++#endif ++ return -EINVAL; ++ } ++ + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page diff --git a/queue-4.4/hugetlbfs-fix-bug-in-pgoff-overflow-checking.patch b/queue-4.4/hugetlbfs-fix-bug-in-pgoff-overflow-checking.patch new file mode 100644 index 00000000000..8e5987b4a0d --- /dev/null +++ b/queue-4.4/hugetlbfs-fix-bug-in-pgoff-overflow-checking.patch @@ -0,0 +1,56 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Mike Kravetz +Date: Thu, 5 Apr 2018 16:18:21 -0700 +Subject: hugetlbfs: fix bug in pgoff overflow checking + +From: Mike Kravetz + +commit 5df63c2a149ae65a9ec239e7c2af44efa6f79beb upstream. + +This is a fix for a regression in 32 bit kernels caused by an invalid +check for pgoff overflow in hugetlbfs mmap setup. The check incorrectly +specified that the size of a loff_t was the same as the size of a long. +The regression prevents mapping hugetlbfs files at offsets greater than +4GB on 32 bit kernels. + +On 32 bit kernels conversion from a page based unsigned long can not +overflow a loff_t byte offset. Therefore, skip this check if +sizeof(unsigned long) != sizeof(loff_t). + +Link: http://lkml.kernel.org/r/20180330145402.5053-1-mike.kravetz@oracle.com +Fixes: 63489f8e8211 ("hugetlbfs: check for pgoff value overflow") +Reported-by: Dan Rue +Signed-off-by: Mike Kravetz +Tested-by: Anders Roxell +Cc: Michal Hocko +Cc: Yisheng Xie +Cc: "Kirill A . Shutemov" +Cc: Nic Losby +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/hugetlbfs/inode.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -148,10 +148,14 @@ static int hugetlbfs_file_mmap(struct fi + + /* + * page based offset in vm_pgoff could be sufficiently large to +- * overflow a (l)off_t when converted to byte offset. ++ * overflow a loff_t when converted to byte offset. This can ++ * only happen on architectures where sizeof(loff_t) == ++ * sizeof(unsigned long). So, only check in those instances. + */ +- if (vma->vm_pgoff & PGOFF_LOFFT_MAX) +- return -EINVAL; ++ if (sizeof(unsigned long) == sizeof(loff_t)) { ++ if (vma->vm_pgoff & PGOFF_LOFFT_MAX) ++ return -EINVAL; ++ } + + /* must be huge page aligned */ + if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) diff --git a/queue-4.4/hugetlbfs-fix-offset-overflow-in-hugetlbfs-mmap.patch b/queue-4.4/hugetlbfs-fix-offset-overflow-in-hugetlbfs-mmap.patch new file mode 100644 index 00000000000..135475a797b --- /dev/null +++ b/queue-4.4/hugetlbfs-fix-offset-overflow-in-hugetlbfs-mmap.patch @@ -0,0 +1,100 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Mike Kravetz +Date: Thu, 13 Apr 2017 14:56:32 -0700 +Subject: hugetlbfs: fix offset overflow in hugetlbfs mmap + +From: Mike Kravetz + +commit 045c7a3f53d9403b62d396b6d051c4be5044cdb4 upstream. + +If mmap() maps a file, it can be passed an offset into the file at which +the mapping is to start. Offset could be a negative value when +represented as a loff_t. The offset plus length will be used to update +the file size (i_size) which is also a loff_t. + +Validate the value of offset and offset + length to make sure they do +not overflow and appear as negative. + +Found by syzcaller with commit ff8c0c53c475 ("mm/hugetlb.c: don't call +region_abort if region_chg fails") applied. Prior to this commit, the +overflow would still occur but we would luckily return ENOMEM. + +To reproduce: + + mmap(0, 0x2000, 0, 0x40021, 0xffffffffffffffffULL, 0x8000000000000000ULL); + +Resulted in, + + kernel BUG at mm/hugetlb.c:742! + Call Trace: + hugetlbfs_evict_inode+0x80/0xa0 + evict+0x24a/0x620 + iput+0x48f/0x8c0 + dentry_unlink_inode+0x31f/0x4d0 + __dentry_kill+0x292/0x5e0 + dput+0x730/0x830 + __fput+0x438/0x720 + ____fput+0x1a/0x20 + task_work_run+0xfe/0x180 + exit_to_usermode_loop+0x133/0x150 + syscall_return_slowpath+0x184/0x1c0 + entry_SYSCALL_64_fastpath+0xab/0xad + +Fixes: ff8c0c53c475 ("mm/hugetlb.c: don't call region_abort if region_chg fails") +Link: http://lkml.kernel.org/r/1491951118-30678-1-git-send-email-mike.kravetz@oracle.com +Reported-by: Vegard Nossum +Signed-off-by: Mike Kravetz +Acked-by: Hillf Danton +Cc: Dmitry Vyukov +Cc: Michal Hocko +Cc: "Kirill A . Shutemov" +Cc: Andrey Ryabinin +Cc: Naoya Horiguchi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/hugetlbfs/inode.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -136,17 +136,26 @@ static int hugetlbfs_file_mmap(struct fi + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vma->vm_ops = &hugetlb_vm_ops; + ++ /* ++ * Offset passed to mmap (before page shift) could have been ++ * negative when represented as a (l)off_t. ++ */ ++ if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) ++ return -EINVAL; ++ + if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + return -EINVAL; + + vma_len = (loff_t)(vma->vm_end - vma->vm_start); ++ len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); ++ /* check for overflow */ ++ if (len < vma_len) ++ return -EINVAL; + + mutex_lock(&inode->i_mutex); + file_accessed(file); + + ret = -ENOMEM; +- len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); +- + if (hugetlb_reserve_pages(inode, + vma->vm_pgoff >> huge_page_order(h), + len >> huge_page_shift(h), vma, +@@ -155,7 +164,7 @@ static int hugetlbfs_file_mmap(struct fi + + ret = 0; + if (vma->vm_flags & VM_WRITE && inode->i_size < len) +- inode->i_size = len; ++ i_size_write(inode, len); + out: + mutex_unlock(&inode->i_mutex); + diff --git a/queue-4.4/kvm-nvmx-eliminate-vmcs02-pool.patch b/queue-4.4/kvm-nvmx-eliminate-vmcs02-pool.patch new file mode 100644 index 00000000000..81732c3c10b --- /dev/null +++ b/queue-4.4/kvm-nvmx-eliminate-vmcs02-pool.patch @@ -0,0 +1,293 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Jim Mattson +Date: Mon, 27 Nov 2017 17:22:25 -0600 +Subject: KVM: nVMX: Eliminate vmcs02 pool + +From: Jim Mattson + +commit de3a0021a60635de96aa92713c1a31a96747d72c upstream. + +The potential performance advantages of a vmcs02 pool have never been +realized. To simplify the code, eliminate the pool. Instead, a single +vmcs02 is allocated per VCPU when the VCPU enters VMX operation. + +Signed-off-by: Jim Mattson +Signed-off-by: Mark Kanda +Reviewed-by: Ameya More +Reviewed-by: David Hildenbrand +Reviewed-by: Paolo Bonzini +Signed-off-by: Radim Krčmář +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.4: + - No loaded_vmcs::shadow_vmcs field to initialise + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx.c | 144 ++++++++--------------------------------------------- + 1 file changed, 22 insertions(+), 122 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -172,7 +172,6 @@ module_param(ple_window_max, int, S_IRUG + extern const ulong vmx_return; + + #define NR_AUTOLOAD_MSRS 8 +-#define VMCS02_POOL_SIZE 1 + + struct vmcs { + u32 revision_id; +@@ -205,7 +204,7 @@ struct shared_msr_entry { + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. +- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the ++ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). +@@ -384,13 +383,6 @@ struct __packed vmcs12 { + */ + #define VMCS12_SIZE 0x1000 + +-/* Used to remember the last vmcs02 used for some recently used vmcs12s */ +-struct vmcs02_list { +- struct list_head list; +- gpa_t vmptr; +- struct loaded_vmcs vmcs02; +-}; +- + /* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. +@@ -412,16 +404,16 @@ struct nested_vmx { + */ + bool sync_shadow_vmcs; + +- /* vmcs02_list cache of VMCSs recently used to run L2 guests */ +- struct list_head vmcs02_pool; +- int vmcs02_num; + u64 vmcs01_tsc_offset; + bool change_vmcs01_virtual_x2apic_mode; + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; ++ ++ struct loaded_vmcs vmcs02; ++ + /* +- * Guest pages referred to in vmcs02 with host-physical pointers, so +- * we must keep them pinned while L2 runs. ++ * Guest pages referred to in the vmcs02 with host-physical ++ * pointers, so we must keep them pinned while L2 runs. + */ + struct page *apic_access_page; + struct page *virtual_apic_page; +@@ -6435,93 +6427,6 @@ static int handle_monitor(struct kvm_vcp + } + + /* +- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. +- * We could reuse a single VMCS for all the L2 guests, but we also want the +- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this +- * allows keeping them loaded on the processor, and in the future will allow +- * optimizations where prepare_vmcs02 doesn't need to set all the fields on +- * every entry if they never change. +- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE +- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. +- * +- * The following functions allocate and free a vmcs02 in this pool. +- */ +- +-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ +-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) +-{ +- struct vmcs02_list *item; +- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) +- if (item->vmptr == vmx->nested.current_vmptr) { +- list_move(&item->list, &vmx->nested.vmcs02_pool); +- return &item->vmcs02; +- } +- +- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { +- /* Recycle the least recently used VMCS. */ +- item = list_entry(vmx->nested.vmcs02_pool.prev, +- struct vmcs02_list, list); +- item->vmptr = vmx->nested.current_vmptr; +- list_move(&item->list, &vmx->nested.vmcs02_pool); +- return &item->vmcs02; +- } +- +- /* Create a new VMCS */ +- item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); +- if (!item) +- return NULL; +- item->vmcs02.vmcs = alloc_vmcs(); +- if (!item->vmcs02.vmcs) { +- kfree(item); +- return NULL; +- } +- loaded_vmcs_init(&item->vmcs02); +- item->vmptr = vmx->nested.current_vmptr; +- list_add(&(item->list), &(vmx->nested.vmcs02_pool)); +- vmx->nested.vmcs02_num++; +- return &item->vmcs02; +-} +- +-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ +-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) +-{ +- struct vmcs02_list *item; +- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) +- if (item->vmptr == vmptr) { +- free_loaded_vmcs(&item->vmcs02); +- list_del(&item->list); +- kfree(item); +- vmx->nested.vmcs02_num--; +- return; +- } +-} +- +-/* +- * Free all VMCSs saved for this vcpu, except the one pointed by +- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs +- * must be &vmx->vmcs01. +- */ +-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) +-{ +- struct vmcs02_list *item, *n; +- +- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); +- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { +- /* +- * Something will leak if the above WARN triggers. Better than +- * a use-after-free. +- */ +- if (vmx->loaded_vmcs == &item->vmcs02) +- continue; +- +- free_loaded_vmcs(&item->vmcs02); +- list_del(&item->list); +- kfree(item); +- vmx->nested.vmcs02_num--; +- } +-} +- +-/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified + * by Vol 2B, VMX Instruction Reference, "Conventions". +@@ -6833,6 +6738,11 @@ static int handle_vmon(struct kvm_vcpu * + return 1; + } + ++ vmx->nested.vmcs02.vmcs = alloc_vmcs(); ++ if (!vmx->nested.vmcs02.vmcs) ++ goto out_vmcs02; ++ loaded_vmcs_init(&vmx->nested.vmcs02); ++ + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); +@@ -6851,9 +6761,6 @@ static int handle_vmon(struct kvm_vcpu * + vmx->nested.current_shadow_vmcs = shadow_vmcs; + } + +- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); +- vmx->nested.vmcs02_num = 0; +- + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; +@@ -6870,6 +6777,9 @@ out_shadow_vmcs: + free_page((unsigned long)vmx->nested.msr_bitmap); + + out_msr_bitmap: ++ free_loaded_vmcs(&vmx->nested.vmcs02); ++ ++out_vmcs02: + return -ENOMEM; + } + +@@ -6946,7 +6856,7 @@ static void free_nested(struct vcpu_vmx + } + if (enable_shadow_vmcs) + free_vmcs(vmx->nested.current_shadow_vmcs); +- /* Unpin physical memory we referred to in current vmcs02 */ ++ /* Unpin physical memory we referred to in the vmcs02 */ + if (vmx->nested.apic_access_page) { + nested_release_page(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; +@@ -6962,7 +6872,7 @@ static void free_nested(struct vcpu_vmx + vmx->nested.pi_desc = NULL; + } + +- nested_free_all_saved_vmcss(vmx); ++ free_loaded_vmcs(&vmx->nested.vmcs02); + } + + /* Emulate the VMXOFF instruction */ +@@ -6996,8 +6906,6 @@ static int handle_vmclear(struct kvm_vcp + vmptr + offsetof(struct vmcs12, launch_state), + &zero, sizeof(zero)); + +- nested_free_vmcs02(vmx, vmptr); +- + skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); + return 1; +@@ -7784,10 +7692,11 @@ static bool nested_vmx_exit_handled(stru + + /* + * The host physical addresses of some pages of guest memory +- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU +- * may write to these pages via their host physical address while +- * L2 is running, bypassing any address-translation-based dirty +- * tracking (e.g. EPT write protection). ++ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC ++ * Page). The CPU may write to these pages via their host ++ * physical address while L2 is running, bypassing any ++ * address-translation-based dirty tracking (e.g. EPT write ++ * protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. +@@ -9889,7 +9798,6 @@ static int nested_vmx_run(struct kvm_vcp + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; +- struct loaded_vmcs *vmcs02; + bool ia32e; + u32 msr_entry_idx; + +@@ -10029,10 +9937,6 @@ static int nested_vmx_run(struct kvm_vcp + * the nested entry. + */ + +- vmcs02 = nested_get_current_vmcs02(vmx); +- if (!vmcs02) +- return -ENOMEM; +- + enter_guest_mode(vcpu); + + vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); +@@ -10041,7 +9945,7 @@ static int nested_vmx_run(struct kvm_vcp + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + + cpu = get_cpu(); +- vmx->loaded_vmcs = vmcs02; ++ vmx->loaded_vmcs = &vmx->nested.vmcs02; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; +@@ -10553,10 +10457,6 @@ static void nested_vmx_vmexit(struct kvm + vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); + vmx_segment_cache_clear(vmx); + +- /* if no vmcs02 cache requested, remove the one we used */ +- if (VMCS02_POOL_SIZE == 0) +- nested_free_vmcs02(vmx, vmx->nested.current_vmptr); +- + load_vmcs12_host_state(vcpu, vmcs12); + + /* Update TSC_OFFSET if TSC was changed while L2 ran */ diff --git a/queue-4.4/kvm-nvmx-fix-msr-bitmaps-to-prevent-l2-from-accessing-l0-x2apic.patch b/queue-4.4/kvm-nvmx-fix-msr-bitmaps-to-prevent-l2-from-accessing-l0-x2apic.patch new file mode 100644 index 00000000000..9b49ea68352 --- /dev/null +++ b/queue-4.4/kvm-nvmx-fix-msr-bitmaps-to-prevent-l2-from-accessing-l0-x2apic.patch @@ -0,0 +1,253 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: "Radim Krčmář" +Date: Mon, 8 Aug 2016 20:16:22 +0200 +Subject: KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC + +From: "Radim Krčmář" + +commit d048c098218e91ed0e10dfa1f0f80e2567fe4ef7 upstream. + +msr bitmap can be used to avoid a VM exit (interception) on guest MSR +accesses. In some configurations of VMX controls, the guest can even +directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED +APIC ACCESSES. + +L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI. +To do so, L1 would first trick KVM to disable all possible interceptions +by enabling APICv features and then would turn those features off; +nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would +not intercept previously enabled MSRs even though they were not safe +with the new configuration. + +Correctly re-enabling interceptions is not enough as a second bug would +still allow L1+L2 to access host's MSRs: msr bitmap was shared for all +VMCSs, so L1 could trigger a race to get the desired combination of msr +bitmap and VMX controls. + +This fix allocates a msr bitmap for every L1 VCPU, allows only safe +x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would +have to intercept everything anyway. + +Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap") +Reported-by: Jim Mattson +Suggested-by: Wincy Van +Reviewed-by: Wanpeng Li +Signed-off-by: Radim Krčmář +[bwh: Backported to 4.4: + - handle_vmon() doesn't allocate a cached vmcs12 + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx.c | 96 ++++++++++++++++++++--------------------------------- + 1 file changed, 38 insertions(+), 58 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -431,6 +431,8 @@ struct nested_vmx { + u16 posted_intr_nv; + u64 msr_ia32_feature_control; + ++ unsigned long *msr_bitmap; ++ + struct hrtimer preemption_timer; + bool preemption_timer_expired; + +@@ -912,7 +914,6 @@ static unsigned long *vmx_msr_bitmap_leg + static unsigned long *vmx_msr_bitmap_longmode; + static unsigned long *vmx_msr_bitmap_legacy_x2apic; + static unsigned long *vmx_msr_bitmap_longmode_x2apic; +-static unsigned long *vmx_msr_bitmap_nested; + static unsigned long *vmx_vmread_bitmap; + static unsigned long *vmx_vmwrite_bitmap; + +@@ -2358,7 +2359,7 @@ static void vmx_set_msr_bitmap(struct kv + unsigned long *msr_bitmap; + + if (is_guest_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_nested; ++ msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; + else if (vcpu->arch.apic_base & X2APIC_ENABLE) { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; +@@ -6192,13 +6193,6 @@ static __init int hardware_setup(void) + if (!vmx_msr_bitmap_longmode_x2apic) + goto out4; + +- if (nested) { +- vmx_msr_bitmap_nested = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_nested) +- goto out5; +- } +- + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) + goto out6; +@@ -6216,8 +6210,6 @@ static __init int hardware_setup(void) + + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); +- if (nested) +- memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); + + if (setup_vmcs_config(&vmcs_config) < 0) { + r = -EIO; +@@ -6354,9 +6346,6 @@ out8: + out7: + free_page((unsigned long)vmx_vmread_bitmap); + out6: +- if (nested) +- free_page((unsigned long)vmx_msr_bitmap_nested); +-out5: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + out4: + free_page((unsigned long)vmx_msr_bitmap_longmode); +@@ -6382,8 +6371,6 @@ static __exit void hardware_unsetup(void + free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); + free_page((unsigned long)vmx_vmread_bitmap); +- if (nested) +- free_page((unsigned long)vmx_msr_bitmap_nested); + + free_kvm_area(); + } +@@ -6825,10 +6812,17 @@ static int handle_vmon(struct kvm_vcpu * + return 1; + } + ++ if (cpu_has_vmx_msr_bitmap()) { ++ vmx->nested.msr_bitmap = ++ (unsigned long *)__get_free_page(GFP_KERNEL); ++ if (!vmx->nested.msr_bitmap) ++ goto out_msr_bitmap; ++ } ++ + if (enable_shadow_vmcs) { + shadow_vmcs = alloc_vmcs(); + if (!shadow_vmcs) +- return -ENOMEM; ++ goto out_shadow_vmcs; + /* mark vmcs as shadow */ + shadow_vmcs->revision_id |= (1u << 31); + /* init shadow vmcs */ +@@ -6850,6 +6844,12 @@ static int handle_vmon(struct kvm_vcpu * + skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); + return 1; ++ ++out_shadow_vmcs: ++ free_page((unsigned long)vmx->nested.msr_bitmap); ++ ++out_msr_bitmap: ++ return -ENOMEM; + } + + /* +@@ -6919,6 +6919,10 @@ static void free_nested(struct vcpu_vmx + vmx->nested.vmxon = false; + free_vpid(vmx->nested.vpid02); + nested_release_vmcs12(vmx); ++ if (vmx->nested.msr_bitmap) { ++ free_page((unsigned long)vmx->nested.msr_bitmap); ++ vmx->nested.msr_bitmap = NULL; ++ } + if (enable_shadow_vmcs) + free_vmcs(vmx->nested.current_shadow_vmcs); + /* Unpin physical memory we referred to in current vmcs02 */ +@@ -9248,8 +9252,10 @@ static inline bool nested_vmx_merge_msr_ + { + int msr; + struct page *page; +- unsigned long *msr_bitmap; ++ unsigned long *msr_bitmap_l1; ++ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + ++ /* This shortcut is ok because we support only x2APIC MSRs so far. */ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) + return false; + +@@ -9258,58 +9264,32 @@ static inline bool nested_vmx_merge_msr_ + WARN_ON(1); + return false; + } +- msr_bitmap = (unsigned long *)kmap(page); ++ msr_bitmap_l1 = (unsigned long *)kmap(page); ++ ++ memset(msr_bitmap_l0, 0xff, PAGE_SIZE); + + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { + if (nested_cpu_has_apic_reg_virt(vmcs12)) + for (msr = 0x800; msr <= 0x8ff; msr++) + nested_vmx_disable_intercept_for_msr( +- msr_bitmap, +- vmx_msr_bitmap_nested, ++ msr_bitmap_l1, msr_bitmap_l0, + msr, MSR_TYPE_R); +- /* TPR is allowed */ +- nested_vmx_disable_intercept_for_msr(msr_bitmap, +- vmx_msr_bitmap_nested, ++ ++ nested_vmx_disable_intercept_for_msr( ++ msr_bitmap_l1, msr_bitmap_l0, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_R | MSR_TYPE_W); ++ + if (nested_cpu_has_vid(vmcs12)) { +- /* EOI and self-IPI are allowed */ + nested_vmx_disable_intercept_for_msr( +- msr_bitmap, +- vmx_msr_bitmap_nested, ++ msr_bitmap_l1, msr_bitmap_l0, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( +- msr_bitmap, +- vmx_msr_bitmap_nested, ++ msr_bitmap_l1, msr_bitmap_l0, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); + } +- } else { +- /* +- * Enable reading intercept of all the x2apic +- * MSRs. We should not rely on vmcs12 to do any +- * optimizations here, it may have been modified +- * by L1. +- */ +- for (msr = 0x800; msr <= 0x8ff; msr++) +- __vmx_enable_intercept_for_msr( +- vmx_msr_bitmap_nested, +- msr, +- MSR_TYPE_R); +- +- __vmx_enable_intercept_for_msr( +- vmx_msr_bitmap_nested, +- APIC_BASE_MSR + (APIC_TASKPRI >> 4), +- MSR_TYPE_W); +- __vmx_enable_intercept_for_msr( +- vmx_msr_bitmap_nested, +- APIC_BASE_MSR + (APIC_EOI >> 4), +- MSR_TYPE_W); +- __vmx_enable_intercept_for_msr( +- vmx_msr_bitmap_nested, +- APIC_BASE_MSR + (APIC_SELF_IPI >> 4), +- MSR_TYPE_W); + } + kunmap(page); + nested_release_page_clean(page); +@@ -9729,10 +9709,10 @@ static void prepare_vmcs02(struct kvm_vc + } + + if (cpu_has_vmx_msr_bitmap() && +- exec_control & CPU_BASED_USE_MSR_BITMAPS) { +- nested_vmx_merge_msr_bitmap(vcpu, vmcs12); +- /* MSR_BITMAP will be set by following vmx_set_efer. */ +- } else ++ exec_control & CPU_BASED_USE_MSR_BITMAPS && ++ nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) ++ ; /* MSR_BITMAP will be set by following vmx_set_efer. */ ++ else + exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; + + /* diff --git a/queue-4.4/kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch b/queue-4.4/kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch new file mode 100644 index 00000000000..516dee75eee --- /dev/null +++ b/queue-4.4/kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch @@ -0,0 +1,113 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: David Matlack +Date: Tue, 1 Aug 2017 14:00:40 -0700 +Subject: KVM: nVMX: mark vmcs12 pages dirty on L2 exit + +From: David Matlack + +commit c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570 upstream. + +The host physical addresses of L1's Virtual APIC Page and Posted +Interrupt descriptor are loaded into the VMCS02. The CPU may write +to these pages via their host physical address while L2 is running, +bypassing address-translation-based dirty tracking (e.g. EPT write +protection). Mark them dirty on every exit from L2 to prevent them +from getting out of sync with dirty tracking. + +Also mark the virtual APIC page and the posted interrupt descriptor +dirty when KVM is virtualizing posted interrupt processing. + +Signed-off-by: David Matlack +Reviewed-by: Paolo Bonzini +Signed-off-by: Radim Krčmář +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 43 insertions(+), 10 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -4527,6 +4527,28 @@ static int vmx_cpu_uses_apicv(struct kvm + return enable_apicv && lapic_in_kernel(vcpu); + } + ++static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) ++{ ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu); ++ gfn_t gfn; ++ ++ /* ++ * Don't need to mark the APIC access page dirty; it is never ++ * written to by the CPU during APIC virtualization. ++ */ ++ ++ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { ++ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; ++ kvm_vcpu_mark_page_dirty(vcpu, gfn); ++ } ++ ++ if (nested_cpu_has_posted_intr(vmcs12)) { ++ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; ++ kvm_vcpu_mark_page_dirty(vcpu, gfn); ++ } ++} ++ ++ + static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); +@@ -4534,18 +4556,15 @@ static void vmx_complete_nested_posted_i + void *vapic_page; + u16 status; + +- if (vmx->nested.pi_desc && +- vmx->nested.pi_pending) { +- vmx->nested.pi_pending = false; +- if (!pi_test_and_clear_on(vmx->nested.pi_desc)) +- return; +- +- max_irr = find_last_bit( +- (unsigned long *)vmx->nested.pi_desc->pir, 256); ++ if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) ++ return; + +- if (max_irr == 256) +- return; ++ vmx->nested.pi_pending = false; ++ if (!pi_test_and_clear_on(vmx->nested.pi_desc)) ++ return; + ++ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); ++ if (max_irr != 256) { + vapic_page = kmap(vmx->nested.virtual_apic_page); + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); + kunmap(vmx->nested.virtual_apic_page); +@@ -4557,6 +4576,8 @@ static void vmx_complete_nested_posted_i + vmcs_write16(GUEST_INTR_STATUS, status); + } + } ++ ++ nested_mark_vmcs12_pages_dirty(vcpu); + } + + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +@@ -7761,6 +7782,18 @@ static bool nested_vmx_exit_handled(stru + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + ++ /* ++ * The host physical addresses of some pages of guest memory ++ * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU ++ * may write to these pages via their host physical address while ++ * L2 is running, bypassing any address-translation-based dirty ++ * tracking (e.g. EPT write protection). ++ * ++ * Mark them dirty on every exit from L2 to prevent them from ++ * getting out of sync with dirty tracking. ++ */ ++ nested_mark_vmcs12_pages_dirty(vcpu); ++ + if (vmx->nested.nested_run_pending) + return false; + diff --git a/queue-4.4/kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch b/queue-4.4/kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch new file mode 100644 index 00000000000..50b350116b4 --- /dev/null +++ b/queue-4.4/kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch @@ -0,0 +1,191 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: KarimAllah Ahmed +Date: Sat, 3 Feb 2018 15:56:23 +0100 +Subject: KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL + +From: KarimAllah Ahmed + +commit b2ac58f90540e39324e7a29a7ad471407ae0bf48 upstream. + +[ Based on a patch from Paolo Bonzini ] + +... basically doing exactly what we do for VMX: + +- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID) +- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest + actually used it. + +Signed-off-by: KarimAllah Ahmed +Signed-off-by: David Woodhouse +Signed-off-by: Thomas Gleixner +Reviewed-by: Darren Kenny +Reviewed-by: Konrad Rzeszutek Wilk +Cc: Andrea Arcangeli +Cc: Andi Kleen +Cc: Jun Nakajima +Cc: kvm@vger.kernel.org +Cc: Dave Hansen +Cc: Tim Chen +Cc: Andy Lutomirski +Cc: Asit Mallick +Cc: Arjan Van De Ven +Cc: Greg KH +Cc: Paolo Bonzini +Cc: Dan Williams +Cc: Linus Torvalds +Cc: Ashok Raj +Link: https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 88 insertions(+) + +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -147,6 +147,8 @@ struct vcpu_svm { + u64 gs_base; + } host; + ++ u64 spec_ctrl; ++ + u32 *msrpm; + + ulong nmi_iret_rip; +@@ -182,6 +184,7 @@ static const struct svm_direct_access_ms + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, + #endif ++ { .index = MSR_IA32_SPEC_CTRL, .always = false }, + { .index = MSR_IA32_PRED_CMD, .always = false }, + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, +@@ -764,6 +767,25 @@ static bool valid_msr_intercept(u32 inde + return false; + } + ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) ++{ ++ u8 bit_write; ++ unsigned long tmp; ++ u32 offset; ++ u32 *msrpm; ++ ++ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: ++ to_svm(vcpu)->msrpm; ++ ++ offset = svm_msrpm_offset(msr); ++ bit_write = 2 * (msr & 0x0f) + 1; ++ tmp = msrpm[offset]; ++ ++ BUG_ON(offset == MSR_INVALID); ++ ++ return !!test_bit(bit_write, &tmp); ++} ++ + static void set_msr_interception(u32 *msrpm, unsigned msr, + int read, int write) + { +@@ -1122,6 +1144,8 @@ static void svm_vcpu_reset(struct kvm_vc + u32 dummy; + u32 eax = 1; + ++ svm->spec_ctrl = 0; ++ + if (!init_event) { + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; +@@ -3063,6 +3087,13 @@ static int svm_get_msr(struct kvm_vcpu * + case MSR_VM_CR: + msr_info->data = svm->nested.vm_cr_msr; + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ msr_info->data = svm->spec_ctrl; ++ break; + case MSR_IA32_UCODE_REV: + msr_info->data = 0x01000065; + break; +@@ -3137,6 +3168,33 @@ static int svm_set_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ /* The STIBP bit doesn't fault even if it's not advertised */ ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ++ return 1; ++ ++ svm->spec_ctrl = data; ++ ++ if (!data) ++ break; ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_svm_vmrun_msrpm. ++ * We update the L1 MSR bit as well since it will end up ++ * touching the MSR anyway now. ++ */ ++ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); ++ break; + case MSR_IA32_PRED_CMD: + if (!msr->host_initiated && + !guest_cpuid_has_ibpb(vcpu)) +@@ -3839,6 +3897,15 @@ static void svm_vcpu_run(struct kvm_vcpu + + local_irq_enable(); + ++ /* ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there ++ * is no need to worry about the conditional branch over the wrmsr ++ * being speculatively taken. ++ */ ++ if (svm->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ + asm volatile ( + "push %%" _ASM_BP "; \n\t" + "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" +@@ -3931,6 +3998,27 @@ static void svm_vcpu_run(struct kvm_vcpu + #endif + ); + ++ /* ++ * We do not use IBRS in the kernel. If this vCPU has used the ++ * SPEC_CTRL MSR it may have left it on; save the value and ++ * turn it off. This is much more efficient than blindly adding ++ * it to the atomic save/restore list. Especially as the former ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM. ++ * ++ * For non-nested case: ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ * ++ * For nested case: ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ */ ++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) ++ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ ++ if (svm->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + diff --git a/queue-4.4/kvm-svm-implement-virt_spec_ctrl-support-for-ssbd.patch b/queue-4.4/kvm-svm-implement-virt_spec_ctrl-support-for-ssbd.patch new file mode 100644 index 00000000000..05e436d9e60 --- /dev/null +++ b/queue-4.4/kvm-svm-implement-virt_spec_ctrl-support-for-ssbd.patch @@ -0,0 +1,228 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Tom Lendacky +Date: Thu, 10 May 2018 22:06:39 +0200 +Subject: KVM: SVM: Implement VIRT_SPEC_CTRL support for SSBD + +From: Tom Lendacky + +commit bc226f07dcd3c9ef0b7f6236fe356ea4a9cb4769 upstream. + +Expose the new virtualized architectural mechanism, VIRT_SSBD, for using +speculative store bypass disable (SSBD) under SVM. This will allow guests +to use SSBD on hardware that uses non-architectural mechanisms for enabling +SSBD. + +[ tglx: Folded the migration fixup from Paolo Bonzini ] + +Signed-off-by: Tom Lendacky +Signed-off-by: Thomas Gleixner +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/kvm_host.h | 2 +- + arch/x86/kernel/cpu/common.c | 3 ++- + arch/x86/kvm/cpuid.c | 11 +++++++++-- + arch/x86/kvm/cpuid.h | 9 +++++++++ + arch/x86/kvm/svm.c | 21 +++++++++++++++++++-- + arch/x86/kvm/vmx.c | 18 +++++++++++++++--- + arch/x86/kvm/x86.c | 13 ++++--------- + 7 files changed, 59 insertions(+), 18 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -765,7 +765,7 @@ struct kvm_x86_ops { + int (*hardware_setup)(void); /* __init */ + void (*hardware_unsetup)(void); /* __exit */ + bool (*cpu_has_accelerated_tpr)(void); +- bool (*cpu_has_high_real_mode_segbase)(void); ++ bool (*has_emulated_msr)(int index); + void (*cpuid_update)(struct kvm_vcpu *vcpu); + + /* Create, but do not attach this VCPU */ +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -693,7 +693,8 @@ static void init_speculation_control(str + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) + set_cpu_cap(c, X86_FEATURE_STIBP); + +- if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD)) ++ if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) || ++ cpu_has(c, X86_FEATURE_VIRT_SSBD)) + set_cpu_cap(c, X86_FEATURE_SSBD); + + if (cpu_has(c, X86_FEATURE_AMD_IBRS)) { +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -343,7 +343,7 @@ static inline int __do_cpuid_ent(struct + + /* cpuid 0x80000008.ebx */ + const u32 kvm_cpuid_8000_0008_ebx_x86_features = +- F(AMD_IBPB) | F(AMD_IBRS); ++ F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD); + + /* cpuid 0xC0000001.edx */ + const u32 kvm_supported_word5_x86_features = +@@ -595,13 +595,20 @@ static inline int __do_cpuid_ent(struct + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); + entry->edx = 0; +- /* IBRS and IBPB aren't necessarily present in hardware cpuid */ ++ /* ++ * IBRS, IBPB and VIRT_SSBD aren't necessarily present in ++ * hardware cpuid ++ */ + if (boot_cpu_has(X86_FEATURE_AMD_IBPB)) + entry->ebx |= F(AMD_IBPB); + if (boot_cpu_has(X86_FEATURE_AMD_IBRS)) + entry->ebx |= F(AMD_IBRS); ++ if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) ++ entry->ebx |= F(VIRT_SSBD); + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); ++ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) ++ entry->ebx |= F(VIRT_SSBD); + break; + } + case 0x80000019: +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -189,6 +189,15 @@ static inline bool guest_cpuid_has_arch_ + return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES)); + } + ++static inline bool guest_cpuid_has_virt_ssbd(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_cpuid_entry2 *best; ++ ++ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); ++ return best && (best->ebx & bit(X86_FEATURE_VIRT_SSBD)); ++} ++ ++ + + /* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -3102,6 +3102,13 @@ static int svm_get_msr(struct kvm_vcpu * + + msr_info->data = svm->spec_ctrl; + break; ++ case MSR_AMD64_VIRT_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_virt_ssbd(vcpu)) ++ return 1; ++ ++ msr_info->data = svm->virt_spec_ctrl; ++ break; + case MSR_IA32_UCODE_REV: + msr_info->data = 0x01000065; + break; +@@ -3219,6 +3226,16 @@ static int svm_set_msr(struct kvm_vcpu * + break; + set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); + break; ++ case MSR_AMD64_VIRT_SPEC_CTRL: ++ if (!msr->host_initiated && ++ !guest_cpuid_has_virt_ssbd(vcpu)) ++ return 1; ++ ++ if (data & ~SPEC_CTRL_SSBD) ++ return 1; ++ ++ svm->virt_spec_ctrl = data; ++ break; + case MSR_STAR: + svm->vmcb->save.star = data; + break; +@@ -4137,7 +4154,7 @@ static bool svm_cpu_has_accelerated_tpr( + return false; + } + +-static bool svm_has_high_real_mode_segbase(void) ++static bool svm_has_emulated_msr(int index) + { + return true; + } +@@ -4421,7 +4438,7 @@ static struct kvm_x86_ops svm_x86_ops = + .hardware_enable = svm_hardware_enable, + .hardware_disable = svm_hardware_disable, + .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, +- .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase, ++ .has_emulated_msr = svm_has_emulated_msr, + + .vcpu_create = svm_create_vcpu, + .vcpu_free = svm_free_vcpu, +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -8458,9 +8458,21 @@ static void vmx_handle_external_intr(str + local_irq_enable(); + } + +-static bool vmx_has_high_real_mode_segbase(void) ++static bool vmx_has_emulated_msr(int index) + { +- return enable_unrestricted_guest || emulate_invalid_guest_state; ++ switch (index) { ++ case MSR_IA32_SMBASE: ++ /* ++ * We cannot do SMM unless we can run the guest in big ++ * real mode. ++ */ ++ return enable_unrestricted_guest || emulate_invalid_guest_state; ++ case MSR_AMD64_VIRT_SPEC_CTRL: ++ /* This is AMD only. */ ++ return false; ++ default: ++ return true; ++ } + } + + static bool vmx_mpx_supported(void) +@@ -10952,7 +10964,7 @@ static struct kvm_x86_ops vmx_x86_ops = + .hardware_enable = hardware_enable, + .hardware_disable = hardware_disable, + .cpu_has_accelerated_tpr = report_flexpriority, +- .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, ++ .has_emulated_msr = vmx_has_emulated_msr, + + .vcpu_create = vmx_create_vcpu, + .vcpu_free = vmx_free_vcpu, +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -985,6 +985,7 @@ static u32 emulated_msrs[] = { + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, + MSR_IA32_SMBASE, ++ MSR_AMD64_VIRT_SPEC_CTRL, + }; + + static unsigned num_emulated_msrs; +@@ -2584,7 +2585,7 @@ int kvm_vm_ioctl_check_extension(struct + * fringe case that is not enabled except via specific settings + * of the module parameters. + */ +- r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); ++ r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE); + break; + case KVM_CAP_COALESCED_MMIO: + r = KVM_COALESCED_MMIO_PAGE_OFFSET; +@@ -4073,14 +4074,8 @@ static void kvm_init_msr_list(void) + num_msrs_to_save = j; + + for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { +- switch (emulated_msrs[i]) { +- case MSR_IA32_SMBASE: +- if (!kvm_x86_ops->cpu_has_high_real_mode_segbase()) +- continue; +- break; +- default: +- break; +- } ++ if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i])) ++ continue; + + if (j < i) + emulated_msrs[j] = emulated_msrs[i]; diff --git a/queue-4.4/kvm-svm-move-spec-control-call-after-restore-of-gs.patch b/queue-4.4/kvm-svm-move-spec-control-call-after-restore-of-gs.patch new file mode 100644 index 00000000000..3645e791111 --- /dev/null +++ b/queue-4.4/kvm-svm-move-spec-control-call-after-restore-of-gs.patch @@ -0,0 +1,69 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Thomas Gleixner +Date: Fri, 11 May 2018 15:21:01 +0200 +Subject: KVM: SVM: Move spec control call after restore of GS + +From: Thomas Gleixner + +commit 15e6c22fd8e5a42c5ed6d487b7c9fe44c2517765 upstream. + +svm_vcpu_run() invokes x86_spec_ctrl_restore_host() after VMEXIT, but +before the host GS is restored. x86_spec_ctrl_restore_host() uses 'current' +to determine the host SSBD state of the thread. 'current' is GS based, but +host GS is not yet restored and the access causes a triple fault. + +Move the call after the host GS restore. + +Fixes: 885f82bfbc6f x86/process: Allow runtime control of Speculative Store Bypass +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Konrad Rzeszutek Wilk +Acked-by: Paolo Bonzini +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm.c | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -3998,6 +3998,18 @@ static void svm_vcpu_run(struct kvm_vcpu + #endif + ); + ++ /* Eliminate branch target predictions from guest mode */ ++ vmexit_fill_RSB(); ++ ++#ifdef CONFIG_X86_64 ++ wrmsrl(MSR_GS_BASE, svm->host.gs_base); ++#else ++ loadsegment(fs, svm->host.fs); ++#ifndef CONFIG_X86_32_LAZY_GS ++ loadsegment(gs, svm->host.gs); ++#endif ++#endif ++ + /* + * We do not use IBRS in the kernel. If this vCPU has used the + * SPEC_CTRL MSR it may have left it on; save the value and +@@ -4018,18 +4030,6 @@ static void svm_vcpu_run(struct kvm_vcpu + + x86_spec_ctrl_restore_host(svm->spec_ctrl); + +- /* Eliminate branch target predictions from guest mode */ +- vmexit_fill_RSB(); +- +-#ifdef CONFIG_X86_64 +- wrmsrl(MSR_GS_BASE, svm->host.gs_base); +-#else +- loadsegment(fs, svm->host.fs); +-#ifndef CONFIG_X86_32_LAZY_GS +- loadsegment(gs, svm->host.gs); +-#endif +-#endif +- + reload_tss(vcpu); + + local_irq_disable(); diff --git a/queue-4.4/kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch b/queue-4.4/kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch new file mode 100644 index 00000000000..a49020847d8 --- /dev/null +++ b/queue-4.4/kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch @@ -0,0 +1,298 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: KarimAllah Ahmed +Date: Thu, 1 Feb 2018 22:59:45 +0100 +Subject: KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL + +From: KarimAllah Ahmed + +commit d28b387fb74da95d69d2615732f50cceb38e9a4d upstream. + +[ Based on a patch from Ashok Raj ] + +Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for +guests that will only mitigate Spectre V2 through IBRS+IBPB and will not +be using a retpoline+IBPB based approach. + +To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for +guests that do not actually use the MSR, only start saving and restoring +when a non-zero is written to it. + +No attempt is made to handle STIBP here, intentionally. Filtering STIBP +may be added in a future patch, which may require trapping all writes +if we don't want to pass it through directly to the guest. + +[dwmw2: Clean up CPUID bits, save/restore manually, handle reset] + +Signed-off-by: KarimAllah Ahmed +Signed-off-by: David Woodhouse +Signed-off-by: Thomas Gleixner +Reviewed-by: Darren Kenny +Reviewed-by: Konrad Rzeszutek Wilk +Reviewed-by: Jim Mattson +Cc: Andrea Arcangeli +Cc: Andi Kleen +Cc: Jun Nakajima +Cc: kvm@vger.kernel.org +Cc: Dave Hansen +Cc: Tim Chen +Cc: Andy Lutomirski +Cc: Asit Mallick +Cc: Arjan Van De Ven +Cc: Greg KH +Cc: Paolo Bonzini +Cc: Dan Williams +Cc: Linus Torvalds +Cc: Ashok Raj +Link: https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/cpuid.c | 8 ++- + arch/x86/kvm/cpuid.h | 11 +++++ + arch/x86/kvm/vmx.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++- + arch/x86/kvm/x86.c | 2 + 4 files changed, 118 insertions(+), 6 deletions(-) + +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -343,7 +343,7 @@ static inline int __do_cpuid_ent(struct + + /* cpuid 0x80000008.ebx */ + const u32 kvm_cpuid_8000_0008_ebx_x86_features = +- F(IBPB); ++ F(IBPB) | F(IBRS); + + /* cpuid 0xC0000001.edx */ + const u32 kvm_supported_word5_x86_features = +@@ -364,7 +364,7 @@ static inline int __do_cpuid_ent(struct + + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = +- F(ARCH_CAPABILITIES); ++ F(SPEC_CTRL) | F(ARCH_CAPABILITIES); + + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); +@@ -595,9 +595,11 @@ static inline int __do_cpuid_ent(struct + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); + entry->edx = 0; +- /* IBPB isn't necessarily present in hardware cpuid */ ++ /* IBRS and IBPB aren't necessarily present in hardware cpuid */ + if (boot_cpu_has(X86_FEATURE_IBPB)) + entry->ebx |= F(IBPB); ++ if (boot_cpu_has(X86_FEATURE_IBRS)) ++ entry->ebx |= F(IBRS); + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + break; +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -170,6 +170,17 @@ static inline bool guest_cpuid_has_ibpb( + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); + } + ++static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_cpuid_entry2 *best; ++ ++ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); ++ if (best && (best->ebx & bit(X86_FEATURE_IBRS))) ++ return true; ++ best = kvm_find_cpuid_entry(vcpu, 7, 0); ++ return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); ++} ++ + static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) + { + struct kvm_cpuid_entry2 *best; +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -546,6 +546,7 @@ struct vcpu_vmx { + #endif + + u64 arch_capabilities; ++ u64 spec_ctrl; + + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; +@@ -1693,6 +1694,29 @@ static void update_exception_bitmap(stru + } + + /* ++ * Check if MSR is intercepted for currently loaded MSR bitmap. ++ */ ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) ++{ ++ unsigned long *msr_bitmap; ++ int f = sizeof(unsigned long); ++ ++ if (!cpu_has_vmx_msr_bitmap()) ++ return true; ++ ++ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; ++ ++ if (msr <= 0x1fff) { ++ return !!test_bit(msr, msr_bitmap + 0x800 / f); ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { ++ msr &= 0x1fff; ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f); ++ } ++ ++ return true; ++} ++ ++/* + * Check if MSR is intercepted for L01 MSR bitmap. + */ + static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +@@ -2834,6 +2858,13 @@ static int vmx_get_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + msr_info->data = guest_read_tsc(vcpu); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ msr_info->data = to_vmx(vcpu)->spec_ctrl; ++ break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has_arch_capabilities(vcpu)) +@@ -2939,6 +2970,36 @@ static int vmx_set_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr_info); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ /* The STIBP bit doesn't fault even if it's not advertised */ ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ++ return 1; ++ ++ vmx->spec_ctrl = data; ++ ++ if (!data) ++ break; ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_vmx_merge_msr_bitmap. We should not touch the ++ * vmcs02.msr_bitmap here since it gets completely overwritten ++ * in the merging. We update the vmcs01 here for L1 as well ++ * since it will end up touching the MSR anyway now. ++ */ ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, ++ MSR_IA32_SPEC_CTRL, ++ MSR_TYPE_RW); ++ break; + case MSR_IA32_PRED_CMD: + if (!msr_info->host_initiated && + !guest_cpuid_has_ibpb(vcpu)) +@@ -5045,6 +5106,7 @@ static void vmx_vcpu_reset(struct kvm_vc + u64 cr0; + + vmx->rmode.vm86_active = 0; ++ vmx->spec_ctrl = 0; + + vmx->soft_vnmi_blocked = 0; + +@@ -8589,6 +8651,15 @@ static void __noclone vmx_vcpu_run(struc + atomic_switch_perf_msrs(vmx); + debugctlmsr = get_debugctlmsr(); + ++ /* ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there ++ * is no need to worry about the conditional branch over the wrmsr ++ * being speculatively taken. ++ */ ++ if (vmx->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ + vmx->__launched = vmx->loaded_vmcs->launched; + asm( + /* Store host registers */ +@@ -8707,6 +8778,27 @@ static void __noclone vmx_vcpu_run(struc + #endif + ); + ++ /* ++ * We do not use IBRS in the kernel. If this vCPU has used the ++ * SPEC_CTRL MSR it may have left it on; save the value and ++ * turn it off. This is much more efficient than blindly adding ++ * it to the atomic save/restore list. Especially as the former ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM. ++ * ++ * For non-nested case: ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ * ++ * For nested case: ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ */ ++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) ++ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ ++ if (vmx->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + +@@ -9242,7 +9334,7 @@ static inline bool nested_vmx_merge_msr_ + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + /* +- * pred_cmd is trying to verify two things: ++ * pred_cmd & spec_ctrl are trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap +@@ -9255,9 +9347,10 @@ static inline bool nested_vmx_merge_msr_ + * the MSR. + */ + bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); ++ bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && +- !pred_cmd) ++ !pred_cmd && !spec_ctrl) + return false; + + page = nested_get_page(vcpu, vmcs12->msr_bitmap); +@@ -9293,6 +9386,12 @@ static inline bool nested_vmx_merge_msr_ + } + } + ++ if (spec_ctrl) ++ nested_vmx_disable_intercept_for_msr( ++ msr_bitmap_l1, msr_bitmap_l0, ++ MSR_IA32_SPEC_CTRL, ++ MSR_TYPE_R | MSR_TYPE_W); ++ + if (pred_cmd) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -961,7 +961,7 @@ static u32 msrs_to_save[] = { + #endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, +- MSR_IA32_ARCH_CAPABILITIES ++ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES + }; + + static unsigned num_msrs_to_save; diff --git a/queue-4.4/kvm-vmx-emulate-msr_ia32_arch_capabilities.patch b/queue-4.4/kvm-vmx-emulate-msr_ia32_arch_capabilities.patch new file mode 100644 index 00000000000..c6ce7997d75 --- /dev/null +++ b/queue-4.4/kvm-vmx-emulate-msr_ia32_arch_capabilities.patch @@ -0,0 +1,151 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: KarimAllah Ahmed +Date: Thu, 1 Feb 2018 22:59:44 +0100 +Subject: KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES + +From: KarimAllah Ahmed + +commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd upstream. + +Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO +(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the +contents will come directly from the hardware, but user-space can still +override it. + +[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional] + +Signed-off-by: KarimAllah Ahmed +Signed-off-by: David Woodhouse +Signed-off-by: Thomas Gleixner +Reviewed-by: Paolo Bonzini +Reviewed-by: Darren Kenny +Reviewed-by: Jim Mattson +Reviewed-by: Konrad Rzeszutek Wilk +Cc: Andrea Arcangeli +Cc: Andi Kleen +Cc: Jun Nakajima +Cc: kvm@vger.kernel.org +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Andy Lutomirski +Cc: Asit Mallick +Cc: Arjan Van De Ven +Cc: Greg KH +Cc: Dan Williams +Cc: Tim Chen +Cc: Ashok Raj +Link: https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.4: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/cpuid.c | 11 +++++++++-- + arch/x86/kvm/cpuid.h | 8 ++++++++ + arch/x86/kvm/vmx.c | 15 +++++++++++++++ + arch/x86/kvm/x86.c | 1 + + 4 files changed, 33 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -362,6 +362,10 @@ static inline int __do_cpuid_ent(struct + const u32 kvm_supported_word10_x86_features = + F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; + ++ /* cpuid 7.0.edx*/ ++ const u32 kvm_cpuid_7_0_edx_x86_features = ++ F(ARCH_CAPABILITIES); ++ + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); + +@@ -439,11 +443,14 @@ static inline int __do_cpuid_ent(struct + cpuid_mask(&entry->ebx, 9); + // TSC_ADJUST is emulated + entry->ebx |= F(TSC_ADJUST); +- } else ++ entry->edx &= kvm_cpuid_7_0_edx_x86_features; ++ cpuid_mask(&entry->edx, CPUID_7_EDX); ++ } else { + entry->ebx = 0; ++ entry->edx = 0; ++ } + entry->eax = 0; + entry->ecx = 0; +- entry->edx = 0; + break; + } + case 9: +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -170,6 +170,14 @@ static inline bool guest_cpuid_has_ibpb( + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); + } + ++static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_cpuid_entry2 *best; ++ ++ best = kvm_find_cpuid_entry(vcpu, 7, 0); ++ return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES)); ++} ++ + + /* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -545,6 +545,8 @@ struct vcpu_vmx { + u64 msr_guest_kernel_gs_base; + #endif + ++ u64 arch_capabilities; ++ + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + /* +@@ -2832,6 +2834,12 @@ static int vmx_get_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + msr_info->data = guest_read_tsc(vcpu); + break; ++ case MSR_IA32_ARCH_CAPABILITIES: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_arch_capabilities(vcpu)) ++ return 1; ++ msr_info->data = to_vmx(vcpu)->arch_capabilities; ++ break; + case MSR_IA32_SYSENTER_CS: + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); + break; +@@ -2958,6 +2966,11 @@ static int vmx_set_msr(struct kvm_vcpu * + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, + MSR_TYPE_W); + break; ++ case MSR_IA32_ARCH_CAPABILITIES: ++ if (!msr_info->host_initiated) ++ return 1; ++ vmx->arch_capabilities = data; ++ break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) +@@ -5002,6 +5015,8 @@ static int vmx_vcpu_setup(struct vcpu_vm + ++vmx->nmsrs; + } + ++ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities); + + vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -961,6 +961,7 @@ static u32 msrs_to_save[] = { + #endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, ++ MSR_IA32_ARCH_CAPABILITIES + }; + + static unsigned num_msrs_to_save; diff --git a/queue-4.4/kvm-vmx-introduce-alloc_loaded_vmcs.patch b/queue-4.4/kvm-vmx-introduce-alloc_loaded_vmcs.patch new file mode 100644 index 00000000000..e71b9722cc0 --- /dev/null +++ b/queue-4.4/kvm-vmx-introduce-alloc_loaded_vmcs.patch @@ -0,0 +1,102 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Paolo Bonzini +Date: Thu, 11 Jan 2018 12:16:15 +0100 +Subject: KVM: VMX: introduce alloc_loaded_vmcs + +From: Paolo Bonzini + +commit f21f165ef922c2146cc5bdc620f542953c41714b upstream. + +Group together the calls to alloc_vmcs and loaded_vmcs_init. Soon we'll also +allocate an MSR bitmap there. + +Signed-off-by: Paolo Bonzini +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.4: + - No loaded_vmcs::shadow_vmcs field to initialise + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx.c | 35 ++++++++++++++++++++++------------- + 1 file changed, 22 insertions(+), 13 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -3345,11 +3345,6 @@ static struct vmcs *alloc_vmcs_cpu(int c + return vmcs; + } + +-static struct vmcs *alloc_vmcs(void) +-{ +- return alloc_vmcs_cpu(raw_smp_processor_id()); +-} +- + static void free_vmcs(struct vmcs *vmcs) + { + free_pages((unsigned long)vmcs, vmcs_config.order); +@@ -3367,6 +3362,21 @@ static void free_loaded_vmcs(struct load + loaded_vmcs->vmcs = NULL; + } + ++static struct vmcs *alloc_vmcs(void) ++{ ++ return alloc_vmcs_cpu(raw_smp_processor_id()); ++} ++ ++static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) ++{ ++ loaded_vmcs->vmcs = alloc_vmcs(); ++ if (!loaded_vmcs->vmcs) ++ return -ENOMEM; ++ ++ loaded_vmcs_init(loaded_vmcs); ++ return 0; ++} ++ + static void free_kvm_area(void) + { + int cpu; +@@ -6699,6 +6709,7 @@ static int handle_vmon(struct kvm_vcpu * + struct vmcs *shadow_vmcs; + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; ++ int r; + + /* The Intel VMX Instruction Reference lists a bunch of bits that + * are prerequisite to running VMXON, most notably cr4.VMXE must be +@@ -6738,10 +6749,9 @@ static int handle_vmon(struct kvm_vcpu * + return 1; + } + +- vmx->nested.vmcs02.vmcs = alloc_vmcs(); +- if (!vmx->nested.vmcs02.vmcs) ++ r = alloc_loaded_vmcs(&vmx->nested.vmcs02); ++ if (r < 0) + goto out_vmcs02; +- loaded_vmcs_init(&vmx->nested.vmcs02); + + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = +@@ -8802,16 +8812,15 @@ static struct kvm_vcpu *vmx_create_vcpu( + if (!vmx->guest_msrs) + goto free_pml; + +- vmx->loaded_vmcs = &vmx->vmcs01; +- vmx->loaded_vmcs->vmcs = alloc_vmcs(); +- if (!vmx->loaded_vmcs->vmcs) +- goto free_msrs; + if (!vmm_exclusive) + kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); +- loaded_vmcs_init(vmx->loaded_vmcs); ++ err = alloc_loaded_vmcs(&vmx->vmcs01); + if (!vmm_exclusive) + kvm_cpu_vmxoff(); ++ if (err < 0) ++ goto free_msrs; + ++ vmx->loaded_vmcs = &vmx->vmcs01; + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; diff --git a/queue-4.4/kvm-vmx-make-msr-bitmaps-per-vcpu.patch b/queue-4.4/kvm-vmx-make-msr-bitmaps-per-vcpu.patch new file mode 100644 index 00000000000..547ac46a16e --- /dev/null +++ b/queue-4.4/kvm-vmx-make-msr-bitmaps-per-vcpu.patch @@ -0,0 +1,512 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Paolo Bonzini +Date: Tue, 16 Jan 2018 16:51:18 +0100 +Subject: KVM: VMX: make MSR bitmaps per-VCPU + +From: Paolo Bonzini + +commit 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6 upstream. + +Place the MSR bitmap in struct loaded_vmcs, and update it in place +every time the x2apic or APICv state can change. This is rare and +the loop can handle 64 MSRs per iteration, in a similar fashion as +nested_vmx_prepare_msr_bitmap. + +This prepares for choosing, on a per-VM basis, whether to intercept +the SPEC_CTRL and PRED_CMD MSRs. + +Suggested-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.4: + - APICv support looked different + - We still need to intercept the APIC_ID MSR + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx.c | 254 +++++++++++++++++++++++------------------------------ + 1 file changed, 112 insertions(+), 142 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -109,6 +109,14 @@ static u64 __read_mostly host_xss; + static bool __read_mostly enable_pml = 1; + module_param_named(pml, enable_pml, bool, S_IRUGO); + ++#define MSR_TYPE_R 1 ++#define MSR_TYPE_W 2 ++#define MSR_TYPE_RW 3 ++ ++#define MSR_BITMAP_MODE_X2APIC 1 ++#define MSR_BITMAP_MODE_X2APIC_APICV 2 ++#define MSR_BITMAP_MODE_LM 4 ++ + #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL + + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) +@@ -188,6 +196,7 @@ struct loaded_vmcs { + struct vmcs *vmcs; + int cpu; + int launched; ++ unsigned long *msr_bitmap; + struct list_head loaded_vmcss_on_cpu_link; + }; + +@@ -423,8 +432,6 @@ struct nested_vmx { + u16 posted_intr_nv; + u64 msr_ia32_feature_control; + +- unsigned long *msr_bitmap; +- + struct hrtimer preemption_timer; + bool preemption_timer_expired; + +@@ -525,6 +532,7 @@ struct vcpu_vmx { + unsigned long host_rsp; + u8 fail; + bool nmi_known_unmasked; ++ u8 msr_bitmap_mode; + u32 exit_intr_info; + u32 idt_vectoring_info; + ulong rflags; +@@ -883,6 +891,7 @@ static void vmx_sync_pir_to_irr_dummy(st + static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); + static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); + static int alloc_identity_pagetable(struct kvm *kvm); ++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); + + static DEFINE_PER_CPU(struct vmcs *, vmxarea); + static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +@@ -902,10 +911,6 @@ static DEFINE_PER_CPU(spinlock_t, blocke + + static unsigned long *vmx_io_bitmap_a; + static unsigned long *vmx_io_bitmap_b; +-static unsigned long *vmx_msr_bitmap_legacy; +-static unsigned long *vmx_msr_bitmap_longmode; +-static unsigned long *vmx_msr_bitmap_legacy_x2apic; +-static unsigned long *vmx_msr_bitmap_longmode_x2apic; + static unsigned long *vmx_vmread_bitmap; + static unsigned long *vmx_vmwrite_bitmap; + +@@ -2346,27 +2351,6 @@ static void move_msr_up(struct vcpu_vmx + vmx->guest_msrs[from] = tmp; + } + +-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) +-{ +- unsigned long *msr_bitmap; +- +- if (is_guest_mode(vcpu)) +- msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; +- else if (vcpu->arch.apic_base & X2APIC_ENABLE) { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode_x2apic; +- else +- msr_bitmap = vmx_msr_bitmap_legacy_x2apic; +- } else { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode; +- else +- msr_bitmap = vmx_msr_bitmap_legacy; +- } +- +- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); +-} +- + /* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy +@@ -2407,7 +2391,7 @@ static void setup_msrs(struct vcpu_vmx * + vmx->save_nmsrs = save_nmsrs; + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(&vmx->vcpu); ++ vmx_update_msr_bitmap(&vmx->vcpu); + } + + /* +@@ -3360,6 +3344,8 @@ static void free_loaded_vmcs(struct load + loaded_vmcs_clear(loaded_vmcs); + free_vmcs(loaded_vmcs->vmcs); + loaded_vmcs->vmcs = NULL; ++ if (loaded_vmcs->msr_bitmap) ++ free_page((unsigned long)loaded_vmcs->msr_bitmap); + } + + static struct vmcs *alloc_vmcs(void) +@@ -3374,7 +3360,18 @@ static int alloc_loaded_vmcs(struct load + return -ENOMEM; + + loaded_vmcs_init(loaded_vmcs); ++ ++ if (cpu_has_vmx_msr_bitmap()) { ++ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); ++ if (!loaded_vmcs->msr_bitmap) ++ goto out_vmcs; ++ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); ++ } + return 0; ++ ++out_vmcs: ++ free_loaded_vmcs(loaded_vmcs); ++ return -ENOMEM; + } + + static void free_kvm_area(void) +@@ -4373,10 +4370,8 @@ static void free_vpid(int vpid) + spin_unlock(&vmx_vpid_lock); + } + +-#define MSR_TYPE_R 1 +-#define MSR_TYPE_W 2 +-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +- u32 msr, int type) ++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type) + { + int f = sizeof(unsigned long); + +@@ -4410,8 +4405,8 @@ static void __vmx_disable_intercept_for_ + } + } + +-static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +- u32 msr, int type) ++static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type) + { + int f = sizeof(unsigned long); + +@@ -4491,37 +4486,76 @@ static void nested_vmx_disable_intercept + } + } + +-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) ++static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type, bool value) + { +- if (!longmode_only) +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +- msr, MSR_TYPE_R | MSR_TYPE_W); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +- msr, MSR_TYPE_R | MSR_TYPE_W); ++ if (value) ++ vmx_enable_intercept_for_msr(msr_bitmap, msr, type); ++ else ++ vmx_disable_intercept_for_msr(msr_bitmap, msr, type); + } + +-static void vmx_enable_intercept_msr_read_x2apic(u32 msr) ++static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) + { +- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, +- msr, MSR_TYPE_R); +- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, +- msr, MSR_TYPE_R); ++ u8 mode = 0; ++ ++ if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { ++ mode |= MSR_BITMAP_MODE_X2APIC; ++ if (enable_apicv) ++ mode |= MSR_BITMAP_MODE_X2APIC_APICV; ++ } ++ ++ if (is_long_mode(vcpu)) ++ mode |= MSR_BITMAP_MODE_LM; ++ ++ return mode; + } + +-static void vmx_disable_intercept_msr_read_x2apic(u32 msr) ++#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) ++ ++static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, ++ u8 mode) + { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, +- msr, MSR_TYPE_R); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, +- msr, MSR_TYPE_R); ++ int msr; ++ ++ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { ++ unsigned word = msr / BITS_PER_LONG; ++ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; ++ msr_bitmap[word + (0x800 / sizeof(long))] = ~0; ++ } ++ ++ if (mode & MSR_BITMAP_MODE_X2APIC) { ++ /* ++ * TPR reads and writes can be virtualized even if virtual interrupt ++ * delivery is not in use. ++ */ ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); ++ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { ++ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_ID), MSR_TYPE_R); ++ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); ++ } ++ } + } + +-static void vmx_disable_intercept_msr_write_x2apic(u32 msr) ++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) + { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, +- msr, MSR_TYPE_W); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, +- msr, MSR_TYPE_W); ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; ++ u8 mode = vmx_msr_bitmap_mode(vcpu); ++ u8 changed = mode ^ vmx->msr_bitmap_mode; ++ ++ if (!changed) ++ return; ++ ++ vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, ++ !(mode & MSR_BITMAP_MODE_LM)); ++ ++ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) ++ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); ++ ++ vmx->msr_bitmap_mode = mode; + } + + static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu) +@@ -4842,7 +4876,7 @@ static int vmx_vcpu_setup(struct vcpu_vm + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + } + if (cpu_has_vmx_msr_bitmap()) +- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); ++ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + +@@ -6183,7 +6217,7 @@ static void wakeup_handler(void) + + static __init int hardware_setup(void) + { +- int r = -ENOMEM, i, msr; ++ int r = -ENOMEM, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + +@@ -6198,31 +6232,13 @@ static __init int hardware_setup(void) + if (!vmx_io_bitmap_b) + goto out; + +- vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_legacy) +- goto out1; +- +- vmx_msr_bitmap_legacy_x2apic = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_legacy_x2apic) +- goto out2; +- +- vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_longmode) +- goto out3; +- +- vmx_msr_bitmap_longmode_x2apic = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_longmode_x2apic) +- goto out4; +- + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) +- goto out6; ++ goto out1; + + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmwrite_bitmap) +- goto out7; ++ goto out2; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); +@@ -6231,12 +6247,9 @@ static __init int hardware_setup(void) + + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); + +- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); +- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); +- + if (setup_vmcs_config(&vmcs_config) < 0) { + r = -EIO; +- goto out8; ++ goto out3; + } + + if (boot_cpu_has(X86_FEATURE_NX)) +@@ -6302,38 +6315,8 @@ static __init int hardware_setup(void) + kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; + } + +- vmx_disable_intercept_for_msr(MSR_FS_BASE, false); +- vmx_disable_intercept_for_msr(MSR_GS_BASE, false); +- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); +- +- memcpy(vmx_msr_bitmap_legacy_x2apic, +- vmx_msr_bitmap_legacy, PAGE_SIZE); +- memcpy(vmx_msr_bitmap_longmode_x2apic, +- vmx_msr_bitmap_longmode, PAGE_SIZE); +- + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + +- if (enable_apicv) { +- for (msr = 0x800; msr <= 0x8ff; msr++) +- vmx_disable_intercept_msr_read_x2apic(msr); +- +- /* According SDM, in x2apic mode, the whole id reg is used. +- * But in KVM, it only use the highest eight bits. Need to +- * intercept it */ +- vmx_enable_intercept_msr_read_x2apic(0x802); +- /* TMCCT */ +- vmx_enable_intercept_msr_read_x2apic(0x839); +- /* TPR */ +- vmx_disable_intercept_msr_write_x2apic(0x808); +- /* EOI */ +- vmx_disable_intercept_msr_write_x2apic(0x80b); +- /* SELF-IPI */ +- vmx_disable_intercept_msr_write_x2apic(0x83f); +- } +- + if (enable_ept) { + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, +@@ -6364,18 +6347,10 @@ static __init int hardware_setup(void) + + return alloc_kvm_area(); + +-out8: +- free_page((unsigned long)vmx_vmwrite_bitmap); +-out7: +- free_page((unsigned long)vmx_vmread_bitmap); +-out6: +- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +-out4: +- free_page((unsigned long)vmx_msr_bitmap_longmode); + out3: +- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); ++ free_page((unsigned long)vmx_vmwrite_bitmap); + out2: +- free_page((unsigned long)vmx_msr_bitmap_legacy); ++ free_page((unsigned long)vmx_vmread_bitmap); + out1: + free_page((unsigned long)vmx_io_bitmap_b); + out: +@@ -6386,10 +6361,6 @@ out: + + static __exit void hardware_unsetup(void) + { +- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); +- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +- free_page((unsigned long)vmx_msr_bitmap_legacy); +- free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); +@@ -6753,13 +6724,6 @@ static int handle_vmon(struct kvm_vcpu * + if (r < 0) + goto out_vmcs02; + +- if (cpu_has_vmx_msr_bitmap()) { +- vmx->nested.msr_bitmap = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx->nested.msr_bitmap) +- goto out_msr_bitmap; +- } +- + if (enable_shadow_vmcs) { + shadow_vmcs = alloc_vmcs(); + if (!shadow_vmcs) +@@ -6784,9 +6748,6 @@ static int handle_vmon(struct kvm_vcpu * + return 1; + + out_shadow_vmcs: +- free_page((unsigned long)vmx->nested.msr_bitmap); +- +-out_msr_bitmap: + free_loaded_vmcs(&vmx->nested.vmcs02); + + out_vmcs02: +@@ -6860,10 +6821,6 @@ static void free_nested(struct vcpu_vmx + vmx->nested.vmxon = false; + free_vpid(vmx->nested.vpid02); + nested_release_vmcs12(vmx); +- if (vmx->nested.msr_bitmap) { +- free_page((unsigned long)vmx->nested.msr_bitmap); +- vmx->nested.msr_bitmap = NULL; +- } + if (enable_shadow_vmcs) + free_vmcs(vmx->nested.current_shadow_vmcs); + /* Unpin physical memory we referred to in the vmcs02 */ +@@ -8200,7 +8157,7 @@ static void vmx_set_virtual_x2apic_mode( + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + } + + static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +@@ -8780,6 +8737,7 @@ static struct kvm_vcpu *vmx_create_vcpu( + { + int err; + struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); ++ unsigned long *msr_bitmap; + int cpu; + + if (!vmx) +@@ -8820,6 +8778,15 @@ static struct kvm_vcpu *vmx_create_vcpu( + if (err < 0) + goto free_msrs; + ++ msr_bitmap = vmx->vmcs01.msr_bitmap; ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); ++ vmx->msr_bitmap_mode = 0; ++ + vmx->loaded_vmcs = &vmx->vmcs01; + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); +@@ -9204,7 +9171,7 @@ static inline bool nested_vmx_merge_msr_ + int msr; + struct page *page; + unsigned long *msr_bitmap_l1; +- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; ++ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + + /* This shortcut is ok because we support only x2APIC MSRs so far. */ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) +@@ -9715,6 +9682,9 @@ static void prepare_vmcs02(struct kvm_vc + else + vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + ++ if (cpu_has_vmx_msr_bitmap()) ++ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); ++ + if (enable_vpid) { + /* + * There is no direct mapping between vpid02 and vpid12, the +@@ -10415,7 +10385,7 @@ static void load_vmcs12_host_state(struc + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) diff --git a/queue-4.4/kvm-x86-add-ibpb-support.patch b/queue-4.4/kvm-x86-add-ibpb-support.patch new file mode 100644 index 00000000000..330b42af712 --- /dev/null +++ b/queue-4.4/kvm-x86-add-ibpb-support.patch @@ -0,0 +1,346 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Ashok Raj +Date: Thu, 1 Feb 2018 22:59:43 +0100 +Subject: KVM/x86: Add IBPB support + +From: Ashok Raj + +commit 15d45071523d89b3fb7372e2135fbd72f6af9506 upstream. + +The Indirect Branch Predictor Barrier (IBPB) is an indirect branch +control mechanism. It keeps earlier branches from influencing +later ones. + +Unlike IBRS and STIBP, IBPB does not define a new mode of operation. +It's a command that ensures predicted branch targets aren't used after +the barrier. Although IBRS and IBPB are enumerated by the same CPUID +enumeration, IBPB is very different. + +IBPB helps mitigate against three potential attacks: + +* Mitigate guests from being attacked by other guests. + - This is addressed by issing IBPB when we do a guest switch. + +* Mitigate attacks from guest/ring3->host/ring3. + These would require a IBPB during context switch in host, or after + VMEXIT. The host process has two ways to mitigate + - Either it can be compiled with retpoline + - If its going through context switch, and has set !dumpable then + there is a IBPB in that path. + (Tim's patch: https://patchwork.kernel.org/patch/10192871) + - The case where after a VMEXIT you return back to Qemu might make + Qemu attackable from guest when Qemu isn't compiled with retpoline. + There are issues reported when doing IBPB on every VMEXIT that resulted + in some tsc calibration woes in guest. + +* Mitigate guest/ring0->host/ring0 attacks. + When host kernel is using retpoline it is safe against these attacks. + If host kernel isn't using retpoline we might need to do a IBPB flush on + every VMEXIT. + +Even when using retpoline for indirect calls, in certain conditions 'ret' +can use the BTB on Skylake-era CPUs. There are other mitigations +available like RSB stuffing/clearing. + +* IBPB is issued only for SVM during svm_free_vcpu(). + VMX has a vmclear and SVM doesn't. Follow discussion here: + https://lkml.org/lkml/2018/1/15/146 + +Please refer to the following spec for more details on the enumeration +and control. + +Refer here to get documentation about mitigations. + +https://software.intel.com/en-us/side-channel-security-support + +[peterz: rebase and changelog rewrite] +[karahmed: - rebase + - vmx: expose PRED_CMD if guest has it in CPUID + - svm: only pass through IBPB if guest has it in CPUID + - vmx: support !cpu_has_vmx_msr_bitmap()] + - vmx: support nested] +[dwmw2: Expose CPUID bit too (AMD IBPB only for now as we lack IBRS) + PRED_CMD is a write-only MSR] + +Signed-off-by: Ashok Raj +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: David Woodhouse +Signed-off-by: KarimAllah Ahmed +Signed-off-by: Thomas Gleixner +Reviewed-by: Konrad Rzeszutek Wilk +Cc: Andrea Arcangeli +Cc: Andi Kleen +Cc: kvm@vger.kernel.org +Cc: Asit Mallick +Cc: Linus Torvalds +Cc: Andy Lutomirski +Cc: Dave Hansen +Cc: Arjan Van De Ven +Cc: Greg KH +Cc: Jun Nakajima +Cc: Paolo Bonzini +Cc: Dan Williams +Cc: Tim Chen +Link: http://lkml.kernel.org/r/1515720739-43819-6-git-send-email-ashok.raj@intel.com +Link: https://lkml.kernel.org/r/1517522386-18410-3-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.4: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/cpuid.c | 11 ++++++- + arch/x86/kvm/cpuid.h | 12 +++++++ + arch/x86/kvm/svm.c | 28 ++++++++++++++++++ + arch/x86/kvm/vmx.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-- + 4 files changed, 127 insertions(+), 3 deletions(-) + +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -341,6 +341,10 @@ static inline int __do_cpuid_ent(struct + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + ++ /* cpuid 0x80000008.ebx */ ++ const u32 kvm_cpuid_8000_0008_ebx_x86_features = ++ F(IBPB); ++ + /* cpuid 0xC0000001.edx */ + const u32 kvm_supported_word5_x86_features = + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | +@@ -583,7 +587,12 @@ static inline int __do_cpuid_ent(struct + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); +- entry->ebx = entry->edx = 0; ++ entry->edx = 0; ++ /* IBPB isn't necessarily present in hardware cpuid */ ++ if (boot_cpu_has(X86_FEATURE_IBPB)) ++ entry->ebx |= F(IBPB); ++ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; ++ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + break; + } + case 0x80000019: +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -159,6 +159,18 @@ static inline bool guest_cpuid_has_rdtsc + return best && (best->edx & bit(X86_FEATURE_RDTSCP)); + } + ++static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_cpuid_entry2 *best; ++ ++ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); ++ if (best && (best->ebx & bit(X86_FEATURE_IBPB))) ++ return true; ++ best = kvm_find_cpuid_entry(vcpu, 7, 0); ++ return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); ++} ++ ++ + /* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 + */ +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -182,6 +182,7 @@ static const struct svm_direct_access_ms + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, + #endif ++ { .index = MSR_IA32_PRED_CMD, .always = false }, + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, + { .index = MSR_IA32_LASTINTFROMIP, .always = false }, +@@ -411,6 +412,7 @@ struct svm_cpu_data { + struct kvm_ldttss_desc *tss_desc; + + struct page *save_area; ++ struct vmcb *current_vmcb; + }; + + static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +@@ -1210,11 +1212,17 @@ static void svm_free_vcpu(struct kvm_vcp + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, svm); ++ /* ++ * The vmcb page can be recycled, causing a false negative in ++ * svm_vcpu_load(). So do a full IBPB now. ++ */ ++ indirect_branch_prediction_barrier(); + } + + static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + { + struct vcpu_svm *svm = to_svm(vcpu); ++ struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + int i; + + if (unlikely(cpu != vcpu->cpu)) { +@@ -1239,6 +1247,10 @@ static void svm_vcpu_load(struct kvm_vcp + wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); + } + } ++ if (sd->current_vmcb != svm->vmcb) { ++ sd->current_vmcb = svm->vmcb; ++ indirect_branch_prediction_barrier(); ++ } + } + + static void svm_vcpu_put(struct kvm_vcpu *vcpu) +@@ -3125,6 +3137,22 @@ static int svm_set_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr); + break; ++ case MSR_IA32_PRED_CMD: ++ if (!msr->host_initiated && ++ !guest_cpuid_has_ibpb(vcpu)) ++ return 1; ++ ++ if (data & ~PRED_CMD_IBPB) ++ return 1; ++ ++ if (!data) ++ break; ++ ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); ++ if (is_guest_mode(vcpu)) ++ break; ++ set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); ++ break; + case MSR_STAR: + svm->vmcb->save.star = data; + break; +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -544,6 +544,7 @@ struct vcpu_vmx { + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; + #endif ++ + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + /* +@@ -892,6 +893,8 @@ static void copy_vmcs12_to_shadow(struct + static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); + static int alloc_identity_pagetable(struct kvm *kvm); + static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); ++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type); + + static DEFINE_PER_CPU(struct vmcs *, vmxarea); + static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +@@ -1687,6 +1690,29 @@ static void update_exception_bitmap(stru + vmcs_write32(EXCEPTION_BITMAP, eb); + } + ++/* ++ * Check if MSR is intercepted for L01 MSR bitmap. ++ */ ++static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) ++{ ++ unsigned long *msr_bitmap; ++ int f = sizeof(unsigned long); ++ ++ if (!cpu_has_vmx_msr_bitmap()) ++ return true; ++ ++ msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; ++ ++ if (msr <= 0x1fff) { ++ return !!test_bit(msr, msr_bitmap + 0x800 / f); ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { ++ msr &= 0x1fff; ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f); ++ } ++ ++ return true; ++} ++ + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) + { +@@ -2072,6 +2098,7 @@ static void vmx_vcpu_load(struct kvm_vcp + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); ++ indirect_branch_prediction_barrier(); + } + + if (vmx->loaded_vmcs->cpu != cpu) { +@@ -2904,6 +2931,33 @@ static int vmx_set_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr_info); + break; ++ case MSR_IA32_PRED_CMD: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibpb(vcpu)) ++ return 1; ++ ++ if (data & ~PRED_CMD_IBPB) ++ return 1; ++ ++ if (!data) ++ break; ++ ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_vmx_merge_msr_bitmap. We should not touch the ++ * vmcs02.msr_bitmap here since it gets completely overwritten ++ * in the merging. ++ */ ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, ++ MSR_TYPE_W); ++ break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) +@@ -9172,9 +9226,23 @@ static inline bool nested_vmx_merge_msr_ + struct page *page; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; ++ /* ++ * pred_cmd is trying to verify two things: ++ * ++ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This ++ * ensures that we do not accidentally generate an L02 MSR bitmap ++ * from the L12 MSR bitmap that is too permissive. ++ * 2. That L1 or L2s have actually used the MSR. This avoids ++ * unnecessarily merging of the bitmap if the MSR is unused. This ++ * works properly because we only update the L01 MSR bitmap lazily. ++ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only ++ * updated to reflect this when L1 (or its L2s) actually write to ++ * the MSR. ++ */ ++ bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); + +- /* This shortcut is ok because we support only x2APIC MSRs so far. */ +- if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) ++ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && ++ !pred_cmd) + return false; + + page = nested_get_page(vcpu, vmcs12->msr_bitmap); +@@ -9209,6 +9277,13 @@ static inline bool nested_vmx_merge_msr_ + MSR_TYPE_W); + } + } ++ ++ if (pred_cmd) ++ nested_vmx_disable_intercept_for_msr( ++ msr_bitmap_l1, msr_bitmap_l0, ++ MSR_IA32_PRED_CMD, ++ MSR_TYPE_W); ++ + kunmap(page); + nested_release_page_clean(page); + diff --git a/queue-4.4/kvm-x86-remove-indirect-msr-op-calls-from-spec_ctrl.patch b/queue-4.4/kvm-x86-remove-indirect-msr-op-calls-from-spec_ctrl.patch new file mode 100644 index 00000000000..71887be8e1a --- /dev/null +++ b/queue-4.4/kvm-x86-remove-indirect-msr-op-calls-from-spec_ctrl.patch @@ -0,0 +1,100 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Paolo Bonzini +Date: Thu, 22 Feb 2018 16:43:17 +0100 +Subject: KVM/x86: Remove indirect MSR op calls from SPEC_CTRL + +From: Paolo Bonzini + +commit ecb586bd29c99fb4de599dec388658e74388daad upstream. + +Having a paravirt indirect call in the IBRS restore path is not a +good idea, since we are trying to protect from speculative execution +of bogus indirect branch targets. It is also slower, so use +native_wrmsrl() on the vmentry path too. + +Signed-off-by: Paolo Bonzini +Reviewed-by: Jim Mattson +Cc: David Woodhouse +Cc: KarimAllah Ahmed +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Radim Krčmář +Cc: Thomas Gleixner +Cc: kvm@vger.kernel.org +Cc: stable@vger.kernel.org +Fixes: d28b387fb74da95d69d2615732f50cceb38e9a4d +Link: http://lkml.kernel.org/r/20180222154318.20361-2-pbonzini@redhat.com +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.4: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm.c | 7 ++++--- + arch/x86/kvm/vmx.c | 7 ++++--- + 2 files changed, 8 insertions(+), 6 deletions(-) + +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -3904,7 +3905,7 @@ static void svm_vcpu_run(struct kvm_vcpu + * being speculatively taken. + */ + if (svm->spec_ctrl) +- wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); + + asm volatile ( + "push %%" _ASM_BP "; \n\t" +@@ -4014,10 +4015,10 @@ static void svm_vcpu_run(struct kvm_vcpu + * save it. + */ + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) +- rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + + if (svm->spec_ctrl) +- wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -48,6 +48,7 @@ + #include + #include + #include ++#include + #include + + #include "trace.h" +@@ -8658,7 +8659,7 @@ static void __noclone vmx_vcpu_run(struc + * being speculatively taken. + */ + if (vmx->spec_ctrl) +- wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + + vmx->__launched = vmx->loaded_vmcs->launched; + asm( +@@ -8794,10 +8795,10 @@ static void __noclone vmx_vcpu_run(struc + * save it. + */ + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) +- rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + + if (vmx->spec_ctrl) +- wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); diff --git a/queue-4.4/mm-hugetlb.c-don-t-call-region_abort-if-region_chg-fails.patch b/queue-4.4/mm-hugetlb.c-don-t-call-region_abort-if-region_chg-fails.patch new file mode 100644 index 00000000000..ea766dd2253 --- /dev/null +++ b/queue-4.4/mm-hugetlb.c-don-t-call-region_abort-if-region_chg-fails.patch @@ -0,0 +1,62 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Mike Kravetz +Date: Fri, 31 Mar 2017 15:12:07 -0700 +Subject: mm/hugetlb.c: don't call region_abort if region_chg fails + +From: Mike Kravetz + +commit ff8c0c53c47530ffea82c22a0a6df6332b56c957 upstream. + +Changes to hugetlbfs reservation maps is a two step process. The first +step is a call to region_chg to determine what needs to be changed, and +prepare that change. This should be followed by a call to call to +region_add to commit the change, or region_abort to abort the change. + +The error path in hugetlb_reserve_pages called region_abort after a +failed call to region_chg. As a result, the adds_in_progress counter in +the reservation map is off by 1. This is caught by a VM_BUG_ON in +resv_map_release when the reservation map is freed. + +syzkaller fuzzer (when using an injected kmalloc failure) found this +bug, that resulted in the following: + + kernel BUG at mm/hugetlb.c:742! + Call Trace: + hugetlbfs_evict_inode+0x7b/0xa0 fs/hugetlbfs/inode.c:493 + evict+0x481/0x920 fs/inode.c:553 + iput_final fs/inode.c:1515 [inline] + iput+0x62b/0xa20 fs/inode.c:1542 + hugetlb_file_setup+0x593/0x9f0 fs/hugetlbfs/inode.c:1306 + newseg+0x422/0xd30 ipc/shm.c:575 + ipcget_new ipc/util.c:285 [inline] + ipcget+0x21e/0x580 ipc/util.c:639 + SYSC_shmget ipc/shm.c:673 [inline] + SyS_shmget+0x158/0x230 ipc/shm.c:657 + entry_SYSCALL_64_fastpath+0x1f/0xc2 + RIP: resv_map_release+0x265/0x330 mm/hugetlb.c:742 + +Link: http://lkml.kernel.org/r/1490821682-23228-1-git-send-email-mike.kravetz@oracle.com +Signed-off-by: Mike Kravetz +Reported-by: Dmitry Vyukov +Acked-by: Hillf Danton +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + mm/hugetlb.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -4142,7 +4142,9 @@ int hugetlb_reserve_pages(struct inode * + return 0; + out_err: + if (!vma || vma->vm_flags & VM_MAYSHARE) +- region_abort(resv_map, from, to); ++ /* Don't call region_abort if region_chg failed */ ++ if (chg >= 0) ++ region_abort(resv_map, from, to); + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_put(&resv_map->refs, resv_map_release); + return ret; diff --git a/queue-4.4/posix-timers-sanitize-overrun-handling.patch b/queue-4.4/posix-timers-sanitize-overrun-handling.patch new file mode 100644 index 00000000000..1126dc91303 --- /dev/null +++ b/queue-4.4/posix-timers-sanitize-overrun-handling.patch @@ -0,0 +1,147 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Thomas Gleixner +Date: Thu, 1 Nov 2018 13:02:38 -0700 +Subject: posix-timers: Sanitize overrun handling + +From: Thomas Gleixner + +commit 78c9c4dfbf8c04883941445a195276bb4bb92c76 upstream. + +The posix timer overrun handling is broken because the forwarding functions +can return a huge number of overruns which does not fit in an int. As a +consequence timer_getoverrun(2) and siginfo::si_overrun can turn into +random number generators. + +The k_clock::timer_forward() callbacks return a 64 bit value now. Make +k_itimer::ti_overrun[_last] 64bit as well, so the kernel internal +accounting is correct. 3Remove the temporary (int) casts. + +Add a helper function which clamps the overrun value returned to user space +via timer_getoverrun(2) or siginfo::si_overrun limited to a positive value +between 0 and INT_MAX. INT_MAX is an indicator for user space that the +overrun value has been clamped. + +Reported-by: Team OWL337 +Signed-off-by: Thomas Gleixner +Acked-by: John Stultz +Cc: Peter Zijlstra +Cc: Michael Kerrisk +Link: https://lkml.kernel.org/r/20180626132705.018623573@linutronix.de +[florian: Make patch apply to v4.9.135] +Signed-off-by: Florian Fainelli +Reviewed-by: Thomas Gleixner +Signed-off-by: Sasha Levin +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/posix-timers.h | 4 ++-- + kernel/time/posix-cpu-timers.c | 2 +- + kernel/time/posix-timers.c | 29 +++++++++++++++++++---------- + 3 files changed, 22 insertions(+), 13 deletions(-) + +--- a/include/linux/posix-timers.h ++++ b/include/linux/posix-timers.h +@@ -65,8 +65,8 @@ struct k_itimer { + spinlock_t it_lock; + clockid_t it_clock; /* which timer type */ + timer_t it_id; /* timer id */ +- int it_overrun; /* overrun on pending signal */ +- int it_overrun_last; /* overrun on last delivered signal */ ++ s64 it_overrun; /* overrun on pending signal */ ++ s64 it_overrun_last; /* overrun on last delivered signal */ + int it_requeue_pending; /* waiting to requeue this timer */ + #define REQUEUE_PENDING 1 + int it_sigev_notify; /* notify word of sigevent struct */ +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -103,7 +103,7 @@ static void bump_cpu_timer(struct k_itim + continue; + + timer->it.cpu.expires += incr; +- timer->it_overrun += 1 << i; ++ timer->it_overrun += 1LL << i; + delta -= incr; + } + } +--- a/kernel/time/posix-timers.c ++++ b/kernel/time/posix-timers.c +@@ -355,6 +355,17 @@ static __init int init_posix_timers(void + + __initcall(init_posix_timers); + ++/* ++ * The siginfo si_overrun field and the return value of timer_getoverrun(2) ++ * are of type int. Clamp the overrun value to INT_MAX ++ */ ++static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) ++{ ++ s64 sum = timr->it_overrun_last + (s64)baseval; ++ ++ return sum > (s64)INT_MAX ? INT_MAX : (int)sum; ++} ++ + static void schedule_next_timer(struct k_itimer *timr) + { + struct hrtimer *timer = &timr->it.real.timer; +@@ -362,12 +373,11 @@ static void schedule_next_timer(struct k + if (timr->it.real.interval.tv64 == 0) + return; + +- timr->it_overrun += (unsigned int) hrtimer_forward(timer, +- timer->base->get_time(), +- timr->it.real.interval); ++ timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), ++ timr->it.real.interval); + + timr->it_overrun_last = timr->it_overrun; +- timr->it_overrun = -1; ++ timr->it_overrun = -1LL; + ++timr->it_requeue_pending; + hrtimer_restart(timer); + } +@@ -396,7 +406,7 @@ void do_schedule_next_timer(struct sigin + else + schedule_next_timer(timr); + +- info->si_overrun += timr->it_overrun_last; ++ info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); + } + + if (timr) +@@ -491,8 +501,7 @@ static enum hrtimer_restart posix_timer_ + now = ktime_add(now, kj); + } + #endif +- timr->it_overrun += (unsigned int) +- hrtimer_forward(timer, now, ++ timr->it_overrun += hrtimer_forward(timer, now, + timr->it.real.interval); + ret = HRTIMER_RESTART; + ++timr->it_requeue_pending; +@@ -633,7 +642,7 @@ SYSCALL_DEFINE3(timer_create, const cloc + it_id_set = IT_ID_SET; + new_timer->it_id = (timer_t) new_timer_id; + new_timer->it_clock = which_clock; +- new_timer->it_overrun = -1; ++ new_timer->it_overrun = -1LL; + + if (timer_event_spec) { + if (copy_from_user(&event, timer_event_spec, sizeof (event))) { +@@ -762,7 +771,7 @@ common_timer_get(struct k_itimer *timr, + */ + if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || + timr->it_sigev_notify == SIGEV_NONE)) +- timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); ++ timr->it_overrun += hrtimer_forward(timer, now, iv); + + remaining = __hrtimer_expires_remaining_adjusted(timer, now); + /* Return 0 only, when the timer is expired and not pending */ +@@ -824,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_ + if (!timr) + return -EINVAL; + +- overrun = timr->it_overrun_last; ++ overrun = timer_overrun_to_int(timr, 0); + unlock_timer(timr, flags); + + return overrun; diff --git a/queue-4.4/series b/queue-4.4/series index 66de3e3be48..f00a30a74ea 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -37,3 +37,37 @@ hfsplus-do-not-free-node-before-using.patch debugobjects-avoid-recursive-calls-with-kmemleak.patch ocfs2-fix-potential-use-after-free.patch pstore-convert-console-write-to-use-write_buf.patch +alsa-pcm-remove-sndrv_pcm_ioctl1_info-internal-command.patch +kvm-nvmx-fix-msr-bitmaps-to-prevent-l2-from-accessing-l0-x2apic.patch +kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch +kvm-nvmx-eliminate-vmcs02-pool.patch +kvm-vmx-introduce-alloc_loaded_vmcs.patch +kvm-vmx-make-msr-bitmaps-per-vcpu.patch +kvm-x86-add-ibpb-support.patch +kvm-vmx-emulate-msr_ia32_arch_capabilities.patch +kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch +kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch +kvm-x86-remove-indirect-msr-op-calls-from-spec_ctrl.patch +x86-reorganize-smap-handling-in-user-space-accesses.patch +x86-fix-smap-in-32-bit-environments.patch +x86-introduce-__uaccess_begin_nospec-and-uaccess_try_nospec.patch +x86-usercopy-replace-open-coded-stac-clac-with-__uaccess_-begin-end.patch +x86-uaccess-use-__uaccess_begin_nospec-and-uaccess_try_nospec.patch +x86-bugs-kvm-support-the-combination-of-guest-and-host-ibrs.patch +x86-kvm-vmx-expose-spec_ctrl-bit-2-to-the-guest.patch +kvm-svm-move-spec-control-call-after-restore-of-gs.patch +x86-bugs-kvm-extend-speculation-control-for-virt_spec_ctrl.patch +x86-speculation-use-synthetic-bits-for-ibrs-ibpb-stibp.patch +kvm-svm-implement-virt_spec_ctrl-support-for-ssbd.patch +bpf-support-8-byte-metafield-access.patch +bpf-verifier-add-spi-variable-to-check_stack_write.patch +bpf-verifier-pass-instruction-index-to-check_mem_access-and-check_xadd.patch +bpf-prevent-memory-disambiguation-attack.patch +wil6210-missing-length-check-in-wmi_set_ie.patch +posix-timers-sanitize-overrun-handling.patch +mm-hugetlb.c-don-t-call-region_abort-if-region_chg-fails.patch +hugetlbfs-fix-offset-overflow-in-hugetlbfs-mmap.patch +hugetlbfs-check-for-pgoff-value-overflow.patch +hugetlbfs-fix-bug-in-pgoff-overflow-checking.patch +swiotlb-clean-up-reporting.patch +sr-pass-down-correctly-sized-scsi-sense-buffer.patch diff --git a/queue-4.4/sr-pass-down-correctly-sized-scsi-sense-buffer.patch b/queue-4.4/sr-pass-down-correctly-sized-scsi-sense-buffer.patch new file mode 100644 index 00000000000..2b58bbcd815 --- /dev/null +++ b/queue-4.4/sr-pass-down-correctly-sized-scsi-sense-buffer.patch @@ -0,0 +1,84 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Jens Axboe +Date: Mon, 21 May 2018 12:21:14 -0600 +Subject: sr: pass down correctly sized SCSI sense buffer + +From: Jens Axboe + +commit f7068114d45ec55996b9040e98111afa56e010fe upstream. + +We're casting the CDROM layer request_sense to the SCSI sense +buffer, but the former is 64 bytes and the latter is 96 bytes. +As we generally allocate these on the stack, we end up blowing +up the stack. + +Fix this by wrapping the scsi_execute() call with a properly +sized sense buffer, and copying back the bits for the CDROM +layer. + +Reported-by: Piotr Gabriel Kosinski +Reported-by: Daniel Shapira +Tested-by: Kees Cook +Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request") +Signed-off-by: Jens Axboe +[bwh: Despite what the "Fixes" field says, a buffer overrun was already + possible if the sense data was really > 64 bytes long. + Backported to 4.4: + - We always need to allocate a sense buffer in order to call + scsi_normalize_sense() + - Remove the existing conditional heap-allocation of the sense buffer] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + drivers/scsi/sr_ioctl.c | 21 +++++++-------------- + 1 file changed, 7 insertions(+), 14 deletions(-) + +--- a/drivers/scsi/sr_ioctl.c ++++ b/drivers/scsi/sr_ioctl.c +@@ -187,30 +187,25 @@ int sr_do_ioctl(Scsi_CD *cd, struct pack + struct scsi_device *SDev; + struct scsi_sense_hdr sshdr; + int result, err = 0, retries = 0; +- struct request_sense *sense = cgc->sense; ++ unsigned char sense_buffer[SCSI_SENSE_BUFFERSIZE]; + + SDev = cd->device; + +- if (!sense) { +- sense = kmalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); +- if (!sense) { +- err = -ENOMEM; +- goto out; +- } +- } +- + retry: + if (!scsi_block_when_processing_errors(SDev)) { + err = -ENODEV; + goto out; + } + +- memset(sense, 0, sizeof(*sense)); ++ memset(sense_buffer, 0, sizeof(sense_buffer)); + result = scsi_execute(SDev, cgc->cmd, cgc->data_direction, +- cgc->buffer, cgc->buflen, (char *)sense, ++ cgc->buffer, cgc->buflen, sense_buffer, + cgc->timeout, IOCTL_RETRIES, 0, NULL); + +- scsi_normalize_sense((char *)sense, sizeof(*sense), &sshdr); ++ scsi_normalize_sense(sense_buffer, sizeof(sense_buffer), &sshdr); ++ ++ if (cgc->sense) ++ memcpy(cgc->sense, sense_buffer, sizeof(*cgc->sense)); + + /* Minimal error checking. Ignore cases we know about, and report the rest. */ + if (driver_byte(result) != 0) { +@@ -261,8 +256,6 @@ int sr_do_ioctl(Scsi_CD *cd, struct pack + + /* Wake up a process waiting for device */ + out: +- if (!cgc->sense) +- kfree(sense); + cgc->stat = err; + return err; + } diff --git a/queue-4.4/swiotlb-clean-up-reporting.patch b/queue-4.4/swiotlb-clean-up-reporting.patch new file mode 100644 index 00000000000..1c45fea238a --- /dev/null +++ b/queue-4.4/swiotlb-clean-up-reporting.patch @@ -0,0 +1,88 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Kees Cook +Date: Tue, 10 Jul 2018 16:22:22 -0700 +Subject: swiotlb: clean up reporting + +From: Kees Cook + +commit 7d63fb3af87aa67aa7d24466e792f9d7c57d8e79 upstream. + +This removes needless use of '%p', and refactors the printk calls to +use pr_*() helpers instead. + +Signed-off-by: Kees Cook +Reviewed-by: Konrad Rzeszutek Wilk +Signed-off-by: Christoph Hellwig +[bwh: Backported to 4.4: + - Adjust filename + - Remove "swiotlb: " prefix from an additional log message] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + lib/swiotlb.c | 20 +++++++++----------- + 1 file changed, 9 insertions(+), 11 deletions(-) + +--- a/lib/swiotlb.c ++++ b/lib/swiotlb.c +@@ -17,6 +17,8 @@ + * 08/12/11 beckyb Add highmem support + */ + ++#define pr_fmt(fmt) "software IO TLB: " fmt ++ + #include + #include + #include +@@ -143,20 +145,16 @@ static bool no_iotlb_memory; + void swiotlb_print_info(void) + { + unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; +- unsigned char *vstart, *vend; + + if (no_iotlb_memory) { +- pr_warn("software IO TLB: No low mem\n"); ++ pr_warn("No low mem\n"); + return; + } + +- vstart = phys_to_virt(io_tlb_start); +- vend = phys_to_virt(io_tlb_end); +- +- printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", ++ pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n", + (unsigned long long)io_tlb_start, + (unsigned long long)io_tlb_end, +- bytes >> 20, vstart, vend - 1); ++ bytes >> 20); + } + + int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +@@ -230,7 +228,7 @@ swiotlb_init(int verbose) + if (io_tlb_start) + memblock_free_early(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); +- pr_warn("Cannot allocate SWIOTLB buffer"); ++ pr_warn("Cannot allocate buffer"); + no_iotlb_memory = true; + } + +@@ -272,8 +270,8 @@ swiotlb_late_init_with_default_size(size + return -ENOMEM; + } + if (order != get_order(bytes)) { +- printk(KERN_WARNING "Warning: only able to allocate %ld MB " +- "for software IO TLB\n", (PAGE_SIZE << order) >> 20); ++ pr_warn("only able to allocate %ld MB\n", ++ (PAGE_SIZE << order) >> 20); + io_tlb_nslabs = SLABS_PER_PAGE << order; + } + rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); +@@ -680,7 +678,7 @@ swiotlb_alloc_coherent(struct device *hw + return ret; + + err_warn: +- pr_warn("swiotlb: coherent allocation failed for device %s size=%zu\n", ++ pr_warn("coherent allocation failed for device %s size=%zu\n", + dev_name(hwdev), size); + dump_stack(); + diff --git a/queue-4.4/wil6210-missing-length-check-in-wmi_set_ie.patch b/queue-4.4/wil6210-missing-length-check-in-wmi_set_ie.patch new file mode 100644 index 00000000000..980e8a76e58 --- /dev/null +++ b/queue-4.4/wil6210-missing-length-check-in-wmi_set_ie.patch @@ -0,0 +1,39 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Lior David +Date: Tue, 14 Nov 2017 15:25:39 +0200 +Subject: wil6210: missing length check in wmi_set_ie + +From: Lior David + +commit b5a8ffcae4103a9d823ea3aa3a761f65779fbe2a upstream. + +Add a length check in wmi_set_ie to detect unsigned integer +overflow. + +Signed-off-by: Lior David +Signed-off-by: Maya Erez +Signed-off-by: Kalle Valo +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/wireless/ath/wil6210/wmi.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/drivers/net/wireless/ath/wil6210/wmi.c ++++ b/drivers/net/wireless/ath/wil6210/wmi.c +@@ -1035,8 +1035,14 @@ int wmi_set_ie(struct wil6210_priv *wil, + }; + int rc; + u16 len = sizeof(struct wmi_set_appie_cmd) + ie_len; +- struct wmi_set_appie_cmd *cmd = kzalloc(len, GFP_KERNEL); ++ struct wmi_set_appie_cmd *cmd; + ++ if (len < ie_len) { ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ cmd = kzalloc(len, GFP_KERNEL); + if (!cmd) { + rc = -ENOMEM; + goto out; diff --git a/queue-4.4/x86-bugs-kvm-extend-speculation-control-for-virt_spec_ctrl.patch b/queue-4.4/x86-bugs-kvm-extend-speculation-control-for-virt_spec_ctrl.patch new file mode 100644 index 00000000000..8f48978d98a --- /dev/null +++ b/queue-4.4/x86-bugs-kvm-extend-speculation-control-for-virt_spec_ctrl.patch @@ -0,0 +1,100 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Thomas Gleixner +Date: Wed, 9 May 2018 23:01:01 +0200 +Subject: x86/bugs, KVM: Extend speculation control for VIRT_SPEC_CTRL + +From: Thomas Gleixner + +commit ccbcd2674472a978b48c91c1fbfb66c0ff959f24 upstream. + +AMD is proposing a VIRT_SPEC_CTRL MSR to handle the Speculative Store +Bypass Disable via MSR_AMD64_LS_CFG so that guests do not have to care +about the bit position of the SSBD bit and thus facilitate migration. +Also, the sibling coordination on Family 17H CPUs can only be done on +the host. + +Extend x86_spec_ctrl_set_guest() and x86_spec_ctrl_restore_host() with an +extra argument for the VIRT_SPEC_CTRL MSR. + +Hand in 0 from VMX and in SVM add a new virt_spec_ctrl member to the CPU +data structure which is going to be used in later patches for the actual +implementation. + +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Konrad Rzeszutek Wilk +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.4: This was partly applied before; apply just the + missing bits] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm.c | 11 +++++++++-- + arch/x86/kvm/vmx.c | 5 +++-- + 2 files changed, 12 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -149,6 +149,12 @@ struct vcpu_svm { + } host; + + u64 spec_ctrl; ++ /* ++ * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be ++ * translated into the appropriate L2_CFG bits on the host to ++ * perform speculative control. ++ */ ++ u64 virt_spec_ctrl; + + u32 *msrpm; + +@@ -1146,6 +1152,7 @@ static void svm_vcpu_reset(struct kvm_vc + u32 eax = 1; + + svm->spec_ctrl = 0; ++ svm->virt_spec_ctrl = 0; + + if (!init_event) { + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | +@@ -3904,7 +3911,7 @@ static void svm_vcpu_run(struct kvm_vcpu + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ +- x86_spec_ctrl_set_guest(svm->spec_ctrl); ++ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); + + asm volatile ( + "push %%" _ASM_BP "; \n\t" +@@ -4028,7 +4035,7 @@ static void svm_vcpu_run(struct kvm_vcpu + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) + svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + +- x86_spec_ctrl_restore_host(svm->spec_ctrl); ++ x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); + + reload_tss(vcpu); + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -8658,9 +8658,10 @@ static void __noclone vmx_vcpu_run(struc + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ +- x86_spec_ctrl_set_guest(vmx->spec_ctrl); ++ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); + + vmx->__launched = vmx->loaded_vmcs->launched; ++ + asm( + /* Store host registers */ + "push %%" _ASM_DX "; push %%" _ASM_BP ";" +@@ -8796,7 +8797,7 @@ static void __noclone vmx_vcpu_run(struc + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) + vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + +- x86_spec_ctrl_restore_host(vmx->spec_ctrl); ++ x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); + + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); diff --git a/queue-4.4/x86-bugs-kvm-support-the-combination-of-guest-and-host-ibrs.patch b/queue-4.4/x86-bugs-kvm-support-the-combination-of-guest-and-host-ibrs.patch new file mode 100644 index 00000000000..1131e28b355 --- /dev/null +++ b/queue-4.4/x86-bugs-kvm-support-the-combination-of-guest-and-host-ibrs.patch @@ -0,0 +1,81 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Konrad Rzeszutek Wilk +Date: Wed, 25 Apr 2018 22:04:19 -0400 +Subject: x86/bugs, KVM: Support the combination of guest and host IBRS + +From: Konrad Rzeszutek Wilk + +commit 5cf687548705412da47c9cec342fd952d71ed3d5 upstream. + +A guest may modify the SPEC_CTRL MSR from the value used by the +kernel. Since the kernel doesn't use IBRS, this means a value of zero is +what is needed in the host. + +But the 336996-Speculative-Execution-Side-Channel-Mitigations.pdf refers to +the other bits as reserved so the kernel should respect the boot time +SPEC_CTRL value and use that. + +This allows to deal with future extensions to the SPEC_CTRL interface if +any at all. + +Note: This uses wrmsrl() instead of native_wrmsl(). I does not make any +difference as paravirt will over-write the callq *0xfff.. with the wrmsrl +assembler code. + +Signed-off-by: Konrad Rzeszutek Wilk +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Ingo Molnar +[bwh: Backported to 4.4: This was partly applied before; apply just the + missing bits] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm.c | 6 ++---- + arch/x86/kvm/vmx.c | 6 ++---- + 2 files changed, 4 insertions(+), 8 deletions(-) + +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -3904,8 +3904,7 @@ static void svm_vcpu_run(struct kvm_vcpu + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ +- if (svm->spec_ctrl) +- native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ x86_spec_ctrl_set_guest(svm->spec_ctrl); + + asm volatile ( + "push %%" _ASM_BP "; \n\t" +@@ -4017,8 +4016,7 @@ static void svm_vcpu_run(struct kvm_vcpu + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) + svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + +- if (svm->spec_ctrl) +- native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ x86_spec_ctrl_restore_host(svm->spec_ctrl); + + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -8658,8 +8658,7 @@ static void __noclone vmx_vcpu_run(struc + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ +- if (vmx->spec_ctrl) +- native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ x86_spec_ctrl_set_guest(vmx->spec_ctrl); + + vmx->__launched = vmx->loaded_vmcs->launched; + asm( +@@ -8797,8 +8796,7 @@ static void __noclone vmx_vcpu_run(struc + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) + vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + +- if (vmx->spec_ctrl) +- native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ x86_spec_ctrl_restore_host(vmx->spec_ctrl); + + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); diff --git a/queue-4.4/x86-fix-smap-in-32-bit-environments.patch b/queue-4.4/x86-fix-smap-in-32-bit-environments.patch new file mode 100644 index 00000000000..bf657d4f801 --- /dev/null +++ b/queue-4.4/x86-fix-smap-in-32-bit-environments.patch @@ -0,0 +1,129 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Linus Torvalds +Date: Tue, 23 Feb 2016 14:58:52 -0800 +Subject: x86: fix SMAP in 32-bit environments + +From: Linus Torvalds + +commit de9e478b9d49f3a0214310d921450cf5bb4a21e6 upstream. + +In commit 11f1a4b9755f ("x86: reorganize SMAP handling in user space +accesses") I changed how the stac/clac instructions were generated +around the user space accesses, which then made it possible to do +batched accesses efficiently for user string copies etc. + +However, in doing so, I completely spaced out, and didn't even think +about the 32-bit case. And nobody really even seemed to notice, because +SMAP doesn't even exist until modern Skylake processors, and you'd have +to be crazy to run 32-bit kernels on a modern CPU. + +Which brings us to Andy Lutomirski. + +He actually tested the 32-bit kernel on new hardware, and noticed that +it doesn't work. My bad. The trivial fix is to add the required +uaccess begin/end markers around the raw accesses in . + +I feel a bit bad about this patch, just because that header file really +should be cleaned up to avoid all the duplicated code in it, and this +commit just expands on the problem. But this just fixes the bug without +any bigger cleanup surgery. + +Reported-and-tested-by: Andy Lutomirski +Signed-off-by: Linus Torvalds +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/uaccess_32.h | 26 ++++++++++++++++++++++++++ + 1 file changed, 26 insertions(+) + +--- a/arch/x86/include/asm/uaccess_32.h ++++ b/arch/x86/include/asm/uaccess_32.h +@@ -48,20 +48,28 @@ __copy_to_user_inatomic(void __user *to, + + switch (n) { + case 1: ++ __uaccess_begin(); + __put_user_size(*(u8 *)from, (u8 __user *)to, + 1, ret, 1); ++ __uaccess_end(); + return ret; + case 2: ++ __uaccess_begin(); + __put_user_size(*(u16 *)from, (u16 __user *)to, + 2, ret, 2); ++ __uaccess_end(); + return ret; + case 4: ++ __uaccess_begin(); + __put_user_size(*(u32 *)from, (u32 __user *)to, + 4, ret, 4); ++ __uaccess_end(); + return ret; + case 8: ++ __uaccess_begin(); + __put_user_size(*(u64 *)from, (u64 __user *)to, + 8, ret, 8); ++ __uaccess_end(); + return ret; + } + } +@@ -103,13 +111,19 @@ __copy_from_user_inatomic(void *to, cons + + switch (n) { + case 1: ++ __uaccess_begin(); + __get_user_size(*(u8 *)to, from, 1, ret, 1); ++ __uaccess_end(); + return ret; + case 2: ++ __uaccess_begin(); + __get_user_size(*(u16 *)to, from, 2, ret, 2); ++ __uaccess_end(); + return ret; + case 4: ++ __uaccess_begin(); + __get_user_size(*(u32 *)to, from, 4, ret, 4); ++ __uaccess_end(); + return ret; + } + } +@@ -148,13 +162,19 @@ __copy_from_user(void *to, const void __ + + switch (n) { + case 1: ++ __uaccess_begin(); + __get_user_size(*(u8 *)to, from, 1, ret, 1); ++ __uaccess_end(); + return ret; + case 2: ++ __uaccess_begin(); + __get_user_size(*(u16 *)to, from, 2, ret, 2); ++ __uaccess_end(); + return ret; + case 4: ++ __uaccess_begin(); + __get_user_size(*(u32 *)to, from, 4, ret, 4); ++ __uaccess_end(); + return ret; + } + } +@@ -170,13 +190,19 @@ static __always_inline unsigned long __c + + switch (n) { + case 1: ++ __uaccess_begin(); + __get_user_size(*(u8 *)to, from, 1, ret, 1); ++ __uaccess_end(); + return ret; + case 2: ++ __uaccess_begin(); + __get_user_size(*(u16 *)to, from, 2, ret, 2); ++ __uaccess_end(); + return ret; + case 4: ++ __uaccess_begin(); + __get_user_size(*(u32 *)to, from, 4, ret, 4); ++ __uaccess_end(); + return ret; + } + } diff --git a/queue-4.4/x86-introduce-__uaccess_begin_nospec-and-uaccess_try_nospec.patch b/queue-4.4/x86-introduce-__uaccess_begin_nospec-and-uaccess_try_nospec.patch new file mode 100644 index 00000000000..2873299322b --- /dev/null +++ b/queue-4.4/x86-introduce-__uaccess_begin_nospec-and-uaccess_try_nospec.patch @@ -0,0 +1,80 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Dan Williams +Date: Mon, 29 Jan 2018 17:02:39 -0800 +Subject: x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec + +From: Dan Williams + +commit b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd upstream. + +For __get_user() paths, do not allow the kernel to speculate on the value +of a user controlled pointer. In addition to the 'stac' instruction for +Supervisor Mode Access Protection (SMAP), a barrier_nospec() causes the +access_ok() result to resolve in the pipeline before the CPU might take any +speculative action on the pointer value. Given the cost of 'stac' the +speculation barrier is placed after 'stac' to hopefully overlap the cost of +disabling SMAP with the cost of flushing the instruction pipeline. + +Since __get_user is a major kernel interface that deals with user +controlled pointers, the __uaccess_begin_nospec() mechanism will prevent +speculative execution past an access_ok() permission check. While +speculative execution past access_ok() is not enough to lead to a kernel +memory leak, it is a necessary precondition. + +To be clear, __uaccess_begin_nospec() is addressing a class of potential +problems near __get_user() usages. + +Note, that while the barrier_nospec() in __uaccess_begin_nospec() is used +to protect __get_user(), pointer masking similar to array_index_nospec() +will be used for get_user() since it incorporates a bounds check near the +usage. + +uaccess_try_nospec provides the same mechanism for get_user_try. + +No functional changes. + +Suggested-by: Linus Torvalds +Suggested-by: Andi Kleen +Suggested-by: Ingo Molnar +Signed-off-by: Dan Williams +Signed-off-by: Thomas Gleixner +Cc: linux-arch@vger.kernel.org +Cc: Tom Lendacky +Cc: Kees Cook +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: Al Viro +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727415922.33451.5796614273104346583.stgit@dwillia2-desk3.amr.corp.intel.com +[bwh: Backported to 4.4: use current_thread_info()] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/uaccess.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/arch/x86/include/asm/uaccess.h ++++ b/arch/x86/include/asm/uaccess.h +@@ -146,6 +146,11 @@ extern int __get_user_bad(void); + + #define __uaccess_begin() stac() + #define __uaccess_end() clac() ++#define __uaccess_begin_nospec() \ ++({ \ ++ stac(); \ ++ barrier_nospec(); \ ++}) + + /* + * This is a type: either unsigned long, if the argument fits into +@@ -473,6 +478,10 @@ struct __large_struct { unsigned long bu + __uaccess_begin(); \ + barrier(); + ++#define uaccess_try_nospec do { \ ++ current_thread_info()->uaccess_err = 0; \ ++ __uaccess_begin_nospec(); \ ++ + #define uaccess_catch(err) \ + __uaccess_end(); \ + (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ diff --git a/queue-4.4/x86-kvm-vmx-expose-spec_ctrl-bit-2-to-the-guest.patch b/queue-4.4/x86-kvm-vmx-expose-spec_ctrl-bit-2-to-the-guest.patch new file mode 100644 index 00000000000..55f2e3e142c --- /dev/null +++ b/queue-4.4/x86-kvm-vmx-expose-spec_ctrl-bit-2-to-the-guest.patch @@ -0,0 +1,111 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Konrad Rzeszutek Wilk +Date: Wed, 25 Apr 2018 22:04:25 -0400 +Subject: x86/KVM/VMX: Expose SPEC_CTRL Bit(2) to the guest + +From: Konrad Rzeszutek Wilk + +commit da39556f66f5cfe8f9c989206974f1cb16ca5d7c upstream. + +Expose the CPUID.7.EDX[31] bit to the guest, and also guard against various +combinations of SPEC_CTRL MSR values. + +The handling of the MSR (to take into account the host value of SPEC_CTRL +Bit(2)) is taken care of in patch: + + KVM/SVM/VMX/x86/spectre_v2: Support the combination of guest and host IBRS + +Signed-off-by: Konrad Rzeszutek Wilk +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar + +[dwmw2: Handle 4.9 guest CPUID differences, rename + guest_cpu_has_ibrs() → guest_cpu_has_spec_ctrl()] +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.4: Update feature bit name] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/cpuid.c | 2 +- + arch/x86/kvm/cpuid.h | 4 ++-- + arch/x86/kvm/svm.c | 4 ++-- + arch/x86/kvm/vmx.c | 6 +++--- + 4 files changed, 8 insertions(+), 8 deletions(-) + +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -364,7 +364,7 @@ static inline int __do_cpuid_ent(struct + + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = +- F(SPEC_CTRL) | F(ARCH_CAPABILITIES); ++ F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); + + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -170,7 +170,7 @@ static inline bool guest_cpuid_has_ibpb( + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); + } + +-static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu) ++static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu) + { + struct kvm_cpuid_entry2 *best; + +@@ -178,7 +178,7 @@ static inline bool guest_cpuid_has_ibrs( + if (best && (best->ebx & bit(X86_FEATURE_IBRS))) + return true; + best = kvm_find_cpuid_entry(vcpu, 7, 0); +- return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); ++ return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD))); + } + + static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -3090,7 +3090,7 @@ static int svm_get_msr(struct kvm_vcpu * + break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && +- !guest_cpuid_has_ibrs(vcpu)) ++ !guest_cpuid_has_spec_ctrl(vcpu)) + return 1; + + msr_info->data = svm->spec_ctrl; +@@ -3171,7 +3171,7 @@ static int svm_set_msr(struct kvm_vcpu * + break; + case MSR_IA32_SPEC_CTRL: + if (!msr->host_initiated && +- !guest_cpuid_has_ibrs(vcpu)) ++ !guest_cpuid_has_spec_ctrl(vcpu)) + return 1; + + /* The STIBP bit doesn't fault even if it's not advertised */ +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2861,7 +2861,7 @@ static int vmx_get_msr(struct kvm_vcpu * + break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && +- !guest_cpuid_has_ibrs(vcpu)) ++ !guest_cpuid_has_spec_ctrl(vcpu)) + return 1; + + msr_info->data = to_vmx(vcpu)->spec_ctrl; +@@ -2973,11 +2973,11 @@ static int vmx_set_msr(struct kvm_vcpu * + break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && +- !guest_cpuid_has_ibrs(vcpu)) ++ !guest_cpuid_has_spec_ctrl(vcpu)) + return 1; + + /* The STIBP bit doesn't fault even if it's not advertised */ +- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + return 1; + + vmx->spec_ctrl = data; diff --git a/queue-4.4/x86-reorganize-smap-handling-in-user-space-accesses.patch b/queue-4.4/x86-reorganize-smap-handling-in-user-space-accesses.patch new file mode 100644 index 00000000000..7322da6ac94 --- /dev/null +++ b/queue-4.4/x86-reorganize-smap-handling-in-user-space-accesses.patch @@ -0,0 +1,395 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Linus Torvalds +Date: Thu, 17 Dec 2015 09:45:09 -0800 +Subject: x86: reorganize SMAP handling in user space accesses + +From: Linus Torvalds + +commit 11f1a4b9755f5dbc3e822a96502ebe9b044b14d8 upstream. + +This reorganizes how we do the stac/clac instructions in the user access +code. Instead of adding the instructions directly to the same inline +asm that does the actual user level access and exception handling, add +them at a higher level. + +This is mainly preparation for the next step, where we will expose an +interface to allow users to mark several accesses together as being user +space accesses, but it does already clean up some code: + + - the inlined trivial cases of copy_in_user() now do stac/clac just + once over the accesses: they used to do one pair around the user + space read, and another pair around the write-back. + + - the {get,put}_user_ex() macros that are used with the catch/try + handling don't do any stac/clac at all, because that happens in the + try/catch surrounding them. + +Other than those two cleanups that happened naturally from the +re-organization, this should not make any difference. Yet. + +Signed-off-by: Linus Torvalds +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/uaccess.h | 53 ++++++++++++++------- + arch/x86/include/asm/uaccess_64.h | 94 ++++++++++++++++++++++++++------------ + 2 files changed, 101 insertions(+), 46 deletions(-) + +--- a/arch/x86/include/asm/uaccess.h ++++ b/arch/x86/include/asm/uaccess.h +@@ -144,6 +144,9 @@ extern int __get_user_4(void); + extern int __get_user_8(void); + extern int __get_user_bad(void); + ++#define __uaccess_begin() stac() ++#define __uaccess_end() clac() ++ + /* + * This is a type: either unsigned long, if the argument fits into + * that type, or otherwise unsigned long long. +@@ -203,10 +206,10 @@ __typeof__(__builtin_choose_expr(sizeof( + + #ifdef CONFIG_X86_32 + #define __put_user_asm_u64(x, addr, err, errret) \ +- asm volatile(ASM_STAC "\n" \ ++ asm volatile("\n" \ + "1: movl %%eax,0(%2)\n" \ + "2: movl %%edx,4(%2)\n" \ +- "3: " ASM_CLAC "\n" \ ++ "3:" \ + ".section .fixup,\"ax\"\n" \ + "4: movl %3,%0\n" \ + " jmp 3b\n" \ +@@ -217,10 +220,10 @@ __typeof__(__builtin_choose_expr(sizeof( + : "A" (x), "r" (addr), "i" (errret), "0" (err)) + + #define __put_user_asm_ex_u64(x, addr) \ +- asm volatile(ASM_STAC "\n" \ ++ asm volatile("\n" \ + "1: movl %%eax,0(%1)\n" \ + "2: movl %%edx,4(%1)\n" \ +- "3: " ASM_CLAC "\n" \ ++ "3:" \ + _ASM_EXTABLE_EX(1b, 2b) \ + _ASM_EXTABLE_EX(2b, 3b) \ + : : "A" (x), "r" (addr)) +@@ -314,6 +317,10 @@ do { \ + } \ + } while (0) + ++/* ++ * This doesn't do __uaccess_begin/end - the exception handling ++ * around it must do that. ++ */ + #define __put_user_size_ex(x, ptr, size) \ + do { \ + __chk_user_ptr(ptr); \ +@@ -368,9 +375,9 @@ do { \ + } while (0) + + #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ +- asm volatile(ASM_STAC "\n" \ ++ asm volatile("\n" \ + "1: mov"itype" %2,%"rtype"1\n" \ +- "2: " ASM_CLAC "\n" \ ++ "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: mov %3,%0\n" \ + " xor"itype" %"rtype"1,%"rtype"1\n" \ +@@ -380,6 +387,10 @@ do { \ + : "=r" (err), ltype(x) \ + : "m" (__m(addr)), "i" (errret), "0" (err)) + ++/* ++ * This doesn't do __uaccess_begin/end - the exception handling ++ * around it must do that. ++ */ + #define __get_user_size_ex(x, ptr, size) \ + do { \ + __chk_user_ptr(ptr); \ +@@ -410,7 +421,9 @@ do { \ + #define __put_user_nocheck(x, ptr, size) \ + ({ \ + int __pu_err; \ ++ __uaccess_begin(); \ + __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ ++ __uaccess_end(); \ + __builtin_expect(__pu_err, 0); \ + }) + +@@ -418,7 +431,9 @@ do { \ + ({ \ + int __gu_err; \ + unsigned long __gu_val; \ ++ __uaccess_begin(); \ + __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ ++ __uaccess_end(); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ + __builtin_expect(__gu_err, 0); \ + }) +@@ -433,9 +448,9 @@ struct __large_struct { unsigned long bu + * aliasing issues. + */ + #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ +- asm volatile(ASM_STAC "\n" \ ++ asm volatile("\n" \ + "1: mov"itype" %"rtype"1,%2\n" \ +- "2: " ASM_CLAC "\n" \ ++ "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: mov %3,%0\n" \ + " jmp 2b\n" \ +@@ -455,11 +470,11 @@ struct __large_struct { unsigned long bu + */ + #define uaccess_try do { \ + current_thread_info()->uaccess_err = 0; \ +- stac(); \ ++ __uaccess_begin(); \ + barrier(); + + #define uaccess_catch(err) \ +- clac(); \ ++ __uaccess_end(); \ + (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ + } while (0) + +@@ -557,12 +572,13 @@ extern void __cmpxchg_wrong_size(void) + __typeof__(ptr) __uval = (uval); \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ ++ __uaccess_begin(); \ + switch (size) { \ + case 1: \ + { \ +- asm volatile("\t" ASM_STAC "\n" \ ++ asm volatile("\n" \ + "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \ +- "2:\t" ASM_CLAC "\n" \ ++ "2:\n" \ + "\t.section .fixup, \"ax\"\n" \ + "3:\tmov %3, %0\n" \ + "\tjmp 2b\n" \ +@@ -576,9 +592,9 @@ extern void __cmpxchg_wrong_size(void) + } \ + case 2: \ + { \ +- asm volatile("\t" ASM_STAC "\n" \ ++ asm volatile("\n" \ + "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \ +- "2:\t" ASM_CLAC "\n" \ ++ "2:\n" \ + "\t.section .fixup, \"ax\"\n" \ + "3:\tmov %3, %0\n" \ + "\tjmp 2b\n" \ +@@ -592,9 +608,9 @@ extern void __cmpxchg_wrong_size(void) + } \ + case 4: \ + { \ +- asm volatile("\t" ASM_STAC "\n" \ ++ asm volatile("\n" \ + "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \ +- "2:\t" ASM_CLAC "\n" \ ++ "2:\n" \ + "\t.section .fixup, \"ax\"\n" \ + "3:\tmov %3, %0\n" \ + "\tjmp 2b\n" \ +@@ -611,9 +627,9 @@ extern void __cmpxchg_wrong_size(void) + if (!IS_ENABLED(CONFIG_X86_64)) \ + __cmpxchg_wrong_size(); \ + \ +- asm volatile("\t" ASM_STAC "\n" \ ++ asm volatile("\n" \ + "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \ +- "2:\t" ASM_CLAC "\n" \ ++ "2:\n" \ + "\t.section .fixup, \"ax\"\n" \ + "3:\tmov %3, %0\n" \ + "\tjmp 2b\n" \ +@@ -628,6 +644,7 @@ extern void __cmpxchg_wrong_size(void) + default: \ + __cmpxchg_wrong_size(); \ + } \ ++ __uaccess_end(); \ + *__uval = __old; \ + __ret; \ + }) +--- a/arch/x86/include/asm/uaccess_64.h ++++ b/arch/x86/include/asm/uaccess_64.h +@@ -56,35 +56,49 @@ int __copy_from_user_nocheck(void *dst, + if (!__builtin_constant_p(size)) + return copy_user_generic(dst, (__force void *)src, size); + switch (size) { +- case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src, ++ case 1: ++ __uaccess_begin(); ++ __get_user_asm(*(u8 *)dst, (u8 __user *)src, + ret, "b", "b", "=q", 1); ++ __uaccess_end(); + return ret; +- case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src, ++ case 2: ++ __uaccess_begin(); ++ __get_user_asm(*(u16 *)dst, (u16 __user *)src, + ret, "w", "w", "=r", 2); ++ __uaccess_end(); + return ret; +- case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src, ++ case 4: ++ __uaccess_begin(); ++ __get_user_asm(*(u32 *)dst, (u32 __user *)src, + ret, "l", "k", "=r", 4); ++ __uaccess_end(); + return ret; +- case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src, ++ case 8: ++ __uaccess_begin(); ++ __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 8); ++ __uaccess_end(); + return ret; + case 10: ++ __uaccess_begin(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 10); +- if (unlikely(ret)) +- return ret; +- __get_user_asm(*(u16 *)(8 + (char *)dst), +- (u16 __user *)(8 + (char __user *)src), +- ret, "w", "w", "=r", 2); ++ if (likely(!ret)) ++ __get_user_asm(*(u16 *)(8 + (char *)dst), ++ (u16 __user *)(8 + (char __user *)src), ++ ret, "w", "w", "=r", 2); ++ __uaccess_end(); + return ret; + case 16: ++ __uaccess_begin(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 16); +- if (unlikely(ret)) +- return ret; +- __get_user_asm(*(u64 *)(8 + (char *)dst), +- (u64 __user *)(8 + (char __user *)src), +- ret, "q", "", "=r", 8); ++ if (likely(!ret)) ++ __get_user_asm(*(u64 *)(8 + (char *)dst), ++ (u64 __user *)(8 + (char __user *)src), ++ ret, "q", "", "=r", 8); ++ __uaccess_end(); + return ret; + default: + return copy_user_generic(dst, (__force void *)src, size); +@@ -106,35 +120,51 @@ int __copy_to_user_nocheck(void __user * + if (!__builtin_constant_p(size)) + return copy_user_generic((__force void *)dst, src, size); + switch (size) { +- case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst, ++ case 1: ++ __uaccess_begin(); ++ __put_user_asm(*(u8 *)src, (u8 __user *)dst, + ret, "b", "b", "iq", 1); ++ __uaccess_end(); + return ret; +- case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst, ++ case 2: ++ __uaccess_begin(); ++ __put_user_asm(*(u16 *)src, (u16 __user *)dst, + ret, "w", "w", "ir", 2); ++ __uaccess_end(); + return ret; +- case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst, ++ case 4: ++ __uaccess_begin(); ++ __put_user_asm(*(u32 *)src, (u32 __user *)dst, + ret, "l", "k", "ir", 4); ++ __uaccess_end(); + return ret; +- case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, ++ case 8: ++ __uaccess_begin(); ++ __put_user_asm(*(u64 *)src, (u64 __user *)dst, + ret, "q", "", "er", 8); ++ __uaccess_end(); + return ret; + case 10: ++ __uaccess_begin(); + __put_user_asm(*(u64 *)src, (u64 __user *)dst, + ret, "q", "", "er", 10); +- if (unlikely(ret)) +- return ret; +- asm("":::"memory"); +- __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, +- ret, "w", "w", "ir", 2); ++ if (likely(!ret)) { ++ asm("":::"memory"); ++ __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, ++ ret, "w", "w", "ir", 2); ++ } ++ __uaccess_end(); + return ret; + case 16: ++ __uaccess_begin(); + __put_user_asm(*(u64 *)src, (u64 __user *)dst, + ret, "q", "", "er", 16); +- if (unlikely(ret)) +- return ret; +- asm("":::"memory"); +- __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, +- ret, "q", "", "er", 8); ++ if (likely(!ret)) { ++ asm("":::"memory"); ++ __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, ++ ret, "q", "", "er", 8); ++ } ++ __uaccess_end(); + return ret; + default: + return copy_user_generic((__force void *)dst, src, size); +@@ -160,39 +190,47 @@ int __copy_in_user(void __user *dst, con + switch (size) { + case 1: { + u8 tmp; ++ __uaccess_begin(); + __get_user_asm(tmp, (u8 __user *)src, + ret, "b", "b", "=q", 1); + if (likely(!ret)) + __put_user_asm(tmp, (u8 __user *)dst, + ret, "b", "b", "iq", 1); ++ __uaccess_end(); + return ret; + } + case 2: { + u16 tmp; ++ __uaccess_begin(); + __get_user_asm(tmp, (u16 __user *)src, + ret, "w", "w", "=r", 2); + if (likely(!ret)) + __put_user_asm(tmp, (u16 __user *)dst, + ret, "w", "w", "ir", 2); ++ __uaccess_end(); + return ret; + } + + case 4: { + u32 tmp; ++ __uaccess_begin(); + __get_user_asm(tmp, (u32 __user *)src, + ret, "l", "k", "=r", 4); + if (likely(!ret)) + __put_user_asm(tmp, (u32 __user *)dst, + ret, "l", "k", "ir", 4); ++ __uaccess_end(); + return ret; + } + case 8: { + u64 tmp; ++ __uaccess_begin(); + __get_user_asm(tmp, (u64 __user *)src, + ret, "q", "", "=r", 8); + if (likely(!ret)) + __put_user_asm(tmp, (u64 __user *)dst, + ret, "q", "", "er", 8); ++ __uaccess_end(); + return ret; + } + default: diff --git a/queue-4.4/x86-speculation-use-synthetic-bits-for-ibrs-ibpb-stibp.patch b/queue-4.4/x86-speculation-use-synthetic-bits-for-ibrs-ibpb-stibp.patch new file mode 100644 index 00000000000..f9a56e0bc50 --- /dev/null +++ b/queue-4.4/x86-speculation-use-synthetic-bits-for-ibrs-ibpb-stibp.patch @@ -0,0 +1,81 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Borislav Petkov +Date: Wed, 2 May 2018 18:15:14 +0200 +Subject: x86/speculation: Use synthetic bits for IBRS/IBPB/STIBP + +From: Borislav Petkov + +commit e7c587da125291db39ddf1f49b18e5970adbac17 upstream. + +Intel and AMD have different CPUID bits hence for those use synthetic bits +which get set on the respective vendor's in init_speculation_control(). So +that debacles like what the commit message of + + c65732e4f721 ("x86/cpu: Restore CPUID_8000_0008_EBX reload") + +talks about don't happen anymore. + +Signed-off-by: Borislav Petkov +Signed-off-by: Thomas Gleixner +Reviewed-by: Konrad Rzeszutek Wilk +Tested-by: Jörg Otte +Cc: Linus Torvalds +Cc: "Kirill A. Shutemov" +Link: https://lkml.kernel.org/r/20180504161815.GG9257@pd.tnic +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.4: This was partly applied before; apply just the + missing bits] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/cpuid.c | 10 +++++----- + arch/x86/kvm/cpuid.h | 4 ++-- + 2 files changed, 7 insertions(+), 7 deletions(-) + +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -343,7 +343,7 @@ static inline int __do_cpuid_ent(struct + + /* cpuid 0x80000008.ebx */ + const u32 kvm_cpuid_8000_0008_ebx_x86_features = +- F(IBPB) | F(IBRS); ++ F(AMD_IBPB) | F(AMD_IBRS); + + /* cpuid 0xC0000001.edx */ + const u32 kvm_supported_word5_x86_features = +@@ -596,10 +596,10 @@ static inline int __do_cpuid_ent(struct + entry->eax = g_phys_as | (virt_as << 8); + entry->edx = 0; + /* IBRS and IBPB aren't necessarily present in hardware cpuid */ +- if (boot_cpu_has(X86_FEATURE_IBPB)) +- entry->ebx |= F(IBPB); +- if (boot_cpu_has(X86_FEATURE_IBRS)) +- entry->ebx |= F(IBRS); ++ if (boot_cpu_has(X86_FEATURE_AMD_IBPB)) ++ entry->ebx |= F(AMD_IBPB); ++ if (boot_cpu_has(X86_FEATURE_AMD_IBRS)) ++ entry->ebx |= F(AMD_IBRS); + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + break; +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -164,7 +164,7 @@ static inline bool guest_cpuid_has_ibpb( + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); +- if (best && (best->ebx & bit(X86_FEATURE_IBPB))) ++ if (best && (best->ebx & bit(X86_FEATURE_AMD_IBPB))) + return true; + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); +@@ -175,7 +175,7 @@ static inline bool guest_cpuid_has_spec_ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); +- if (best && (best->ebx & bit(X86_FEATURE_IBRS))) ++ if (best && (best->ebx & bit(X86_FEATURE_AMD_IBRS))) + return true; + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD))); diff --git a/queue-4.4/x86-uaccess-use-__uaccess_begin_nospec-and-uaccess_try_nospec.patch b/queue-4.4/x86-uaccess-use-__uaccess_begin_nospec-and-uaccess_try_nospec.patch new file mode 100644 index 00000000000..260f6295233 --- /dev/null +++ b/queue-4.4/x86-uaccess-use-__uaccess_begin_nospec-and-uaccess_try_nospec.patch @@ -0,0 +1,304 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Dan Williams +Date: Mon, 29 Jan 2018 17:02:49 -0800 +Subject: x86/uaccess: Use __uaccess_begin_nospec() and uaccess_try_nospec + +From: Dan Williams + +commit 304ec1b050310548db33063e567123fae8fd0301 upstream. + +Quoting Linus: + + I do think that it would be a good idea to very expressly document + the fact that it's not that the user access itself is unsafe. I do + agree that things like "get_user()" want to be protected, but not + because of any direct bugs or problems with get_user() and friends, + but simply because get_user() is an excellent source of a pointer + that is obviously controlled from a potentially attacking user + space. So it's a prime candidate for then finding _subsequent_ + accesses that can then be used to perturb the cache. + +__uaccess_begin_nospec() covers __get_user() and copy_from_iter() where the +limit check is far away from the user pointer de-reference. In those cases +a barrier_nospec() prevents speculation with a potential pointer to +privileged memory. uaccess_try_nospec covers get_user_try. + +Suggested-by: Linus Torvalds +Suggested-by: Andi Kleen +Signed-off-by: Dan Williams +Signed-off-by: Thomas Gleixner +Cc: linux-arch@vger.kernel.org +Cc: Kees Cook +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: Al Viro +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727416953.33451.10508284228526170604.stgit@dwillia2-desk3.amr.corp.intel.com +[bwh: Backported to 4.4: + - Convert several more functions to use __uaccess_begin_nospec(), that + are just wrappers in mainline + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/uaccess.h | 6 +++--- + arch/x86/include/asm/uaccess_32.h | 26 +++++++++++++------------- + arch/x86/include/asm/uaccess_64.h | 20 ++++++++++---------- + arch/x86/lib/usercopy_32.c | 10 +++++----- + 4 files changed, 31 insertions(+), 31 deletions(-) + +--- a/arch/x86/include/asm/uaccess.h ++++ b/arch/x86/include/asm/uaccess.h +@@ -436,7 +436,7 @@ do { \ + ({ \ + int __gu_err; \ + unsigned long __gu_val; \ +- __uaccess_begin(); \ ++ __uaccess_begin_nospec(); \ + __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __uaccess_end(); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ +@@ -546,7 +546,7 @@ struct __large_struct { unsigned long bu + * get_user_ex(...); + * } get_user_catch(err) + */ +-#define get_user_try uaccess_try ++#define get_user_try uaccess_try_nospec + #define get_user_catch(err) uaccess_catch(err) + + #define get_user_ex(x, ptr) do { \ +@@ -581,7 +581,7 @@ extern void __cmpxchg_wrong_size(void) + __typeof__(ptr) __uval = (uval); \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ +- __uaccess_begin(); \ ++ __uaccess_begin_nospec(); \ + switch (size) { \ + case 1: \ + { \ +--- a/arch/x86/include/asm/uaccess_32.h ++++ b/arch/x86/include/asm/uaccess_32.h +@@ -48,25 +48,25 @@ __copy_to_user_inatomic(void __user *to, + + switch (n) { + case 1: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __put_user_size(*(u8 *)from, (u8 __user *)to, + 1, ret, 1); + __uaccess_end(); + return ret; + case 2: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __put_user_size(*(u16 *)from, (u16 __user *)to, + 2, ret, 2); + __uaccess_end(); + return ret; + case 4: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __put_user_size(*(u32 *)from, (u32 __user *)to, + 4, ret, 4); + __uaccess_end(); + return ret; + case 8: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __put_user_size(*(u64 *)from, (u64 __user *)to, + 8, ret, 8); + __uaccess_end(); +@@ -111,17 +111,17 @@ __copy_from_user_inatomic(void *to, cons + + switch (n) { + case 1: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); + return ret; + case 2: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); + return ret; + case 4: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); + return ret; +@@ -162,17 +162,17 @@ __copy_from_user(void *to, const void __ + + switch (n) { + case 1: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); + return ret; + case 2: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); + return ret; + case 4: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); + return ret; +@@ -190,17 +190,17 @@ static __always_inline unsigned long __c + + switch (n) { + case 1: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); + return ret; + case 2: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); + return ret; + case 4: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); + return ret; +--- a/arch/x86/include/asm/uaccess_64.h ++++ b/arch/x86/include/asm/uaccess_64.h +@@ -57,31 +57,31 @@ int __copy_from_user_nocheck(void *dst, + return copy_user_generic(dst, (__force void *)src, size); + switch (size) { + case 1: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u8 *)dst, (u8 __user *)src, + ret, "b", "b", "=q", 1); + __uaccess_end(); + return ret; + case 2: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u16 *)dst, (u16 __user *)src, + ret, "w", "w", "=r", 2); + __uaccess_end(); + return ret; + case 4: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u32 *)dst, (u32 __user *)src, + ret, "l", "k", "=r", 4); + __uaccess_end(); + return ret; + case 8: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 8); + __uaccess_end(); + return ret; + case 10: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 10); + if (likely(!ret)) +@@ -91,7 +91,7 @@ int __copy_from_user_nocheck(void *dst, + __uaccess_end(); + return ret; + case 16: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 16); + if (likely(!ret)) +@@ -190,7 +190,7 @@ int __copy_in_user(void __user *dst, con + switch (size) { + case 1: { + u8 tmp; +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(tmp, (u8 __user *)src, + ret, "b", "b", "=q", 1); + if (likely(!ret)) +@@ -201,7 +201,7 @@ int __copy_in_user(void __user *dst, con + } + case 2: { + u16 tmp; +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(tmp, (u16 __user *)src, + ret, "w", "w", "=r", 2); + if (likely(!ret)) +@@ -213,7 +213,7 @@ int __copy_in_user(void __user *dst, con + + case 4: { + u32 tmp; +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(tmp, (u32 __user *)src, + ret, "l", "k", "=r", 4); + if (likely(!ret)) +@@ -224,7 +224,7 @@ int __copy_in_user(void __user *dst, con + } + case 8: { + u64 tmp; +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(tmp, (u64 __user *)src, + ret, "q", "", "=r", 8); + if (likely(!ret)) +--- a/arch/x86/lib/usercopy_32.c ++++ b/arch/x86/lib/usercopy_32.c +@@ -570,7 +570,7 @@ do { \ + unsigned long __copy_to_user_ll(void __user *to, const void *from, + unsigned long n) + { +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + if (movsl_is_ok(to, from, n)) + __copy_user(to, from, n); + else +@@ -583,7 +583,7 @@ EXPORT_SYMBOL(__copy_to_user_ll); + unsigned long __copy_from_user_ll(void *to, const void __user *from, + unsigned long n) + { +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + if (movsl_is_ok(to, from, n)) + __copy_user_zeroing(to, from, n); + else +@@ -596,7 +596,7 @@ EXPORT_SYMBOL(__copy_from_user_ll); + unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from, + unsigned long n) + { +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + if (movsl_is_ok(to, from, n)) + __copy_user(to, from, n); + else +@@ -610,7 +610,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero + unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, + unsigned long n) + { +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + #ifdef CONFIG_X86_INTEL_USERCOPY + if (n > 64 && cpu_has_xmm2) + n = __copy_user_zeroing_intel_nocache(to, from, n); +@@ -627,7 +627,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocach + unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, + unsigned long n) + { +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + #ifdef CONFIG_X86_INTEL_USERCOPY + if (n > 64 && cpu_has_xmm2) + n = __copy_user_intel_nocache(to, from, n); diff --git a/queue-4.4/x86-usercopy-replace-open-coded-stac-clac-with-__uaccess_-begin-end.patch b/queue-4.4/x86-usercopy-replace-open-coded-stac-clac-with-__uaccess_-begin-end.patch new file mode 100644 index 00000000000..99d32b97819 --- /dev/null +++ b/queue-4.4/x86-usercopy-replace-open-coded-stac-clac-with-__uaccess_-begin-end.patch @@ -0,0 +1,122 @@ +From foo@baz Thu Dec 13 20:11:30 CET 2018 +From: Dan Williams +Date: Mon, 29 Jan 2018 17:02:44 -0800 +Subject: x86/usercopy: Replace open coded stac/clac with __uaccess_{begin, end} + +From: Dan Williams + +commit b5c4ae4f35325d520b230bab6eb3310613b72ac1 upstream. + +In preparation for converting some __uaccess_begin() instances to +__uacess_begin_nospec(), make sure all 'from user' uaccess paths are +using the _begin(), _end() helpers rather than open-coded stac() and +clac(). + +No functional changes. + +Suggested-by: Ingo Molnar +Signed-off-by: Dan Williams +Signed-off-by: Thomas Gleixner +Cc: linux-arch@vger.kernel.org +Cc: Tom Lendacky +Cc: Kees Cook +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: Al Viro +Cc: torvalds@linux-foundation.org +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727416438.33451.17309465232057176966.stgit@dwillia2-desk3.amr.corp.intel.com +[bwh: Backported to 4.4: + - Convert several more functions to use __uaccess_begin_nospec(), that + are just wrappers in mainline + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/lib/usercopy_32.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +--- a/arch/x86/lib/usercopy_32.c ++++ b/arch/x86/lib/usercopy_32.c +@@ -570,12 +570,12 @@ do { \ + unsigned long __copy_to_user_ll(void __user *to, const void *from, + unsigned long n) + { +- stac(); ++ __uaccess_begin(); + if (movsl_is_ok(to, from, n)) + __copy_user(to, from, n); + else + n = __copy_user_intel(to, from, n); +- clac(); ++ __uaccess_end(); + return n; + } + EXPORT_SYMBOL(__copy_to_user_ll); +@@ -583,12 +583,12 @@ EXPORT_SYMBOL(__copy_to_user_ll); + unsigned long __copy_from_user_ll(void *to, const void __user *from, + unsigned long n) + { +- stac(); ++ __uaccess_begin(); + if (movsl_is_ok(to, from, n)) + __copy_user_zeroing(to, from, n); + else + n = __copy_user_zeroing_intel(to, from, n); +- clac(); ++ __uaccess_end(); + return n; + } + EXPORT_SYMBOL(__copy_from_user_ll); +@@ -596,13 +596,13 @@ EXPORT_SYMBOL(__copy_from_user_ll); + unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from, + unsigned long n) + { +- stac(); ++ __uaccess_begin(); + if (movsl_is_ok(to, from, n)) + __copy_user(to, from, n); + else + n = __copy_user_intel((void __user *)to, + (const void *)from, n); +- clac(); ++ __uaccess_end(); + return n; + } + EXPORT_SYMBOL(__copy_from_user_ll_nozero); +@@ -610,7 +610,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero + unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, + unsigned long n) + { +- stac(); ++ __uaccess_begin(); + #ifdef CONFIG_X86_INTEL_USERCOPY + if (n > 64 && cpu_has_xmm2) + n = __copy_user_zeroing_intel_nocache(to, from, n); +@@ -619,7 +619,7 @@ unsigned long __copy_from_user_ll_nocach + #else + __copy_user_zeroing(to, from, n); + #endif +- clac(); ++ __uaccess_end(); + return n; + } + EXPORT_SYMBOL(__copy_from_user_ll_nocache); +@@ -627,7 +627,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocach + unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, + unsigned long n) + { +- stac(); ++ __uaccess_begin(); + #ifdef CONFIG_X86_INTEL_USERCOPY + if (n > 64 && cpu_has_xmm2) + n = __copy_user_intel_nocache(to, from, n); +@@ -636,7 +636,7 @@ unsigned long __copy_from_user_ll_nocach + #else + __copy_user(to, from, n); + #endif +- clac(); ++ __uaccess_end(); + return n; + } + EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);