4.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 17 Jan 2017 10:01:34 +0000 (11:01 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 17 Jan 2017 10:01:34 +0000 (11:01 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 17 Jan 2017 10:01:34 +0000 (11:01 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 17 Jan 2017 10:01:34 +0000 (11:01 +0100)
diff --git a/queue-4.4/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch b/queue-4.4/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch

new file mode 100644 (file)

index 0000000..9b7ebd9
--- /dev/null
+++ b/queue-4.4/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch
@@ -0,0 +1,83 @@
+From 4f3dbdf47e150016aacd734e663347fcaa768303 Mon Sep 17 00:00:00 2001
+From: Wanpeng Li <wanpeng.li@hotmail.com>
+Date: Thu, 5 Jan 2017 17:39:42 -0800
+Subject: KVM: eventfd: fix NULL deref irqbypass consumer
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Wanpeng Li <wanpeng.li@hotmail.com>
+
+commit 4f3dbdf47e150016aacd734e663347fcaa768303 upstream.
+
+Reported syzkaller:
+
+    BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
+    IP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass]
+    PGD 0
+
+    Oops: 0002 [#1] SMP
+    CPU: 1 PID: 125 Comm: kworker/1:1 Not tainted 4.9.0+ #1
+    Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm]
+    task: ffff9bbe0dfbb900 task.stack: ffffb61802014000
+    RIP: 0010:irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass]
+    Call Trace:
+     irqfd_shutdown+0x66/0xa0 [kvm]
+     process_one_work+0x16b/0x480
+     worker_thread+0x4b/0x500
+     kthread+0x101/0x140
+     ? process_one_work+0x480/0x480
+     ? kthread_create_on_node+0x60/0x60
+     ret_from_fork+0x25/0x30
+    RIP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] RSP: ffffb61802017e20
+    CR2: 0000000000000008
+
+The syzkaller folks reported a NULL pointer dereference that due to
+unregister an consumer which fails registration before. The syzkaller
+creates two VMs w/ an equal eventfd occasionally. So the second VM
+fails to register an irqbypass consumer. It will make irqfd as inactive
+and queue an workqueue work to shutdown irqfd and unregister the irqbypass
+consumer when eventfd is closed. However, the second consumer has been
+initialized though it fails registration. So the token(same as the first
+VM's) is taken to unregister the consumer through the workqueue, the
+consumer of the first VM is found and unregistered, then NULL deref incurred
+in the path of deleting consumer from the consumers list.
+
+This patch fixes it by making irq_bypass_register/unregister_consumer()
+looks for the consumer entry based on consumer pointer itself instead of
+token matching.
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Suggested-by: Alex Williamson <alex.williamson@redhat.com>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Radim Krčmář <rkrcmar@redhat.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Alex Williamson <alex.williamson@redhat.com>
+Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ virt/lib/irqbypass.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/virt/lib/irqbypass.c
++++ b/virt/lib/irqbypass.c
+@@ -188,7 +188,7 @@ int irq_bypass_register_consumer(struct
+       mutex_lock(&lock);
+ 
+       list_for_each_entry(tmp, &consumers, node) {
+-              if (tmp->token == consumer->token) {
++              if (tmp->token == consumer->token || tmp == consumer) {
+                       mutex_unlock(&lock);
+                       module_put(THIS_MODULE);
+                       return -EBUSY;
+@@ -235,7 +235,7 @@ void irq_bypass_unregister_consumer(stru
+       mutex_lock(&lock);
+ 
+       list_for_each_entry(tmp, &consumers, node) {
+-              if (tmp->token != consumer->token)
++              if (tmp != consumer)
+                       continue;
+ 
+               list_for_each_entry(producer, &producers, node) {
diff --git a/queue-4.4/kvm-x86-emulate-fxsave-and-fxrstor.patch b/queue-4.4/kvm-x86-emulate-fxsave-and-fxrstor.patch

new file mode 100644 (file)

index 0000000..a33b295
--- /dev/null
+++ b/queue-4.4/kvm-x86-emulate-fxsave-and-fxrstor.patch
@@ -0,0 +1,187 @@
+From 283c95d0e3891b64087706b344a4b545d04a6e62 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
+Date: Wed, 9 Nov 2016 19:07:06 +0100
+Subject: KVM: x86: emulate FXSAVE and FXRSTOR
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Radim Krčmář <rkrcmar@redhat.com>
+
+commit 283c95d0e3891b64087706b344a4b545d04a6e62 upstream.
+
+Internal errors were reported on 16 bit fxsave and fxrstor with ipxe.
+Old Intels don't have unrestricted_guest, so we have to emulate them.
+
+The patch takes advantage of the hardware implementation.
+
+AMD and Intel differ in saving and restoring other fields in first 32
+bytes.  A test wrote 0xff to the fxsave area, 0 to upper bits of MCSXR
+in the fxsave area, executed fxrstor, rewrote the fxsave area to 0xee,
+and executed fxsave:
+
+  Intel (Nehalem):
+    7f 1f 7f 7f ff 00 ff 07 ff ff ff ff ff ff 00 00
+    ff ff ff ff ff ff 00 00 ff ff 00 00 ff ff 00 00
+  Intel (Haswell -- deprecated FPU CS and FPU DS):
+    7f 1f 7f 7f ff 00 ff 07 ff ff ff ff 00 00 00 00
+    ff ff ff ff 00 00 00 00 ff ff 00 00 ff ff 00 00
+  AMD (Opteron 2300-series):
+    7f 1f 7f 7f ff 00 ee ee ee ee ee ee ee ee ee ee
+    ee ee ee ee ee ee ee ee ff ff 00 00 ff ff 02 00
+
+fxsave/fxrstor will only be emulated on early Intels, so KVM can't do
+much to improve the situation.
+
+Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c |  129 ++++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 128 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -3858,6 +3858,131 @@ static int em_movsxd(struct x86_emulate_
+       return X86EMUL_CONTINUE;
+ }
+ 
++static int check_fxsr(struct x86_emulate_ctxt *ctxt)
++{
++      u32 eax = 1, ebx, ecx = 0, edx;
++
++      ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
++      if (!(edx & FFL(FXSR)))
++              return emulate_ud(ctxt);
++
++      if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
++              return emulate_nm(ctxt);
++
++      /*
++       * Don't emulate a case that should never be hit, instead of working
++       * around a lack of fxsave64/fxrstor64 on old compilers.
++       */
++      if (ctxt->mode >= X86EMUL_MODE_PROT64)
++              return X86EMUL_UNHANDLEABLE;
++
++      return X86EMUL_CONTINUE;
++}
++
++/*
++ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
++ *  1) 16 bit mode
++ *  2) 32 bit mode
++ *     - like (1), but FIP and FDP (foo) are only 16 bit.  At least Intel CPUs
++ *       preserve whole 32 bit values, though, so (1) and (2) are the same wrt.
++ *       save and restore
++ *  3) 64-bit mode with REX.W prefix
++ *     - like (2), but XMM 8-15 are being saved and restored
++ *  4) 64-bit mode without REX.W prefix
++ *     - like (3), but FIP and FDP are 64 bit
++ *
++ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the
++ * desired result.  (4) is not emulated.
++ *
++ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS
++ * and FPU DS) should match.
++ */
++static int em_fxsave(struct x86_emulate_ctxt *ctxt)
++{
++      struct fxregs_state fx_state;
++      size_t size;
++      int rc;
++
++      rc = check_fxsr(ctxt);
++      if (rc != X86EMUL_CONTINUE)
++              return rc;
++
++      ctxt->ops->get_fpu(ctxt);
++
++      rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
++
++      ctxt->ops->put_fpu(ctxt);
++
++      if (rc != X86EMUL_CONTINUE)
++              return rc;
++
++      if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)
++              size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]);
++      else
++              size = offsetof(struct fxregs_state, xmm_space[0]);
++
++      return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size);
++}
++
++static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
++              struct fxregs_state *new)
++{
++      int rc = X86EMUL_CONTINUE;
++      struct fxregs_state old;
++
++      rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
++      if (rc != X86EMUL_CONTINUE)
++              return rc;
++
++      /*
++       * 64 bit host will restore XMM 8-15, which is not correct on non-64
++       * bit guests.  Load the current values in order to preserve 64 bit
++       * XMMs after fxrstor.
++       */
++#ifdef CONFIG_X86_64
++      /* XXX: accessing XMM 8-15 very awkwardly */
++      memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
++#endif
++
++      /*
++       * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
++       * does save and restore MXCSR.
++       */
++      if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
++              memcpy(new->xmm_space, old.xmm_space, 8 * 16);
++
++      return rc;
++}
++
++static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
++{
++      struct fxregs_state fx_state;
++      int rc;
++
++      rc = check_fxsr(ctxt);
++      if (rc != X86EMUL_CONTINUE)
++              return rc;
++
++      rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
++      if (rc != X86EMUL_CONTINUE)
++              return rc;
++
++      if (fx_state.mxcsr >> 16)
++              return emulate_gp(ctxt, 0);
++
++      ctxt->ops->get_fpu(ctxt);
++
++      if (ctxt->mode < X86EMUL_MODE_PROT64)
++              rc = fxrstor_fixup(ctxt, &fx_state);
++
++      if (rc == X86EMUL_CONTINUE)
++              rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
++
++      ctxt->ops->put_fpu(ctxt);
++
++      return rc;
++}
++
+ static bool valid_cr(int nr)
+ {
+       switch (nr) {
+@@ -4210,7 +4335,9 @@ static const struct gprefix pfx_0f_ae_7
+ };
+ 
+ static const struct group_dual group15 = { {
+-      N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7),
++      I(ModRM | Aligned16, em_fxsave),
++      I(ModRM | Aligned16, em_fxrstor),
++      N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+ }, {
+       N, N, N, N, N, N, N, N,
+ } };
diff --git a/queue-4.4/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch b/queue-4.4/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch

new file mode 100644 (file)

index 0000000..0c3e28b
--- /dev/null
+++ b/queue-4.4/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch
@@ -0,0 +1,107 @@
+From 33ab91103b3415e12457e3104f0e4517ce12d0f3 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Thu, 12 Jan 2017 15:02:32 +0100
+Subject: KVM: x86: fix emulation of "MOV SS, null selector"
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 33ab91103b3415e12457e3104f0e4517ce12d0f3 upstream.
+
+This is CVE-2017-2583.  On Intel this causes a failed vmentry because
+SS's type is neither 3 nor 7 (even though the manual says this check is
+only done for usable SS, and the dmesg splat says that SS is unusable!).
+On AMD it's worse: svm.c is confused and sets CPL to 0 in the vmcb.
+
+The fix fabricates a data segment descriptor when SS is set to a null
+selector, so that CPL and SS.DPL are set correctly in the VMCS/vmcb.
+Furthermore, only allow setting SS to a NULL selector if SS.RPL < 3;
+this in turn ensures CPL < 3 because RPL must be equal to CPL.
+
+Thanks to Andy Lutomirski and Willy Tarreau for help in analyzing
+the bug and deciphering the manuals.
+
+Reported-by: Xiaohan Zhang <zhangxiaohan1@huawei.com>
+Fixes: 79d5b4c3cd809c770d4bf9812635647016c56011
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c |   48 ++++++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 38 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -1532,7 +1532,6 @@ static int write_segment_descriptor(stru
+                                   &ctxt->exception);
+ }
+ 
+-/* Does not support long mode */
+ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+                                    u16 selector, int seg, u8 cpl,
+                                    enum x86_transfer_type transfer,
+@@ -1569,20 +1568,34 @@ static int __load_segment_descriptor(str
+ 
+       rpl = selector & 3;
+ 
+-      /* NULL selector is not valid for TR, CS and SS (except for long mode) */
+-      if ((seg == VCPU_SREG_CS
+-           || (seg == VCPU_SREG_SS
+-               && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
+-           || seg == VCPU_SREG_TR)
+-          && null_selector)
+-              goto exception;
+-
+       /* TR should be in GDT only */
+       if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+               goto exception;
+ 
+-      if (null_selector) /* for NULL selector skip all following checks */
++      /* NULL selector is not valid for TR, CS and (except for long mode) SS */
++      if (null_selector) {
++              if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR)
++                      goto exception;
++
++              if (seg == VCPU_SREG_SS) {
++                      if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)
++                              goto exception;
++
++                      /*
++                       * ctxt->ops->set_segment expects the CPL to be in
++                       * SS.DPL, so fake an expand-up 32-bit data segment.
++                       */
++                      seg_desc.type = 3;
++                      seg_desc.p = 1;
++                      seg_desc.s = 1;
++                      seg_desc.dpl = cpl;
++                      seg_desc.d = 1;
++                      seg_desc.g = 1;
++              }
++
++              /* Skip all following checks */
+               goto load;
++      }
+ 
+       ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
+       if (ret != X86EMUL_CONTINUE)
+@@ -1698,6 +1711,21 @@ static int load_segment_descriptor(struc
+                                  u16 selector, int seg)
+ {
+       u8 cpl = ctxt->ops->cpl(ctxt);
++
++      /*
++       * None of MOV, POP and LSS can load a NULL selector in CPL=3, but
++       * they can load it at CPL<3 (Intel's manual says only LSS can,
++       * but it's wrong).
++       *
++       * However, the Intel manual says that putting IST=1/DPL=3 in
++       * an interrupt gate will result in SS=3 (the AMD manual instead
++       * says it doesn't), so allow SS=3 in __load_segment_descriptor
++       * and only forbid it here.
++       */
++      if (seg == VCPU_SREG_SS && selector == 3 &&
++          ctxt->mode == X86EMUL_MODE_PROT64)
++              return emulate_exception(ctxt, GP_VECTOR, 0, true);
++
+       return __load_segment_descriptor(ctxt, selector, seg, cpl,
+                                        X86_TRANSFER_NONE, NULL);
+ }
diff --git a/queue-4.4/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch b/queue-4.4/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch

new file mode 100644 (file)

index 0000000..3ecbbed
--- /dev/null
+++ b/queue-4.4/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch
@@ -0,0 +1,59 @@
+From cef84c302fe051744b983a92764d3fcca933415d Mon Sep 17 00:00:00 2001
+From: David Matlack <dmatlack@google.com>
+Date: Fri, 16 Dec 2016 14:30:36 -0800
+Subject: KVM: x86: flush pending lapic jump label updates on module unload
+
+From: David Matlack <dmatlack@google.com>
+
+commit cef84c302fe051744b983a92764d3fcca933415d upstream.
+
+KVM's lapic emulation uses static_key_deferred (apic_{hw,sw}_disabled).
+These are implemented with delayed_work structs which can still be
+pending when the KVM module is unloaded. We've seen this cause kernel
+panics when the kvm_intel module is quickly reloaded.
+
+Use the new static_key_deferred_flush() API to flush pending updates on
+module unload.
+
+Signed-off-by: David Matlack <dmatlack@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/lapic.c |    6 ++++++
+ arch/x86/kvm/lapic.h |    1 +
+ arch/x86/kvm/x86.c   |    1 +
+ 3 files changed, 8 insertions(+)
+
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -2187,3 +2187,9 @@ void kvm_lapic_init(void)
+       jump_label_rate_limit(&apic_hw_disabled, HZ);
+       jump_label_rate_limit(&apic_sw_disabled, HZ);
+ }
++
++void kvm_lapic_exit(void)
++{
++      static_key_deferred_flush(&apic_hw_disabled);
++      static_key_deferred_flush(&apic_sw_disabled);
++}
+--- a/arch/x86/kvm/lapic.h
++++ b/arch/x86/kvm/lapic.h
+@@ -95,6 +95,7 @@ static inline bool kvm_hv_vapic_assist_p
+ 
+ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
+ void kvm_lapic_init(void);
++void kvm_lapic_exit(void);
+ 
+ static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
+ {
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -5842,6 +5842,7 @@ out:
+ 
+ void kvm_arch_exit(void)
+ {
++      kvm_lapic_exit();
+       perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+ 
+       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
diff --git a/queue-4.4/kvm-x86-introduce-segmented_write_std.patch b/queue-4.4/kvm-x86-introduce-segmented_write_std.patch

new file mode 100644 (file)

index 0000000..8f74753
--- /dev/null
+++ b/queue-4.4/kvm-x86-introduce-segmented_write_std.patch
@@ -0,0 +1,83 @@
+From 129a72a0d3c8e139a04512325384fe5ac119e74d Mon Sep 17 00:00:00 2001
+From: Steve Rutherford <srutherford@google.com>
+Date: Wed, 11 Jan 2017 18:28:29 -0800
+Subject: KVM: x86: Introduce segmented_write_std
+
+From: Steve Rutherford <srutherford@google.com>
+
+commit 129a72a0d3c8e139a04512325384fe5ac119e74d upstream.
+
+Introduces segemented_write_std.
+
+Switches from emulated reads/writes to standard read/writes in fxsave,
+fxrstor, sgdt, and sidt.  This fixes CVE-2017-2584, a longstanding
+kernel memory leak.
+
+Since commit 283c95d0e389 ("KVM: x86: emulate FXSAVE and FXRSTOR",
+2016-11-09), which is luckily not yet in any final release, this would
+also be an exploitable kernel memory *write*!
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Fixes: 96051572c819194c37a8367624b285be10297eca
+Fixes: 283c95d0e3891b64087706b344a4b545d04a6e62
+Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Steve Rutherford <srutherford@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c |   22 ++++++++++++++++++----
+ 1 file changed, 18 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -779,6 +779,20 @@ static int segmented_read_std(struct x86
+       return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
+ }
+ 
++static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
++                             struct segmented_address addr,
++                             void *data,
++                             unsigned int size)
++{
++      int rc;
++      ulong linear;
++
++      rc = linearize(ctxt, addr, size, true, &linear);
++      if (rc != X86EMUL_CONTINUE)
++              return rc;
++      return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception);
++}
++
+ /*
+  * Prefetch the remaining bytes of the instruction without crossing page
+  * boundary if they are not in fetch_cache yet.
+@@ -3674,8 +3688,8 @@ static int emulate_store_desc_ptr(struct
+       }
+       /* Disable writeback. */
+       ctxt->dst.type = OP_NONE;
+-      return segmented_write(ctxt, ctxt->dst.addr.mem,
+-                             &desc_ptr, 2 + ctxt->op_bytes);
++      return segmented_write_std(ctxt, ctxt->dst.addr.mem,
++                                 &desc_ptr, 2 + ctxt->op_bytes);
+ }
+ 
+ static int em_sgdt(struct x86_emulate_ctxt *ctxt)
+@@ -3921,7 +3935,7 @@ static int em_fxsave(struct x86_emulate_
+       else
+               size = offsetof(struct fxregs_state, xmm_space[0]);
+ 
+-      return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size);
++      return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ }
+ 
+ static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
+@@ -3963,7 +3977,7 @@ static int em_fxrstor(struct x86_emulate
+       if (rc != X86EMUL_CONTINUE)
+               return rc;
+ 
+-      rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
++      rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
+       if (rc != X86EMUL_CONTINUE)
+               return rc;
+ 
diff --git a/queue-4.4/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch b/queue-4.4/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch

new file mode 100644 (file)

index 0000000..6c0e870
--- /dev/null
+++ b/queue-4.4/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch
@@ -0,0 +1,111 @@
+From e5bbc8a6c992901058bc09e2ce01d16c111ff047 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Tue, 10 Jan 2017 16:58:27 -0800
+Subject: mm/hugetlb.c: fix reservation race when freeing surplus pages
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit e5bbc8a6c992901058bc09e2ce01d16c111ff047 upstream.
+
+return_unused_surplus_pages() decrements the global reservation count,
+and frees any unused surplus pages that were backing the reservation.
+
+Commit 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in
+return_unused_surplus_pages()") added a call to cond_resched_lock in the
+loop freeing the pages.
+
+As a result, the hugetlb_lock could be dropped, and someone else could
+use the pages that will be freed in subsequent iterations of the loop.
+This could result in inconsistent global hugetlb page state, application
+api failures (such as mmap) failures or application crashes.
+
+When dropping the lock in return_unused_surplus_pages, make sure that
+the global reservation count (resv_huge_pages) remains sufficiently
+large to prevent someone else from claiming pages about to be freed.
+
+Analyzed by Paul Cassella.
+
+Fixes: 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in return_unused_surplus_pages()")
+Link: http://lkml.kernel.org/r/1483991767-6879-1-git-send-email-mike.kravetz@oracle.com
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Reported-by: Paul Cassella <cassella@cray.com>
+Suggested-by: Michal Hocko <mhocko@kernel.org>
+Cc: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c |   37 ++++++++++++++++++++++++++++---------
+ 1 file changed, 28 insertions(+), 9 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -1723,23 +1723,32 @@ free:
+ }
+ 
+ /*
+- * When releasing a hugetlb pool reservation, any surplus pages that were
+- * allocated to satisfy the reservation must be explicitly freed if they were
+- * never used.
+- * Called with hugetlb_lock held.
++ * This routine has two main purposes:
++ * 1) Decrement the reservation count (resv_huge_pages) by the value passed
++ *    in unused_resv_pages.  This corresponds to the prior adjustments made
++ *    to the associated reservation map.
++ * 2) Free any unused surplus pages that may have been allocated to satisfy
++ *    the reservation.  As many as unused_resv_pages may be freed.
++ *
++ * Called with hugetlb_lock held.  However, the lock could be dropped (and
++ * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
++ * we must make sure nobody else can claim pages we are in the process of
++ * freeing.  Do this by ensuring resv_huge_page always is greater than the
++ * number of huge pages we plan to free when dropping the lock.
+  */
+ static void return_unused_surplus_pages(struct hstate *h,
+                                       unsigned long unused_resv_pages)
+ {
+       unsigned long nr_pages;
+ 
+-      /* Uncommit the reservation */
+-      h->resv_huge_pages -= unused_resv_pages;
+-
+       /* Cannot return gigantic pages currently */
+       if (hstate_is_gigantic(h))
+-              return;
++              goto out;
+ 
++      /*
++       * Part (or even all) of the reservation could have been backed
++       * by pre-allocated pages. Only free surplus pages.
++       */
+       nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
+ 
+       /*
+@@ -1749,12 +1758,22 @@ static void return_unused_surplus_pages(
+        * when the nodes with surplus pages have no free pages.
+        * free_pool_huge_page() will balance the the freed pages across the
+        * on-line nodes with memory and will handle the hstate accounting.
++       *
++       * Note that we decrement resv_huge_pages as we free the pages.  If
++       * we drop the lock, resv_huge_pages will still be sufficiently large
++       * to cover subsequent pages we may free.
+        */
+       while (nr_pages--) {
++              h->resv_huge_pages--;
++              unused_resv_pages--;
+               if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+-                      break;
++                      goto out;
+               cond_resched_lock(&hugetlb_lock);
+       }
++
++out:
++      /* Fully uncommit the reservation */
++      h->resv_huge_pages -= unused_resv_pages;
+ }
+ 
+ 
diff --git a/queue-4.4/series b/queue-4.4/series

index 27a5dc22619362ed907c75760acf581d5ffd6db1..f7ee34c2d90dd98790721f427b212d31b08fd000 100644 (file)
--- a/queue-4.4/series
+++ b/queue-4.4/series
@@ -4,3 +4,9 @@ selftests-do-not-require-bash-to-run-netsocktests-testcase.patch
  selftests-do-not-require-bash-for-the-generated-test.patch
  mm-fix-devm_memremap_pages-crash-use-mem_hotplug_-begin-done.patch
  ocfs2-fix-crash-caused-by-stale-lvb-with-fsdlm-plugin.patch
+mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch
+kvm-x86-fix-emulation-of-mov-ss-null-selector.patch
+kvm-eventfd-fix-null-deref-irqbypass-consumer.patch
+kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch
+kvm-x86-emulate-fxsave-and-fxrstor.patch
+kvm-x86-introduce-segmented_write_std.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 17 Jan 2017 10:01:34 +0000 (11:01 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 17 Jan 2017 10:01:34 +0000 (11:01 +0100)
queue-4.4/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/kvm-x86-emulate-fxsave-and-fxrstor.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/kvm-x86-introduce-segmented_write_std.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/series		patch \| blob \| blame \| history