From: Greg Kroah-Hartman Date: Tue, 17 Jan 2017 10:01:34 +0000 (+0100) Subject: 4.4-stable patches X-Git-Tag: v4.9.5~19 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=1b15aa9d7783aa123fb488cb592a842ca0d6433d;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: kvm-eventfd-fix-null-deref-irqbypass-consumer.patch kvm-x86-emulate-fxsave-and-fxrstor.patch kvm-x86-fix-emulation-of-mov-ss-null-selector.patch kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch kvm-x86-introduce-segmented_write_std.patch mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch --- diff --git a/queue-4.4/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch b/queue-4.4/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch new file mode 100644 index 00000000000..9b7ebd9b4a2 --- /dev/null +++ b/queue-4.4/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch @@ -0,0 +1,83 @@ +From 4f3dbdf47e150016aacd734e663347fcaa768303 Mon Sep 17 00:00:00 2001 +From: Wanpeng Li +Date: Thu, 5 Jan 2017 17:39:42 -0800 +Subject: KVM: eventfd: fix NULL deref irqbypass consumer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Wanpeng Li + +commit 4f3dbdf47e150016aacd734e663347fcaa768303 upstream. + +Reported syzkaller: + + BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 + IP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] + PGD 0 + + Oops: 0002 [#1] SMP + CPU: 1 PID: 125 Comm: kworker/1:1 Not tainted 4.9.0+ #1 + Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm] + task: ffff9bbe0dfbb900 task.stack: ffffb61802014000 + RIP: 0010:irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] + Call Trace: + irqfd_shutdown+0x66/0xa0 [kvm] + process_one_work+0x16b/0x480 + worker_thread+0x4b/0x500 + kthread+0x101/0x140 + ? process_one_work+0x480/0x480 + ? kthread_create_on_node+0x60/0x60 + ret_from_fork+0x25/0x30 + RIP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] RSP: ffffb61802017e20 + CR2: 0000000000000008 + +The syzkaller folks reported a NULL pointer dereference that due to +unregister an consumer which fails registration before. The syzkaller +creates two VMs w/ an equal eventfd occasionally. So the second VM +fails to register an irqbypass consumer. It will make irqfd as inactive +and queue an workqueue work to shutdown irqfd and unregister the irqbypass +consumer when eventfd is closed. However, the second consumer has been +initialized though it fails registration. So the token(same as the first +VM's) is taken to unregister the consumer through the workqueue, the +consumer of the first VM is found and unregistered, then NULL deref incurred +in the path of deleting consumer from the consumers list. + +This patch fixes it by making irq_bypass_register/unregister_consumer() +looks for the consumer entry based on consumer pointer itself instead of +token matching. + +Reported-by: Dmitry Vyukov +Suggested-by: Alex Williamson +Cc: Paolo Bonzini +Cc: Radim Krčmář +Cc: Dmitry Vyukov +Cc: Alex Williamson +Signed-off-by: Wanpeng Li +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + virt/lib/irqbypass.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/virt/lib/irqbypass.c ++++ b/virt/lib/irqbypass.c +@@ -188,7 +188,7 @@ int irq_bypass_register_consumer(struct + mutex_lock(&lock); + + list_for_each_entry(tmp, &consumers, node) { +- if (tmp->token == consumer->token) { ++ if (tmp->token == consumer->token || tmp == consumer) { + mutex_unlock(&lock); + module_put(THIS_MODULE); + return -EBUSY; +@@ -235,7 +235,7 @@ void irq_bypass_unregister_consumer(stru + mutex_lock(&lock); + + list_for_each_entry(tmp, &consumers, node) { +- if (tmp->token != consumer->token) ++ if (tmp != consumer) + continue; + + list_for_each_entry(producer, &producers, node) { diff --git a/queue-4.4/kvm-x86-emulate-fxsave-and-fxrstor.patch b/queue-4.4/kvm-x86-emulate-fxsave-and-fxrstor.patch new file mode 100644 index 00000000000..a33b295f88d --- /dev/null +++ b/queue-4.4/kvm-x86-emulate-fxsave-and-fxrstor.patch @@ -0,0 +1,187 @@ +From 283c95d0e3891b64087706b344a4b545d04a6e62 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= +Date: Wed, 9 Nov 2016 19:07:06 +0100 +Subject: KVM: x86: emulate FXSAVE and FXRSTOR +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Radim Krčmář + +commit 283c95d0e3891b64087706b344a4b545d04a6e62 upstream. + +Internal errors were reported on 16 bit fxsave and fxrstor with ipxe. +Old Intels don't have unrestricted_guest, so we have to emulate them. + +The patch takes advantage of the hardware implementation. + +AMD and Intel differ in saving and restoring other fields in first 32 +bytes. A test wrote 0xff to the fxsave area, 0 to upper bits of MCSXR +in the fxsave area, executed fxrstor, rewrote the fxsave area to 0xee, +and executed fxsave: + + Intel (Nehalem): + 7f 1f 7f 7f ff 00 ff 07 ff ff ff ff ff ff 00 00 + ff ff ff ff ff ff 00 00 ff ff 00 00 ff ff 00 00 + Intel (Haswell -- deprecated FPU CS and FPU DS): + 7f 1f 7f 7f ff 00 ff 07 ff ff ff ff 00 00 00 00 + ff ff ff ff 00 00 00 00 ff ff 00 00 ff ff 00 00 + AMD (Opteron 2300-series): + 7f 1f 7f 7f ff 00 ee ee ee ee ee ee ee ee ee ee + ee ee ee ee ee ee ee ee ff ff 00 00 ff ff 02 00 + +fxsave/fxrstor will only be emulated on early Intels, so KVM can't do +much to improve the situation. + +Signed-off-by: Radim Krčmář +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/emulate.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 128 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -3858,6 +3858,131 @@ static int em_movsxd(struct x86_emulate_ + return X86EMUL_CONTINUE; + } + ++static int check_fxsr(struct x86_emulate_ctxt *ctxt) ++{ ++ u32 eax = 1, ebx, ecx = 0, edx; ++ ++ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); ++ if (!(edx & FFL(FXSR))) ++ return emulate_ud(ctxt); ++ ++ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) ++ return emulate_nm(ctxt); ++ ++ /* ++ * Don't emulate a case that should never be hit, instead of working ++ * around a lack of fxsave64/fxrstor64 on old compilers. ++ */ ++ if (ctxt->mode >= X86EMUL_MODE_PROT64) ++ return X86EMUL_UNHANDLEABLE; ++ ++ return X86EMUL_CONTINUE; ++} ++ ++/* ++ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, ++ * 1) 16 bit mode ++ * 2) 32 bit mode ++ * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs ++ * preserve whole 32 bit values, though, so (1) and (2) are the same wrt. ++ * save and restore ++ * 3) 64-bit mode with REX.W prefix ++ * - like (2), but XMM 8-15 are being saved and restored ++ * 4) 64-bit mode without REX.W prefix ++ * - like (3), but FIP and FDP are 64 bit ++ * ++ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the ++ * desired result. (4) is not emulated. ++ * ++ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS ++ * and FPU DS) should match. ++ */ ++static int em_fxsave(struct x86_emulate_ctxt *ctxt) ++{ ++ struct fxregs_state fx_state; ++ size_t size; ++ int rc; ++ ++ rc = check_fxsr(ctxt); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ ctxt->ops->get_fpu(ctxt); ++ ++ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); ++ ++ ctxt->ops->put_fpu(ctxt); ++ ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) ++ size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); ++ else ++ size = offsetof(struct fxregs_state, xmm_space[0]); ++ ++ return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size); ++} ++ ++static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, ++ struct fxregs_state *new) ++{ ++ int rc = X86EMUL_CONTINUE; ++ struct fxregs_state old; ++ ++ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ /* ++ * 64 bit host will restore XMM 8-15, which is not correct on non-64 ++ * bit guests. Load the current values in order to preserve 64 bit ++ * XMMs after fxrstor. ++ */ ++#ifdef CONFIG_X86_64 ++ /* XXX: accessing XMM 8-15 very awkwardly */ ++ memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); ++#endif ++ ++ /* ++ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but ++ * does save and restore MXCSR. ++ */ ++ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) ++ memcpy(new->xmm_space, old.xmm_space, 8 * 16); ++ ++ return rc; ++} ++ ++static int em_fxrstor(struct x86_emulate_ctxt *ctxt) ++{ ++ struct fxregs_state fx_state; ++ int rc; ++ ++ rc = check_fxsr(ctxt); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ if (fx_state.mxcsr >> 16) ++ return emulate_gp(ctxt, 0); ++ ++ ctxt->ops->get_fpu(ctxt); ++ ++ if (ctxt->mode < X86EMUL_MODE_PROT64) ++ rc = fxrstor_fixup(ctxt, &fx_state); ++ ++ if (rc == X86EMUL_CONTINUE) ++ rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); ++ ++ ctxt->ops->put_fpu(ctxt); ++ ++ return rc; ++} ++ + static bool valid_cr(int nr) + { + switch (nr) { +@@ -4210,7 +4335,9 @@ static const struct gprefix pfx_0f_ae_7 + }; + + static const struct group_dual group15 = { { +- N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), ++ I(ModRM | Aligned16, em_fxsave), ++ I(ModRM | Aligned16, em_fxrstor), ++ N, N, N, N, N, GP(0, &pfx_0f_ae_7), + }, { + N, N, N, N, N, N, N, N, + } }; diff --git a/queue-4.4/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch b/queue-4.4/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch new file mode 100644 index 00000000000..0c3e28b8b78 --- /dev/null +++ b/queue-4.4/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch @@ -0,0 +1,107 @@ +From 33ab91103b3415e12457e3104f0e4517ce12d0f3 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 12 Jan 2017 15:02:32 +0100 +Subject: KVM: x86: fix emulation of "MOV SS, null selector" + +From: Paolo Bonzini + +commit 33ab91103b3415e12457e3104f0e4517ce12d0f3 upstream. + +This is CVE-2017-2583. On Intel this causes a failed vmentry because +SS's type is neither 3 nor 7 (even though the manual says this check is +only done for usable SS, and the dmesg splat says that SS is unusable!). +On AMD it's worse: svm.c is confused and sets CPL to 0 in the vmcb. + +The fix fabricates a data segment descriptor when SS is set to a null +selector, so that CPL and SS.DPL are set correctly in the VMCS/vmcb. +Furthermore, only allow setting SS to a NULL selector if SS.RPL < 3; +this in turn ensures CPL < 3 because RPL must be equal to CPL. + +Thanks to Andy Lutomirski and Willy Tarreau for help in analyzing +the bug and deciphering the manuals. + +Reported-by: Xiaohan Zhang +Fixes: 79d5b4c3cd809c770d4bf9812635647016c56011 +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/emulate.c | 48 ++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 38 insertions(+), 10 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -1532,7 +1532,6 @@ static int write_segment_descriptor(stru + &ctxt->exception); + } + +-/* Does not support long mode */ + static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, int seg, u8 cpl, + enum x86_transfer_type transfer, +@@ -1569,20 +1568,34 @@ static int __load_segment_descriptor(str + + rpl = selector & 3; + +- /* NULL selector is not valid for TR, CS and SS (except for long mode) */ +- if ((seg == VCPU_SREG_CS +- || (seg == VCPU_SREG_SS +- && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) +- || seg == VCPU_SREG_TR) +- && null_selector) +- goto exception; +- + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + +- if (null_selector) /* for NULL selector skip all following checks */ ++ /* NULL selector is not valid for TR, CS and (except for long mode) SS */ ++ if (null_selector) { ++ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR) ++ goto exception; ++ ++ if (seg == VCPU_SREG_SS) { ++ if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl) ++ goto exception; ++ ++ /* ++ * ctxt->ops->set_segment expects the CPL to be in ++ * SS.DPL, so fake an expand-up 32-bit data segment. ++ */ ++ seg_desc.type = 3; ++ seg_desc.p = 1; ++ seg_desc.s = 1; ++ seg_desc.dpl = cpl; ++ seg_desc.d = 1; ++ seg_desc.g = 1; ++ } ++ ++ /* Skip all following checks */ + goto load; ++ } + + ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); + if (ret != X86EMUL_CONTINUE) +@@ -1698,6 +1711,21 @@ static int load_segment_descriptor(struc + u16 selector, int seg) + { + u8 cpl = ctxt->ops->cpl(ctxt); ++ ++ /* ++ * None of MOV, POP and LSS can load a NULL selector in CPL=3, but ++ * they can load it at CPL<3 (Intel's manual says only LSS can, ++ * but it's wrong). ++ * ++ * However, the Intel manual says that putting IST=1/DPL=3 in ++ * an interrupt gate will result in SS=3 (the AMD manual instead ++ * says it doesn't), so allow SS=3 in __load_segment_descriptor ++ * and only forbid it here. ++ */ ++ if (seg == VCPU_SREG_SS && selector == 3 && ++ ctxt->mode == X86EMUL_MODE_PROT64) ++ return emulate_exception(ctxt, GP_VECTOR, 0, true); ++ + return __load_segment_descriptor(ctxt, selector, seg, cpl, + X86_TRANSFER_NONE, NULL); + } diff --git a/queue-4.4/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch b/queue-4.4/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch new file mode 100644 index 00000000000..3ecbbed56f1 --- /dev/null +++ b/queue-4.4/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch @@ -0,0 +1,59 @@ +From cef84c302fe051744b983a92764d3fcca933415d Mon Sep 17 00:00:00 2001 +From: David Matlack +Date: Fri, 16 Dec 2016 14:30:36 -0800 +Subject: KVM: x86: flush pending lapic jump label updates on module unload + +From: David Matlack + +commit cef84c302fe051744b983a92764d3fcca933415d upstream. + +KVM's lapic emulation uses static_key_deferred (apic_{hw,sw}_disabled). +These are implemented with delayed_work structs which can still be +pending when the KVM module is unloaded. We've seen this cause kernel +panics when the kvm_intel module is quickly reloaded. + +Use the new static_key_deferred_flush() API to flush pending updates on +module unload. + +Signed-off-by: David Matlack +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/lapic.c | 6 ++++++ + arch/x86/kvm/lapic.h | 1 + + arch/x86/kvm/x86.c | 1 + + 3 files changed, 8 insertions(+) + +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -2187,3 +2187,9 @@ void kvm_lapic_init(void) + jump_label_rate_limit(&apic_hw_disabled, HZ); + jump_label_rate_limit(&apic_sw_disabled, HZ); + } ++ ++void kvm_lapic_exit(void) ++{ ++ static_key_deferred_flush(&apic_hw_disabled); ++ static_key_deferred_flush(&apic_sw_disabled); ++} +--- a/arch/x86/kvm/lapic.h ++++ b/arch/x86/kvm/lapic.h +@@ -95,6 +95,7 @@ static inline bool kvm_hv_vapic_assist_p + + int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); + void kvm_lapic_init(void); ++void kvm_lapic_exit(void); + + static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off) + { +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -5842,6 +5842,7 @@ out: + + void kvm_arch_exit(void) + { ++ kvm_lapic_exit(); + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) diff --git a/queue-4.4/kvm-x86-introduce-segmented_write_std.patch b/queue-4.4/kvm-x86-introduce-segmented_write_std.patch new file mode 100644 index 00000000000..8f747534908 --- /dev/null +++ b/queue-4.4/kvm-x86-introduce-segmented_write_std.patch @@ -0,0 +1,83 @@ +From 129a72a0d3c8e139a04512325384fe5ac119e74d Mon Sep 17 00:00:00 2001 +From: Steve Rutherford +Date: Wed, 11 Jan 2017 18:28:29 -0800 +Subject: KVM: x86: Introduce segmented_write_std + +From: Steve Rutherford + +commit 129a72a0d3c8e139a04512325384fe5ac119e74d upstream. + +Introduces segemented_write_std. + +Switches from emulated reads/writes to standard read/writes in fxsave, +fxrstor, sgdt, and sidt. This fixes CVE-2017-2584, a longstanding +kernel memory leak. + +Since commit 283c95d0e389 ("KVM: x86: emulate FXSAVE and FXRSTOR", +2016-11-09), which is luckily not yet in any final release, this would +also be an exploitable kernel memory *write*! + +Reported-by: Dmitry Vyukov +Fixes: 96051572c819194c37a8367624b285be10297eca +Fixes: 283c95d0e3891b64087706b344a4b545d04a6e62 +Suggested-by: Paolo Bonzini +Signed-off-by: Steve Rutherford +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/emulate.c | 22 ++++++++++++++++++---- + 1 file changed, 18 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -779,6 +779,20 @@ static int segmented_read_std(struct x86 + return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); + } + ++static int segmented_write_std(struct x86_emulate_ctxt *ctxt, ++ struct segmented_address addr, ++ void *data, ++ unsigned int size) ++{ ++ int rc; ++ ulong linear; ++ ++ rc = linearize(ctxt, addr, size, true, &linear); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception); ++} ++ + /* + * Prefetch the remaining bytes of the instruction without crossing page + * boundary if they are not in fetch_cache yet. +@@ -3674,8 +3688,8 @@ static int emulate_store_desc_ptr(struct + } + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; +- return segmented_write(ctxt, ctxt->dst.addr.mem, +- &desc_ptr, 2 + ctxt->op_bytes); ++ return segmented_write_std(ctxt, ctxt->dst.addr.mem, ++ &desc_ptr, 2 + ctxt->op_bytes); + } + + static int em_sgdt(struct x86_emulate_ctxt *ctxt) +@@ -3921,7 +3935,7 @@ static int em_fxsave(struct x86_emulate_ + else + size = offsetof(struct fxregs_state, xmm_space[0]); + +- return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size); ++ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); + } + + static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, +@@ -3963,7 +3977,7 @@ static int em_fxrstor(struct x86_emulate + if (rc != X86EMUL_CONTINUE) + return rc; + +- rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512); ++ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); + if (rc != X86EMUL_CONTINUE) + return rc; + diff --git a/queue-4.4/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch b/queue-4.4/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch new file mode 100644 index 00000000000..6c0e8704664 --- /dev/null +++ b/queue-4.4/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch @@ -0,0 +1,111 @@ +From e5bbc8a6c992901058bc09e2ce01d16c111ff047 Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Tue, 10 Jan 2017 16:58:27 -0800 +Subject: mm/hugetlb.c: fix reservation race when freeing surplus pages + +From: Mike Kravetz + +commit e5bbc8a6c992901058bc09e2ce01d16c111ff047 upstream. + +return_unused_surplus_pages() decrements the global reservation count, +and frees any unused surplus pages that were backing the reservation. + +Commit 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in +return_unused_surplus_pages()") added a call to cond_resched_lock in the +loop freeing the pages. + +As a result, the hugetlb_lock could be dropped, and someone else could +use the pages that will be freed in subsequent iterations of the loop. +This could result in inconsistent global hugetlb page state, application +api failures (such as mmap) failures or application crashes. + +When dropping the lock in return_unused_surplus_pages, make sure that +the global reservation count (resv_huge_pages) remains sufficiently +large to prevent someone else from claiming pages about to be freed. + +Analyzed by Paul Cassella. + +Fixes: 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in return_unused_surplus_pages()") +Link: http://lkml.kernel.org/r/1483991767-6879-1-git-send-email-mike.kravetz@oracle.com +Signed-off-by: Mike Kravetz +Reported-by: Paul Cassella +Suggested-by: Michal Hocko +Cc: Masayoshi Mizuma +Cc: Naoya Horiguchi +Cc: Aneesh Kumar +Cc: Hillf Danton +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/hugetlb.c | 37 ++++++++++++++++++++++++++++--------- + 1 file changed, 28 insertions(+), 9 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1723,23 +1723,32 @@ free: + } + + /* +- * When releasing a hugetlb pool reservation, any surplus pages that were +- * allocated to satisfy the reservation must be explicitly freed if they were +- * never used. +- * Called with hugetlb_lock held. ++ * This routine has two main purposes: ++ * 1) Decrement the reservation count (resv_huge_pages) by the value passed ++ * in unused_resv_pages. This corresponds to the prior adjustments made ++ * to the associated reservation map. ++ * 2) Free any unused surplus pages that may have been allocated to satisfy ++ * the reservation. As many as unused_resv_pages may be freed. ++ * ++ * Called with hugetlb_lock held. However, the lock could be dropped (and ++ * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, ++ * we must make sure nobody else can claim pages we are in the process of ++ * freeing. Do this by ensuring resv_huge_page always is greater than the ++ * number of huge pages we plan to free when dropping the lock. + */ + static void return_unused_surplus_pages(struct hstate *h, + unsigned long unused_resv_pages) + { + unsigned long nr_pages; + +- /* Uncommit the reservation */ +- h->resv_huge_pages -= unused_resv_pages; +- + /* Cannot return gigantic pages currently */ + if (hstate_is_gigantic(h)) +- return; ++ goto out; + ++ /* ++ * Part (or even all) of the reservation could have been backed ++ * by pre-allocated pages. Only free surplus pages. ++ */ + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); + + /* +@@ -1749,12 +1758,22 @@ static void return_unused_surplus_pages( + * when the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the freed pages across the + * on-line nodes with memory and will handle the hstate accounting. ++ * ++ * Note that we decrement resv_huge_pages as we free the pages. If ++ * we drop the lock, resv_huge_pages will still be sufficiently large ++ * to cover subsequent pages we may free. + */ + while (nr_pages--) { ++ h->resv_huge_pages--; ++ unused_resv_pages--; + if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) +- break; ++ goto out; + cond_resched_lock(&hugetlb_lock); + } ++ ++out: ++ /* Fully uncommit the reservation */ ++ h->resv_huge_pages -= unused_resv_pages; + } + + diff --git a/queue-4.4/series b/queue-4.4/series index 27a5dc22619..f7ee34c2d90 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -4,3 +4,9 @@ selftests-do-not-require-bash-to-run-netsocktests-testcase.patch selftests-do-not-require-bash-for-the-generated-test.patch mm-fix-devm_memremap_pages-crash-use-mem_hotplug_-begin-done.patch ocfs2-fix-crash-caused-by-stale-lvb-with-fsdlm-plugin.patch +mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch +kvm-x86-fix-emulation-of-mov-ss-null-selector.patch +kvm-eventfd-fix-null-deref-irqbypass-consumer.patch +kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch +kvm-x86-emulate-fxsave-and-fxrstor.patch +kvm-x86-introduce-segmented_write_std.patch