From: Greg Kroah-Hartman Date: Tue, 17 Jan 2017 10:02:09 +0000 (+0100) Subject: 4.9-stable patches X-Git-Tag: v4.9.5~18 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5cfd2ceaacdca9d6474022288a7df6d59545b065;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: kvm-eventfd-fix-null-deref-irqbypass-consumer.patch kvm-x86-emulate-fxsave-and-fxrstor.patch kvm-x86-fix-emulation-of-mov-ss-null-selector.patch kvm-x86-fix-null-deref-in-vcpu_scan_ioapic.patch kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch kvm-x86-introduce-segmented_write_std.patch mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch mm-memcg-fix-the-active-list-aging-for-lowmem-requests-when-memcg-is-enabled.patch mm-slab.c-fix-slab-freelist-randomization-duplicate-entries.patch mm-support-anonymous-stable-page.patch --- diff --git a/queue-4.9/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch b/queue-4.9/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch new file mode 100644 index 00000000000..a578b9a501d --- /dev/null +++ b/queue-4.9/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch @@ -0,0 +1,83 @@ +From 4f3dbdf47e150016aacd734e663347fcaa768303 Mon Sep 17 00:00:00 2001 +From: Wanpeng Li +Date: Thu, 5 Jan 2017 17:39:42 -0800 +Subject: KVM: eventfd: fix NULL deref irqbypass consumer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Wanpeng Li + +commit 4f3dbdf47e150016aacd734e663347fcaa768303 upstream. + +Reported syzkaller: + + BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 + IP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] + PGD 0 + + Oops: 0002 [#1] SMP + CPU: 1 PID: 125 Comm: kworker/1:1 Not tainted 4.9.0+ #1 + Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm] + task: ffff9bbe0dfbb900 task.stack: ffffb61802014000 + RIP: 0010:irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] + Call Trace: + irqfd_shutdown+0x66/0xa0 [kvm] + process_one_work+0x16b/0x480 + worker_thread+0x4b/0x500 + kthread+0x101/0x140 + ? process_one_work+0x480/0x480 + ? kthread_create_on_node+0x60/0x60 + ret_from_fork+0x25/0x30 + RIP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] RSP: ffffb61802017e20 + CR2: 0000000000000008 + +The syzkaller folks reported a NULL pointer dereference that due to +unregister an consumer which fails registration before. The syzkaller +creates two VMs w/ an equal eventfd occasionally. So the second VM +fails to register an irqbypass consumer. It will make irqfd as inactive +and queue an workqueue work to shutdown irqfd and unregister the irqbypass +consumer when eventfd is closed. However, the second consumer has been +initialized though it fails registration. So the token(same as the first +VM's) is taken to unregister the consumer through the workqueue, the +consumer of the first VM is found and unregistered, then NULL deref incurred +in the path of deleting consumer from the consumers list. + +This patch fixes it by making irq_bypass_register/unregister_consumer() +looks for the consumer entry based on consumer pointer itself instead of +token matching. + +Reported-by: Dmitry Vyukov +Suggested-by: Alex Williamson +Cc: Paolo Bonzini +Cc: Radim Krčmář +Cc: Dmitry Vyukov +Cc: Alex Williamson +Signed-off-by: Wanpeng Li +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + virt/lib/irqbypass.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/virt/lib/irqbypass.c ++++ b/virt/lib/irqbypass.c +@@ -195,7 +195,7 @@ int irq_bypass_register_consumer(struct + mutex_lock(&lock); + + list_for_each_entry(tmp, &consumers, node) { +- if (tmp->token == consumer->token) { ++ if (tmp->token == consumer->token || tmp == consumer) { + mutex_unlock(&lock); + module_put(THIS_MODULE); + return -EBUSY; +@@ -245,7 +245,7 @@ void irq_bypass_unregister_consumer(stru + mutex_lock(&lock); + + list_for_each_entry(tmp, &consumers, node) { +- if (tmp->token != consumer->token) ++ if (tmp != consumer) + continue; + + list_for_each_entry(producer, &producers, node) { diff --git a/queue-4.9/kvm-x86-emulate-fxsave-and-fxrstor.patch b/queue-4.9/kvm-x86-emulate-fxsave-and-fxrstor.patch new file mode 100644 index 00000000000..c3ea7c53e64 --- /dev/null +++ b/queue-4.9/kvm-x86-emulate-fxsave-and-fxrstor.patch @@ -0,0 +1,187 @@ +From 283c95d0e3891b64087706b344a4b545d04a6e62 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= +Date: Wed, 9 Nov 2016 19:07:06 +0100 +Subject: KVM: x86: emulate FXSAVE and FXRSTOR +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Radim Krčmář + +commit 283c95d0e3891b64087706b344a4b545d04a6e62 upstream. + +Internal errors were reported on 16 bit fxsave and fxrstor with ipxe. +Old Intels don't have unrestricted_guest, so we have to emulate them. + +The patch takes advantage of the hardware implementation. + +AMD and Intel differ in saving and restoring other fields in first 32 +bytes. A test wrote 0xff to the fxsave area, 0 to upper bits of MCSXR +in the fxsave area, executed fxrstor, rewrote the fxsave area to 0xee, +and executed fxsave: + + Intel (Nehalem): + 7f 1f 7f 7f ff 00 ff 07 ff ff ff ff ff ff 00 00 + ff ff ff ff ff ff 00 00 ff ff 00 00 ff ff 00 00 + Intel (Haswell -- deprecated FPU CS and FPU DS): + 7f 1f 7f 7f ff 00 ff 07 ff ff ff ff 00 00 00 00 + ff ff ff ff 00 00 00 00 ff ff 00 00 ff ff 00 00 + AMD (Opteron 2300-series): + 7f 1f 7f 7f ff 00 ee ee ee ee ee ee ee ee ee ee + ee ee ee ee ee ee ee ee ff ff 00 00 ff ff 02 00 + +fxsave/fxrstor will only be emulated on early Intels, so KVM can't do +much to improve the situation. + +Signed-off-by: Radim Krčmář +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/emulate.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 128 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -3870,6 +3870,131 @@ static int em_movsxd(struct x86_emulate_ + return X86EMUL_CONTINUE; + } + ++static int check_fxsr(struct x86_emulate_ctxt *ctxt) ++{ ++ u32 eax = 1, ebx, ecx = 0, edx; ++ ++ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); ++ if (!(edx & FFL(FXSR))) ++ return emulate_ud(ctxt); ++ ++ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) ++ return emulate_nm(ctxt); ++ ++ /* ++ * Don't emulate a case that should never be hit, instead of working ++ * around a lack of fxsave64/fxrstor64 on old compilers. ++ */ ++ if (ctxt->mode >= X86EMUL_MODE_PROT64) ++ return X86EMUL_UNHANDLEABLE; ++ ++ return X86EMUL_CONTINUE; ++} ++ ++/* ++ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, ++ * 1) 16 bit mode ++ * 2) 32 bit mode ++ * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs ++ * preserve whole 32 bit values, though, so (1) and (2) are the same wrt. ++ * save and restore ++ * 3) 64-bit mode with REX.W prefix ++ * - like (2), but XMM 8-15 are being saved and restored ++ * 4) 64-bit mode without REX.W prefix ++ * - like (3), but FIP and FDP are 64 bit ++ * ++ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the ++ * desired result. (4) is not emulated. ++ * ++ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS ++ * and FPU DS) should match. ++ */ ++static int em_fxsave(struct x86_emulate_ctxt *ctxt) ++{ ++ struct fxregs_state fx_state; ++ size_t size; ++ int rc; ++ ++ rc = check_fxsr(ctxt); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ ctxt->ops->get_fpu(ctxt); ++ ++ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); ++ ++ ctxt->ops->put_fpu(ctxt); ++ ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) ++ size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); ++ else ++ size = offsetof(struct fxregs_state, xmm_space[0]); ++ ++ return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size); ++} ++ ++static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, ++ struct fxregs_state *new) ++{ ++ int rc = X86EMUL_CONTINUE; ++ struct fxregs_state old; ++ ++ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ /* ++ * 64 bit host will restore XMM 8-15, which is not correct on non-64 ++ * bit guests. Load the current values in order to preserve 64 bit ++ * XMMs after fxrstor. ++ */ ++#ifdef CONFIG_X86_64 ++ /* XXX: accessing XMM 8-15 very awkwardly */ ++ memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); ++#endif ++ ++ /* ++ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but ++ * does save and restore MXCSR. ++ */ ++ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) ++ memcpy(new->xmm_space, old.xmm_space, 8 * 16); ++ ++ return rc; ++} ++ ++static int em_fxrstor(struct x86_emulate_ctxt *ctxt) ++{ ++ struct fxregs_state fx_state; ++ int rc; ++ ++ rc = check_fxsr(ctxt); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ if (fx_state.mxcsr >> 16) ++ return emulate_gp(ctxt, 0); ++ ++ ctxt->ops->get_fpu(ctxt); ++ ++ if (ctxt->mode < X86EMUL_MODE_PROT64) ++ rc = fxrstor_fixup(ctxt, &fx_state); ++ ++ if (rc == X86EMUL_CONTINUE) ++ rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); ++ ++ ctxt->ops->put_fpu(ctxt); ++ ++ return rc; ++} ++ + static bool valid_cr(int nr) + { + switch (nr) { +@@ -4222,7 +4347,9 @@ static const struct gprefix pfx_0f_ae_7 + }; + + static const struct group_dual group15 = { { +- N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), ++ I(ModRM | Aligned16, em_fxsave), ++ I(ModRM | Aligned16, em_fxrstor), ++ N, N, N, N, N, GP(0, &pfx_0f_ae_7), + }, { + N, N, N, N, N, N, N, N, + } }; diff --git a/queue-4.9/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch b/queue-4.9/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch new file mode 100644 index 00000000000..06efe007715 --- /dev/null +++ b/queue-4.9/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch @@ -0,0 +1,107 @@ +From 33ab91103b3415e12457e3104f0e4517ce12d0f3 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 12 Jan 2017 15:02:32 +0100 +Subject: KVM: x86: fix emulation of "MOV SS, null selector" + +From: Paolo Bonzini + +commit 33ab91103b3415e12457e3104f0e4517ce12d0f3 upstream. + +This is CVE-2017-2583. On Intel this causes a failed vmentry because +SS's type is neither 3 nor 7 (even though the manual says this check is +only done for usable SS, and the dmesg splat says that SS is unusable!). +On AMD it's worse: svm.c is confused and sets CPL to 0 in the vmcb. + +The fix fabricates a data segment descriptor when SS is set to a null +selector, so that CPL and SS.DPL are set correctly in the VMCS/vmcb. +Furthermore, only allow setting SS to a NULL selector if SS.RPL < 3; +this in turn ensures CPL < 3 because RPL must be equal to CPL. + +Thanks to Andy Lutomirski and Willy Tarreau for help in analyzing +the bug and deciphering the manuals. + +Reported-by: Xiaohan Zhang +Fixes: 79d5b4c3cd809c770d4bf9812635647016c56011 +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/emulate.c | 48 ++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 38 insertions(+), 10 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -1544,7 +1544,6 @@ static int write_segment_descriptor(stru + &ctxt->exception); + } + +-/* Does not support long mode */ + static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, int seg, u8 cpl, + enum x86_transfer_type transfer, +@@ -1581,20 +1580,34 @@ static int __load_segment_descriptor(str + + rpl = selector & 3; + +- /* NULL selector is not valid for TR, CS and SS (except for long mode) */ +- if ((seg == VCPU_SREG_CS +- || (seg == VCPU_SREG_SS +- && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) +- || seg == VCPU_SREG_TR) +- && null_selector) +- goto exception; +- + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + +- if (null_selector) /* for NULL selector skip all following checks */ ++ /* NULL selector is not valid for TR, CS and (except for long mode) SS */ ++ if (null_selector) { ++ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR) ++ goto exception; ++ ++ if (seg == VCPU_SREG_SS) { ++ if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl) ++ goto exception; ++ ++ /* ++ * ctxt->ops->set_segment expects the CPL to be in ++ * SS.DPL, so fake an expand-up 32-bit data segment. ++ */ ++ seg_desc.type = 3; ++ seg_desc.p = 1; ++ seg_desc.s = 1; ++ seg_desc.dpl = cpl; ++ seg_desc.d = 1; ++ seg_desc.g = 1; ++ } ++ ++ /* Skip all following checks */ + goto load; ++ } + + ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); + if (ret != X86EMUL_CONTINUE) +@@ -1710,6 +1723,21 @@ static int load_segment_descriptor(struc + u16 selector, int seg) + { + u8 cpl = ctxt->ops->cpl(ctxt); ++ ++ /* ++ * None of MOV, POP and LSS can load a NULL selector in CPL=3, but ++ * they can load it at CPL<3 (Intel's manual says only LSS can, ++ * but it's wrong). ++ * ++ * However, the Intel manual says that putting IST=1/DPL=3 in ++ * an interrupt gate will result in SS=3 (the AMD manual instead ++ * says it doesn't), so allow SS=3 in __load_segment_descriptor ++ * and only forbid it here. ++ */ ++ if (seg == VCPU_SREG_SS && selector == 3 && ++ ctxt->mode == X86EMUL_MODE_PROT64) ++ return emulate_exception(ctxt, GP_VECTOR, 0, true); ++ + return __load_segment_descriptor(ctxt, selector, seg, cpl, + X86_TRANSFER_NONE, NULL); + } diff --git a/queue-4.9/kvm-x86-fix-null-deref-in-vcpu_scan_ioapic.patch b/queue-4.9/kvm-x86-fix-null-deref-in-vcpu_scan_ioapic.patch new file mode 100644 index 00000000000..795a172db04 --- /dev/null +++ b/queue-4.9/kvm-x86-fix-null-deref-in-vcpu_scan_ioapic.patch @@ -0,0 +1,126 @@ +From 546d87e5c903a7f3ee7b9f998949a94729fbc65b Mon Sep 17 00:00:00 2001 +From: Wanpeng Li +Date: Tue, 3 Jan 2017 18:56:19 -0800 +Subject: KVM: x86: fix NULL deref in vcpu_scan_ioapic +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Wanpeng Li + +commit 546d87e5c903a7f3ee7b9f998949a94729fbc65b upstream. + +Reported by syzkaller: + + BUG: unable to handle kernel NULL pointer dereference at 00000000000001b0 + IP: _raw_spin_lock+0xc/0x30 + PGD 3e28eb067 + PUD 3f0ac6067 + PMD 0 + Oops: 0002 [#1] SMP + CPU: 0 PID: 2431 Comm: test Tainted: G OE 4.10.0-rc1+ #3 + Call Trace: + ? kvm_ioapic_scan_entry+0x3e/0x110 [kvm] + kvm_arch_vcpu_ioctl_run+0x10a8/0x15f0 [kvm] + ? pick_next_task_fair+0xe1/0x4e0 + ? kvm_arch_vcpu_load+0xea/0x260 [kvm] + kvm_vcpu_ioctl+0x33a/0x600 [kvm] + ? hrtimer_try_to_cancel+0x29/0x130 + ? do_nanosleep+0x97/0xf0 + do_vfs_ioctl+0xa1/0x5d0 + ? __hrtimer_init+0x90/0x90 + ? do_nanosleep+0x5b/0xf0 + SyS_ioctl+0x79/0x90 + do_syscall_64+0x6e/0x180 + entry_SYSCALL64_slow_path+0x25/0x25 + RIP: _raw_spin_lock+0xc/0x30 RSP: ffffa43688973cc0 + +The syzkaller folks reported a NULL pointer dereference due to +ENABLE_CAP succeeding even without an irqchip. The Hyper-V +synthetic interrupt controller is activated, resulting in a +wrong request to rescan the ioapic and a NULL pointer dereference. + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #ifndef KVM_CAP_HYPERV_SYNIC + #define KVM_CAP_HYPERV_SYNIC 123 + #endif + + void* thr(void* arg) + { + struct kvm_enable_cap cap; + cap.flags = 0; + cap.cap = KVM_CAP_HYPERV_SYNIC; + ioctl((long)arg, KVM_ENABLE_CAP, &cap); + return 0; + } + + int main() + { + void *host_mem = mmap(0, 0x1000, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + int kvmfd = open("/dev/kvm", 0); + int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0); + struct kvm_userspace_memory_region memreg; + memreg.slot = 0; + memreg.flags = 0; + memreg.guest_phys_addr = 0; + memreg.memory_size = 0x1000; + memreg.userspace_addr = (unsigned long)host_mem; + host_mem[0] = 0xf4; + ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg); + int cpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); + struct kvm_sregs sregs; + ioctl(cpufd, KVM_GET_SREGS, &sregs); + sregs.cr0 = 0; + sregs.cr4 = 0; + sregs.efer = 0; + sregs.cs.selector = 0; + sregs.cs.base = 0; + ioctl(cpufd, KVM_SET_SREGS, &sregs); + struct kvm_regs regs = { .rflags = 2 }; + ioctl(cpufd, KVM_SET_REGS, ®s); + ioctl(vmfd, KVM_CREATE_IRQCHIP, 0); + pthread_t th; + pthread_create(&th, 0, thr, (void*)(long)cpufd); + usleep(rand() % 10000); + ioctl(cpufd, KVM_RUN, 0); + pthread_join(th, 0); + return 0; + } + +This patch fixes it by failing ENABLE_CAP if without an irqchip. + +Reported-by: Dmitry Vyukov +Fixes: 5c919412fe61 (kvm/x86: Hyper-V synthetic interrupt controller) +Cc: Paolo Bonzini +Cc: Radim Krčmář +Cc: Dmitry Vyukov +Signed-off-by: Wanpeng Li +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -3308,6 +3308,8 @@ static int kvm_vcpu_ioctl_enable_cap(str + + switch (cap->cap) { + case KVM_CAP_HYPERV_SYNIC: ++ if (!irqchip_in_kernel(vcpu->kvm)) ++ return -EINVAL; + return kvm_hv_activate_synic(vcpu); + default: + return -EINVAL; diff --git a/queue-4.9/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch b/queue-4.9/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch new file mode 100644 index 00000000000..c0c14b3966f --- /dev/null +++ b/queue-4.9/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch @@ -0,0 +1,59 @@ +From cef84c302fe051744b983a92764d3fcca933415d Mon Sep 17 00:00:00 2001 +From: David Matlack +Date: Fri, 16 Dec 2016 14:30:36 -0800 +Subject: KVM: x86: flush pending lapic jump label updates on module unload + +From: David Matlack + +commit cef84c302fe051744b983a92764d3fcca933415d upstream. + +KVM's lapic emulation uses static_key_deferred (apic_{hw,sw}_disabled). +These are implemented with delayed_work structs which can still be +pending when the KVM module is unloaded. We've seen this cause kernel +panics when the kvm_intel module is quickly reloaded. + +Use the new static_key_deferred_flush() API to flush pending updates on +module unload. + +Signed-off-by: David Matlack +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/lapic.c | 6 ++++++ + arch/x86/kvm/lapic.h | 1 + + arch/x86/kvm/x86.c | 1 + + 3 files changed, 8 insertions(+) + +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -2360,3 +2360,9 @@ void kvm_lapic_init(void) + jump_label_rate_limit(&apic_hw_disabled, HZ); + jump_label_rate_limit(&apic_sw_disabled, HZ); + } ++ ++void kvm_lapic_exit(void) ++{ ++ static_key_deferred_flush(&apic_hw_disabled); ++ static_key_deferred_flush(&apic_sw_disabled); ++} +--- a/arch/x86/kvm/lapic.h ++++ b/arch/x86/kvm/lapic.h +@@ -108,6 +108,7 @@ static inline bool kvm_hv_vapic_assist_p + + int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); + void kvm_lapic_init(void); ++void kvm_lapic_exit(void); + + #define VEC_POS(v) ((v) & (32 - 1)) + #define REG_POS(v) (((v) >> 5) << 4) +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -5963,6 +5963,7 @@ out: + + void kvm_arch_exit(void) + { ++ kvm_lapic_exit(); + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) diff --git a/queue-4.9/kvm-x86-introduce-segmented_write_std.patch b/queue-4.9/kvm-x86-introduce-segmented_write_std.patch new file mode 100644 index 00000000000..17f4f4466c4 --- /dev/null +++ b/queue-4.9/kvm-x86-introduce-segmented_write_std.patch @@ -0,0 +1,83 @@ +From 129a72a0d3c8e139a04512325384fe5ac119e74d Mon Sep 17 00:00:00 2001 +From: Steve Rutherford +Date: Wed, 11 Jan 2017 18:28:29 -0800 +Subject: KVM: x86: Introduce segmented_write_std + +From: Steve Rutherford + +commit 129a72a0d3c8e139a04512325384fe5ac119e74d upstream. + +Introduces segemented_write_std. + +Switches from emulated reads/writes to standard read/writes in fxsave, +fxrstor, sgdt, and sidt. This fixes CVE-2017-2584, a longstanding +kernel memory leak. + +Since commit 283c95d0e389 ("KVM: x86: emulate FXSAVE and FXRSTOR", +2016-11-09), which is luckily not yet in any final release, this would +also be an exploitable kernel memory *write*! + +Reported-by: Dmitry Vyukov +Fixes: 96051572c819194c37a8367624b285be10297eca +Fixes: 283c95d0e3891b64087706b344a4b545d04a6e62 +Suggested-by: Paolo Bonzini +Signed-off-by: Steve Rutherford +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/emulate.c | 22 ++++++++++++++++++---- + 1 file changed, 18 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -791,6 +791,20 @@ static int segmented_read_std(struct x86 + return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); + } + ++static int segmented_write_std(struct x86_emulate_ctxt *ctxt, ++ struct segmented_address addr, ++ void *data, ++ unsigned int size) ++{ ++ int rc; ++ ulong linear; ++ ++ rc = linearize(ctxt, addr, size, true, &linear); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception); ++} ++ + /* + * Prefetch the remaining bytes of the instruction without crossing page + * boundary if they are not in fetch_cache yet. +@@ -3686,8 +3700,8 @@ static int emulate_store_desc_ptr(struct + } + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; +- return segmented_write(ctxt, ctxt->dst.addr.mem, +- &desc_ptr, 2 + ctxt->op_bytes); ++ return segmented_write_std(ctxt, ctxt->dst.addr.mem, ++ &desc_ptr, 2 + ctxt->op_bytes); + } + + static int em_sgdt(struct x86_emulate_ctxt *ctxt) +@@ -3933,7 +3947,7 @@ static int em_fxsave(struct x86_emulate_ + else + size = offsetof(struct fxregs_state, xmm_space[0]); + +- return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size); ++ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); + } + + static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, +@@ -3975,7 +3989,7 @@ static int em_fxrstor(struct x86_emulate + if (rc != X86EMUL_CONTINUE) + return rc; + +- rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512); ++ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); + if (rc != X86EMUL_CONTINUE) + return rc; + diff --git a/queue-4.9/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch b/queue-4.9/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch new file mode 100644 index 00000000000..0e1b5e0297c --- /dev/null +++ b/queue-4.9/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch @@ -0,0 +1,111 @@ +From e5bbc8a6c992901058bc09e2ce01d16c111ff047 Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Tue, 10 Jan 2017 16:58:27 -0800 +Subject: mm/hugetlb.c: fix reservation race when freeing surplus pages + +From: Mike Kravetz + +commit e5bbc8a6c992901058bc09e2ce01d16c111ff047 upstream. + +return_unused_surplus_pages() decrements the global reservation count, +and frees any unused surplus pages that were backing the reservation. + +Commit 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in +return_unused_surplus_pages()") added a call to cond_resched_lock in the +loop freeing the pages. + +As a result, the hugetlb_lock could be dropped, and someone else could +use the pages that will be freed in subsequent iterations of the loop. +This could result in inconsistent global hugetlb page state, application +api failures (such as mmap) failures or application crashes. + +When dropping the lock in return_unused_surplus_pages, make sure that +the global reservation count (resv_huge_pages) remains sufficiently +large to prevent someone else from claiming pages about to be freed. + +Analyzed by Paul Cassella. + +Fixes: 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in return_unused_surplus_pages()") +Link: http://lkml.kernel.org/r/1483991767-6879-1-git-send-email-mike.kravetz@oracle.com +Signed-off-by: Mike Kravetz +Reported-by: Paul Cassella +Suggested-by: Michal Hocko +Cc: Masayoshi Mizuma +Cc: Naoya Horiguchi +Cc: Aneesh Kumar +Cc: Hillf Danton +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/hugetlb.c | 37 ++++++++++++++++++++++++++++--------- + 1 file changed, 28 insertions(+), 9 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1773,23 +1773,32 @@ free: + } + + /* +- * When releasing a hugetlb pool reservation, any surplus pages that were +- * allocated to satisfy the reservation must be explicitly freed if they were +- * never used. +- * Called with hugetlb_lock held. ++ * This routine has two main purposes: ++ * 1) Decrement the reservation count (resv_huge_pages) by the value passed ++ * in unused_resv_pages. This corresponds to the prior adjustments made ++ * to the associated reservation map. ++ * 2) Free any unused surplus pages that may have been allocated to satisfy ++ * the reservation. As many as unused_resv_pages may be freed. ++ * ++ * Called with hugetlb_lock held. However, the lock could be dropped (and ++ * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, ++ * we must make sure nobody else can claim pages we are in the process of ++ * freeing. Do this by ensuring resv_huge_page always is greater than the ++ * number of huge pages we plan to free when dropping the lock. + */ + static void return_unused_surplus_pages(struct hstate *h, + unsigned long unused_resv_pages) + { + unsigned long nr_pages; + +- /* Uncommit the reservation */ +- h->resv_huge_pages -= unused_resv_pages; +- + /* Cannot return gigantic pages currently */ + if (hstate_is_gigantic(h)) +- return; ++ goto out; + ++ /* ++ * Part (or even all) of the reservation could have been backed ++ * by pre-allocated pages. Only free surplus pages. ++ */ + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); + + /* +@@ -1799,12 +1808,22 @@ static void return_unused_surplus_pages( + * when the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the freed pages across the + * on-line nodes with memory and will handle the hstate accounting. ++ * ++ * Note that we decrement resv_huge_pages as we free the pages. If ++ * we drop the lock, resv_huge_pages will still be sufficiently large ++ * to cover subsequent pages we may free. + */ + while (nr_pages--) { ++ h->resv_huge_pages--; ++ unused_resv_pages--; + if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) +- break; ++ goto out; + cond_resched_lock(&hugetlb_lock); + } ++ ++out: ++ /* Fully uncommit the reservation */ ++ h->resv_huge_pages -= unused_resv_pages; + } + + diff --git a/queue-4.9/mm-memcg-fix-the-active-list-aging-for-lowmem-requests-when-memcg-is-enabled.patch b/queue-4.9/mm-memcg-fix-the-active-list-aging-for-lowmem-requests-when-memcg-is-enabled.patch new file mode 100644 index 00000000000..7990062dde0 --- /dev/null +++ b/queue-4.9/mm-memcg-fix-the-active-list-aging-for-lowmem-requests-when-memcg-is-enabled.patch @@ -0,0 +1,273 @@ +From b4536f0c829c8586544c94735c343f9b5070bd01 Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Tue, 10 Jan 2017 16:58:04 -0800 +Subject: mm, memcg: fix the active list aging for lowmem requests when memcg is enabled + +From: Michal Hocko + +commit b4536f0c829c8586544c94735c343f9b5070bd01 upstream. + +Nils Holland and Klaus Ethgen have reported unexpected OOM killer +invocations with 32b kernel starting with 4.8 kernels + + kworker/u4:5 invoked oom-killer: gfp_mask=0x2400840(GFP_NOFS|__GFP_NOFAIL), nodemask=0, order=0, oom_score_adj=0 + kworker/u4:5 cpuset=/ mems_allowed=0 + CPU: 1 PID: 2603 Comm: kworker/u4:5 Not tainted 4.9.0-gentoo #2 + [...] + Mem-Info: + active_anon:58685 inactive_anon:90 isolated_anon:0 + active_file:274324 inactive_file:281962 isolated_file:0 + unevictable:0 dirty:649 writeback:0 unstable:0 + slab_reclaimable:40662 slab_unreclaimable:17754 + mapped:7382 shmem:202 pagetables:351 bounce:0 + free:206736 free_pcp:332 free_cma:0 + Node 0 active_anon:234740kB inactive_anon:360kB active_file:1097296kB inactive_file:1127848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:29528kB dirty:2596kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 184320kB anon_thp: 808kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no + DMA free:3952kB min:788kB low:984kB high:1180kB active_anon:0kB inactive_anon:0kB active_file:7316kB inactive_file:0kB unevictable:0kB writepending:96kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:3200kB slab_unreclaimable:1408kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB + lowmem_reserve[]: 0 813 3474 3474 + Normal free:41332kB min:41368kB low:51708kB high:62048kB active_anon:0kB inactive_anon:0kB active_file:532748kB inactive_file:44kB unevictable:0kB writepending:24kB present:897016kB managed:836248kB mlocked:0kB slab_reclaimable:159448kB slab_unreclaimable:69608kB kernel_stack:1112kB pagetables:1404kB bounce:0kB free_pcp:528kB local_pcp:340kB free_cma:0kB + lowmem_reserve[]: 0 0 21292 21292 + HighMem free:781660kB min:512kB low:34356kB high:68200kB active_anon:234740kB inactive_anon:360kB active_file:557232kB inactive_file:1127804kB unevictable:0kB writepending:2592kB present:2725384kB managed:2725384kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:800kB local_pcp:608kB free_cma:0kB + +the oom killer is clearly pre-mature because there there is still a lot +of page cache in the zone Normal which should satisfy this lowmem +request. Further debugging has shown that the reclaim cannot make any +forward progress because the page cache is hidden in the active list +which doesn't get rotated because inactive_list_is_low is not memcg +aware. + +The code simply subtracts per-zone highmem counters from the respective +memcg's lru sizes which doesn't make any sense. We can simply end up +always seeing the resulting active and inactive counts 0 and return +false. This issue is not limited to 32b kernels but in practice the +effect on systems without CONFIG_HIGHMEM would be much harder to notice +because we do not invoke the OOM killer for allocations requests +targeting < ZONE_NORMAL. + +Fix the issue by tracking per zone lru page counts in mem_cgroup_per_node +and subtract per-memcg highmem counts when memcg is enabled. Introduce +helper lruvec_zone_lru_size which redirects to either zone counters or +mem_cgroup_get_zone_lru_size when appropriate. + +We are losing empty LRU but non-zero lru size detection introduced by +ca707239e8a7 ("mm: update_lru_size warn and reset bad lru_size") because +of the inherent zone vs. node discrepancy. + +Fixes: f8d1a31163fc ("mm: consider whether to decivate based on eligible zones inactive ratio") +Link: http://lkml.kernel.org/r/20170104100825.3729-1-mhocko@kernel.org +Signed-off-by: Michal Hocko +Reported-by: Nils Holland +Tested-by: Nils Holland +Reported-by: Klaus Ethgen +Acked-by: Minchan Kim +Acked-by: Mel Gorman +Acked-by: Johannes Weiner +Reviewed-by: Vladimir Davydov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/memcontrol.h | 26 +++++++++++++++++++++++--- + include/linux/mm_inline.h | 2 +- + mm/memcontrol.c | 18 ++++++++---------- + mm/vmscan.c | 27 +++++++++++++++++---------- + 4 files changed, 49 insertions(+), 24 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter { + */ + struct mem_cgroup_per_node { + struct lruvec lruvec; +- unsigned long lru_size[NR_LRU_LISTS]; ++ unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; + + struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; + +@@ -432,7 +432,7 @@ static inline bool mem_cgroup_online(str + int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); + + void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, +- int nr_pages); ++ int zid, int nr_pages); + + unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask); +@@ -441,9 +441,23 @@ static inline + unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) + { + struct mem_cgroup_per_node *mz; ++ unsigned long nr_pages = 0; ++ int zid; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); +- return mz->lru_size[lru]; ++ for (zid = 0; zid < MAX_NR_ZONES; zid++) ++ nr_pages += mz->lru_zone_size[zid][lru]; ++ return nr_pages; ++} ++ ++static inline ++unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, ++ enum lru_list lru, int zone_idx) ++{ ++ struct mem_cgroup_per_node *mz; ++ ++ mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); ++ return mz->lru_zone_size[zone_idx][lru]; + } + + void mem_cgroup_handle_over_high(void); +@@ -671,6 +685,12 @@ mem_cgroup_get_lru_size(struct lruvec *l + { + return 0; + } ++static inline ++unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, ++ enum lru_list lru, int zone_idx) ++{ ++ return 0; ++} + + static inline unsigned long + mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -39,7 +39,7 @@ static __always_inline void update_lru_s + { + __update_lru_size(lruvec, lru, zid, nr_pages); + #ifdef CONFIG_MEMCG +- mem_cgroup_update_lru_size(lruvec, lru, nr_pages); ++ mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); + #endif + } + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -625,8 +625,8 @@ static void mem_cgroup_charge_statistics + unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) + { ++ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + unsigned long nr = 0; +- struct mem_cgroup_per_node *mz; + enum lru_list lru; + + VM_BUG_ON((unsigned)nid >= nr_node_ids); +@@ -634,8 +634,7 @@ unsigned long mem_cgroup_node_nr_lru_pag + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; +- mz = mem_cgroup_nodeinfo(memcg, nid); +- nr += mz->lru_size[lru]; ++ nr += mem_cgroup_get_lru_size(lruvec, lru); + } + return nr; + } +@@ -1002,6 +1001,7 @@ out: + * mem_cgroup_update_lru_size - account for adding or removing an lru page + * @lruvec: mem_cgroup per zone lru vector + * @lru: index of lru list the page is sitting on ++ * @zid: zone id of the accounted pages + * @nr_pages: positive when adding or negative when removing + * + * This function must be called under lru_lock, just before a page is added +@@ -1009,27 +1009,25 @@ out: + * so as to allow it to check that lru_size 0 is consistent with list_empty). + */ + void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, +- int nr_pages) ++ int zid, int nr_pages) + { + struct mem_cgroup_per_node *mz; + unsigned long *lru_size; + long size; +- bool empty; + + if (mem_cgroup_disabled()) + return; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); +- lru_size = mz->lru_size + lru; +- empty = list_empty(lruvec->lists + lru); ++ lru_size = &mz->lru_zone_size[zid][lru]; + + if (nr_pages < 0) + *lru_size += nr_pages; + + size = *lru_size; +- if (WARN_ONCE(size < 0 || empty != !size, +- "%s(%p, %d, %d): lru_size %ld but %sempty\n", +- __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { ++ if (WARN_ONCE(size < 0, ++ "%s(%p, %d, %d): lru_size %ld\n", ++ __func__, lruvec, lru, nr_pages, size)) { + VM_BUG_ON(1); + *lru_size = 0; + } +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -242,6 +242,16 @@ unsigned long lruvec_lru_size(struct lru + return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); + } + ++unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, ++ int zone_idx) ++{ ++ if (!mem_cgroup_disabled()) ++ return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); ++ ++ return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], ++ NR_ZONE_LRU_BASE + lru); ++} ++ + /* + * Add a shrinker callback to be called from the vm. + */ +@@ -1382,8 +1392,7 @@ int __isolate_lru_page(struct page *page + * be complete before mem_cgroup_update_lru_size due to a santity check. + */ + static __always_inline void update_lru_sizes(struct lruvec *lruvec, +- enum lru_list lru, unsigned long *nr_zone_taken, +- unsigned long nr_taken) ++ enum lru_list lru, unsigned long *nr_zone_taken) + { + int zid; + +@@ -1392,11 +1401,11 @@ static __always_inline void update_lru_s + continue; + + __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); +- } +- + #ifdef CONFIG_MEMCG +- mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); ++ mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); + #endif ++ } ++ + } + + /* +@@ -1501,7 +1510,7 @@ static unsigned long isolate_lru_pages(u + *nr_scanned = scan; + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, + nr_taken, mode, is_file_lru(lru)); +- update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); ++ update_lru_sizes(lruvec, lru, nr_zone_taken); + return nr_taken; + } + +@@ -2047,10 +2056,8 @@ static bool inactive_list_is_low(struct + if (!managed_zone(zone)) + continue; + +- inactive_zone = zone_page_state(zone, +- NR_ZONE_LRU_BASE + (file * LRU_FILE)); +- active_zone = zone_page_state(zone, +- NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); ++ inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid); ++ active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid); + + inactive -= min(inactive, inactive_zone); + active -= min(active, active_zone); diff --git a/queue-4.9/mm-slab.c-fix-slab-freelist-randomization-duplicate-entries.patch b/queue-4.9/mm-slab.c-fix-slab-freelist-randomization-duplicate-entries.patch new file mode 100644 index 00000000000..e8959653252 --- /dev/null +++ b/queue-4.9/mm-slab.c-fix-slab-freelist-randomization-duplicate-entries.patch @@ -0,0 +1,64 @@ +From c4e490cf148e85ead0d1b1c2caaba833f1d5b29f Mon Sep 17 00:00:00 2001 +From: John Sperbeck +Date: Tue, 10 Jan 2017 16:58:24 -0800 +Subject: mm/slab.c: fix SLAB freelist randomization duplicate entries + +From: John Sperbeck + +commit c4e490cf148e85ead0d1b1c2caaba833f1d5b29f upstream. + +This patch fixes a bug in the freelist randomization code. When a high +random number is used, the freelist will contain duplicate entries. It +will result in different allocations sharing the same chunk. + +It will result in odd behaviours and crashes. It should be uncommon but +it depends on the machines. We saw it happening more often on some +machines (every few hours of running tests). + +Fixes: c7ce4f60ac19 ("mm: SLAB freelist randomization") +Link: http://lkml.kernel.org/r/20170103181908.143178-1-thgarnie@google.com +Signed-off-by: John Sperbeck +Signed-off-by: Thomas Garnier +Cc: Christoph Lameter +Cc: Pekka Enberg +Cc: David Rientjes +Cc: Joonsoo Kim +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/slab.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -2475,7 +2475,6 @@ union freelist_init_state { + unsigned int pos; + unsigned int *list; + unsigned int count; +- unsigned int rand; + }; + struct rnd_state rnd_state; + }; +@@ -2501,8 +2500,7 @@ static bool freelist_state_initialize(un + } else { + state->list = cachep->random_seq; + state->count = count; +- state->pos = 0; +- state->rand = rand; ++ state->pos = rand % count; + ret = true; + } + return ret; +@@ -2511,7 +2509,9 @@ static bool freelist_state_initialize(un + /* Get the next entry on the list and randomize it using a random shift */ + static freelist_idx_t next_random_slot(union freelist_init_state *state) + { +- return (state->list[state->pos++] + state->rand) % state->count; ++ if (state->pos >= state->count) ++ state->pos = 0; ++ return state->list[state->pos++]; + } + + /* Swap two freelist entries */ diff --git a/queue-4.9/mm-support-anonymous-stable-page.patch b/queue-4.9/mm-support-anonymous-stable-page.patch new file mode 100644 index 00000000000..2d7a88398b1 --- /dev/null +++ b/queue-4.9/mm-support-anonymous-stable-page.patch @@ -0,0 +1,138 @@ +From f05714293a591038304ddae7cb0dd747bb3786cc Mon Sep 17 00:00:00 2001 +From: Minchan Kim +Date: Tue, 10 Jan 2017 16:58:15 -0800 +Subject: mm: support anonymous stable page + +From: Minchan Kim + +commit f05714293a591038304ddae7cb0dd747bb3786cc upstream. + +During developemnt for zram-swap asynchronous writeback, I found strange +corruption of compressed page, resulting in: + + Modules linked in: zram(E) + CPU: 3 PID: 1520 Comm: zramd-1 Tainted: G E 4.8.0-mm1-00320-ge0d4894c9c38-dirty #3274 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 + task: ffff88007620b840 task.stack: ffff880078090000 + RIP: set_freeobj.part.43+0x1c/0x1f + RSP: 0018:ffff880078093ca8 EFLAGS: 00010246 + RAX: 0000000000000018 RBX: ffff880076798d88 RCX: ffffffff81c408c8 + RDX: 0000000000000018 RSI: 0000000000000000 RDI: 0000000000000246 + RBP: ffff880078093cb0 R08: 0000000000000000 R09: 0000000000000000 + R10: ffff88005bc43030 R11: 0000000000001df3 R12: ffff880076798d88 + R13: 000000000005bc43 R14: ffff88007819d1b8 R15: 0000000000000001 + FS: 0000000000000000(0000) GS:ffff88007e380000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007fc934048f20 CR3: 0000000077b01000 CR4: 00000000000406e0 + Call Trace: + obj_malloc+0x22b/0x260 + zs_malloc+0x1e4/0x580 + zram_bvec_rw+0x4cd/0x830 [zram] + page_requests_rw+0x9c/0x130 [zram] + zram_thread+0xe6/0x173 [zram] + kthread+0xca/0xe0 + ret_from_fork+0x25/0x30 + +With investigation, it reveals currently stable page doesn't support +anonymous page. IOW, reuse_swap_page can reuse the page without waiting +writeback completion so it can overwrite page zram is compressing. + +Unfortunately, zram has used per-cpu stream feature from v4.7. +It aims for increasing cache hit ratio of scratch buffer for +compressing. Downside of that approach is that zram should ask +memory space for compressed page in per-cpu context which requires +stricted gfp flag which could be failed. If so, it retries to +allocate memory space out of per-cpu context so it could get memory +this time and compress the data again, copies it to the memory space. + +In this scenario, zram assumes the data should never be changed +but it is not true unless stable page supports. So, If the data is +changed under us, zram can make buffer overrun because second +compression size could be bigger than one we got in previous trial +and blindly, copy bigger size object to smaller buffer which is +buffer overrun. The overrun breaks zsmalloc free object chaining +so system goes crash like above. + +I think below is same problem. +https://bugzilla.suse.com/show_bug.cgi?id=997574 + +Unfortunately, reuse_swap_page should be atomic so that we cannot wait on +writeback in there so the approach in this patch is simply return false if +we found it needs stable page. Although it increases memory footprint +temporarily, it happens rarely and it should be reclaimed easily althoug +it happened. Also, It would be better than waiting of IO completion, +which is critial path for application latency. + +Fixes: da9556a2367c ("zram: user per-cpu compression streams") +Link: http://lkml.kernel.org/r/20161120233015.GA14113@bbox +Link: http://lkml.kernel.org/r/1482366980-3782-2-git-send-email-minchan@kernel.org +Signed-off-by: Minchan Kim +Acked-by: Hugh Dickins +Cc: Sergey Senozhatsky +Cc: Darrick J. Wong +Cc: Takashi Iwai +Cc: Hyeoncheol Lee +Cc: +Cc: Sangseok Lee +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/swap.h | 3 ++- + mm/swapfile.c | 20 +++++++++++++++++++- + 2 files changed, 21 insertions(+), 2 deletions(-) + +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -150,8 +150,9 @@ enum { + SWP_FILE = (1 << 7), /* set after swap_activate success */ + SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ + SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ ++ SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ + /* add others here before... */ +- SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */ ++ SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ + }; + + #define SWAP_CLUSTER_MAX 32UL +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -943,11 +943,25 @@ bool reuse_swap_page(struct page *page, + count = page_trans_huge_mapcount(page, total_mapcount); + if (count <= 1 && PageSwapCache(page)) { + count += page_swapcount(page); +- if (count == 1 && !PageWriteback(page)) { ++ if (count != 1) ++ goto out; ++ if (!PageWriteback(page)) { + delete_from_swap_cache(page); + SetPageDirty(page); ++ } else { ++ swp_entry_t entry; ++ struct swap_info_struct *p; ++ ++ entry.val = page_private(page); ++ p = swap_info_get(entry); ++ if (p->flags & SWP_STABLE_WRITES) { ++ spin_unlock(&p->lock); ++ return false; ++ } ++ spin_unlock(&p->lock); + } + } ++out: + return count <= 1; + } + +@@ -2449,6 +2463,10 @@ SYSCALL_DEFINE2(swapon, const char __use + error = -ENOMEM; + goto bad_swap; + } ++ ++ if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) ++ p->flags |= SWP_STABLE_WRITES; ++ + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + int cpu; + diff --git a/queue-4.9/series b/queue-4.9/series index a4f80977726..a68230699e2 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -15,3 +15,13 @@ dax-fix-deadlock-with-dax-4k-holes.patch mm-pmd-dirty-emulation-in-page-fault-handler.patch mm-fix-devm_memremap_pages-crash-use-mem_hotplug_-begin-done.patch ocfs2-fix-crash-caused-by-stale-lvb-with-fsdlm-plugin.patch +mm-memcg-fix-the-active-list-aging-for-lowmem-requests-when-memcg-is-enabled.patch +mm-support-anonymous-stable-page.patch +mm-slab.c-fix-slab-freelist-randomization-duplicate-entries.patch +mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch +kvm-x86-fix-emulation-of-mov-ss-null-selector.patch +kvm-eventfd-fix-null-deref-irqbypass-consumer.patch +kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch +kvm-x86-fix-null-deref-in-vcpu_scan_ioapic.patch +kvm-x86-emulate-fxsave-and-fxrstor.patch +kvm-x86-introduce-segmented_write_std.patch