From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 17 Jan 2017 10:02:09 +0000 (+0100)
Subject: 4.9-stable patches
X-Git-Tag: v4.9.5~18
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5cfd2ceaacdca9d6474022288a7df6d59545b065;p=thirdparty%2Fkernel%2Fstable-queue.git

4.9-stable patches

added patches:
	kvm-eventfd-fix-null-deref-irqbypass-consumer.patch
	kvm-x86-emulate-fxsave-and-fxrstor.patch
	kvm-x86-fix-emulation-of-mov-ss-null-selector.patch
	kvm-x86-fix-null-deref-in-vcpu_scan_ioapic.patch
	kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch
	kvm-x86-introduce-segmented_write_std.patch
	mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch
	mm-memcg-fix-the-active-list-aging-for-lowmem-requests-when-memcg-is-enabled.patch
	mm-slab.c-fix-slab-freelist-randomization-duplicate-entries.patch
	mm-support-anonymous-stable-page.patch
---

diff --git a/queue-4.9/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch b/queue-4.9/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch
new file mode 100644
index 00000000000..a578b9a501d
--- /dev/null
+++ b/queue-4.9/kvm-eventfd-fix-null-deref-irqbypass-consumer.patch
@@ -0,0 +1,83 @@
+From 4f3dbdf47e150016aacd734e663347fcaa768303 Mon Sep 17 00:00:00 2001
+From: Wanpeng Li <wanpeng.li@hotmail.com>
+Date: Thu, 5 Jan 2017 17:39:42 -0800
+Subject: KVM: eventfd: fix NULL deref irqbypass consumer
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Wanpeng Li <wanpeng.li@hotmail.com>
+
+commit 4f3dbdf47e150016aacd734e663347fcaa768303 upstream.
+
+Reported syzkaller:
+
+    BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
+    IP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass]
+    PGD 0
+
+    Oops: 0002 [#1] SMP
+    CPU: 1 PID: 125 Comm: kworker/1:1 Not tainted 4.9.0+ #1
+    Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm]
+    task: ffff9bbe0dfbb900 task.stack: ffffb61802014000
+    RIP: 0010:irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass]
+    Call Trace:
+     irqfd_shutdown+0x66/0xa0 [kvm]
+     process_one_work+0x16b/0x480
+     worker_thread+0x4b/0x500
+     kthread+0x101/0x140
+     ? process_one_work+0x480/0x480
+     ? kthread_create_on_node+0x60/0x60
+     ret_from_fork+0x25/0x30
+    RIP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] RSP: ffffb61802017e20
+    CR2: 0000000000000008
+
+The syzkaller folks reported a NULL pointer dereference that due to
+unregister an consumer which fails registration before. The syzkaller
+creates two VMs w/ an equal eventfd occasionally. So the second VM
+fails to register an irqbypass consumer. It will make irqfd as inactive
+and queue an workqueue work to shutdown irqfd and unregister the irqbypass
+consumer when eventfd is closed. However, the second consumer has been
+initialized though it fails registration. So the token(same as the first
+VM's) is taken to unregister the consumer through the workqueue, the
+consumer of the first VM is found and unregistered, then NULL deref incurred
+in the path of deleting consumer from the consumers list.
+
+This patch fixes it by making irq_bypass_register/unregister_consumer()
+looks for the consumer entry based on consumer pointer itself instead of
+token matching.
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Suggested-by: Alex Williamson <alex.williamson@redhat.com>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Radim KrÄmÃ¡Å <rkrcmar@redhat.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Alex Williamson <alex.williamson@redhat.com>
+Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ virt/lib/irqbypass.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/virt/lib/irqbypass.c
++++ b/virt/lib/irqbypass.c
+@@ -195,7 +195,7 @@ int irq_bypass_register_consumer(struct
+ 	mutex_lock(&lock);
+ 
+ 	list_for_each_entry(tmp, &consumers, node) {
+-		if (tmp->token == consumer->token) {
++		if (tmp->token == consumer->token || tmp == consumer) {
+ 			mutex_unlock(&lock);
+ 			module_put(THIS_MODULE);
+ 			return -EBUSY;
+@@ -245,7 +245,7 @@ void irq_bypass_unregister_consumer(stru
+ 	mutex_lock(&lock);
+ 
+ 	list_for_each_entry(tmp, &consumers, node) {
+-		if (tmp->token != consumer->token)
++		if (tmp != consumer)
+ 			continue;
+ 
+ 		list_for_each_entry(producer, &producers, node) {
diff --git a/queue-4.9/kvm-x86-emulate-fxsave-and-fxrstor.patch b/queue-4.9/kvm-x86-emulate-fxsave-and-fxrstor.patch
new file mode 100644
index 00000000000..c3ea7c53e64
--- /dev/null
+++ b/queue-4.9/kvm-x86-emulate-fxsave-and-fxrstor.patch
@@ -0,0 +1,187 @@
+From 283c95d0e3891b64087706b344a4b545d04a6e62 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
+Date: Wed, 9 Nov 2016 19:07:06 +0100
+Subject: KVM: x86: emulate FXSAVE and FXRSTOR
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Radim KrÄmÃ¡Å <rkrcmar@redhat.com>
+
+commit 283c95d0e3891b64087706b344a4b545d04a6e62 upstream.
+
+Internal errors were reported on 16 bit fxsave and fxrstor with ipxe.
+Old Intels don't have unrestricted_guest, so we have to emulate them.
+
+The patch takes advantage of the hardware implementation.
+
+AMD and Intel differ in saving and restoring other fields in first 32
+bytes.  A test wrote 0xff to the fxsave area, 0 to upper bits of MCSXR
+in the fxsave area, executed fxrstor, rewrote the fxsave area to 0xee,
+and executed fxsave:
+
+  Intel (Nehalem):
+    7f 1f 7f 7f ff 00 ff 07 ff ff ff ff ff ff 00 00
+    ff ff ff ff ff ff 00 00 ff ff 00 00 ff ff 00 00
+  Intel (Haswell -- deprecated FPU CS and FPU DS):
+    7f 1f 7f 7f ff 00 ff 07 ff ff ff ff 00 00 00 00
+    ff ff ff ff 00 00 00 00 ff ff 00 00 ff ff 00 00
+  AMD (Opteron 2300-series):
+    7f 1f 7f 7f ff 00 ee ee ee ee ee ee ee ee ee ee
+    ee ee ee ee ee ee ee ee ff ff 00 00 ff ff 02 00
+
+fxsave/fxrstor will only be emulated on early Intels, so KVM can't do
+much to improve the situation.
+
+Signed-off-by: Radim KrÄmÃ¡Å <rkrcmar@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c |  129 ++++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 128 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -3870,6 +3870,131 @@ static int em_movsxd(struct x86_emulate_
+ 	return X86EMUL_CONTINUE;
+ }
+ 
++static int check_fxsr(struct x86_emulate_ctxt *ctxt)
++{
++	u32 eax = 1, ebx, ecx = 0, edx;
++
++	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
++	if (!(edx & FFL(FXSR)))
++		return emulate_ud(ctxt);
++
++	if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
++		return emulate_nm(ctxt);
++
++	/*
++	 * Don't emulate a case that should never be hit, instead of working
++	 * around a lack of fxsave64/fxrstor64 on old compilers.
++	 */
++	if (ctxt->mode >= X86EMUL_MODE_PROT64)
++		return X86EMUL_UNHANDLEABLE;
++
++	return X86EMUL_CONTINUE;
++}
++
++/*
++ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
++ *  1) 16 bit mode
++ *  2) 32 bit mode
++ *     - like (1), but FIP and FDP (foo) are only 16 bit.  At least Intel CPUs
++ *       preserve whole 32 bit values, though, so (1) and (2) are the same wrt.
++ *       save and restore
++ *  3) 64-bit mode with REX.W prefix
++ *     - like (2), but XMM 8-15 are being saved and restored
++ *  4) 64-bit mode without REX.W prefix
++ *     - like (3), but FIP and FDP are 64 bit
++ *
++ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the
++ * desired result.  (4) is not emulated.
++ *
++ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS
++ * and FPU DS) should match.
++ */
++static int em_fxsave(struct x86_emulate_ctxt *ctxt)
++{
++	struct fxregs_state fx_state;
++	size_t size;
++	int rc;
++
++	rc = check_fxsr(ctxt);
++	if (rc != X86EMUL_CONTINUE)
++		return rc;
++
++	ctxt->ops->get_fpu(ctxt);
++
++	rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
++
++	ctxt->ops->put_fpu(ctxt);
++
++	if (rc != X86EMUL_CONTINUE)
++		return rc;
++
++	if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)
++		size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]);
++	else
++		size = offsetof(struct fxregs_state, xmm_space[0]);
++
++	return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size);
++}
++
++static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
++		struct fxregs_state *new)
++{
++	int rc = X86EMUL_CONTINUE;
++	struct fxregs_state old;
++
++	rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
++	if (rc != X86EMUL_CONTINUE)
++		return rc;
++
++	/*
++	 * 64 bit host will restore XMM 8-15, which is not correct on non-64
++	 * bit guests.  Load the current values in order to preserve 64 bit
++	 * XMMs after fxrstor.
++	 */
++#ifdef CONFIG_X86_64
++	/* XXX: accessing XMM 8-15 very awkwardly */
++	memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
++#endif
++
++	/*
++	 * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
++	 * does save and restore MXCSR.
++	 */
++	if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
++		memcpy(new->xmm_space, old.xmm_space, 8 * 16);
++
++	return rc;
++}
++
++static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
++{
++	struct fxregs_state fx_state;
++	int rc;
++
++	rc = check_fxsr(ctxt);
++	if (rc != X86EMUL_CONTINUE)
++		return rc;
++
++	rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
++	if (rc != X86EMUL_CONTINUE)
++		return rc;
++
++	if (fx_state.mxcsr >> 16)
++		return emulate_gp(ctxt, 0);
++
++	ctxt->ops->get_fpu(ctxt);
++
++	if (ctxt->mode < X86EMUL_MODE_PROT64)
++		rc = fxrstor_fixup(ctxt, &fx_state);
++
++	if (rc == X86EMUL_CONTINUE)
++		rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
++
++	ctxt->ops->put_fpu(ctxt);
++
++	return rc;
++}
++
+ static bool valid_cr(int nr)
+ {
+ 	switch (nr) {
+@@ -4222,7 +4347,9 @@ static const struct gprefix pfx_0f_ae_7
+ };
+ 
+ static const struct group_dual group15 = { {
+-	N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7),
++	I(ModRM | Aligned16, em_fxsave),
++	I(ModRM | Aligned16, em_fxrstor),
++	N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+ }, {
+ 	N, N, N, N, N, N, N, N,
+ } };
diff --git a/queue-4.9/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch b/queue-4.9/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch
new file mode 100644
index 00000000000..06efe007715
--- /dev/null
+++ b/queue-4.9/kvm-x86-fix-emulation-of-mov-ss-null-selector.patch
@@ -0,0 +1,107 @@
+From 33ab91103b3415e12457e3104f0e4517ce12d0f3 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Thu, 12 Jan 2017 15:02:32 +0100
+Subject: KVM: x86: fix emulation of "MOV SS, null selector"
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 33ab91103b3415e12457e3104f0e4517ce12d0f3 upstream.
+
+This is CVE-2017-2583.  On Intel this causes a failed vmentry because
+SS's type is neither 3 nor 7 (even though the manual says this check is
+only done for usable SS, and the dmesg splat says that SS is unusable!).
+On AMD it's worse: svm.c is confused and sets CPL to 0 in the vmcb.
+
+The fix fabricates a data segment descriptor when SS is set to a null
+selector, so that CPL and SS.DPL are set correctly in the VMCS/vmcb.
+Furthermore, only allow setting SS to a NULL selector if SS.RPL < 3;
+this in turn ensures CPL < 3 because RPL must be equal to CPL.
+
+Thanks to Andy Lutomirski and Willy Tarreau for help in analyzing
+the bug and deciphering the manuals.
+
+Reported-by: Xiaohan Zhang <zhangxiaohan1@huawei.com>
+Fixes: 79d5b4c3cd809c770d4bf9812635647016c56011
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c |   48 ++++++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 38 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -1544,7 +1544,6 @@ static int write_segment_descriptor(stru
+ 				    &ctxt->exception);
+ }
+ 
+-/* Does not support long mode */
+ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ 				     u16 selector, int seg, u8 cpl,
+ 				     enum x86_transfer_type transfer,
+@@ -1581,20 +1580,34 @@ static int __load_segment_descriptor(str
+ 
+ 	rpl = selector & 3;
+ 
+-	/* NULL selector is not valid for TR, CS and SS (except for long mode) */
+-	if ((seg == VCPU_SREG_CS
+-	     || (seg == VCPU_SREG_SS
+-		 && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
+-	     || seg == VCPU_SREG_TR)
+-	    && null_selector)
+-		goto exception;
+-
+ 	/* TR should be in GDT only */
+ 	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ 		goto exception;
+ 
+-	if (null_selector) /* for NULL selector skip all following checks */
++	/* NULL selector is not valid for TR, CS and (except for long mode) SS */
++	if (null_selector) {
++		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR)
++			goto exception;
++
++		if (seg == VCPU_SREG_SS) {
++			if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)
++				goto exception;
++
++			/*
++			 * ctxt->ops->set_segment expects the CPL to be in
++			 * SS.DPL, so fake an expand-up 32-bit data segment.
++			 */
++			seg_desc.type = 3;
++			seg_desc.p = 1;
++			seg_desc.s = 1;
++			seg_desc.dpl = cpl;
++			seg_desc.d = 1;
++			seg_desc.g = 1;
++		}
++
++		/* Skip all following checks */
+ 		goto load;
++	}
+ 
+ 	ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
+ 	if (ret != X86EMUL_CONTINUE)
+@@ -1710,6 +1723,21 @@ static int load_segment_descriptor(struc
+ 				   u16 selector, int seg)
+ {
+ 	u8 cpl = ctxt->ops->cpl(ctxt);
++
++	/*
++	 * None of MOV, POP and LSS can load a NULL selector in CPL=3, but
++	 * they can load it at CPL<3 (Intel's manual says only LSS can,
++	 * but it's wrong).
++	 *
++	 * However, the Intel manual says that putting IST=1/DPL=3 in
++	 * an interrupt gate will result in SS=3 (the AMD manual instead
++	 * says it doesn't), so allow SS=3 in __load_segment_descriptor
++	 * and only forbid it here.
++	 */
++	if (seg == VCPU_SREG_SS && selector == 3 &&
++	    ctxt->mode == X86EMUL_MODE_PROT64)
++		return emulate_exception(ctxt, GP_VECTOR, 0, true);
++
+ 	return __load_segment_descriptor(ctxt, selector, seg, cpl,
+ 					 X86_TRANSFER_NONE, NULL);
+ }
diff --git a/queue-4.9/kvm-x86-fix-null-deref-in-vcpu_scan_ioapic.patch b/queue-4.9/kvm-x86-fix-null-deref-in-vcpu_scan_ioapic.patch
new file mode 100644
index 00000000000..795a172db04
--- /dev/null
+++ b/queue-4.9/kvm-x86-fix-null-deref-in-vcpu_scan_ioapic.patch
@@ -0,0 +1,126 @@
+From 546d87e5c903a7f3ee7b9f998949a94729fbc65b Mon Sep 17 00:00:00 2001
+From: Wanpeng Li <wanpeng.li@hotmail.com>
+Date: Tue, 3 Jan 2017 18:56:19 -0800
+Subject: KVM: x86: fix NULL deref in vcpu_scan_ioapic
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Wanpeng Li <wanpeng.li@hotmail.com>
+
+commit 546d87e5c903a7f3ee7b9f998949a94729fbc65b upstream.
+
+Reported by syzkaller:
+
+    BUG: unable to handle kernel NULL pointer dereference at 00000000000001b0
+    IP: _raw_spin_lock+0xc/0x30
+    PGD 3e28eb067
+    PUD 3f0ac6067
+    PMD 0
+    Oops: 0002 [#1] SMP
+    CPU: 0 PID: 2431 Comm: test Tainted: G           OE   4.10.0-rc1+ #3
+    Call Trace:
+     ? kvm_ioapic_scan_entry+0x3e/0x110 [kvm]
+     kvm_arch_vcpu_ioctl_run+0x10a8/0x15f0 [kvm]
+     ? pick_next_task_fair+0xe1/0x4e0
+     ? kvm_arch_vcpu_load+0xea/0x260 [kvm]
+     kvm_vcpu_ioctl+0x33a/0x600 [kvm]
+     ? hrtimer_try_to_cancel+0x29/0x130
+     ? do_nanosleep+0x97/0xf0
+     do_vfs_ioctl+0xa1/0x5d0
+     ? __hrtimer_init+0x90/0x90
+     ? do_nanosleep+0x5b/0xf0
+     SyS_ioctl+0x79/0x90
+     do_syscall_64+0x6e/0x180
+     entry_SYSCALL64_slow_path+0x25/0x25
+    RIP: _raw_spin_lock+0xc/0x30 RSP: ffffa43688973cc0
+
+The syzkaller folks reported a NULL pointer dereference due to
+ENABLE_CAP succeeding even without an irqchip.  The Hyper-V
+synthetic interrupt controller is activated, resulting in a
+wrong request to rescan the ioapic and a NULL pointer dereference.
+
+    #include <sys/ioctl.h>
+    #include <sys/mman.h>
+    #include <sys/types.h>
+    #include <linux/kvm.h>
+    #include <pthread.h>
+    #include <stddef.h>
+    #include <stdint.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <unistd.h>
+
+    #ifndef KVM_CAP_HYPERV_SYNIC
+    #define KVM_CAP_HYPERV_SYNIC 123
+    #endif
+
+    void* thr(void* arg)
+    {
+	struct kvm_enable_cap cap;
+	cap.flags = 0;
+	cap.cap = KVM_CAP_HYPERV_SYNIC;
+	ioctl((long)arg, KVM_ENABLE_CAP, &cap);
+	return 0;
+    }
+
+    int main()
+    {
+	void *host_mem = mmap(0, 0x1000, PROT_READ|PROT_WRITE,
+			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+	int kvmfd = open("/dev/kvm", 0);
+	int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0);
+	struct kvm_userspace_memory_region memreg;
+	memreg.slot = 0;
+	memreg.flags = 0;
+	memreg.guest_phys_addr = 0;
+	memreg.memory_size = 0x1000;
+	memreg.userspace_addr = (unsigned long)host_mem;
+	host_mem[0] = 0xf4;
+	ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg);
+	int cpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0);
+	struct kvm_sregs sregs;
+	ioctl(cpufd, KVM_GET_SREGS, &sregs);
+	sregs.cr0 = 0;
+	sregs.cr4 = 0;
+	sregs.efer = 0;
+	sregs.cs.selector = 0;
+	sregs.cs.base = 0;
+	ioctl(cpufd, KVM_SET_SREGS, &sregs);
+	struct kvm_regs regs = { .rflags = 2 };
+	ioctl(cpufd, KVM_SET_REGS, &regs);
+	ioctl(vmfd, KVM_CREATE_IRQCHIP, 0);
+	pthread_t th;
+	pthread_create(&th, 0, thr, (void*)(long)cpufd);
+	usleep(rand() % 10000);
+	ioctl(cpufd, KVM_RUN, 0);
+	pthread_join(th, 0);
+	return 0;
+    }
+
+This patch fixes it by failing ENABLE_CAP if without an irqchip.
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Fixes: 5c919412fe61 (kvm/x86: Hyper-V synthetic interrupt controller)
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Radim KrÄmÃ¡Å <rkrcmar@redhat.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3308,6 +3308,8 @@ static int kvm_vcpu_ioctl_enable_cap(str
+ 
+ 	switch (cap->cap) {
+ 	case KVM_CAP_HYPERV_SYNIC:
++		if (!irqchip_in_kernel(vcpu->kvm))
++			return -EINVAL;
+ 		return kvm_hv_activate_synic(vcpu);
+ 	default:
+ 		return -EINVAL;
diff --git a/queue-4.9/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch b/queue-4.9/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch
new file mode 100644
index 00000000000..c0c14b3966f
--- /dev/null
+++ b/queue-4.9/kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch
@@ -0,0 +1,59 @@
+From cef84c302fe051744b983a92764d3fcca933415d Mon Sep 17 00:00:00 2001
+From: David Matlack <dmatlack@google.com>
+Date: Fri, 16 Dec 2016 14:30:36 -0800
+Subject: KVM: x86: flush pending lapic jump label updates on module unload
+
+From: David Matlack <dmatlack@google.com>
+
+commit cef84c302fe051744b983a92764d3fcca933415d upstream.
+
+KVM's lapic emulation uses static_key_deferred (apic_{hw,sw}_disabled).
+These are implemented with delayed_work structs which can still be
+pending when the KVM module is unloaded. We've seen this cause kernel
+panics when the kvm_intel module is quickly reloaded.
+
+Use the new static_key_deferred_flush() API to flush pending updates on
+module unload.
+
+Signed-off-by: David Matlack <dmatlack@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/lapic.c |    6 ++++++
+ arch/x86/kvm/lapic.h |    1 +
+ arch/x86/kvm/x86.c   |    1 +
+ 3 files changed, 8 insertions(+)
+
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -2360,3 +2360,9 @@ void kvm_lapic_init(void)
+ 	jump_label_rate_limit(&apic_hw_disabled, HZ);
+ 	jump_label_rate_limit(&apic_sw_disabled, HZ);
+ }
++
++void kvm_lapic_exit(void)
++{
++	static_key_deferred_flush(&apic_hw_disabled);
++	static_key_deferred_flush(&apic_sw_disabled);
++}
+--- a/arch/x86/kvm/lapic.h
++++ b/arch/x86/kvm/lapic.h
+@@ -108,6 +108,7 @@ static inline bool kvm_hv_vapic_assist_p
+ 
+ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
+ void kvm_lapic_init(void);
++void kvm_lapic_exit(void);
+ 
+ #define VEC_POS(v) ((v) & (32 - 1))
+ #define REG_POS(v) (((v) >> 5) << 4)
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -5963,6 +5963,7 @@ out:
+ 
+ void kvm_arch_exit(void)
+ {
++	kvm_lapic_exit();
+ 	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+ 
+ 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
diff --git a/queue-4.9/kvm-x86-introduce-segmented_write_std.patch b/queue-4.9/kvm-x86-introduce-segmented_write_std.patch
new file mode 100644
index 00000000000..17f4f4466c4
--- /dev/null
+++ b/queue-4.9/kvm-x86-introduce-segmented_write_std.patch
@@ -0,0 +1,83 @@
+From 129a72a0d3c8e139a04512325384fe5ac119e74d Mon Sep 17 00:00:00 2001
+From: Steve Rutherford <srutherford@google.com>
+Date: Wed, 11 Jan 2017 18:28:29 -0800
+Subject: KVM: x86: Introduce segmented_write_std
+
+From: Steve Rutherford <srutherford@google.com>
+
+commit 129a72a0d3c8e139a04512325384fe5ac119e74d upstream.
+
+Introduces segemented_write_std.
+
+Switches from emulated reads/writes to standard read/writes in fxsave,
+fxrstor, sgdt, and sidt.  This fixes CVE-2017-2584, a longstanding
+kernel memory leak.
+
+Since commit 283c95d0e389 ("KVM: x86: emulate FXSAVE and FXRSTOR",
+2016-11-09), which is luckily not yet in any final release, this would
+also be an exploitable kernel memory *write*!
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Fixes: 96051572c819194c37a8367624b285be10297eca
+Fixes: 283c95d0e3891b64087706b344a4b545d04a6e62
+Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Steve Rutherford <srutherford@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c |   22 ++++++++++++++++++----
+ 1 file changed, 18 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -791,6 +791,20 @@ static int segmented_read_std(struct x86
+ 	return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
+ }
+ 
++static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
++			       struct segmented_address addr,
++			       void *data,
++			       unsigned int size)
++{
++	int rc;
++	ulong linear;
++
++	rc = linearize(ctxt, addr, size, true, &linear);
++	if (rc != X86EMUL_CONTINUE)
++		return rc;
++	return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception);
++}
++
+ /*
+  * Prefetch the remaining bytes of the instruction without crossing page
+  * boundary if they are not in fetch_cache yet.
+@@ -3686,8 +3700,8 @@ static int emulate_store_desc_ptr(struct
+ 	}
+ 	/* Disable writeback. */
+ 	ctxt->dst.type = OP_NONE;
+-	return segmented_write(ctxt, ctxt->dst.addr.mem,
+-			       &desc_ptr, 2 + ctxt->op_bytes);
++	return segmented_write_std(ctxt, ctxt->dst.addr.mem,
++				   &desc_ptr, 2 + ctxt->op_bytes);
+ }
+ 
+ static int em_sgdt(struct x86_emulate_ctxt *ctxt)
+@@ -3933,7 +3947,7 @@ static int em_fxsave(struct x86_emulate_
+ 	else
+ 		size = offsetof(struct fxregs_state, xmm_space[0]);
+ 
+-	return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size);
++	return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ }
+ 
+ static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
+@@ -3975,7 +3989,7 @@ static int em_fxrstor(struct x86_emulate
+ 	if (rc != X86EMUL_CONTINUE)
+ 		return rc;
+ 
+-	rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
++	rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
+ 	if (rc != X86EMUL_CONTINUE)
+ 		return rc;
+ 
diff --git a/queue-4.9/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch b/queue-4.9/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch
new file mode 100644
index 00000000000..0e1b5e0297c
--- /dev/null
+++ b/queue-4.9/mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch
@@ -0,0 +1,111 @@
+From e5bbc8a6c992901058bc09e2ce01d16c111ff047 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Tue, 10 Jan 2017 16:58:27 -0800
+Subject: mm/hugetlb.c: fix reservation race when freeing surplus pages
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit e5bbc8a6c992901058bc09e2ce01d16c111ff047 upstream.
+
+return_unused_surplus_pages() decrements the global reservation count,
+and frees any unused surplus pages that were backing the reservation.
+
+Commit 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in
+return_unused_surplus_pages()") added a call to cond_resched_lock in the
+loop freeing the pages.
+
+As a result, the hugetlb_lock could be dropped, and someone else could
+use the pages that will be freed in subsequent iterations of the loop.
+This could result in inconsistent global hugetlb page state, application
+api failures (such as mmap) failures or application crashes.
+
+When dropping the lock in return_unused_surplus_pages, make sure that
+the global reservation count (resv_huge_pages) remains sufficiently
+large to prevent someone else from claiming pages about to be freed.
+
+Analyzed by Paul Cassella.
+
+Fixes: 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in return_unused_surplus_pages()")
+Link: http://lkml.kernel.org/r/1483991767-6879-1-git-send-email-mike.kravetz@oracle.com
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Reported-by: Paul Cassella <cassella@cray.com>
+Suggested-by: Michal Hocko <mhocko@kernel.org>
+Cc: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c |   37 ++++++++++++++++++++++++++++---------
+ 1 file changed, 28 insertions(+), 9 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -1773,23 +1773,32 @@ free:
+ }
+ 
+ /*
+- * When releasing a hugetlb pool reservation, any surplus pages that were
+- * allocated to satisfy the reservation must be explicitly freed if they were
+- * never used.
+- * Called with hugetlb_lock held.
++ * This routine has two main purposes:
++ * 1) Decrement the reservation count (resv_huge_pages) by the value passed
++ *    in unused_resv_pages.  This corresponds to the prior adjustments made
++ *    to the associated reservation map.
++ * 2) Free any unused surplus pages that may have been allocated to satisfy
++ *    the reservation.  As many as unused_resv_pages may be freed.
++ *
++ * Called with hugetlb_lock held.  However, the lock could be dropped (and
++ * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
++ * we must make sure nobody else can claim pages we are in the process of
++ * freeing.  Do this by ensuring resv_huge_page always is greater than the
++ * number of huge pages we plan to free when dropping the lock.
+  */
+ static void return_unused_surplus_pages(struct hstate *h,
+ 					unsigned long unused_resv_pages)
+ {
+ 	unsigned long nr_pages;
+ 
+-	/* Uncommit the reservation */
+-	h->resv_huge_pages -= unused_resv_pages;
+-
+ 	/* Cannot return gigantic pages currently */
+ 	if (hstate_is_gigantic(h))
+-		return;
++		goto out;
+ 
++	/*
++	 * Part (or even all) of the reservation could have been backed
++	 * by pre-allocated pages. Only free surplus pages.
++	 */
+ 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
+ 
+ 	/*
+@@ -1799,12 +1808,22 @@ static void return_unused_surplus_pages(
+ 	 * when the nodes with surplus pages have no free pages.
+ 	 * free_pool_huge_page() will balance the the freed pages across the
+ 	 * on-line nodes with memory and will handle the hstate accounting.
++	 *
++	 * Note that we decrement resv_huge_pages as we free the pages.  If
++	 * we drop the lock, resv_huge_pages will still be sufficiently large
++	 * to cover subsequent pages we may free.
+ 	 */
+ 	while (nr_pages--) {
++		h->resv_huge_pages--;
++		unused_resv_pages--;
+ 		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+-			break;
++			goto out;
+ 		cond_resched_lock(&hugetlb_lock);
+ 	}
++
++out:
++	/* Fully uncommit the reservation */
++	h->resv_huge_pages -= unused_resv_pages;
+ }
+ 
+ 
diff --git a/queue-4.9/mm-memcg-fix-the-active-list-aging-for-lowmem-requests-when-memcg-is-enabled.patch b/queue-4.9/mm-memcg-fix-the-active-list-aging-for-lowmem-requests-when-memcg-is-enabled.patch
new file mode 100644
index 00000000000..7990062dde0
--- /dev/null
+++ b/queue-4.9/mm-memcg-fix-the-active-list-aging-for-lowmem-requests-when-memcg-is-enabled.patch
@@ -0,0 +1,273 @@
+From b4536f0c829c8586544c94735c343f9b5070bd01 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Tue, 10 Jan 2017 16:58:04 -0800
+Subject: mm, memcg: fix the active list aging for lowmem requests when memcg is enabled
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit b4536f0c829c8586544c94735c343f9b5070bd01 upstream.
+
+Nils Holland and Klaus Ethgen have reported unexpected OOM killer
+invocations with 32b kernel starting with 4.8 kernels
+
+	kworker/u4:5 invoked oom-killer: gfp_mask=0x2400840(GFP_NOFS|__GFP_NOFAIL), nodemask=0, order=0, oom_score_adj=0
+	kworker/u4:5 cpuset=/ mems_allowed=0
+	CPU: 1 PID: 2603 Comm: kworker/u4:5 Not tainted 4.9.0-gentoo #2
+	[...]
+	Mem-Info:
+	active_anon:58685 inactive_anon:90 isolated_anon:0
+	 active_file:274324 inactive_file:281962 isolated_file:0
+	 unevictable:0 dirty:649 writeback:0 unstable:0
+	 slab_reclaimable:40662 slab_unreclaimable:17754
+	 mapped:7382 shmem:202 pagetables:351 bounce:0
+	 free:206736 free_pcp:332 free_cma:0
+	Node 0 active_anon:234740kB inactive_anon:360kB active_file:1097296kB inactive_file:1127848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:29528kB dirty:2596kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 184320kB anon_thp: 808kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no
+	DMA free:3952kB min:788kB low:984kB high:1180kB active_anon:0kB inactive_anon:0kB active_file:7316kB inactive_file:0kB unevictable:0kB writepending:96kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:3200kB slab_unreclaimable:1408kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
+	lowmem_reserve[]: 0 813 3474 3474
+	Normal free:41332kB min:41368kB low:51708kB high:62048kB active_anon:0kB inactive_anon:0kB active_file:532748kB inactive_file:44kB unevictable:0kB writepending:24kB present:897016kB managed:836248kB mlocked:0kB slab_reclaimable:159448kB slab_unreclaimable:69608kB kernel_stack:1112kB pagetables:1404kB bounce:0kB free_pcp:528kB local_pcp:340kB free_cma:0kB
+	lowmem_reserve[]: 0 0 21292 21292
+	HighMem free:781660kB min:512kB low:34356kB high:68200kB active_anon:234740kB inactive_anon:360kB active_file:557232kB inactive_file:1127804kB unevictable:0kB writepending:2592kB present:2725384kB managed:2725384kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:800kB local_pcp:608kB free_cma:0kB
+
+the oom killer is clearly pre-mature because there there is still a lot
+of page cache in the zone Normal which should satisfy this lowmem
+request.  Further debugging has shown that the reclaim cannot make any
+forward progress because the page cache is hidden in the active list
+which doesn't get rotated because inactive_list_is_low is not memcg
+aware.
+
+The code simply subtracts per-zone highmem counters from the respective
+memcg's lru sizes which doesn't make any sense.  We can simply end up
+always seeing the resulting active and inactive counts 0 and return
+false.  This issue is not limited to 32b kernels but in practice the
+effect on systems without CONFIG_HIGHMEM would be much harder to notice
+because we do not invoke the OOM killer for allocations requests
+targeting < ZONE_NORMAL.
+
+Fix the issue by tracking per zone lru page counts in mem_cgroup_per_node
+and subtract per-memcg highmem counts when memcg is enabled.  Introduce
+helper lruvec_zone_lru_size which redirects to either zone counters or
+mem_cgroup_get_zone_lru_size when appropriate.
+
+We are losing empty LRU but non-zero lru size detection introduced by
+ca707239e8a7 ("mm: update_lru_size warn and reset bad lru_size") because
+of the inherent zone vs. node discrepancy.
+
+Fixes: f8d1a31163fc ("mm: consider whether to decivate based on eligible zones inactive ratio")
+Link: http://lkml.kernel.org/r/20170104100825.3729-1-mhocko@kernel.org
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Reported-by: Nils Holland <nholland@tisys.org>
+Tested-by: Nils Holland <nholland@tisys.org>
+Reported-by: Klaus Ethgen <Klaus@Ethgen.de>
+Acked-by: Minchan Kim <minchan@kernel.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Vladimir Davydov <vdavydov.dev@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/memcontrol.h |   26 +++++++++++++++++++++++---
+ include/linux/mm_inline.h  |    2 +-
+ mm/memcontrol.c            |   18 ++++++++----------
+ mm/vmscan.c                |   27 +++++++++++++++++----------
+ 4 files changed, 49 insertions(+), 24 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter {
+  */
+ struct mem_cgroup_per_node {
+ 	struct lruvec		lruvec;
+-	unsigned long		lru_size[NR_LRU_LISTS];
++	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
+ 
+ 	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];
+ 
+@@ -432,7 +432,7 @@ static inline bool mem_cgroup_online(str
+ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
+ 
+ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+-		int nr_pages);
++		int zid, int nr_pages);
+ 
+ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ 					   int nid, unsigned int lru_mask);
+@@ -441,9 +441,23 @@ static inline
+ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+ {
+ 	struct mem_cgroup_per_node *mz;
++	unsigned long nr_pages = 0;
++	int zid;
+ 
+ 	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+-	return mz->lru_size[lru];
++	for (zid = 0; zid < MAX_NR_ZONES; zid++)
++		nr_pages += mz->lru_zone_size[zid][lru];
++	return nr_pages;
++}
++
++static inline
++unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
++		enum lru_list lru, int zone_idx)
++{
++	struct mem_cgroup_per_node *mz;
++
++	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
++	return mz->lru_zone_size[zone_idx][lru];
+ }
+ 
+ void mem_cgroup_handle_over_high(void);
+@@ -671,6 +685,12 @@ mem_cgroup_get_lru_size(struct lruvec *l
+ {
+ 	return 0;
+ }
++static inline
++unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
++		enum lru_list lru, int zone_idx)
++{
++	return 0;
++}
+ 
+ static inline unsigned long
+ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -39,7 +39,7 @@ static __always_inline void update_lru_s
+ {
+ 	__update_lru_size(lruvec, lru, zid, nr_pages);
+ #ifdef CONFIG_MEMCG
+-	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
++	mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
+ #endif
+ }
+ 
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -625,8 +625,8 @@ static void mem_cgroup_charge_statistics
+ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ 					   int nid, unsigned int lru_mask)
+ {
++	struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+ 	unsigned long nr = 0;
+-	struct mem_cgroup_per_node *mz;
+ 	enum lru_list lru;
+ 
+ 	VM_BUG_ON((unsigned)nid >= nr_node_ids);
+@@ -634,8 +634,7 @@ unsigned long mem_cgroup_node_nr_lru_pag
+ 	for_each_lru(lru) {
+ 		if (!(BIT(lru) & lru_mask))
+ 			continue;
+-		mz = mem_cgroup_nodeinfo(memcg, nid);
+-		nr += mz->lru_size[lru];
++		nr += mem_cgroup_get_lru_size(lruvec, lru);
+ 	}
+ 	return nr;
+ }
+@@ -1002,6 +1001,7 @@ out:
+  * mem_cgroup_update_lru_size - account for adding or removing an lru page
+  * @lruvec: mem_cgroup per zone lru vector
+  * @lru: index of lru list the page is sitting on
++ * @zid: zone id of the accounted pages
+  * @nr_pages: positive when adding or negative when removing
+  *
+  * This function must be called under lru_lock, just before a page is added
+@@ -1009,27 +1009,25 @@ out:
+  * so as to allow it to check that lru_size 0 is consistent with list_empty).
+  */
+ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+-				int nr_pages)
++				int zid, int nr_pages)
+ {
+ 	struct mem_cgroup_per_node *mz;
+ 	unsigned long *lru_size;
+ 	long size;
+-	bool empty;
+ 
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 
+ 	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+-	lru_size = mz->lru_size + lru;
+-	empty = list_empty(lruvec->lists + lru);
++	lru_size = &mz->lru_zone_size[zid][lru];
+ 
+ 	if (nr_pages < 0)
+ 		*lru_size += nr_pages;
+ 
+ 	size = *lru_size;
+-	if (WARN_ONCE(size < 0 || empty != !size,
+-		"%s(%p, %d, %d): lru_size %ld but %sempty\n",
+-		__func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
++	if (WARN_ONCE(size < 0,
++		"%s(%p, %d, %d): lru_size %ld\n",
++		__func__, lruvec, lru, nr_pages, size)) {
+ 		VM_BUG_ON(1);
+ 		*lru_size = 0;
+ 	}
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -242,6 +242,16 @@ unsigned long lruvec_lru_size(struct lru
+ 	return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
+ }
+ 
++unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru,
++				   int zone_idx)
++{
++	if (!mem_cgroup_disabled())
++		return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx);
++
++	return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx],
++			       NR_ZONE_LRU_BASE + lru);
++}
++
+ /*
+  * Add a shrinker callback to be called from the vm.
+  */
+@@ -1382,8 +1392,7 @@ int __isolate_lru_page(struct page *page
+  * be complete before mem_cgroup_update_lru_size due to a santity check.
+  */
+ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
+-			enum lru_list lru, unsigned long *nr_zone_taken,
+-			unsigned long nr_taken)
++			enum lru_list lru, unsigned long *nr_zone_taken)
+ {
+ 	int zid;
+ 
+@@ -1392,11 +1401,11 @@ static __always_inline void update_lru_s
+ 			continue;
+ 
+ 		__update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
+-	}
+-
+ #ifdef CONFIG_MEMCG
+-	mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
++		mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
+ #endif
++	}
++
+ }
+ 
+ /*
+@@ -1501,7 +1510,7 @@ static unsigned long isolate_lru_pages(u
+ 	*nr_scanned = scan;
+ 	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
+ 				    nr_taken, mode, is_file_lru(lru));
+-	update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
++	update_lru_sizes(lruvec, lru, nr_zone_taken);
+ 	return nr_taken;
+ }
+ 
+@@ -2047,10 +2056,8 @@ static bool inactive_list_is_low(struct
+ 		if (!managed_zone(zone))
+ 			continue;
+ 
+-		inactive_zone = zone_page_state(zone,
+-				NR_ZONE_LRU_BASE + (file * LRU_FILE));
+-		active_zone = zone_page_state(zone,
+-				NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
++		inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid);
++		active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid);
+ 
+ 		inactive -= min(inactive, inactive_zone);
+ 		active -= min(active, active_zone);
diff --git a/queue-4.9/mm-slab.c-fix-slab-freelist-randomization-duplicate-entries.patch b/queue-4.9/mm-slab.c-fix-slab-freelist-randomization-duplicate-entries.patch
new file mode 100644
index 00000000000..e8959653252
--- /dev/null
+++ b/queue-4.9/mm-slab.c-fix-slab-freelist-randomization-duplicate-entries.patch
@@ -0,0 +1,64 @@
+From c4e490cf148e85ead0d1b1c2caaba833f1d5b29f Mon Sep 17 00:00:00 2001
+From: John Sperbeck <jsperbeck@google.com>
+Date: Tue, 10 Jan 2017 16:58:24 -0800
+Subject: mm/slab.c: fix SLAB freelist randomization duplicate entries
+
+From: John Sperbeck <jsperbeck@google.com>
+
+commit c4e490cf148e85ead0d1b1c2caaba833f1d5b29f upstream.
+
+This patch fixes a bug in the freelist randomization code.  When a high
+random number is used, the freelist will contain duplicate entries.  It
+will result in different allocations sharing the same chunk.
+
+It will result in odd behaviours and crashes.  It should be uncommon but
+it depends on the machines.  We saw it happening more often on some
+machines (every few hours of running tests).
+
+Fixes: c7ce4f60ac19 ("mm: SLAB freelist randomization")
+Link: http://lkml.kernel.org/r/20170103181908.143178-1-thgarnie@google.com
+Signed-off-by: John Sperbeck <jsperbeck@google.com>
+Signed-off-by: Thomas Garnier <thgarnie@google.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Pekka Enberg <penberg@kernel.org>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/slab.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -2475,7 +2475,6 @@ union freelist_init_state {
+ 		unsigned int pos;
+ 		unsigned int *list;
+ 		unsigned int count;
+-		unsigned int rand;
+ 	};
+ 	struct rnd_state rnd_state;
+ };
+@@ -2501,8 +2500,7 @@ static bool freelist_state_initialize(un
+ 	} else {
+ 		state->list = cachep->random_seq;
+ 		state->count = count;
+-		state->pos = 0;
+-		state->rand = rand;
++		state->pos = rand % count;
+ 		ret = true;
+ 	}
+ 	return ret;
+@@ -2511,7 +2509,9 @@ static bool freelist_state_initialize(un
+ /* Get the next entry on the list and randomize it using a random shift */
+ static freelist_idx_t next_random_slot(union freelist_init_state *state)
+ {
+-	return (state->list[state->pos++] + state->rand) % state->count;
++	if (state->pos >= state->count)
++		state->pos = 0;
++	return state->list[state->pos++];
+ }
+ 
+ /* Swap two freelist entries */
diff --git a/queue-4.9/mm-support-anonymous-stable-page.patch b/queue-4.9/mm-support-anonymous-stable-page.patch
new file mode 100644
index 00000000000..2d7a88398b1
--- /dev/null
+++ b/queue-4.9/mm-support-anonymous-stable-page.patch
@@ -0,0 +1,138 @@
+From f05714293a591038304ddae7cb0dd747bb3786cc Mon Sep 17 00:00:00 2001
+From: Minchan Kim <minchan@kernel.org>
+Date: Tue, 10 Jan 2017 16:58:15 -0800
+Subject: mm: support anonymous stable page
+
+From: Minchan Kim <minchan@kernel.org>
+
+commit f05714293a591038304ddae7cb0dd747bb3786cc upstream.
+
+During developemnt for zram-swap asynchronous writeback, I found strange
+corruption of compressed page, resulting in:
+
+  Modules linked in: zram(E)
+  CPU: 3 PID: 1520 Comm: zramd-1 Tainted: G            E   4.8.0-mm1-00320-ge0d4894c9c38-dirty #3274
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+  task: ffff88007620b840 task.stack: ffff880078090000
+  RIP: set_freeobj.part.43+0x1c/0x1f
+  RSP: 0018:ffff880078093ca8  EFLAGS: 00010246
+  RAX: 0000000000000018 RBX: ffff880076798d88 RCX: ffffffff81c408c8
+  RDX: 0000000000000018 RSI: 0000000000000000 RDI: 0000000000000246
+  RBP: ffff880078093cb0 R08: 0000000000000000 R09: 0000000000000000
+  R10: ffff88005bc43030 R11: 0000000000001df3 R12: ffff880076798d88
+  R13: 000000000005bc43 R14: ffff88007819d1b8 R15: 0000000000000001
+  FS:  0000000000000000(0000) GS:ffff88007e380000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 00007fc934048f20 CR3: 0000000077b01000 CR4: 00000000000406e0
+  Call Trace:
+    obj_malloc+0x22b/0x260
+    zs_malloc+0x1e4/0x580
+    zram_bvec_rw+0x4cd/0x830 [zram]
+    page_requests_rw+0x9c/0x130 [zram]
+    zram_thread+0xe6/0x173 [zram]
+    kthread+0xca/0xe0
+    ret_from_fork+0x25/0x30
+
+With investigation, it reveals currently stable page doesn't support
+anonymous page.  IOW, reuse_swap_page can reuse the page without waiting
+writeback completion so it can overwrite page zram is compressing.
+
+Unfortunately, zram has used per-cpu stream feature from v4.7.
+It aims for increasing cache hit ratio of scratch buffer for
+compressing. Downside of that approach is that zram should ask
+memory space for compressed page in per-cpu context which requires
+stricted gfp flag which could be failed. If so, it retries to
+allocate memory space out of per-cpu context so it could get memory
+this time and compress the data again, copies it to the memory space.
+
+In this scenario, zram assumes the data should never be changed
+but it is not true unless stable page supports. So, If the data is
+changed under us, zram can make buffer overrun because second
+compression size could be bigger than one we got in previous trial
+and blindly, copy bigger size object to smaller buffer which is
+buffer overrun. The overrun breaks zsmalloc free object chaining
+so system goes crash like above.
+
+I think below is same problem.
+https://bugzilla.suse.com/show_bug.cgi?id=997574
+
+Unfortunately, reuse_swap_page should be atomic so that we cannot wait on
+writeback in there so the approach in this patch is simply return false if
+we found it needs stable page.  Although it increases memory footprint
+temporarily, it happens rarely and it should be reclaimed easily althoug
+it happened.  Also, It would be better than waiting of IO completion,
+which is critial path for application latency.
+
+Fixes: da9556a2367c ("zram: user per-cpu compression streams")
+Link: http://lkml.kernel.org/r/20161120233015.GA14113@bbox
+Link: http://lkml.kernel.org/r/1482366980-3782-2-git-send-email-minchan@kernel.org
+Signed-off-by: Minchan Kim <minchan@kernel.org>
+Acked-by: Hugh Dickins <hughd@google.com>
+Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Cc: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: Takashi Iwai <tiwai@suse.de>
+Cc: Hyeoncheol Lee <cheol.lee@lge.com>
+Cc: <yjay.kim@lge.com>
+Cc: Sangseok Lee <sangseok.lee@lge.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/swap.h |    3 ++-
+ mm/swapfile.c        |   20 +++++++++++++++++++-
+ 2 files changed, 21 insertions(+), 2 deletions(-)
+
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -150,8 +150,9 @@ enum {
+ 	SWP_FILE	= (1 << 7),	/* set after swap_activate success */
+ 	SWP_AREA_DISCARD = (1 << 8),	/* single-time swap area discards */
+ 	SWP_PAGE_DISCARD = (1 << 9),	/* freed swap page-cluster discards */
++	SWP_STABLE_WRITES = (1 << 10),	/* no overwrite PG_writeback pages */
+ 					/* add others here before... */
+-	SWP_SCANNING	= (1 << 10),	/* refcount in scan_swap_map */
++	SWP_SCANNING	= (1 << 11),	/* refcount in scan_swap_map */
+ };
+ 
+ #define SWAP_CLUSTER_MAX 32UL
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -943,11 +943,25 @@ bool reuse_swap_page(struct page *page,
+ 	count = page_trans_huge_mapcount(page, total_mapcount);
+ 	if (count <= 1 && PageSwapCache(page)) {
+ 		count += page_swapcount(page);
+-		if (count == 1 && !PageWriteback(page)) {
++		if (count != 1)
++			goto out;
++		if (!PageWriteback(page)) {
+ 			delete_from_swap_cache(page);
+ 			SetPageDirty(page);
++		} else {
++			swp_entry_t entry;
++			struct swap_info_struct *p;
++
++			entry.val = page_private(page);
++			p = swap_info_get(entry);
++			if (p->flags & SWP_STABLE_WRITES) {
++				spin_unlock(&p->lock);
++				return false;
++			}
++			spin_unlock(&p->lock);
+ 		}
+ 	}
++out:
+ 	return count <= 1;
+ }
+ 
+@@ -2449,6 +2463,10 @@ SYSCALL_DEFINE2(swapon, const char __use
+ 		error = -ENOMEM;
+ 		goto bad_swap;
+ 	}
++
++	if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
++		p->flags |= SWP_STABLE_WRITES;
++
+ 	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ 		int cpu;
+ 
diff --git a/queue-4.9/series b/queue-4.9/series
index a4f80977726..a68230699e2 100644
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -15,3 +15,13 @@ dax-fix-deadlock-with-dax-4k-holes.patch
 mm-pmd-dirty-emulation-in-page-fault-handler.patch
 mm-fix-devm_memremap_pages-crash-use-mem_hotplug_-begin-done.patch
 ocfs2-fix-crash-caused-by-stale-lvb-with-fsdlm-plugin.patch
+mm-memcg-fix-the-active-list-aging-for-lowmem-requests-when-memcg-is-enabled.patch
+mm-support-anonymous-stable-page.patch
+mm-slab.c-fix-slab-freelist-randomization-duplicate-entries.patch
+mm-hugetlb.c-fix-reservation-race-when-freeing-surplus-pages.patch
+kvm-x86-fix-emulation-of-mov-ss-null-selector.patch
+kvm-eventfd-fix-null-deref-irqbypass-consumer.patch
+kvm-x86-flush-pending-lapic-jump-label-updates-on-module-unload.patch
+kvm-x86-fix-null-deref-in-vcpu_scan_ioapic.patch
+kvm-x86-emulate-fxsave-and-fxrstor.patch
+kvm-x86-introduce-segmented_write_std.patch