From fc0f56e7c9a6611fbf1a14f0de73d3815c3d85f0 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 3 Oct 2017 11:02:39 +0200
Subject: [PATCH] 4.9-stable patches

added patches:
	cxl-fix-driver-use-count.patch
	gfs2-fix-debugfs-glocks-dump.patch
	kvm-vmx-do-not-change-sn-bit-in-vmx_update_pi_irte.patch
	kvm-vmx-remove-warn_on_once-in-kvm_vcpu_trigger_posted_interrupt.patch
	timer-sysclt-restrict-timer-migration-sysctl-values-to-0-and-1.patch
---
 queue-4.9/cxl-fix-driver-use-count.patch      |  83 ++++++++++++++
 queue-4.9/gfs2-fix-debugfs-glocks-dump.patch  | 108 ++++++++++++++++++
 ...-change-sn-bit-in-vmx_update_pi_irte.patch |  50 ++++++++
 ...in-kvm_vcpu_trigger_posted_interrupt.patch |  81 +++++++++++++
 queue-4.9/series                              |   5 +
 ...r-migration-sysctl-values-to-0-and-1.patch |  49 ++++++++
 6 files changed, 376 insertions(+)
 create mode 100644 queue-4.9/cxl-fix-driver-use-count.patch
 create mode 100644 queue-4.9/gfs2-fix-debugfs-glocks-dump.patch
 create mode 100644 queue-4.9/kvm-vmx-do-not-change-sn-bit-in-vmx_update_pi_irte.patch
 create mode 100644 queue-4.9/kvm-vmx-remove-warn_on_once-in-kvm_vcpu_trigger_posted_interrupt.patch
 create mode 100644 queue-4.9/timer-sysclt-restrict-timer-migration-sysctl-values-to-0-and-1.patch

diff --git a/queue-4.9/cxl-fix-driver-use-count.patch b/queue-4.9/cxl-fix-driver-use-count.patch
new file mode 100644
index 00000000000..50e549bad05
--- /dev/null
+++ b/queue-4.9/cxl-fix-driver-use-count.patch
@@ -0,0 +1,83 @@
+From 197267d0356004a31c4d6b6336598f5dff3301e1 Mon Sep 17 00:00:00 2001
+From: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
+Date: Wed, 30 Aug 2017 12:15:49 +0200
+Subject: cxl: Fix driver use count
+
+From: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
+
+commit 197267d0356004a31c4d6b6336598f5dff3301e1 upstream.
+
+cxl keeps a driver use count, which is used with the hash memory model
+on p8 to know when to upgrade local TLBIs to global and to trigger
+callbacks to manage the MMU for PSL8.
+
+If a process opens a context and closes without attaching or fails the
+attachment, the driver use count is never decremented. As a
+consequence, TLB invalidations remain global, even if there are no
+active cxl contexts.
+
+We should increment the driver use count when the process is attaching
+to the cxl adapter, and not on open. It's not needed before the
+adapter starts using the context and the use count is decremented on
+the detach path, so it makes more sense.
+
+It affects only the user api. The kernel api is already doing The
+Right Thing.
+
+Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
+Cc: stable@vger.kernel.org # v4.2+
+Fixes: 7bb5d91a4dda ("cxl: Rework context lifetimes")
+Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+[ajd: backport to stable v4.9 tree]
+Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/misc/cxl/api.c  |    4 ++++
+ drivers/misc/cxl/file.c |    8 +++++++-
+ 2 files changed, 11 insertions(+), 1 deletion(-)
+
+--- a/drivers/misc/cxl/api.c
++++ b/drivers/misc/cxl/api.c
+@@ -244,6 +244,10 @@ int cxl_start_context(struct cxl_context
+ 		ctx->real_mode = false;
+ 	}
+ 
++	/*
++	 * Increment driver use count. Enables global TLBIs for hash
++	 * and callbacks to handle the segment table
++	 */
+ 	cxl_ctx_get();
+ 
+ 	if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) {
+--- a/drivers/misc/cxl/file.c
++++ b/drivers/misc/cxl/file.c
+@@ -91,7 +91,6 @@ static int __afu_open(struct inode *inod
+ 
+ 	pr_devel("afu_open pe: %i\n", ctx->pe);
+ 	file->private_data = ctx;
+-	cxl_ctx_get();
+ 
+ 	/* indicate success */
+ 	rc = 0;
+@@ -213,6 +212,12 @@ static long afu_ioctl_start_work(struct
+ 	ctx->glpid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ 
+ 
++	/*
++	 * Increment driver use count. Enables global TLBIs for hash
++	 * and callbacks to handle the segment table
++	 */
++	cxl_ctx_get();
++
+ 	trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr);
+ 
+ 	if ((rc = cxl_ops->attach_process(ctx, false, work.work_element_descriptor,
+@@ -222,6 +227,7 @@ static long afu_ioctl_start_work(struct
+ 		put_pid(ctx->glpid);
+ 		put_pid(ctx->pid);
+ 		ctx->glpid = ctx->pid = NULL;
++		cxl_ctx_put();
+ 		goto out;
+ 	}
+ 
diff --git a/queue-4.9/gfs2-fix-debugfs-glocks-dump.patch b/queue-4.9/gfs2-fix-debugfs-glocks-dump.patch
new file mode 100644
index 00000000000..9c3b3cc0931
--- /dev/null
+++ b/queue-4.9/gfs2-fix-debugfs-glocks-dump.patch
@@ -0,0 +1,108 @@
+From 10201655b085df8e000822e496e5d4016a167a36 Mon Sep 17 00:00:00 2001
+From: Andreas Gruenbacher <agruenba@redhat.com>
+Date: Tue, 19 Sep 2017 07:15:35 -0500
+Subject: gfs2: Fix debugfs glocks dump
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 10201655b085df8e000822e496e5d4016a167a36 upstream.
+
+The switch to rhashtables (commit 88ffbf3e03) broke the debugfs glock
+dump (/sys/kernel/debug/gfs2/<device>/glocks) for dumps bigger than a
+single buffer: the right function for restarting an rhashtable iteration
+from the beginning of the hash table is rhashtable_walk_enter;
+rhashtable_walk_stop + rhashtable_walk_start will just resume from the
+current position.
+
+The upstream commit doesn't directly apply to 4.9.y because 4.9.y
+doesn't have the following mainline commits:
+
+  92ecd73a887c4a2b94daf5fc35179d75d1c4ef95
+    gfs2: Deduplicate gfs2_{glocks,glstats}_open
+  cc37a62785a584f4875788689f3fd1fa6e4eb291
+    gfs2: Replace rhashtable_walk_init with rhashtable_walk_enter
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Bob Peterson <rpeterso@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/gfs2/glock.c |   16 +++++-----------
+ 1 file changed, 5 insertions(+), 11 deletions(-)
+
+--- a/fs/gfs2/glock.c
++++ b/fs/gfs2/glock.c
+@@ -1836,13 +1836,9 @@ static void *gfs2_glock_seq_start(struct
+ {
+ 	struct gfs2_glock_iter *gi = seq->private;
+ 	loff_t n = *pos;
+-	int ret;
+ 
+-	if (gi->last_pos <= *pos)
+-		n = (*pos - gi->last_pos);
+-
+-	ret = rhashtable_walk_start(&gi->hti);
+-	if (ret)
++	rhashtable_walk_enter(&gl_hash_table, &gi->hti);
++	if (rhashtable_walk_start(&gi->hti) != 0)
+ 		return NULL;
+ 
+ 	do {
+@@ -1850,6 +1846,7 @@ static void *gfs2_glock_seq_start(struct
+ 	} while (gi->gl && n--);
+ 
+ 	gi->last_pos = *pos;
++
+ 	return gi->gl;
+ }
+ 
+@@ -1861,6 +1858,7 @@ static void *gfs2_glock_seq_next(struct
+ 	(*pos)++;
+ 	gi->last_pos = *pos;
+ 	gfs2_glock_iter_next(gi);
++
+ 	return gi->gl;
+ }
+ 
+@@ -1870,6 +1868,7 @@ static void gfs2_glock_seq_stop(struct s
+ 
+ 	gi->gl = NULL;
+ 	rhashtable_walk_stop(&gi->hti);
++	rhashtable_walk_exit(&gi->hti);
+ }
+ 
+ static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
+@@ -1932,12 +1931,10 @@ static int gfs2_glocks_open(struct inode
+ 		struct gfs2_glock_iter *gi = seq->private;
+ 
+ 		gi->sdp = inode->i_private;
+-		gi->last_pos = 0;
+ 		seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
+ 		if (seq->buf)
+ 			seq->size = GFS2_SEQ_GOODSIZE;
+ 		gi->gl = NULL;
+-		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
+ 	}
+ 	return ret;
+ }
+@@ -1948,7 +1945,6 @@ static int gfs2_glocks_release(struct in
+ 	struct gfs2_glock_iter *gi = seq->private;
+ 
+ 	gi->gl = NULL;
+-	rhashtable_walk_exit(&gi->hti);
+ 	return seq_release_private(inode, file);
+ }
+ 
+@@ -1960,12 +1956,10 @@ static int gfs2_glstats_open(struct inod
+ 		struct seq_file *seq = file->private_data;
+ 		struct gfs2_glock_iter *gi = seq->private;
+ 		gi->sdp = inode->i_private;
+-		gi->last_pos = 0;
+ 		seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
+ 		if (seq->buf)
+ 			seq->size = GFS2_SEQ_GOODSIZE;
+ 		gi->gl = NULL;
+-		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
+ 	}
+ 	return ret;
+ }
diff --git a/queue-4.9/kvm-vmx-do-not-change-sn-bit-in-vmx_update_pi_irte.patch b/queue-4.9/kvm-vmx-do-not-change-sn-bit-in-vmx_update_pi_irte.patch
new file mode 100644
index 00000000000..6477be42935
--- /dev/null
+++ b/queue-4.9/kvm-vmx-do-not-change-sn-bit-in-vmx_update_pi_irte.patch
@@ -0,0 +1,50 @@
+From dc91f2eb1a4021eb6705c15e474942f84ab9b211 Mon Sep 17 00:00:00 2001
+From: Haozhong Zhang <haozhong.zhang@intel.com>
+Date: Mon, 18 Sep 2017 09:56:49 +0800
+Subject: KVM: VMX: do not change SN bit in vmx_update_pi_irte()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Haozhong Zhang <haozhong.zhang@intel.com>
+
+commit dc91f2eb1a4021eb6705c15e474942f84ab9b211 upstream.
+
+In kvm_vcpu_trigger_posted_interrupt() and pi_pre_block(), KVM
+assumes that PI notification events should not be suppressed when the
+target vCPU is not blocked.
+
+vmx_update_pi_irte() sets the SN field before changing an interrupt
+from posting to remapping, but it does not check the vCPU mode.
+Therefore, the change of SN field may break above the assumption.
+Besides, I don't see reasons to suppress notification events here, so
+remove the changes of SN field to avoid race condition.
+
+Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
+Reported-by: "Ramamurthy, Venkatesh" <venkatesh.ramamurthy@intel.com>
+Reported-by: Dan Williams <dan.j.williams@intel.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted")
+Signed-off-by: Radim KrÄmÃ¡Å <rkrcmar@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c |    6 +-----
+ 1 file changed, 1 insertion(+), 5 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -11215,12 +11215,8 @@ static int vmx_update_pi_irte(struct kvm
+ 
+ 		if (set)
+ 			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+-		else {
+-			/* suppress notification event before unposting */
+-			pi_set_sn(vcpu_to_pi_desc(vcpu));
++		else
+ 			ret = irq_set_vcpu_affinity(host_irq, NULL);
+-			pi_clear_sn(vcpu_to_pi_desc(vcpu));
+-		}
+ 
+ 		if (ret < 0) {
+ 			printk(KERN_INFO "%s: failed to update PI IRTE\n",
diff --git a/queue-4.9/kvm-vmx-remove-warn_on_once-in-kvm_vcpu_trigger_posted_interrupt.patch b/queue-4.9/kvm-vmx-remove-warn_on_once-in-kvm_vcpu_trigger_posted_interrupt.patch
new file mode 100644
index 00000000000..7089701b5c8
--- /dev/null
+++ b/queue-4.9/kvm-vmx-remove-warn_on_once-in-kvm_vcpu_trigger_posted_interrupt.patch
@@ -0,0 +1,81 @@
+From 5753743fa5108b8f98bd61e40dc63f641b26c768 Mon Sep 17 00:00:00 2001
+From: Haozhong Zhang <haozhong.zhang@intel.com>
+Date: Mon, 18 Sep 2017 09:56:50 +0800
+Subject: KVM: VMX: remove WARN_ON_ONCE in kvm_vcpu_trigger_posted_interrupt
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Haozhong Zhang <haozhong.zhang@intel.com>
+
+commit 5753743fa5108b8f98bd61e40dc63f641b26c768 upstream.
+
+WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)) in kvm_vcpu_trigger_posted_interrupt()
+intends to detect the violation of invariant that VT-d PI notification
+event is not suppressed when vcpu is in the guest mode. Because the
+two checks for the target vcpu mode and the target suppress field
+cannot be performed atomically, the target vcpu mode may change in
+between. If that does happen, WARN_ON_ONCE() here may raise false
+alarms.
+
+As the previous patch fixed the real invariant breaker, remove this
+WARN_ON_ONCE() to avoid false alarms, and document the allowed cases
+instead.
+
+Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
+Reported-by: "Ramamurthy, Venkatesh" <venkatesh.ramamurthy@intel.com>
+Reported-by: Dan Williams <dan.j.williams@intel.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted")
+Signed-off-by: Radim KrÄmÃ¡Å <rkrcmar@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c |   33 +++++++++++++++++++++------------
+ 1 file changed, 21 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -4759,21 +4759,30 @@ static inline bool kvm_vcpu_trigger_post
+ {
+ #ifdef CONFIG_SMP
+ 	if (vcpu->mode == IN_GUEST_MODE) {
+-		struct vcpu_vmx *vmx = to_vmx(vcpu);
+-
+ 		/*
+-		 * Currently, we don't support urgent interrupt,
+-		 * all interrupts are recognized as non-urgent
+-		 * interrupt, so we cannot post interrupts when
+-		 * 'SN' is set.
++		 * The vector of interrupt to be delivered to vcpu had
++		 * been set in PIR before this function.
++		 *
++		 * Following cases will be reached in this block, and
++		 * we always send a notification event in all cases as
++		 * explained below.
++		 *
++		 * Case 1: vcpu keeps in non-root mode. Sending a
++		 * notification event posts the interrupt to vcpu.
++		 *
++		 * Case 2: vcpu exits to root mode and is still
++		 * runnable. PIR will be synced to vIRR before the
++		 * next vcpu entry. Sending a notification event in
++		 * this case has no effect, as vcpu is not in root
++		 * mode.
+ 		 *
+-		 * If the vcpu is in guest mode, it means it is
+-		 * running instead of being scheduled out and
+-		 * waiting in the run queue, and that's the only
+-		 * case when 'SN' is set currently, warning if
+-		 * 'SN' is set.
++		 * Case 3: vcpu exits to root mode and is blocked.
++		 * vcpu_block() has already synced PIR to vIRR and
++		 * never blocks vcpu if vIRR is not cleared. Therefore,
++		 * a blocked vcpu here does not wait for any requested
++		 * interrupts in PIR, and sending a notification event
++		 * which has no effect is safe here.
+ 		 */
+-		WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
+ 
+ 		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+ 				POSTED_INTR_VECTOR);
diff --git a/queue-4.9/series b/queue-4.9/series
index 92257aa3283..da1912e49fc 100644
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -54,3 +54,8 @@ btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch
 btrfs-prevent-to-set-invalid-default-subvolid.patch
 x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch
 x86-fpu-don-t-let-userspace-set-bogus-xcomp_bv.patch
+gfs2-fix-debugfs-glocks-dump.patch
+timer-sysclt-restrict-timer-migration-sysctl-values-to-0-and-1.patch
+kvm-vmx-do-not-change-sn-bit-in-vmx_update_pi_irte.patch
+kvm-vmx-remove-warn_on_once-in-kvm_vcpu_trigger_posted_interrupt.patch
+cxl-fix-driver-use-count.patch
diff --git a/queue-4.9/timer-sysclt-restrict-timer-migration-sysctl-values-to-0-and-1.patch b/queue-4.9/timer-sysclt-restrict-timer-migration-sysctl-values-to-0-and-1.patch
new file mode 100644
index 00000000000..bf4c9bc2322
--- /dev/null
+++ b/queue-4.9/timer-sysclt-restrict-timer-migration-sysctl-values-to-0-and-1.patch
@@ -0,0 +1,49 @@
+From b94bf594cf8ed67cdd0439e70fa939783471597a Mon Sep 17 00:00:00 2001
+From: Myungho Jung <mhjungk@gmail.com>
+Date: Wed, 19 Apr 2017 15:24:50 -0700
+Subject: timer/sysclt: Restrict timer migration sysctl values to 0 and 1
+
+From: Myungho Jung <mhjungk@gmail.com>
+
+commit b94bf594cf8ed67cdd0439e70fa939783471597a upstream.
+
+timer_migration sysctl acts as a boolean switch, so the allowed values
+should be restricted to 0 and 1.
+
+Add the necessary extra fields to the sysctl table entry to enforce that.
+
+[ tglx: Rewrote changelog ]
+
+Signed-off-by: Myungho Jung <mhjungk@gmail.com>
+Link: http://lkml.kernel.org/r/1492640690-3550-1-git-send-email-mhjungk@gmail.com
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Kazuhiro Hayashi <kazuhiro3.hayashi@toshiba.co.jp>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sysctl.c     |    2 ++
+ kernel/time/timer.c |    2 +-
+ 2 files changed, 3 insertions(+), 1 deletion(-)
+
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -1189,6 +1189,8 @@ static struct ctl_table kern_table[] = {
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= timer_migration_handler,
++		.extra1		= &zero,
++		.extra2		= &one,
+ 	},
+ #endif
+ #ifdef CONFIG_BPF_SYSCALL
+--- a/kernel/time/timer.c
++++ b/kernel/time/timer.c
+@@ -240,7 +240,7 @@ int timer_migration_handler(struct ctl_t
+ 	int ret;
+ 
+ 	mutex_lock(&mutex);
+-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
++	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ 	if (!ret && write)
+ 		timers_update_migration(false);
+ 	mutex_unlock(&mutex);
-- 
2.47.3