From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 25 Apr 2017 12:20:46 +0000 (+0100)
Subject: 4.4-stable patches
X-Git-Tag: v4.4.64~3
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b39dbc5b6dee5bc9e28b1b8bf50e00afebb976cc;p=thirdparty%2Fkernel%2Fstable-queue.git

4.4-stable patches

added patches:
	block-fix-del_gendisk-vs-blkdev_ioctl-crash.patch
	drivers-hv-balloon-account-for-gaps-in-hot-add-regions.patch
	drivers-hv-balloon-keep-track-of-where-ha_region-starts.patch
	hv-don-t-reset-hv_context.tsc_page-on-crash.patch
	kvm-arm-arm64-fix-locking-for-kvm_free_stage2_pgd.patch
	powerpc-64-fix-flush_-d-i-cache_range-called-from-modules.patch
	tipc-fix-crash-during-node-removal.patch
	tools-hv-kvp-ensure-kvp-device-fd-is-closed-on-exec.patch
	x86-mce-amd-give-a-name-to-mca-bank-3-when-accessed-with-legacy-msrs.patch
	x86-pmem-fix-broken-__copy_user_nocache-cache-bypass-assumptions.patch
---

diff --git a/queue-4.4/block-fix-del_gendisk-vs-blkdev_ioctl-crash.patch b/queue-4.4/block-fix-del_gendisk-vs-blkdev_ioctl-crash.patch
new file mode 100644
index 00000000000..77cd86a1c07
--- /dev/null
+++ b/queue-4.4/block-fix-del_gendisk-vs-blkdev_ioctl-crash.patch
@@ -0,0 +1,66 @@
+From ac34f15e0c6d2fd58480052b6985f6991fb53bcc Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Tue, 29 Dec 2015 14:02:29 -0800
+Subject: block: fix del_gendisk() vs blkdev_ioctl crash
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit ac34f15e0c6d2fd58480052b6985f6991fb53bcc upstream.
+
+When tearing down a block device early in its lifetime, userspace may
+still be performing discovery actions like blkdev_ioctl() to re-read
+partitions.
+
+The nvdimm_revalidate_disk() implementation depends on
+disk->driverfs_dev to be valid at entry.  However, it is set to NULL in
+del_gendisk() and fatally this is happening *before* the disk device is
+deleted from userspace view.
+
+There's no reason for del_gendisk() to clear ->driverfs_dev.  That
+device is the parent of the disk.  It is guaranteed to not be freed
+until the disk, as a child, drops its ->parent reference.
+
+We could also fix this issue locally in nvdimm_revalidate_disk() by
+using disk_to_dev(disk)->parent, but lets fix it globally since
+->driverfs_dev follows the lifetime of the parent.  Longer term we
+should probably just add a @parent parameter to add_disk(), and stop
+carrying this pointer in the gendisk.
+
+ BUG: unable to handle kernel NULL pointer dereference at           (null)
+ IP: [<ffffffffa00340a8>] nvdimm_revalidate_disk+0x18/0x90 [libnvdimm]
+ CPU: 2 PID: 538 Comm: systemd-udevd Tainted: G           O    4.4.0-rc5 #2257
+ [..]
+ Call Trace:
+  [<ffffffff8143e5c7>] rescan_partitions+0x87/0x2c0
+  [<ffffffff810f37f9>] ? __lock_is_held+0x49/0x70
+  [<ffffffff81438c62>] __blkdev_reread_part+0x72/0xb0
+  [<ffffffff81438cc5>] blkdev_reread_part+0x25/0x40
+  [<ffffffff8143982d>] blkdev_ioctl+0x4fd/0x9c0
+  [<ffffffff811246c9>] ? current_kernel_time64+0x69/0xd0
+  [<ffffffff812916dd>] block_ioctl+0x3d/0x50
+  [<ffffffff81264c38>] do_vfs_ioctl+0x308/0x560
+  [<ffffffff8115dbd1>] ? __audit_syscall_entry+0xb1/0x100
+  [<ffffffff810031d6>] ? do_audit_syscall_entry+0x66/0x70
+  [<ffffffff81264f09>] SyS_ioctl+0x79/0x90
+  [<ffffffff81902672>] entry_SYSCALL_64_fastpath+0x12/0x76
+
+Cc: Jan Kara <jack@suse.cz>
+Cc: Jens Axboe <axboe@fb.com>
+Reported-by: Robert Hu <robert.hu@intel.com>
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/genhd.c |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/block/genhd.c
++++ b/block/genhd.c
+@@ -664,7 +664,6 @@ void del_gendisk(struct gendisk *disk)
+ 
+ 	kobject_put(disk->part0.holder_dir);
+ 	kobject_put(disk->slave_dir);
+-	disk->driverfs_dev = NULL;
+ 	if (!sysfs_deprecated)
+ 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+ 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
diff --git a/queue-4.4/drivers-hv-balloon-account-for-gaps-in-hot-add-regions.patch b/queue-4.4/drivers-hv-balloon-account-for-gaps-in-hot-add-regions.patch
new file mode 100644
index 00000000000..ed728fd7e3e
--- /dev/null
+++ b/queue-4.4/drivers-hv-balloon-account-for-gaps-in-hot-add-regions.patch
@@ -0,0 +1,271 @@
+From cb7a5724c7e1bfb5766ad1c3beba14cc715991cf Mon Sep 17 00:00:00 2001
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+Date: Wed, 24 Aug 2016 16:23:10 -0700
+Subject: Drivers: hv: balloon: account for gaps in hot add regions
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+commit cb7a5724c7e1bfb5766ad1c3beba14cc715991cf upstream.
+
+I'm observing the following hot add requests from the WS2012 host:
+
+hot_add_req: start_pfn = 0x108200 count = 330752
+hot_add_req: start_pfn = 0x158e00 count = 193536
+hot_add_req: start_pfn = 0x188400 count = 239616
+
+As the host doesn't specify hot add regions we're trying to create
+128Mb-aligned region covering the first request, we create the 0x108000 -
+0x160000 region and we add 0x108000 - 0x158e00 memory. The second request
+passes the pfn_covered() check, we enlarge the region to 0x108000 -
+0x190000 and add 0x158e00 - 0x188200 memory. The problem emerges with the
+third request as it starts at 0x188400 so there is a 0x200 gap which is
+not covered. As the end of our region is 0x190000 now it again passes the
+pfn_covered() check were we just adjust the covered_end_pfn and make it
+0x188400 instead of 0x188200 which means that we'll try to online
+0x188200-0x188400 pages but these pages were never assigned to us and we
+crash.
+
+We can't react to such requests by creating new hot add regions as it may
+happen that the whole suggested range falls into the previously identified
+128Mb-aligned area so we'll end up adding nothing or create intersecting
+regions and our current logic doesn't allow that. Instead, create a list of
+such 'gaps' and check for them in the page online callback.
+
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
+Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/hv/hv_balloon.c |  131 ++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 94 insertions(+), 37 deletions(-)
+
+--- a/drivers/hv/hv_balloon.c
++++ b/drivers/hv/hv_balloon.c
+@@ -441,6 +441,16 @@ struct hv_hotadd_state {
+ 	unsigned long covered_end_pfn;
+ 	unsigned long ha_end_pfn;
+ 	unsigned long end_pfn;
++	/*
++	 * A list of gaps.
++	 */
++	struct list_head gap_list;
++};
++
++struct hv_hotadd_gap {
++	struct list_head list;
++	unsigned long start_pfn;
++	unsigned long end_pfn;
+ };
+ 
+ struct balloon_state {
+@@ -596,18 +606,46 @@ static struct notifier_block hv_memory_n
+ 	.priority = 0
+ };
+ 
++/* Check if the particular page is backed and can be onlined and online it. */
++static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
++{
++	unsigned long cur_start_pgp;
++	unsigned long cur_end_pgp;
++	struct hv_hotadd_gap *gap;
++
++	cur_start_pgp = (unsigned long)pfn_to_page(has->covered_start_pfn);
++	cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn);
++
++	/* The page is not backed. */
++	if (((unsigned long)pg < cur_start_pgp) ||
++	    ((unsigned long)pg >= cur_end_pgp))
++		return;
++
++	/* Check for gaps. */
++	list_for_each_entry(gap, &has->gap_list, list) {
++		cur_start_pgp = (unsigned long)
++			pfn_to_page(gap->start_pfn);
++		cur_end_pgp = (unsigned long)
++			pfn_to_page(gap->end_pfn);
++		if (((unsigned long)pg >= cur_start_pgp) &&
++		    ((unsigned long)pg < cur_end_pgp)) {
++			return;
++		}
++	}
+ 
+-static void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size)
++	/* This frame is currently backed; online the page. */
++	__online_page_set_limits(pg);
++	__online_page_increment_counters(pg);
++	__online_page_free(pg);
++}
++
++static void hv_bring_pgs_online(struct hv_hotadd_state *has,
++				unsigned long start_pfn, unsigned long size)
+ {
+ 	int i;
+ 
+-	for (i = 0; i < size; i++) {
+-		struct page *pg;
+-		pg = pfn_to_page(start_pfn + i);
+-		__online_page_set_limits(pg);
+-		__online_page_increment_counters(pg);
+-		__online_page_free(pg);
+-	}
++	for (i = 0; i < size; i++)
++		hv_page_online_one(has, pfn_to_page(start_pfn + i));
+ }
+ 
+ static void hv_mem_hot_add(unsigned long start, unsigned long size,
+@@ -684,26 +722,24 @@ static void hv_online_page(struct page *
+ 	list_for_each(cur, &dm_device.ha_region_list) {
+ 		has = list_entry(cur, struct hv_hotadd_state, list);
+ 		cur_start_pgp = (unsigned long)
+-			pfn_to_page(has->covered_start_pfn);
+-		cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn);
++			pfn_to_page(has->start_pfn);
++		cur_end_pgp = (unsigned long)pfn_to_page(has->end_pfn);
+ 
+-		if (((unsigned long)pg >= cur_start_pgp) &&
+-			((unsigned long)pg < cur_end_pgp)) {
+-			/*
+-			 * This frame is currently backed; online the
+-			 * page.
+-			 */
+-			__online_page_set_limits(pg);
+-			__online_page_increment_counters(pg);
+-			__online_page_free(pg);
+-		}
++		/* The page belongs to a different HAS. */
++		if (((unsigned long)pg < cur_start_pgp) ||
++		    ((unsigned long)pg >= cur_end_pgp))
++			continue;
++
++		hv_page_online_one(has, pg);
++		break;
+ 	}
+ }
+ 
+-static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
++static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
+ {
+ 	struct list_head *cur;
+ 	struct hv_hotadd_state *has;
++	struct hv_hotadd_gap *gap;
+ 	unsigned long residual, new_inc;
+ 
+ 	if (list_empty(&dm_device.ha_region_list))
+@@ -718,6 +754,24 @@ static bool pfn_covered(unsigned long st
+ 		 */
+ 		if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn)
+ 			continue;
++
++		/*
++		 * If the current start pfn is not where the covered_end
++		 * is, create a gap and update covered_end_pfn.
++		 */
++		if (has->covered_end_pfn != start_pfn) {
++			gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
++			if (!gap)
++				return -ENOMEM;
++
++			INIT_LIST_HEAD(&gap->list);
++			gap->start_pfn = has->covered_end_pfn;
++			gap->end_pfn = start_pfn;
++			list_add_tail(&gap->list, &has->gap_list);
++
++			has->covered_end_pfn = start_pfn;
++		}
++
+ 		/*
+ 		 * If the current hot add-request extends beyond
+ 		 * our current limit; extend it.
+@@ -734,19 +788,10 @@ static bool pfn_covered(unsigned long st
+ 			has->end_pfn += new_inc;
+ 		}
+ 
+-		/*
+-		 * If the current start pfn is not where the covered_end
+-		 * is, update it.
+-		 */
+-
+-		if (has->covered_end_pfn != start_pfn)
+-			has->covered_end_pfn = start_pfn;
+-
+-		return true;
+-
++		return 1;
+ 	}
+ 
+-	return false;
++	return 0;
+ }
+ 
+ static unsigned long handle_pg_range(unsigned long pg_start,
+@@ -785,6 +830,8 @@ static unsigned long handle_pg_range(uns
+ 			if (pgs_ol > pfn_cnt)
+ 				pgs_ol = pfn_cnt;
+ 
++			has->covered_end_pfn +=  pgs_ol;
++			pfn_cnt -= pgs_ol;
+ 			/*
+ 			 * Check if the corresponding memory block is already
+ 			 * online by checking its last previously backed page.
+@@ -793,10 +840,8 @@ static unsigned long handle_pg_range(uns
+ 			 */
+ 			if (start_pfn > has->start_pfn &&
+ 			    !PageReserved(pfn_to_page(start_pfn - 1)))
+-				hv_bring_pgs_online(start_pfn, pgs_ol);
++				hv_bring_pgs_online(has, start_pfn, pgs_ol);
+ 
+-			has->covered_end_pfn +=  pgs_ol;
+-			pfn_cnt -= pgs_ol;
+ 		}
+ 
+ 		if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) {
+@@ -834,13 +879,19 @@ static unsigned long process_hot_add(uns
+ 					unsigned long rg_size)
+ {
+ 	struct hv_hotadd_state *ha_region = NULL;
++	int covered;
+ 
+ 	if (pfn_cnt == 0)
+ 		return 0;
+ 
+-	if (!dm_device.host_specified_ha_region)
+-		if (pfn_covered(pg_start, pfn_cnt))
++	if (!dm_device.host_specified_ha_region) {
++		covered = pfn_covered(pg_start, pfn_cnt);
++		if (covered < 0)
++			return 0;
++
++		if (covered)
+ 			goto do_pg_range;
++	}
+ 
+ 	/*
+ 	 * If the host has specified a hot-add range; deal with it first.
+@@ -852,6 +903,7 @@ static unsigned long process_hot_add(uns
+ 			return 0;
+ 
+ 		INIT_LIST_HEAD(&ha_region->list);
++		INIT_LIST_HEAD(&ha_region->gap_list);
+ 
+ 		list_add_tail(&ha_region->list, &dm_device.ha_region_list);
+ 		ha_region->start_pfn = rg_start;
+@@ -1584,6 +1636,7 @@ static int balloon_remove(struct hv_devi
+ 	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
+ 	struct list_head *cur, *tmp;
+ 	struct hv_hotadd_state *has;
++	struct hv_hotadd_gap *gap, *tmp_gap;
+ 
+ 	if (dm->num_pages_ballooned != 0)
+ 		pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
+@@ -1600,6 +1653,10 @@ static int balloon_remove(struct hv_devi
+ #endif
+ 	list_for_each_safe(cur, tmp, &dm->ha_region_list) {
+ 		has = list_entry(cur, struct hv_hotadd_state, list);
++		list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
++			list_del(&gap->list);
++			kfree(gap);
++		}
+ 		list_del(&has->list);
+ 		kfree(has);
+ 	}
diff --git a/queue-4.4/drivers-hv-balloon-keep-track-of-where-ha_region-starts.patch b/queue-4.4/drivers-hv-balloon-keep-track-of-where-ha_region-starts.patch
new file mode 100644
index 00000000000..a011a9edf2a
--- /dev/null
+++ b/queue-4.4/drivers-hv-balloon-keep-track-of-where-ha_region-starts.patch
@@ -0,0 +1,67 @@
+From 7cf3b79ec85ee1a5bbaaf936bb1d050dc652983b Mon Sep 17 00:00:00 2001
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+Date: Wed, 24 Aug 2016 16:23:09 -0700
+Subject: Drivers: hv: balloon: keep track of where ha_region starts
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+commit 7cf3b79ec85ee1a5bbaaf936bb1d050dc652983b upstream.
+
+Windows 2012 (non-R2) does not specify hot add region in hot add requests
+and the logic in hot_add_req() is trying to find a 128Mb-aligned region
+covering the request. It may also happen that host's requests are not 128Mb
+aligned and the created ha_region will start before the first specified
+PFN. We can't online these non-present pages but we don't remember the real
+start of the region.
+
+This is a regression introduced by the commit 5abbbb75d733 ("Drivers: hv:
+hv_balloon: don't lose memory when onlining order is not natural"). While
+the idea of keeping the 'moving window' was wrong (as there is no guarantee
+that hot add requests come ordered) we should still keep track of
+covered_start_pfn. This is not a revert, the logic is different.
+
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
+Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/hv/hv_balloon.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/drivers/hv/hv_balloon.c
++++ b/drivers/hv/hv_balloon.c
+@@ -430,13 +430,14 @@ struct dm_info_msg {
+  * currently hot added. We hot add in multiples of 128M
+  * chunks; it is possible that we may not be able to bring
+  * online all the pages in the region. The range
+- * covered_end_pfn defines the pages that can
++ * covered_start_pfn:covered_end_pfn defines the pages that can
+  * be brough online.
+  */
+ 
+ struct hv_hotadd_state {
+ 	struct list_head list;
+ 	unsigned long start_pfn;
++	unsigned long covered_start_pfn;
+ 	unsigned long covered_end_pfn;
+ 	unsigned long ha_end_pfn;
+ 	unsigned long end_pfn;
+@@ -682,7 +683,8 @@ static void hv_online_page(struct page *
+ 
+ 	list_for_each(cur, &dm_device.ha_region_list) {
+ 		has = list_entry(cur, struct hv_hotadd_state, list);
+-		cur_start_pgp = (unsigned long)pfn_to_page(has->start_pfn);
++		cur_start_pgp = (unsigned long)
++			pfn_to_page(has->covered_start_pfn);
+ 		cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn);
+ 
+ 		if (((unsigned long)pg >= cur_start_pgp) &&
+@@ -854,6 +856,7 @@ static unsigned long process_hot_add(uns
+ 		list_add_tail(&ha_region->list, &dm_device.ha_region_list);
+ 		ha_region->start_pfn = rg_start;
+ 		ha_region->ha_end_pfn = rg_start;
++		ha_region->covered_start_pfn = pg_start;
+ 		ha_region->covered_end_pfn = pg_start;
+ 		ha_region->end_pfn = rg_start + rg_size;
+ 	}
diff --git a/queue-4.4/hv-don-t-reset-hv_context.tsc_page-on-crash.patch b/queue-4.4/hv-don-t-reset-hv_context.tsc_page-on-crash.patch
new file mode 100644
index 00000000000..f6225c3ac79
--- /dev/null
+++ b/queue-4.4/hv-don-t-reset-hv_context.tsc_page-on-crash.patch
@@ -0,0 +1,38 @@
+From 56ef6718a1d8d77745033c5291e025ce18504159 Mon Sep 17 00:00:00 2001
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+Date: Wed, 7 Dec 2016 01:16:27 -0800
+Subject: hv: don't reset hv_context.tsc_page on crash
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+commit 56ef6718a1d8d77745033c5291e025ce18504159 upstream.
+
+It may happen that secondary CPUs are still alive and resetting
+hv_context.tsc_page will cause a consequent crash in read_hv_clock_tsc()
+as we don't check for it being not NULL there. It is safe as we're not
+freeing this page anyways.
+
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
+Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/hv/hv.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/drivers/hv/hv.c
++++ b/drivers/hv/hv.c
+@@ -305,9 +305,10 @@ void hv_cleanup(bool crash)
+ 
+ 		hypercall_msr.as_uint64 = 0;
+ 		wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64);
+-		if (!crash)
++		if (!crash) {
+ 			vfree(hv_context.tsc_page);
+-		hv_context.tsc_page = NULL;
++			hv_context.tsc_page = NULL;
++		}
+ 	}
+ #endif
+ }
diff --git a/queue-4.4/kvm-arm-arm64-fix-locking-for-kvm_free_stage2_pgd.patch b/queue-4.4/kvm-arm-arm64-fix-locking-for-kvm_free_stage2_pgd.patch
new file mode 100644
index 00000000000..ef4d2addc85
--- /dev/null
+++ b/queue-4.4/kvm-arm-arm64-fix-locking-for-kvm_free_stage2_pgd.patch
@@ -0,0 +1,68 @@
+From 8b3405e345b5a098101b0c31b264c812bba045d9 Mon Sep 17 00:00:00 2001
+From: Suzuki K Poulose <suzuki.poulose@arm.com>
+Date: Mon, 3 Apr 2017 15:12:43 +0100
+Subject: kvm: arm/arm64: Fix locking for kvm_free_stage2_pgd
+
+From: Suzuki K Poulose <suzuki.poulose@arm.com>
+
+commit 8b3405e345b5a098101b0c31b264c812bba045d9 upstream.
+
+In kvm_free_stage2_pgd() we don't hold the kvm->mmu_lock while calling
+unmap_stage2_range() on the entire memory range for the guest. This could
+cause problems with other callers (e.g, munmap on a memslot) trying to
+unmap a range. And since we have to unmap the entire Guest memory range
+holding a spinlock, make sure we yield the lock if necessary, after we
+unmap each PUD range.
+
+Fixes: commit d5d8184d35c9 ("KVM: ARM: Memory virtualization setup")
+Cc: Paolo Bonzini <pbonzin@redhat.com>
+Cc: Marc Zyngier <marc.zyngier@arm.com>
+Cc: Christoffer Dall <christoffer.dall@linaro.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
+[ Avoid vCPU starvation and lockup detector warnings ]
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
+Signed-off-by: Christoffer Dall <cdall@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/kvm/mmu.c |   12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/arch/arm/kvm/mmu.c
++++ b/arch/arm/kvm/mmu.c
+@@ -300,6 +300,14 @@ static void unmap_range(struct kvm *kvm,
+ 		next = kvm_pgd_addr_end(addr, end);
+ 		if (!pgd_none(*pgd))
+ 			unmap_puds(kvm, pgd, addr, next);
++		/*
++		 * If we are dealing with a large range in
++		 * stage2 table, release the kvm->mmu_lock
++		 * to prevent starvation and lockup detector
++		 * warnings.
++		 */
++		if (kvm && (next != end))
++			cond_resched_lock(&kvm->mmu_lock);
+ 	} while (pgd++, addr = next, addr != end);
+ }
+ 
+@@ -738,6 +746,7 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm
+  */
+ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
+ {
++	assert_spin_locked(&kvm->mmu_lock);
+ 	unmap_range(kvm, kvm->arch.pgd, start, size);
+ }
+ 
+@@ -824,7 +833,10 @@ void kvm_free_stage2_pgd(struct kvm *kvm
+ 	if (kvm->arch.pgd == NULL)
+ 		return;
+ 
++	spin_lock(&kvm->mmu_lock);
+ 	unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
++	spin_unlock(&kvm->mmu_lock);
++
+ 	kvm_free_hwpgd(kvm_get_hwpgd(kvm));
+ 	if (KVM_PREALLOC_LEVEL > 0)
+ 		kfree(kvm->arch.pgd);
diff --git a/queue-4.4/powerpc-64-fix-flush_-d-i-cache_range-called-from-modules.patch b/queue-4.4/powerpc-64-fix-flush_-d-i-cache_range-called-from-modules.patch
new file mode 100644
index 00000000000..0753f56bf19
--- /dev/null
+++ b/queue-4.4/powerpc-64-fix-flush_-d-i-cache_range-called-from-modules.patch
@@ -0,0 +1,53 @@
+From 8f5f525d5b83f7d76a6baf9c4e94d4bf312ea7f6 Mon Sep 17 00:00:00 2001
+From: Oliver O'Halloran <oohall@gmail.com>
+Date: Mon, 3 Apr 2017 13:25:12 +1000
+Subject: powerpc/64: Fix flush_(d|i)cache_range() called from modules
+
+From: Oliver O'Halloran <oohall@gmail.com>
+
+commit 8f5f525d5b83f7d76a6baf9c4e94d4bf312ea7f6 upstream.
+
+When the kernel is compiled to use 64bit ABIv2 the _GLOBAL() macro does
+not include a global entry point. A function's global entry point is
+used when the function is called from a different TOC context and in the
+kernel this typically means a call from a module into the vmlinux (or
+vice-versa).
+
+There are a few exported asm functions declared with _GLOBAL() and
+calling them from a module will likely crash the kernel since any TOC
+relative load will yield garbage.
+
+flush_icache_range() and flush_dcache_range() are both exported to
+modules, and use the TOC, so must use _GLOBAL_TOC().
+
+Fixes: 721aeaa9fdf3 ("powerpc: Build little endian ppc64 kernel with ABIv2")
+Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ arch/powerpc/kernel/misc_64.S |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/arch/powerpc/kernel/misc_64.S
++++ b/arch/powerpc/kernel/misc_64.S
+@@ -67,6 +67,9 @@ PPC64_CACHES:
+  */
+ 
+ _KPROBE(flush_icache_range)
++0:	addis	r2,r12,(.TOC. - 0b)@ha
++	addi	r2, r2,(.TOC. - 0b)@l
++	.localentry flush_icache_range, . - flush_icache_range
+ BEGIN_FTR_SECTION
+ 	PURGE_PREFETCHED_INS
+ 	blr
+@@ -117,7 +120,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_I
+  *
+  *    flush all bytes from start to stop-1 inclusive
+  */
+-_GLOBAL(flush_dcache_range)
++_GLOBAL_TOC(flush_dcache_range)
+ 
+ /*
+  * Flush the data cache to memory 
diff --git a/queue-4.4/series b/queue-4.4/series
index 7454d40e577..785544672dd 100644
--- a/queue-4.4/series
+++ b/queue-4.4/series
@@ -16,3 +16,13 @@ mmc-sdhci-esdhc-imx-increase-the-pad-i-o-drive-strength-for-ddr50-card.patch
 mac80211-reject-tods-broadcast-data-frames.patch
 ubi-upd-always-flush-after-prepared-for-an-update.patch
 powerpc-kprobe-fix-oops-when-kprobed-on-stdu-instruction.patch
+x86-mce-amd-give-a-name-to-mca-bank-3-when-accessed-with-legacy-msrs.patch
+kvm-arm-arm64-fix-locking-for-kvm_free_stage2_pgd.patch
+powerpc-64-fix-flush_-d-i-cache_range-called-from-modules.patch
+tools-hv-kvp-ensure-kvp-device-fd-is-closed-on-exec.patch
+drivers-hv-balloon-keep-track-of-where-ha_region-starts.patch
+drivers-hv-balloon-account-for-gaps-in-hot-add-regions.patch
+hv-don-t-reset-hv_context.tsc_page-on-crash.patch
+x86-pmem-fix-broken-__copy_user_nocache-cache-bypass-assumptions.patch
+block-fix-del_gendisk-vs-blkdev_ioctl-crash.patch
+tipc-fix-crash-during-node-removal.patch
diff --git a/queue-4.4/tipc-fix-crash-during-node-removal.patch b/queue-4.4/tipc-fix-crash-during-node-removal.patch
new file mode 100644
index 00000000000..f03e48ec7ba
--- /dev/null
+++ b/queue-4.4/tipc-fix-crash-during-node-removal.patch
@@ -0,0 +1,106 @@
+From d25a01257e422a4bdeb426f69529d57c73b235fe Mon Sep 17 00:00:00 2001
+From: Jon Paul Maloy <jon.maloy@ericsson.com>
+Date: Wed, 24 Feb 2016 11:10:48 -0500
+Subject: tipc: fix crash during node removal
+
+From: Jon Paul Maloy <jon.maloy@ericsson.com>
+
+commit d25a01257e422a4bdeb426f69529d57c73b235fe upstream.
+
+When the TIPC module is unloaded, we have identified a race condition
+that allows a node reference counter to go to zero and the node instance
+being freed before the node timer is finished with accessing it. This
+leads to occasional crashes, especially in multi-namespace environments.
+
+The scenario goes as follows:
+
+CPU0:(node_stop)                       CPU1:(node_timeout)  // ref == 2
+
+1:                                          if(!mod_timer())
+2: if (del_timer())
+3:   tipc_node_put()                                        // ref -> 1
+4: tipc_node_put()                                          // ref -> 0
+5:   kfree_rcu(node);
+6:                                               tipc_node_get(node)
+7:                                               // BOOM!
+
+We now clean up this functionality as follows:
+
+1) We remove the node pointer from the node lookup table before we
+   attempt deactivating the timer. This way, we reduce the risk that
+   tipc_node_find() may obtain a valid pointer to an instance marked
+   for deletion; a harmless but undesirable situation.
+
+2) We use del_timer_sync() instead of del_timer() to safely deactivate
+   the node timer without any risk that it might be reactivated by the
+   timeout handler. There is no risk of deadlock here, since the two
+   functions never touch the same spinlocks.
+
+3: We remove a pointless tipc_node_get() + tipc_node_put() from the
+   timeout handler.
+
+Reported-by: Zhijiang Hu <huzhijiang@gmail.com>
+Acked-by: Ying Xue <ying.xue@windriver.com>
+Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/tipc/node.c |   24 +++++++++++-------------
+ 1 file changed, 11 insertions(+), 13 deletions(-)
+
+--- a/net/tipc/node.c
++++ b/net/tipc/node.c
+@@ -102,9 +102,10 @@ static unsigned int tipc_hashfn(u32 addr
+ 
+ static void tipc_node_kref_release(struct kref *kref)
+ {
+-	struct tipc_node *node = container_of(kref, struct tipc_node, kref);
++	struct tipc_node *n = container_of(kref, struct tipc_node, kref);
+ 
+-	tipc_node_delete(node);
++	kfree(n->bc_entry.link);
++	kfree_rcu(n, rcu);
+ }
+ 
+ void tipc_node_put(struct tipc_node *node)
+@@ -216,21 +217,20 @@ static void tipc_node_delete(struct tipc
+ {
+ 	list_del_rcu(&node->list);
+ 	hlist_del_rcu(&node->hash);
+-	kfree(node->bc_entry.link);
+-	kfree_rcu(node, rcu);
++	tipc_node_put(node);
++
++	del_timer_sync(&node->timer);
++	tipc_node_put(node);
+ }
+ 
+ void tipc_node_stop(struct net *net)
+ {
+-	struct tipc_net *tn = net_generic(net, tipc_net_id);
++	struct tipc_net *tn = tipc_net(net);
+ 	struct tipc_node *node, *t_node;
+ 
+ 	spin_lock_bh(&tn->node_list_lock);
+-	list_for_each_entry_safe(node, t_node, &tn->node_list, list) {
+-		if (del_timer(&node->timer))
+-			tipc_node_put(node);
+-		tipc_node_put(node);
+-	}
++	list_for_each_entry_safe(node, t_node, &tn->node_list, list)
++		tipc_node_delete(node);
+ 	spin_unlock_bh(&tn->node_list_lock);
+ }
+ 
+@@ -313,9 +313,7 @@ static void tipc_node_timeout(unsigned l
+ 		if (rc & TIPC_LINK_DOWN_EVT)
+ 			tipc_node_link_down(n, bearer_id, false);
+ 	}
+-	if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
+-		tipc_node_get(n);
+-	tipc_node_put(n);
++	mod_timer(&n->timer, jiffies + n->keepalive_intv);
+ }
+ 
+ /**
diff --git a/queue-4.4/tools-hv-kvp-ensure-kvp-device-fd-is-closed-on-exec.patch b/queue-4.4/tools-hv-kvp-ensure-kvp-device-fd-is-closed-on-exec.patch
new file mode 100644
index 00000000000..843c01355b8
--- /dev/null
+++ b/queue-4.4/tools-hv-kvp-ensure-kvp-device-fd-is-closed-on-exec.patch
@@ -0,0 +1,34 @@
+From 26840437cbd6d3625ea6ab34e17cd34bb810c861 Mon Sep 17 00:00:00 2001
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+Date: Wed, 6 Jul 2016 18:24:10 -0700
+Subject: Tools: hv: kvp: ensure kvp device fd is closed on exec
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+commit 26840437cbd6d3625ea6ab34e17cd34bb810c861 upstream.
+
+KVP daemon does fork()/exec() (with popen()) so we need to close our fds
+to avoid sharing them with child processes. The immediate implication of
+not doing so I see is SELinux complaining about 'ip' trying to access
+'/dev/vmbus/hv_kvp'.
+
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
+Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/hv/hv_kvp_daemon.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/hv/hv_kvp_daemon.c
++++ b/tools/hv/hv_kvp_daemon.c
+@@ -1433,7 +1433,7 @@ int main(int argc, char *argv[])
+ 	openlog("KVP", 0, LOG_USER);
+ 	syslog(LOG_INFO, "KVP starting; pid is:%d", getpid());
+ 
+-	kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR);
++	kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);
+ 
+ 	if (kvp_fd < 0) {
+ 		syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
diff --git a/queue-4.4/x86-mce-amd-give-a-name-to-mca-bank-3-when-accessed-with-legacy-msrs.patch b/queue-4.4/x86-mce-amd-give-a-name-to-mca-bank-3-when-accessed-with-legacy-msrs.patch
new file mode 100644
index 00000000000..33ea3616f12
--- /dev/null
+++ b/queue-4.4/x86-mce-amd-give-a-name-to-mca-bank-3-when-accessed-with-legacy-msrs.patch
@@ -0,0 +1,53 @@
+From 29f72ce3e4d18066ec75c79c857bee0618a3504b Mon Sep 17 00:00:00 2001
+From: Yazen Ghannam <yazen.ghannam@amd.com>
+Date: Thu, 30 Mar 2017 13:17:14 +0200
+Subject: x86/mce/AMD: Give a name to MCA bank 3 when accessed with legacy MSRs
+
+From: Yazen Ghannam <yazen.ghannam@amd.com>
+
+commit 29f72ce3e4d18066ec75c79c857bee0618a3504b upstream.
+
+MCA bank 3 is reserved on systems pre-Fam17h, so it didn't have a name.
+However, MCA bank 3 is defined on Fam17h systems and can be accessed
+using legacy MSRs. Without a name we get a stack trace on Fam17h systems
+when trying to register sysfs files for bank 3 on kernels that don't
+recognize Scalable MCA.
+
+Call MCA bank 3 "decode_unit" since this is what it represents on
+Fam17h. This will allow kernels without SMCA support to see this bank on
+Fam17h+ and prevent the stack trace. This will not affect older systems
+since this bank is reserved on them, i.e. it'll be ignored.
+
+Tested on AMD Fam15h and Fam17h systems.
+
+  WARNING: CPU: 26 PID: 1 at lib/kobject.c:210 kobject_add_internal
+  kobject: (ffff88085bb256c0): attempted to be registered with empty name!
+  ...
+  Call Trace:
+   kobject_add_internal
+   kobject_add
+   kobject_create_and_add
+   threshold_create_device
+   threshold_init_device
+
+Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Link: http://lkml.kernel.org/r/1490102285-3659-1-git-send-email-Yazen.Ghannam@amd.com
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/cpu/mcheck/mce_amd.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
++++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
+@@ -53,7 +53,7 @@ static const char * const th_names[] = {
+ 	"load_store",
+ 	"insn_fetch",
+ 	"combined_unit",
+-	"",
++	"decode_unit",
+ 	"northbridge",
+ 	"execution_unit",
+ };
diff --git a/queue-4.4/x86-pmem-fix-broken-__copy_user_nocache-cache-bypass-assumptions.patch b/queue-4.4/x86-pmem-fix-broken-__copy_user_nocache-cache-bypass-assumptions.patch
new file mode 100644
index 00000000000..a2ba054eb7e
--- /dev/null
+++ b/queue-4.4/x86-pmem-fix-broken-__copy_user_nocache-cache-bypass-assumptions.patch
@@ -0,0 +1,109 @@
+From 11e63f6d920d6f2dfd3cd421e939a4aec9a58dcd Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Thu, 6 Apr 2017 09:04:31 -0700
+Subject: x86, pmem: fix broken __copy_user_nocache cache-bypass assumptions
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 11e63f6d920d6f2dfd3cd421e939a4aec9a58dcd upstream.
+
+Before we rework the "pmem api" to stop abusing __copy_user_nocache()
+for memcpy_to_pmem() we need to fix cases where we may strand dirty data
+in the cpu cache. The problem occurs when copy_from_iter_pmem() is used
+for arbitrary data transfers from userspace. There is no guarantee that
+these transfers, performed by dax_iomap_actor(), will have aligned
+destinations or aligned transfer lengths. Backstop the usage
+__copy_user_nocache() with explicit cache management in these unaligned
+cases.
+
+Yes, copy_from_iter_pmem() is now too big for an inline, but addressing
+that is saved for a later patch that moves the entirety of the "pmem
+api" into the pmem driver directly.
+
+Fixes: 5de490daec8b ("pmem: add copy_from_iter_pmem() and clear_pmem()")
+Cc: <x86@kernel.org>
+Cc: Jan Kara <jack@suse.cz>
+Cc: Jeff Moyer <jmoyer@redhat.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Matthew Wilcox <mawilcox@microsoft.com>
+Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/pmem.h |   45 +++++++++++++++++++++++++++++++-------------
+ 1 file changed, 32 insertions(+), 13 deletions(-)
+
+--- a/arch/x86/include/asm/pmem.h
++++ b/arch/x86/include/asm/pmem.h
+@@ -72,8 +72,8 @@ static inline void arch_wmb_pmem(void)
+  * @size:	number of bytes to write back
+  *
+  * Write back a cache range using the CLWB (cache line write back)
+- * instruction.  This function requires explicit ordering with an
+- * arch_wmb_pmem() call.  This API is internal to the x86 PMEM implementation.
++ * instruction. Note that @size is internally rounded up to be cache
++ * line size aligned.
+  */
+ static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+ {
+@@ -87,15 +87,6 @@ static inline void __arch_wb_cache_pmem(
+ 		clwb(p);
+ }
+ 
+-/*
+- * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec
+- * iterators, so for other types (bvec & kvec) we must do a cache write-back.
+- */
+-static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
+-{
+-	return iter_is_iovec(i) == false;
+-}
+-
+ /**
+  * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
+  * @addr:	PMEM destination address
+@@ -114,8 +105,36 @@ static inline size_t arch_copy_from_iter
+ 	/* TODO: skip the write-back by always using non-temporal stores */
+ 	len = copy_from_iter_nocache(vaddr, bytes, i);
+ 
+-	if (__iter_needs_pmem_wb(i))
+-		__arch_wb_cache_pmem(vaddr, bytes);
++	/*
++	 * In the iovec case on x86_64 copy_from_iter_nocache() uses
++	 * non-temporal stores for the bulk of the transfer, but we need
++	 * to manually flush if the transfer is unaligned. A cached
++	 * memory copy is used when destination or size is not naturally
++	 * aligned. That is:
++	 *   - Require 8-byte alignment when size is 8 bytes or larger.
++	 *   - Require 4-byte alignment when size is 4 bytes.
++	 *
++	 * In the non-iovec case the entire destination needs to be
++	 * flushed.
++	 */
++	if (iter_is_iovec(i)) {
++		unsigned long flushed, dest = (unsigned long) addr;
++
++		if (bytes < 8) {
++			if (!IS_ALIGNED(dest, 4) || (bytes != 4))
++				__arch_wb_cache_pmem(addr, 1);
++		} else {
++			if (!IS_ALIGNED(dest, 8)) {
++				dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
++				__arch_wb_cache_pmem(addr, 1);
++			}
++
++			flushed = dest - (unsigned long) addr;
++			if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8))
++				__arch_wb_cache_pmem(addr + bytes - 1, 1);
++		}
++	} else
++		__arch_wb_cache_pmem(addr, bytes);
+ 
+ 	return len;
+ }