From: Greg Kroah-Hartman Date: Tue, 25 Apr 2017 12:20:46 +0000 (+0100) Subject: 4.4-stable patches X-Git-Tag: v4.4.64~3 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b39dbc5b6dee5bc9e28b1b8bf50e00afebb976cc;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: block-fix-del_gendisk-vs-blkdev_ioctl-crash.patch drivers-hv-balloon-account-for-gaps-in-hot-add-regions.patch drivers-hv-balloon-keep-track-of-where-ha_region-starts.patch hv-don-t-reset-hv_context.tsc_page-on-crash.patch kvm-arm-arm64-fix-locking-for-kvm_free_stage2_pgd.patch powerpc-64-fix-flush_-d-i-cache_range-called-from-modules.patch tipc-fix-crash-during-node-removal.patch tools-hv-kvp-ensure-kvp-device-fd-is-closed-on-exec.patch x86-mce-amd-give-a-name-to-mca-bank-3-when-accessed-with-legacy-msrs.patch x86-pmem-fix-broken-__copy_user_nocache-cache-bypass-assumptions.patch --- diff --git a/queue-4.4/block-fix-del_gendisk-vs-blkdev_ioctl-crash.patch b/queue-4.4/block-fix-del_gendisk-vs-blkdev_ioctl-crash.patch new file mode 100644 index 00000000000..77cd86a1c07 --- /dev/null +++ b/queue-4.4/block-fix-del_gendisk-vs-blkdev_ioctl-crash.patch @@ -0,0 +1,66 @@ +From ac34f15e0c6d2fd58480052b6985f6991fb53bcc Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Tue, 29 Dec 2015 14:02:29 -0800 +Subject: block: fix del_gendisk() vs blkdev_ioctl crash + +From: Dan Williams + +commit ac34f15e0c6d2fd58480052b6985f6991fb53bcc upstream. + +When tearing down a block device early in its lifetime, userspace may +still be performing discovery actions like blkdev_ioctl() to re-read +partitions. + +The nvdimm_revalidate_disk() implementation depends on +disk->driverfs_dev to be valid at entry. However, it is set to NULL in +del_gendisk() and fatally this is happening *before* the disk device is +deleted from userspace view. + +There's no reason for del_gendisk() to clear ->driverfs_dev. That +device is the parent of the disk. It is guaranteed to not be freed +until the disk, as a child, drops its ->parent reference. + +We could also fix this issue locally in nvdimm_revalidate_disk() by +using disk_to_dev(disk)->parent, but lets fix it globally since +->driverfs_dev follows the lifetime of the parent. Longer term we +should probably just add a @parent parameter to add_disk(), and stop +carrying this pointer in the gendisk. + + BUG: unable to handle kernel NULL pointer dereference at (null) + IP: [] nvdimm_revalidate_disk+0x18/0x90 [libnvdimm] + CPU: 2 PID: 538 Comm: systemd-udevd Tainted: G O 4.4.0-rc5 #2257 + [..] + Call Trace: + [] rescan_partitions+0x87/0x2c0 + [] ? __lock_is_held+0x49/0x70 + [] __blkdev_reread_part+0x72/0xb0 + [] blkdev_reread_part+0x25/0x40 + [] blkdev_ioctl+0x4fd/0x9c0 + [] ? current_kernel_time64+0x69/0xd0 + [] block_ioctl+0x3d/0x50 + [] do_vfs_ioctl+0x308/0x560 + [] ? __audit_syscall_entry+0xb1/0x100 + [] ? do_audit_syscall_entry+0x66/0x70 + [] SyS_ioctl+0x79/0x90 + [] entry_SYSCALL_64_fastpath+0x12/0x76 + +Cc: Jan Kara +Cc: Jens Axboe +Reported-by: Robert Hu +Signed-off-by: Dan Williams +Signed-off-by: Greg Kroah-Hartman + +--- + block/genhd.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/block/genhd.c ++++ b/block/genhd.c +@@ -664,7 +664,6 @@ void del_gendisk(struct gendisk *disk) + + kobject_put(disk->part0.holder_dir); + kobject_put(disk->slave_dir); +- disk->driverfs_dev = NULL; + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); + pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); diff --git a/queue-4.4/drivers-hv-balloon-account-for-gaps-in-hot-add-regions.patch b/queue-4.4/drivers-hv-balloon-account-for-gaps-in-hot-add-regions.patch new file mode 100644 index 00000000000..ed728fd7e3e --- /dev/null +++ b/queue-4.4/drivers-hv-balloon-account-for-gaps-in-hot-add-regions.patch @@ -0,0 +1,271 @@ +From cb7a5724c7e1bfb5766ad1c3beba14cc715991cf Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Wed, 24 Aug 2016 16:23:10 -0700 +Subject: Drivers: hv: balloon: account for gaps in hot add regions + +From: Vitaly Kuznetsov + +commit cb7a5724c7e1bfb5766ad1c3beba14cc715991cf upstream. + +I'm observing the following hot add requests from the WS2012 host: + +hot_add_req: start_pfn = 0x108200 count = 330752 +hot_add_req: start_pfn = 0x158e00 count = 193536 +hot_add_req: start_pfn = 0x188400 count = 239616 + +As the host doesn't specify hot add regions we're trying to create +128Mb-aligned region covering the first request, we create the 0x108000 - +0x160000 region and we add 0x108000 - 0x158e00 memory. The second request +passes the pfn_covered() check, we enlarge the region to 0x108000 - +0x190000 and add 0x158e00 - 0x188200 memory. The problem emerges with the +third request as it starts at 0x188400 so there is a 0x200 gap which is +not covered. As the end of our region is 0x190000 now it again passes the +pfn_covered() check were we just adjust the covered_end_pfn and make it +0x188400 instead of 0x188200 which means that we'll try to online +0x188200-0x188400 pages but these pages were never assigned to us and we +crash. + +We can't react to such requests by creating new hot add regions as it may +happen that the whole suggested range falls into the previously identified +128Mb-aligned area so we'll end up adding nothing or create intersecting +regions and our current logic doesn't allow that. Instead, create a list of +such 'gaps' and check for them in the page online callback. + +Signed-off-by: Vitaly Kuznetsov +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Sumit Semwal +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/hv/hv_balloon.c | 131 ++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 94 insertions(+), 37 deletions(-) + +--- a/drivers/hv/hv_balloon.c ++++ b/drivers/hv/hv_balloon.c +@@ -441,6 +441,16 @@ struct hv_hotadd_state { + unsigned long covered_end_pfn; + unsigned long ha_end_pfn; + unsigned long end_pfn; ++ /* ++ * A list of gaps. ++ */ ++ struct list_head gap_list; ++}; ++ ++struct hv_hotadd_gap { ++ struct list_head list; ++ unsigned long start_pfn; ++ unsigned long end_pfn; + }; + + struct balloon_state { +@@ -596,18 +606,46 @@ static struct notifier_block hv_memory_n + .priority = 0 + }; + ++/* Check if the particular page is backed and can be onlined and online it. */ ++static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) ++{ ++ unsigned long cur_start_pgp; ++ unsigned long cur_end_pgp; ++ struct hv_hotadd_gap *gap; ++ ++ cur_start_pgp = (unsigned long)pfn_to_page(has->covered_start_pfn); ++ cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); ++ ++ /* The page is not backed. */ ++ if (((unsigned long)pg < cur_start_pgp) || ++ ((unsigned long)pg >= cur_end_pgp)) ++ return; ++ ++ /* Check for gaps. */ ++ list_for_each_entry(gap, &has->gap_list, list) { ++ cur_start_pgp = (unsigned long) ++ pfn_to_page(gap->start_pfn); ++ cur_end_pgp = (unsigned long) ++ pfn_to_page(gap->end_pfn); ++ if (((unsigned long)pg >= cur_start_pgp) && ++ ((unsigned long)pg < cur_end_pgp)) { ++ return; ++ } ++ } + +-static void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size) ++ /* This frame is currently backed; online the page. */ ++ __online_page_set_limits(pg); ++ __online_page_increment_counters(pg); ++ __online_page_free(pg); ++} ++ ++static void hv_bring_pgs_online(struct hv_hotadd_state *has, ++ unsigned long start_pfn, unsigned long size) + { + int i; + +- for (i = 0; i < size; i++) { +- struct page *pg; +- pg = pfn_to_page(start_pfn + i); +- __online_page_set_limits(pg); +- __online_page_increment_counters(pg); +- __online_page_free(pg); +- } ++ for (i = 0; i < size; i++) ++ hv_page_online_one(has, pfn_to_page(start_pfn + i)); + } + + static void hv_mem_hot_add(unsigned long start, unsigned long size, +@@ -684,26 +722,24 @@ static void hv_online_page(struct page * + list_for_each(cur, &dm_device.ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); + cur_start_pgp = (unsigned long) +- pfn_to_page(has->covered_start_pfn); +- cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); ++ pfn_to_page(has->start_pfn); ++ cur_end_pgp = (unsigned long)pfn_to_page(has->end_pfn); + +- if (((unsigned long)pg >= cur_start_pgp) && +- ((unsigned long)pg < cur_end_pgp)) { +- /* +- * This frame is currently backed; online the +- * page. +- */ +- __online_page_set_limits(pg); +- __online_page_increment_counters(pg); +- __online_page_free(pg); +- } ++ /* The page belongs to a different HAS. */ ++ if (((unsigned long)pg < cur_start_pgp) || ++ ((unsigned long)pg >= cur_end_pgp)) ++ continue; ++ ++ hv_page_online_one(has, pg); ++ break; + } + } + +-static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) ++static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) + { + struct list_head *cur; + struct hv_hotadd_state *has; ++ struct hv_hotadd_gap *gap; + unsigned long residual, new_inc; + + if (list_empty(&dm_device.ha_region_list)) +@@ -718,6 +754,24 @@ static bool pfn_covered(unsigned long st + */ + if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn) + continue; ++ ++ /* ++ * If the current start pfn is not where the covered_end ++ * is, create a gap and update covered_end_pfn. ++ */ ++ if (has->covered_end_pfn != start_pfn) { ++ gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC); ++ if (!gap) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&gap->list); ++ gap->start_pfn = has->covered_end_pfn; ++ gap->end_pfn = start_pfn; ++ list_add_tail(&gap->list, &has->gap_list); ++ ++ has->covered_end_pfn = start_pfn; ++ } ++ + /* + * If the current hot add-request extends beyond + * our current limit; extend it. +@@ -734,19 +788,10 @@ static bool pfn_covered(unsigned long st + has->end_pfn += new_inc; + } + +- /* +- * If the current start pfn is not where the covered_end +- * is, update it. +- */ +- +- if (has->covered_end_pfn != start_pfn) +- has->covered_end_pfn = start_pfn; +- +- return true; +- ++ return 1; + } + +- return false; ++ return 0; + } + + static unsigned long handle_pg_range(unsigned long pg_start, +@@ -785,6 +830,8 @@ static unsigned long handle_pg_range(uns + if (pgs_ol > pfn_cnt) + pgs_ol = pfn_cnt; + ++ has->covered_end_pfn += pgs_ol; ++ pfn_cnt -= pgs_ol; + /* + * Check if the corresponding memory block is already + * online by checking its last previously backed page. +@@ -793,10 +840,8 @@ static unsigned long handle_pg_range(uns + */ + if (start_pfn > has->start_pfn && + !PageReserved(pfn_to_page(start_pfn - 1))) +- hv_bring_pgs_online(start_pfn, pgs_ol); ++ hv_bring_pgs_online(has, start_pfn, pgs_ol); + +- has->covered_end_pfn += pgs_ol; +- pfn_cnt -= pgs_ol; + } + + if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { +@@ -834,13 +879,19 @@ static unsigned long process_hot_add(uns + unsigned long rg_size) + { + struct hv_hotadd_state *ha_region = NULL; ++ int covered; + + if (pfn_cnt == 0) + return 0; + +- if (!dm_device.host_specified_ha_region) +- if (pfn_covered(pg_start, pfn_cnt)) ++ if (!dm_device.host_specified_ha_region) { ++ covered = pfn_covered(pg_start, pfn_cnt); ++ if (covered < 0) ++ return 0; ++ ++ if (covered) + goto do_pg_range; ++ } + + /* + * If the host has specified a hot-add range; deal with it first. +@@ -852,6 +903,7 @@ static unsigned long process_hot_add(uns + return 0; + + INIT_LIST_HEAD(&ha_region->list); ++ INIT_LIST_HEAD(&ha_region->gap_list); + + list_add_tail(&ha_region->list, &dm_device.ha_region_list); + ha_region->start_pfn = rg_start; +@@ -1584,6 +1636,7 @@ static int balloon_remove(struct hv_devi + struct hv_dynmem_device *dm = hv_get_drvdata(dev); + struct list_head *cur, *tmp; + struct hv_hotadd_state *has; ++ struct hv_hotadd_gap *gap, *tmp_gap; + + if (dm->num_pages_ballooned != 0) + pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); +@@ -1600,6 +1653,10 @@ static int balloon_remove(struct hv_devi + #endif + list_for_each_safe(cur, tmp, &dm->ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); ++ list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { ++ list_del(&gap->list); ++ kfree(gap); ++ } + list_del(&has->list); + kfree(has); + } diff --git a/queue-4.4/drivers-hv-balloon-keep-track-of-where-ha_region-starts.patch b/queue-4.4/drivers-hv-balloon-keep-track-of-where-ha_region-starts.patch new file mode 100644 index 00000000000..a011a9edf2a --- /dev/null +++ b/queue-4.4/drivers-hv-balloon-keep-track-of-where-ha_region-starts.patch @@ -0,0 +1,67 @@ +From 7cf3b79ec85ee1a5bbaaf936bb1d050dc652983b Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Wed, 24 Aug 2016 16:23:09 -0700 +Subject: Drivers: hv: balloon: keep track of where ha_region starts + +From: Vitaly Kuznetsov + +commit 7cf3b79ec85ee1a5bbaaf936bb1d050dc652983b upstream. + +Windows 2012 (non-R2) does not specify hot add region in hot add requests +and the logic in hot_add_req() is trying to find a 128Mb-aligned region +covering the request. It may also happen that host's requests are not 128Mb +aligned and the created ha_region will start before the first specified +PFN. We can't online these non-present pages but we don't remember the real +start of the region. + +This is a regression introduced by the commit 5abbbb75d733 ("Drivers: hv: +hv_balloon: don't lose memory when onlining order is not natural"). While +the idea of keeping the 'moving window' was wrong (as there is no guarantee +that hot add requests come ordered) we should still keep track of +covered_start_pfn. This is not a revert, the logic is different. + +Signed-off-by: Vitaly Kuznetsov +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Sumit Semwal +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/hv/hv_balloon.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/drivers/hv/hv_balloon.c ++++ b/drivers/hv/hv_balloon.c +@@ -430,13 +430,14 @@ struct dm_info_msg { + * currently hot added. We hot add in multiples of 128M + * chunks; it is possible that we may not be able to bring + * online all the pages in the region. The range +- * covered_end_pfn defines the pages that can ++ * covered_start_pfn:covered_end_pfn defines the pages that can + * be brough online. + */ + + struct hv_hotadd_state { + struct list_head list; + unsigned long start_pfn; ++ unsigned long covered_start_pfn; + unsigned long covered_end_pfn; + unsigned long ha_end_pfn; + unsigned long end_pfn; +@@ -682,7 +683,8 @@ static void hv_online_page(struct page * + + list_for_each(cur, &dm_device.ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); +- cur_start_pgp = (unsigned long)pfn_to_page(has->start_pfn); ++ cur_start_pgp = (unsigned long) ++ pfn_to_page(has->covered_start_pfn); + cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); + + if (((unsigned long)pg >= cur_start_pgp) && +@@ -854,6 +856,7 @@ static unsigned long process_hot_add(uns + list_add_tail(&ha_region->list, &dm_device.ha_region_list); + ha_region->start_pfn = rg_start; + ha_region->ha_end_pfn = rg_start; ++ ha_region->covered_start_pfn = pg_start; + ha_region->covered_end_pfn = pg_start; + ha_region->end_pfn = rg_start + rg_size; + } diff --git a/queue-4.4/hv-don-t-reset-hv_context.tsc_page-on-crash.patch b/queue-4.4/hv-don-t-reset-hv_context.tsc_page-on-crash.patch new file mode 100644 index 00000000000..f6225c3ac79 --- /dev/null +++ b/queue-4.4/hv-don-t-reset-hv_context.tsc_page-on-crash.patch @@ -0,0 +1,38 @@ +From 56ef6718a1d8d77745033c5291e025ce18504159 Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Wed, 7 Dec 2016 01:16:27 -0800 +Subject: hv: don't reset hv_context.tsc_page on crash + +From: Vitaly Kuznetsov + +commit 56ef6718a1d8d77745033c5291e025ce18504159 upstream. + +It may happen that secondary CPUs are still alive and resetting +hv_context.tsc_page will cause a consequent crash in read_hv_clock_tsc() +as we don't check for it being not NULL there. It is safe as we're not +freeing this page anyways. + +Signed-off-by: Vitaly Kuznetsov +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Sumit Semwal +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/hv/hv.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/drivers/hv/hv.c ++++ b/drivers/hv/hv.c +@@ -305,9 +305,10 @@ void hv_cleanup(bool crash) + + hypercall_msr.as_uint64 = 0; + wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64); +- if (!crash) ++ if (!crash) { + vfree(hv_context.tsc_page); +- hv_context.tsc_page = NULL; ++ hv_context.tsc_page = NULL; ++ } + } + #endif + } diff --git a/queue-4.4/kvm-arm-arm64-fix-locking-for-kvm_free_stage2_pgd.patch b/queue-4.4/kvm-arm-arm64-fix-locking-for-kvm_free_stage2_pgd.patch new file mode 100644 index 00000000000..ef4d2addc85 --- /dev/null +++ b/queue-4.4/kvm-arm-arm64-fix-locking-for-kvm_free_stage2_pgd.patch @@ -0,0 +1,68 @@ +From 8b3405e345b5a098101b0c31b264c812bba045d9 Mon Sep 17 00:00:00 2001 +From: Suzuki K Poulose +Date: Mon, 3 Apr 2017 15:12:43 +0100 +Subject: kvm: arm/arm64: Fix locking for kvm_free_stage2_pgd + +From: Suzuki K Poulose + +commit 8b3405e345b5a098101b0c31b264c812bba045d9 upstream. + +In kvm_free_stage2_pgd() we don't hold the kvm->mmu_lock while calling +unmap_stage2_range() on the entire memory range for the guest. This could +cause problems with other callers (e.g, munmap on a memslot) trying to +unmap a range. And since we have to unmap the entire Guest memory range +holding a spinlock, make sure we yield the lock if necessary, after we +unmap each PUD range. + +Fixes: commit d5d8184d35c9 ("KVM: ARM: Memory virtualization setup") +Cc: Paolo Bonzini +Cc: Marc Zyngier +Cc: Christoffer Dall +Cc: Mark Rutland +Signed-off-by: Suzuki K Poulose +[ Avoid vCPU starvation and lockup detector warnings ] +Signed-off-by: Marc Zyngier +Signed-off-by: Suzuki K Poulose +Signed-off-by: Christoffer Dall +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm/kvm/mmu.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/arch/arm/kvm/mmu.c ++++ b/arch/arm/kvm/mmu.c +@@ -300,6 +300,14 @@ static void unmap_range(struct kvm *kvm, + next = kvm_pgd_addr_end(addr, end); + if (!pgd_none(*pgd)) + unmap_puds(kvm, pgd, addr, next); ++ /* ++ * If we are dealing with a large range in ++ * stage2 table, release the kvm->mmu_lock ++ * to prevent starvation and lockup detector ++ * warnings. ++ */ ++ if (kvm && (next != end)) ++ cond_resched_lock(&kvm->mmu_lock); + } while (pgd++, addr = next, addr != end); + } + +@@ -738,6 +746,7 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm + */ + static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) + { ++ assert_spin_locked(&kvm->mmu_lock); + unmap_range(kvm, kvm->arch.pgd, start, size); + } + +@@ -824,7 +833,10 @@ void kvm_free_stage2_pgd(struct kvm *kvm + if (kvm->arch.pgd == NULL) + return; + ++ spin_lock(&kvm->mmu_lock); + unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); ++ spin_unlock(&kvm->mmu_lock); ++ + kvm_free_hwpgd(kvm_get_hwpgd(kvm)); + if (KVM_PREALLOC_LEVEL > 0) + kfree(kvm->arch.pgd); diff --git a/queue-4.4/powerpc-64-fix-flush_-d-i-cache_range-called-from-modules.patch b/queue-4.4/powerpc-64-fix-flush_-d-i-cache_range-called-from-modules.patch new file mode 100644 index 00000000000..0753f56bf19 --- /dev/null +++ b/queue-4.4/powerpc-64-fix-flush_-d-i-cache_range-called-from-modules.patch @@ -0,0 +1,53 @@ +From 8f5f525d5b83f7d76a6baf9c4e94d4bf312ea7f6 Mon Sep 17 00:00:00 2001 +From: Oliver O'Halloran +Date: Mon, 3 Apr 2017 13:25:12 +1000 +Subject: powerpc/64: Fix flush_(d|i)cache_range() called from modules + +From: Oliver O'Halloran + +commit 8f5f525d5b83f7d76a6baf9c4e94d4bf312ea7f6 upstream. + +When the kernel is compiled to use 64bit ABIv2 the _GLOBAL() macro does +not include a global entry point. A function's global entry point is +used when the function is called from a different TOC context and in the +kernel this typically means a call from a module into the vmlinux (or +vice-versa). + +There are a few exported asm functions declared with _GLOBAL() and +calling them from a module will likely crash the kernel since any TOC +relative load will yield garbage. + +flush_icache_range() and flush_dcache_range() are both exported to +modules, and use the TOC, so must use _GLOBAL_TOC(). + +Fixes: 721aeaa9fdf3 ("powerpc: Build little endian ppc64 kernel with ABIv2") +Signed-off-by: Oliver O'Halloran +Signed-off-by: Michael Ellerman +Signed-off-by: Greg Kroah-Hartman + + +--- + arch/powerpc/kernel/misc_64.S | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/arch/powerpc/kernel/misc_64.S ++++ b/arch/powerpc/kernel/misc_64.S +@@ -67,6 +67,9 @@ PPC64_CACHES: + */ + + _KPROBE(flush_icache_range) ++0: addis r2,r12,(.TOC. - 0b)@ha ++ addi r2, r2,(.TOC. - 0b)@l ++ .localentry flush_icache_range, . - flush_icache_range + BEGIN_FTR_SECTION + PURGE_PREFETCHED_INS + blr +@@ -117,7 +120,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_I + * + * flush all bytes from start to stop-1 inclusive + */ +-_GLOBAL(flush_dcache_range) ++_GLOBAL_TOC(flush_dcache_range) + + /* + * Flush the data cache to memory diff --git a/queue-4.4/series b/queue-4.4/series index 7454d40e577..785544672dd 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -16,3 +16,13 @@ mmc-sdhci-esdhc-imx-increase-the-pad-i-o-drive-strength-for-ddr50-card.patch mac80211-reject-tods-broadcast-data-frames.patch ubi-upd-always-flush-after-prepared-for-an-update.patch powerpc-kprobe-fix-oops-when-kprobed-on-stdu-instruction.patch +x86-mce-amd-give-a-name-to-mca-bank-3-when-accessed-with-legacy-msrs.patch +kvm-arm-arm64-fix-locking-for-kvm_free_stage2_pgd.patch +powerpc-64-fix-flush_-d-i-cache_range-called-from-modules.patch +tools-hv-kvp-ensure-kvp-device-fd-is-closed-on-exec.patch +drivers-hv-balloon-keep-track-of-where-ha_region-starts.patch +drivers-hv-balloon-account-for-gaps-in-hot-add-regions.patch +hv-don-t-reset-hv_context.tsc_page-on-crash.patch +x86-pmem-fix-broken-__copy_user_nocache-cache-bypass-assumptions.patch +block-fix-del_gendisk-vs-blkdev_ioctl-crash.patch +tipc-fix-crash-during-node-removal.patch diff --git a/queue-4.4/tipc-fix-crash-during-node-removal.patch b/queue-4.4/tipc-fix-crash-during-node-removal.patch new file mode 100644 index 00000000000..f03e48ec7ba --- /dev/null +++ b/queue-4.4/tipc-fix-crash-during-node-removal.patch @@ -0,0 +1,106 @@ +From d25a01257e422a4bdeb426f69529d57c73b235fe Mon Sep 17 00:00:00 2001 +From: Jon Paul Maloy +Date: Wed, 24 Feb 2016 11:10:48 -0500 +Subject: tipc: fix crash during node removal + +From: Jon Paul Maloy + +commit d25a01257e422a4bdeb426f69529d57c73b235fe upstream. + +When the TIPC module is unloaded, we have identified a race condition +that allows a node reference counter to go to zero and the node instance +being freed before the node timer is finished with accessing it. This +leads to occasional crashes, especially in multi-namespace environments. + +The scenario goes as follows: + +CPU0:(node_stop) CPU1:(node_timeout) // ref == 2 + +1: if(!mod_timer()) +2: if (del_timer()) +3: tipc_node_put() // ref -> 1 +4: tipc_node_put() // ref -> 0 +5: kfree_rcu(node); +6: tipc_node_get(node) +7: // BOOM! + +We now clean up this functionality as follows: + +1) We remove the node pointer from the node lookup table before we + attempt deactivating the timer. This way, we reduce the risk that + tipc_node_find() may obtain a valid pointer to an instance marked + for deletion; a harmless but undesirable situation. + +2) We use del_timer_sync() instead of del_timer() to safely deactivate + the node timer without any risk that it might be reactivated by the + timeout handler. There is no risk of deadlock here, since the two + functions never touch the same spinlocks. + +3: We remove a pointless tipc_node_get() + tipc_node_put() from the + timeout handler. + +Reported-by: Zhijiang Hu +Acked-by: Ying Xue +Signed-off-by: Jon Maloy +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + net/tipc/node.c | 24 +++++++++++------------- + 1 file changed, 11 insertions(+), 13 deletions(-) + +--- a/net/tipc/node.c ++++ b/net/tipc/node.c +@@ -102,9 +102,10 @@ static unsigned int tipc_hashfn(u32 addr + + static void tipc_node_kref_release(struct kref *kref) + { +- struct tipc_node *node = container_of(kref, struct tipc_node, kref); ++ struct tipc_node *n = container_of(kref, struct tipc_node, kref); + +- tipc_node_delete(node); ++ kfree(n->bc_entry.link); ++ kfree_rcu(n, rcu); + } + + void tipc_node_put(struct tipc_node *node) +@@ -216,21 +217,20 @@ static void tipc_node_delete(struct tipc + { + list_del_rcu(&node->list); + hlist_del_rcu(&node->hash); +- kfree(node->bc_entry.link); +- kfree_rcu(node, rcu); ++ tipc_node_put(node); ++ ++ del_timer_sync(&node->timer); ++ tipc_node_put(node); + } + + void tipc_node_stop(struct net *net) + { +- struct tipc_net *tn = net_generic(net, tipc_net_id); ++ struct tipc_net *tn = tipc_net(net); + struct tipc_node *node, *t_node; + + spin_lock_bh(&tn->node_list_lock); +- list_for_each_entry_safe(node, t_node, &tn->node_list, list) { +- if (del_timer(&node->timer)) +- tipc_node_put(node); +- tipc_node_put(node); +- } ++ list_for_each_entry_safe(node, t_node, &tn->node_list, list) ++ tipc_node_delete(node); + spin_unlock_bh(&tn->node_list_lock); + } + +@@ -313,9 +313,7 @@ static void tipc_node_timeout(unsigned l + if (rc & TIPC_LINK_DOWN_EVT) + tipc_node_link_down(n, bearer_id, false); + } +- if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) +- tipc_node_get(n); +- tipc_node_put(n); ++ mod_timer(&n->timer, jiffies + n->keepalive_intv); + } + + /** diff --git a/queue-4.4/tools-hv-kvp-ensure-kvp-device-fd-is-closed-on-exec.patch b/queue-4.4/tools-hv-kvp-ensure-kvp-device-fd-is-closed-on-exec.patch new file mode 100644 index 00000000000..843c01355b8 --- /dev/null +++ b/queue-4.4/tools-hv-kvp-ensure-kvp-device-fd-is-closed-on-exec.patch @@ -0,0 +1,34 @@ +From 26840437cbd6d3625ea6ab34e17cd34bb810c861 Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Wed, 6 Jul 2016 18:24:10 -0700 +Subject: Tools: hv: kvp: ensure kvp device fd is closed on exec + +From: Vitaly Kuznetsov + +commit 26840437cbd6d3625ea6ab34e17cd34bb810c861 upstream. + +KVP daemon does fork()/exec() (with popen()) so we need to close our fds +to avoid sharing them with child processes. The immediate implication of +not doing so I see is SELinux complaining about 'ip' trying to access +'/dev/vmbus/hv_kvp'. + +Signed-off-by: Vitaly Kuznetsov +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Sumit Semwal +Signed-off-by: Greg Kroah-Hartman + +--- + tools/hv/hv_kvp_daemon.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/tools/hv/hv_kvp_daemon.c ++++ b/tools/hv/hv_kvp_daemon.c +@@ -1433,7 +1433,7 @@ int main(int argc, char *argv[]) + openlog("KVP", 0, LOG_USER); + syslog(LOG_INFO, "KVP starting; pid is:%d", getpid()); + +- kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR); ++ kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC); + + if (kvp_fd < 0) { + syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s", diff --git a/queue-4.4/x86-mce-amd-give-a-name-to-mca-bank-3-when-accessed-with-legacy-msrs.patch b/queue-4.4/x86-mce-amd-give-a-name-to-mca-bank-3-when-accessed-with-legacy-msrs.patch new file mode 100644 index 00000000000..33ea3616f12 --- /dev/null +++ b/queue-4.4/x86-mce-amd-give-a-name-to-mca-bank-3-when-accessed-with-legacy-msrs.patch @@ -0,0 +1,53 @@ +From 29f72ce3e4d18066ec75c79c857bee0618a3504b Mon Sep 17 00:00:00 2001 +From: Yazen Ghannam +Date: Thu, 30 Mar 2017 13:17:14 +0200 +Subject: x86/mce/AMD: Give a name to MCA bank 3 when accessed with legacy MSRs + +From: Yazen Ghannam + +commit 29f72ce3e4d18066ec75c79c857bee0618a3504b upstream. + +MCA bank 3 is reserved on systems pre-Fam17h, so it didn't have a name. +However, MCA bank 3 is defined on Fam17h systems and can be accessed +using legacy MSRs. Without a name we get a stack trace on Fam17h systems +when trying to register sysfs files for bank 3 on kernels that don't +recognize Scalable MCA. + +Call MCA bank 3 "decode_unit" since this is what it represents on +Fam17h. This will allow kernels without SMCA support to see this bank on +Fam17h+ and prevent the stack trace. This will not affect older systems +since this bank is reserved on them, i.e. it'll be ignored. + +Tested on AMD Fam15h and Fam17h systems. + + WARNING: CPU: 26 PID: 1 at lib/kobject.c:210 kobject_add_internal + kobject: (ffff88085bb256c0): attempted to be registered with empty name! + ... + Call Trace: + kobject_add_internal + kobject_add + kobject_create_and_add + threshold_create_device + threshold_init_device + +Signed-off-by: Yazen Ghannam +Signed-off-by: Borislav Petkov +Link: http://lkml.kernel.org/r/1490102285-3659-1-git-send-email-Yazen.Ghannam@amd.com +Signed-off-by: Thomas Gleixner +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c ++++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c +@@ -53,7 +53,7 @@ static const char * const th_names[] = { + "load_store", + "insn_fetch", + "combined_unit", +- "", ++ "decode_unit", + "northbridge", + "execution_unit", + }; diff --git a/queue-4.4/x86-pmem-fix-broken-__copy_user_nocache-cache-bypass-assumptions.patch b/queue-4.4/x86-pmem-fix-broken-__copy_user_nocache-cache-bypass-assumptions.patch new file mode 100644 index 00000000000..a2ba054eb7e --- /dev/null +++ b/queue-4.4/x86-pmem-fix-broken-__copy_user_nocache-cache-bypass-assumptions.patch @@ -0,0 +1,109 @@ +From 11e63f6d920d6f2dfd3cd421e939a4aec9a58dcd Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Thu, 6 Apr 2017 09:04:31 -0700 +Subject: x86, pmem: fix broken __copy_user_nocache cache-bypass assumptions + +From: Dan Williams + +commit 11e63f6d920d6f2dfd3cd421e939a4aec9a58dcd upstream. + +Before we rework the "pmem api" to stop abusing __copy_user_nocache() +for memcpy_to_pmem() we need to fix cases where we may strand dirty data +in the cpu cache. The problem occurs when copy_from_iter_pmem() is used +for arbitrary data transfers from userspace. There is no guarantee that +these transfers, performed by dax_iomap_actor(), will have aligned +destinations or aligned transfer lengths. Backstop the usage +__copy_user_nocache() with explicit cache management in these unaligned +cases. + +Yes, copy_from_iter_pmem() is now too big for an inline, but addressing +that is saved for a later patch that moves the entirety of the "pmem +api" into the pmem driver directly. + +Fixes: 5de490daec8b ("pmem: add copy_from_iter_pmem() and clear_pmem()") +Cc: +Cc: Jan Kara +Cc: Jeff Moyer +Cc: Ingo Molnar +Cc: Christoph Hellwig +Cc: "H. Peter Anvin" +Cc: Al Viro +Cc: Thomas Gleixner +Cc: Matthew Wilcox +Reviewed-by: Ross Zwisler +Signed-off-by: Toshi Kani +Signed-off-by: Dan Williams +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/pmem.h | 45 +++++++++++++++++++++++++++++++------------- + 1 file changed, 32 insertions(+), 13 deletions(-) + +--- a/arch/x86/include/asm/pmem.h ++++ b/arch/x86/include/asm/pmem.h +@@ -72,8 +72,8 @@ static inline void arch_wmb_pmem(void) + * @size: number of bytes to write back + * + * Write back a cache range using the CLWB (cache line write back) +- * instruction. This function requires explicit ordering with an +- * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. ++ * instruction. Note that @size is internally rounded up to be cache ++ * line size aligned. + */ + static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) + { +@@ -87,15 +87,6 @@ static inline void __arch_wb_cache_pmem( + clwb(p); + } + +-/* +- * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec +- * iterators, so for other types (bvec & kvec) we must do a cache write-back. +- */ +-static inline bool __iter_needs_pmem_wb(struct iov_iter *i) +-{ +- return iter_is_iovec(i) == false; +-} +- + /** + * arch_copy_from_iter_pmem - copy data from an iterator to PMEM + * @addr: PMEM destination address +@@ -114,8 +105,36 @@ static inline size_t arch_copy_from_iter + /* TODO: skip the write-back by always using non-temporal stores */ + len = copy_from_iter_nocache(vaddr, bytes, i); + +- if (__iter_needs_pmem_wb(i)) +- __arch_wb_cache_pmem(vaddr, bytes); ++ /* ++ * In the iovec case on x86_64 copy_from_iter_nocache() uses ++ * non-temporal stores for the bulk of the transfer, but we need ++ * to manually flush if the transfer is unaligned. A cached ++ * memory copy is used when destination or size is not naturally ++ * aligned. That is: ++ * - Require 8-byte alignment when size is 8 bytes or larger. ++ * - Require 4-byte alignment when size is 4 bytes. ++ * ++ * In the non-iovec case the entire destination needs to be ++ * flushed. ++ */ ++ if (iter_is_iovec(i)) { ++ unsigned long flushed, dest = (unsigned long) addr; ++ ++ if (bytes < 8) { ++ if (!IS_ALIGNED(dest, 4) || (bytes != 4)) ++ __arch_wb_cache_pmem(addr, 1); ++ } else { ++ if (!IS_ALIGNED(dest, 8)) { ++ dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); ++ __arch_wb_cache_pmem(addr, 1); ++ } ++ ++ flushed = dest - (unsigned long) addr; ++ if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) ++ __arch_wb_cache_pmem(addr + bytes - 1, 1); ++ } ++ } else ++ __arch_wb_cache_pmem(addr, bytes); + + return len; + }