From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 26 Jul 2019 13:58:18 +0000 (+0200)
Subject: 5.1-stable patches
X-Git-Tag: v5.2.4~6
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=440b48b84b9c8ae81f901a23a19bf91944523730;p=thirdparty%2Fkernel%2Fstable-queue.git

5.1-stable patches

added patches:
	block-limit-zone-array-allocation-size.patch
	kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch
	kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch
	mm-vmscan-scan-anonymous-pages-on-file-refaults.patch
	revert-kvm-x86-use-task-structs-fpu-field-for-user.patch
	sd_zbc-fix-report-zones-buffer-allocation.patch
---

diff --git a/queue-5.1/block-limit-zone-array-allocation-size.patch b/queue-5.1/block-limit-zone-array-allocation-size.patch
new file mode 100644
index 00000000000..280d1ac24ac
--- /dev/null
+++ b/queue-5.1/block-limit-zone-array-allocation-size.patch
@@ -0,0 +1,136 @@
+From 26202928fafad8bda8b478edb7e62c885be623d7 Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <damien.lemoal@wdc.com>
+Date: Mon, 1 Jul 2019 14:09:18 +0900
+Subject: block: Limit zone array allocation size
+
+From: Damien Le Moal <damien.lemoal@wdc.com>
+
+commit 26202928fafad8bda8b478edb7e62c885be623d7 upstream.
+
+Limit the size of the struct blk_zone array used in
+blk_revalidate_disk_zones() to avoid memory allocation failures leading
+to disk revalidation failure. Also further reduce the likelyhood of
+such failures by using kvcalloc() (that is vmalloc()) instead of
+allocating contiguous pages with alloc_pages().
+
+Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
+Fixes: e76239a3748c ("block: add a report_zones method")
+Cc: stable@vger.kernel.org
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/blk-zoned.c      |   46 ++++++++++++++++++++++++++++++----------------
+ include/linux/blkdev.h |    5 +++++
+ 2 files changed, 35 insertions(+), 16 deletions(-)
+
+--- a/block/blk-zoned.c
++++ b/block/blk-zoned.c
+@@ -13,6 +13,9 @@
+ #include <linux/rbtree.h>
+ #include <linux/blkdev.h>
+ #include <linux/blk-mq.h>
++#include <linux/mm.h>
++#include <linux/vmalloc.h>
++#include <linux/sched/mm.h>
+ 
+ #include "blk.h"
+ 
+@@ -372,22 +375,25 @@ static inline unsigned long *blk_alloc_z
+  * Allocate an array of struct blk_zone to get nr_zones zone information.
+  * The allocated array may be smaller than nr_zones.
+  */
+-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
++static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
+ {
+-	size_t size = *nr_zones * sizeof(struct blk_zone);
+-	struct page *page;
+-	int order;
+-
+-	for (order = get_order(size); order >= 0; order--) {
+-		page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
+-		if (page) {
+-			*nr_zones = min_t(unsigned int, *nr_zones,
+-				(PAGE_SIZE << order) / sizeof(struct blk_zone));
+-			return page_address(page);
+-		}
++	struct blk_zone *zones;
++	size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
++
++	/*
++	 * GFP_KERNEL here is meaningless as the caller task context has
++	 * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
++	 * with memalloc_noio_save().
++	 */
++	zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
++	if (!zones) {
++		*nr_zones = 0;
++		return NULL;
+ 	}
+ 
+-	return NULL;
++	*nr_zones = nrz;
++
++	return zones;
+ }
+ 
+ void blk_queue_free_zone_bitmaps(struct request_queue *q)
+@@ -414,6 +420,7 @@ int blk_revalidate_disk_zones(struct gen
+ 	unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
+ 	unsigned int i, rep_nr_zones = 0, z = 0, nrz;
+ 	struct blk_zone *zones = NULL;
++	unsigned int noio_flag;
+ 	sector_t sector = 0;
+ 	int ret = 0;
+ 
+@@ -426,6 +433,12 @@ int blk_revalidate_disk_zones(struct gen
+ 		return 0;
+ 	}
+ 
++	/*
++	 * Ensure that all memory allocations in this context are done as
++	 * if GFP_NOIO was specified.
++	 */
++	noio_flag = memalloc_noio_save();
++
+ 	if (!blk_queue_is_zoned(q) || !nr_zones) {
+ 		nr_zones = 0;
+ 		goto update;
+@@ -442,7 +455,7 @@ int blk_revalidate_disk_zones(struct gen
+ 
+ 	/* Get zone information and initialize seq_zones_bitmap */
+ 	rep_nr_zones = nr_zones;
+-	zones = blk_alloc_zones(q->node, &rep_nr_zones);
++	zones = blk_alloc_zones(&rep_nr_zones);
+ 	if (!zones)
+ 		goto out;
+ 
+@@ -479,8 +492,9 @@ update:
+ 	blk_mq_unfreeze_queue(q);
+ 
+ out:
+-	free_pages((unsigned long)zones,
+-		   get_order(rep_nr_zones * sizeof(struct blk_zone)));
++	memalloc_noio_restore(noio_flag);
++
++	kvfree(zones);
+ 	kfree(seq_zones_wlock);
+ 	kfree(seq_zones_bitmap);
+ 
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -344,6 +344,11 @@ struct queue_limits {
+ 
+ #ifdef CONFIG_BLK_DEV_ZONED
+ 
++/*
++ * Maximum number of zones to report with a single report zones command.
++ */
++#define BLK_ZONED_REPORT_MAX_ZONES	8192U
++
+ extern unsigned int blkdev_nr_zones(struct block_device *bdev);
+ extern int blkdev_report_zones(struct block_device *bdev,
+ 			       sector_t sector, struct blk_zone *zones,
diff --git a/queue-5.1/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch b/queue-5.1/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch
new file mode 100644
index 00000000000..9eaddb5df79
--- /dev/null
+++ b/queue-5.1/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch
@@ -0,0 +1,37 @@
+From cf64527bb33f6cec2ed50f89182fc4688d0056b6 Mon Sep 17 00:00:00 2001
+From: Jan Kiszka <jan.kiszka@siemens.com>
+Date: Sun, 21 Jul 2019 13:52:18 +0200
+Subject: KVM: nVMX: Clear pending KVM_REQ_GET_VMCS12_PAGES when leaving nested
+
+From: Jan Kiszka <jan.kiszka@siemens.com>
+
+commit cf64527bb33f6cec2ed50f89182fc4688d0056b6 upstream.
+
+Letting this pend may cause nested_get_vmcs12_pages to run against an
+invalid state, corrupting the effective vmcs of L1.
+
+This was triggerable in QEMU after a guest corruption in L2, followed by
+a L1 reset.
+
+Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
+Reviewed-by: Liran Alon <liran.alon@oracle.com>
+Cc: stable@vger.kernel.org
+Fixes: 7f7f1ba33cf2 ("KVM: x86: do not load vmcs12 pages while still in SMM")
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx/nested.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -212,6 +212,8 @@ static void free_nested(struct kvm_vcpu
+ 	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
+ 		return;
+ 
++	kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
++
+ 	vmx->nested.vmxon = false;
+ 	vmx->nested.smm.vmxon = false;
+ 	free_vpid(vmx->nested.vpid02);
diff --git a/queue-5.1/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch b/queue-5.1/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch
new file mode 100644
index 00000000000..cb802dd8178
--- /dev/null
+++ b/queue-5.1/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch
@@ -0,0 +1,65 @@
+From 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Fri, 19 Jul 2019 18:41:10 +0200
+Subject: KVM: nVMX: do not use dangling shadow VMCS after guest reset
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 upstream.
+
+If a KVM guest is reset while running a nested guest, free_nested will
+disable the shadow VMCS execution control in the vmcs01.  However,
+on the next KVM_RUN vmx_vcpu_run would nevertheless try to sync
+the VMCS12 to the shadow VMCS which has since been freed.
+
+This causes a vmptrld of a NULL pointer on my machime, but Jan reports
+the host to hang altogether.  Let's see how much this trivial patch fixes.
+
+Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
+Cc: Liran Alon <liran.alon@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx/nested.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -184,6 +184,7 @@ static void vmx_disable_shadow_vmcs(stru
+ {
+ 	vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
++	vmx->nested.need_vmcs12_sync = false;
+ }
+ 
+ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
+@@ -1328,6 +1329,9 @@ static void copy_shadow_to_vmcs12(struct
+ 	u64 field_value;
+ 	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ 
++	if (WARN_ON(!shadow_vmcs))
++		return;
++
+ 	preempt_disable();
+ 
+ 	vmcs_load(shadow_vmcs);
+@@ -1366,6 +1370,9 @@ static void copy_vmcs12_to_shadow(struct
+ 	u64 field_value = 0;
+ 	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ 
++	if (WARN_ON(!shadow_vmcs))
++		return;
++
+ 	vmcs_load(shadow_vmcs);
+ 
+ 	for (q = 0; q < ARRAY_SIZE(fields); q++) {
+@@ -4336,7 +4343,6 @@ static inline void nested_release_vmcs12
+ 		/* copy to memory all shadowed fields in case
+ 		   they were modified */
+ 		copy_shadow_to_vmcs12(vmx);
+-		vmx->nested.need_vmcs12_sync = false;
+ 		vmx_disable_shadow_vmcs(vmx);
+ 	}
+ 	vmx->nested.posted_intr_nv = -1;
diff --git a/queue-5.1/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch b/queue-5.1/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch
new file mode 100644
index 00000000000..207dbb72e08
--- /dev/null
+++ b/queue-5.1/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch
@@ -0,0 +1,241 @@
+From 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa Mon Sep 17 00:00:00 2001
+From: Kuo-Hsin Yang <vovoy@chromium.org>
+Date: Thu, 11 Jul 2019 20:52:04 -0700
+Subject: mm: vmscan: scan anonymous pages on file refaults
+
+From: Kuo-Hsin Yang <vovoy@chromium.org>
+
+commit 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa upstream.
+
+When file refaults are detected and there are many inactive file pages,
+the system never reclaim anonymous pages, the file pages are dropped
+aggressively when there are still a lot of cold anonymous pages and
+system thrashes.  This issue impacts the performance of applications
+with large executable, e.g.  chrome.
+
+With this patch, when file refault is detected, inactive_list_is_low()
+always returns true for file pages in get_scan_count() to enable
+scanning anonymous pages.
+
+The problem can be reproduced by the following test program.
+
+---8<---
+void fallocate_file(const char *filename, off_t size)
+{
+	struct stat st;
+	int fd;
+
+	if (!stat(filename, &st) && st.st_size >= size)
+		return;
+
+	fd = open(filename, O_WRONLY | O_CREAT, 0600);
+	if (fd < 0) {
+		perror("create file");
+		exit(1);
+	}
+	if (posix_fallocate(fd, 0, size)) {
+		perror("fallocate");
+		exit(1);
+	}
+	close(fd);
+}
+
+long *alloc_anon(long size)
+{
+	long *start = malloc(size);
+	memset(start, 1, size);
+	return start;
+}
+
+long access_file(const char *filename, long size, long rounds)
+{
+	int fd, i;
+	volatile char *start1, *end1, *start2;
+	const int page_size = getpagesize();
+	long sum = 0;
+
+	fd = open(filename, O_RDONLY);
+	if (fd == -1) {
+		perror("open");
+		exit(1);
+	}
+
+	/*
+	 * Some applications, e.g. chrome, use a lot of executable file
+	 * pages, map some of the pages with PROT_EXEC flag to simulate
+	 * the behavior.
+	 */
+	start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED,
+		      fd, 0);
+	if (start1 == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+	end1 = start1 + size / 2;
+
+	start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2);
+	if (start2 == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	for (i = 0; i < rounds; ++i) {
+		struct timeval before, after;
+		volatile char *ptr1 = start1, *ptr2 = start2;
+		gettimeofday(&before, NULL);
+		for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size)
+			sum += *ptr1 + *ptr2;
+		gettimeofday(&after, NULL);
+		printf("File access time, round %d: %f (sec)
+", i,
+		       (after.tv_sec - before.tv_sec) +
+		       (after.tv_usec - before.tv_usec) / 1000000.0);
+	}
+	return sum;
+}
+
+int main(int argc, char *argv[])
+{
+	const long MB = 1024 * 1024;
+	long anon_mb, file_mb, file_rounds;
+	const char filename[] = "large";
+	long *ret1;
+	long ret2;
+
+	if (argc != 4) {
+		printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS
+");
+		exit(0);
+	}
+	anon_mb = atoi(argv[1]);
+	file_mb = atoi(argv[2]);
+	file_rounds = atoi(argv[3]);
+
+	fallocate_file(filename, file_mb * MB);
+	printf("Allocate %ld MB anonymous pages
+", anon_mb);
+	ret1 = alloc_anon(anon_mb * MB);
+	printf("Access %ld MB file pages
+", file_mb);
+	ret2 = access_file(filename, file_mb * MB, file_rounds);
+	printf("Print result to prevent optimization: %ld
+",
+	       *ret1 + ret2);
+	return 0;
+}
+---8<---
+
+Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the program
+fills ram with 2048 MB memory, access a 200 MB file for 10 times.  Without
+this patch, the file cache is dropped aggresively and every access to the
+file is from disk.
+
+  $ ./thrash 2048 200 10
+  Allocate 2048 MB anonymous pages
+  Access 200 MB file pages
+  File access time, round 0: 2.489316 (sec)
+  File access time, round 1: 2.581277 (sec)
+  File access time, round 2: 2.487624 (sec)
+  File access time, round 3: 2.449100 (sec)
+  File access time, round 4: 2.420423 (sec)
+  File access time, round 5: 2.343411 (sec)
+  File access time, round 6: 2.454833 (sec)
+  File access time, round 7: 2.483398 (sec)
+  File access time, round 8: 2.572701 (sec)
+  File access time, round 9: 2.493014 (sec)
+
+With this patch, these file pages can be cached.
+
+  $ ./thrash 2048 200 10
+  Allocate 2048 MB anonymous pages
+  Access 200 MB file pages
+  File access time, round 0: 2.475189 (sec)
+  File access time, round 1: 2.440777 (sec)
+  File access time, round 2: 2.411671 (sec)
+  File access time, round 3: 1.955267 (sec)
+  File access time, round 4: 0.029924 (sec)
+  File access time, round 5: 0.000808 (sec)
+  File access time, round 6: 0.000771 (sec)
+  File access time, round 7: 0.000746 (sec)
+  File access time, round 8: 0.000738 (sec)
+  File access time, round 9: 0.000747 (sec)
+
+Checked the swap out stats during the test [1], 19006 pages swapped out
+with this patch, 3418 pages swapped out without this patch. There are
+more swap out, but I think it's within reasonable range when file backed
+data set doesn't fit into the memory.
+
+$ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS
+PROCESSES Allocate 2000 MB anonymous pages active_anon: 1613644,
+inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB)
+pswpout: 7972443, pgpgin: 478615246 Access 100 MB executable file pages
+Access 2100 MB regular file pages File access time, round 0: 12.165,
+(sec) active_anon: 1433788, inactive_anon: 478116, active_file: 17896,
+inactive_file: 24328 (kB) File access time, round 1: 11.493, (sec)
+active_anon: 1430576, inactive_anon: 477144, active_file: 25440,
+inactive_file: 26172 (kB) File access time, round 2: 11.455, (sec)
+active_anon: 1427436, inactive_anon: 476060, active_file: 21112,
+inactive_file: 28808 (kB) File access time, round 3: 11.454, (sec)
+active_anon: 1420444, inactive_anon: 473632, active_file: 23216,
+inactive_file: 35036 (kB) File access time, round 4: 11.479, (sec)
+active_anon: 1413964, inactive_anon: 471460, active_file: 31728,
+inactive_file: 32224 (kB) pswpout: 7991449 (+ 19006), pgpgin: 489924366
+(+ 11309120)
+
+With 4 processes accessing non-overlapping parts of a large file, 30316
+pages swapped out with this patch, 5152 pages swapped out without this
+patch.  The swapout number is small comparing to pgpgin.
+
+[1]: https://github.com/vovo/testing/blob/master/mem_thrash.c
+
+Link: http://lkml.kernel.org/r/20190701081038.GA83398@google.com
+Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty")
+Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty")
+Signed-off-by: Kuo-Hsin Yang <vovoy@chromium.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Sonny Rao <sonnyrao@chromium.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: <stable@vger.kernel.org>	[4.12+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[backported to 4.14.y, 4.19.y, 5.1.y: adjust context]
+Signed-off-by: Kuo-Hsin Yang <vovoy@chromium.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2176,7 +2176,7 @@ static void shrink_active_list(unsigned
+  *   10TB     320        32GB
+  */
+ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
+-				 struct scan_control *sc, bool actual_reclaim)
++				 struct scan_control *sc, bool trace)
+ {
+ 	enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+ 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+@@ -2202,7 +2202,7 @@ static bool inactive_list_is_low(struct
+ 	 * rid of the stale workingset quickly.
+ 	 */
+ 	refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+-	if (file && actual_reclaim && lruvec->refaults != refaults) {
++	if (file && lruvec->refaults != refaults) {
+ 		inactive_ratio = 0;
+ 	} else {
+ 		gb = (inactive + active) >> (30 - PAGE_SHIFT);
+@@ -2212,7 +2212,7 @@ static bool inactive_list_is_low(struct
+ 			inactive_ratio = 1;
+ 	}
+ 
+-	if (actual_reclaim)
++	if (trace)
+ 		trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+ 			lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+ 			lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
diff --git a/queue-5.1/revert-kvm-x86-use-task-structs-fpu-field-for-user.patch b/queue-5.1/revert-kvm-x86-use-task-structs-fpu-field-for-user.patch
new file mode 100644
index 00000000000..797cf9dcc14
--- /dev/null
+++ b/queue-5.1/revert-kvm-x86-use-task-structs-fpu-field-for-user.patch
@@ -0,0 +1,66 @@
+From ec269475cba7bcdd1eb8fdf8e87f4c6c81a376fe Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Mon, 22 Jul 2019 13:31:27 +0200
+Subject: Revert "kvm: x86: Use task structs fpu field for user"
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit ec269475cba7bcdd1eb8fdf8e87f4c6c81a376fe upstream.
+
+This reverts commit 240c35a3783ab9b3a0afaba0dde7291295680a6b
+("kvm: x86: Use task structs fpu field for user", 2018-11-06).
+The commit is broken and causes QEMU's FPU state to be destroyed
+when KVM_RUN is preempted.
+
+Fixes: 240c35a3783a ("kvm: x86: Use task structs fpu field for user")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kvm_host.h |    7 ++++---
+ arch/x86/kvm/x86.c              |    4 ++--
+ 2 files changed, 6 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -609,15 +609,16 @@ struct kvm_vcpu_arch {
+ 
+ 	/*
+ 	 * QEMU userspace and the guest each have their own FPU state.
+-	 * In vcpu_run, we switch between the user, maintained in the
+-	 * task_struct struct, and guest FPU contexts. While running a VCPU,
+-	 * the VCPU thread will have the guest FPU context.
++	 * In vcpu_run, we switch between the user and guest FPU contexts.
++	 * While running a VCPU, the VCPU thread will have the guest FPU
++	 * context.
+ 	 *
+ 	 * Note that while the PKRU state lives inside the fpu registers,
+ 	 * it is switched out separately at VMENTER and VMEXIT time. The
+ 	 * "guest_fpu" state here contains the guest FPU context, with the
+ 	 * host PRKU bits.
+ 	 */
++	struct fpu user_fpu;
+ 	struct fpu *guest_fpu;
+ 
+ 	u64 xcr0;
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -8172,7 +8172,7 @@ static int complete_emulated_mmio(struct
+ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+ {
+ 	preempt_disable();
+-	copy_fpregs_to_fpstate(&current->thread.fpu);
++	copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
+ 	/* PKRU is separately restored in kvm_x86_ops->run.  */
+ 	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+ 				~XFEATURE_MASK_PKRU);
+@@ -8185,7 +8185,7 @@ static void kvm_put_guest_fpu(struct kvm
+ {
+ 	preempt_disable();
+ 	copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
+-	copy_kernel_to_fpregs(&current->thread.fpu.state);
++	copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
+ 	preempt_enable();
+ 	++vcpu->stat.fpu_reload;
+ 	trace_kvm_fpu(0);
diff --git a/queue-5.1/sd_zbc-fix-report-zones-buffer-allocation.patch b/queue-5.1/sd_zbc-fix-report-zones-buffer-allocation.patch
new file mode 100644
index 00000000000..f7848d7367e
--- /dev/null
+++ b/queue-5.1/sd_zbc-fix-report-zones-buffer-allocation.patch
@@ -0,0 +1,256 @@
+From b091ac616846a1da75b1f2566b41255ce7f0e0a6 Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <damien.lemoal@wdc.com>
+Date: Mon, 1 Jul 2019 14:09:17 +0900
+Subject: sd_zbc: Fix report zones buffer allocation
+
+From: Damien Le Moal <damien.lemoal@wdc.com>
+
+commit b091ac616846a1da75b1f2566b41255ce7f0e0a6 upstream.
+
+During disk scan and revalidation done with sd_revalidate(), the zones
+of a zoned disk are checked using the helper function
+blk_revalidate_disk_zones() if a configuration change is detected
+(change in the number of zones or zone size). The function
+blk_revalidate_disk_zones() issues report_zones calls that are very
+large, that is, to obtain zone information for all zones of the disk
+with a single command. The size of the report zones command buffer
+necessary for such large request generally is lower than the disk
+max_hw_sectors and KMALLOC_MAX_SIZE (4MB) and succeeds on boot (no
+memory fragmentation), but often fail at run time (e.g. hot-plug
+event). This causes the disk revalidation to fail and the disk
+capacity to be changed to 0.
+
+This problem can be avoided by using vmalloc() instead of kmalloc() for
+the buffer allocation. To limit the amount of memory to be allocated,
+this patch also introduces the arbitrary SD_ZBC_REPORT_MAX_ZONES
+maximum number of zones to report with a single report zones command.
+This limit may be lowered further to satisfy the disk max_hw_sectors
+limit. Finally, to ensure that the vmalloc-ed buffer can always be
+mapped in a request, the buffer size is further limited to at most
+queue_max_segments() pages, allowing successful mapping of the buffer
+even in the worst case scenario where none of the buffer pages are
+contiguous.
+
+Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
+Fixes: e76239a3748c ("block: add a report_zones method")
+Cc: stable@vger.kernel.org
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ drivers/scsi/sd_zbc.c |  104 ++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 75 insertions(+), 29 deletions(-)
+
+--- a/drivers/scsi/sd_zbc.c
++++ b/drivers/scsi/sd_zbc.c
+@@ -23,6 +23,8 @@
+  */
+ 
+ #include <linux/blkdev.h>
++#include <linux/vmalloc.h>
++#include <linux/sched/mm.h>
+ 
+ #include <asm/unaligned.h>
+ 
+@@ -64,7 +66,7 @@ static void sd_zbc_parse_report(struct s
+ /**
+  * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command.
+  * @sdkp: The target disk
+- * @buf: Buffer to use for the reply
++ * @buf: vmalloc-ed buffer to use for the reply
+  * @buflen: the buffer size
+  * @lba: Start LBA of the report
+  * @partial: Do partial report
+@@ -93,7 +95,6 @@ static int sd_zbc_do_report_zones(struct
+ 	put_unaligned_be32(buflen, &cmd[10]);
+ 	if (partial)
+ 		cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
+-	memset(buf, 0, buflen);
+ 
+ 	result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
+ 				  buf, buflen, &sshdr,
+@@ -117,6 +118,53 @@ static int sd_zbc_do_report_zones(struct
+ 	return 0;
+ }
+ 
++/*
++ * Maximum number of zones to get with one report zones command.
++ */
++#define SD_ZBC_REPORT_MAX_ZONES		8192U
++
++/**
++ * Allocate a buffer for report zones reply.
++ * @sdkp: The target disk
++ * @nr_zones: Maximum number of zones to report
++ * @buflen: Size of the buffer allocated
++ *
++ * Try to allocate a reply buffer for the number of requested zones.
++ * The size of the buffer allocated may be smaller than requested to
++ * satify the device constraint (max_hw_sectors, max_segments, etc).
++ *
++ * Return the address of the allocated buffer and update @buflen with
++ * the size of the allocated buffer.
++ */
++static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
++					unsigned int nr_zones, size_t *buflen)
++{
++	struct request_queue *q = sdkp->disk->queue;
++	size_t bufsize;
++	void *buf;
++
++	/*
++	 * Report zone buffer size should be at most 64B times the number of
++	 * zones requested plus the 64B reply header, but should be at least
++	 * SECTOR_SIZE for ATA devices.
++	 * Make sure that this size does not exceed the hardware capabilities.
++	 * Furthermore, since the report zone command cannot be split, make
++	 * sure that the allocated buffer can always be mapped by limiting the
++	 * number of pages allocated to the HBA max segments limit.
++	 */
++	nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
++	bufsize = roundup((nr_zones + 1) * 64, 512);
++	bufsize = min_t(size_t, bufsize,
++			queue_max_hw_sectors(q) << SECTOR_SHIFT);
++	bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
++
++	buf = vzalloc(bufsize);
++	if (buf)
++		*buflen = bufsize;
++
++	return buf;
++}
++
+ /**
+  * sd_zbc_report_zones - Disk report zones operation.
+  * @disk: The target disk
+@@ -132,30 +180,23 @@ int sd_zbc_report_zones(struct gendisk *
+ 			gfp_t gfp_mask)
+ {
+ 	struct scsi_disk *sdkp = scsi_disk(disk);
+-	unsigned int i, buflen, nrz = *nr_zones;
++	unsigned int i, nrz = *nr_zones;
+ 	unsigned char *buf;
+-	size_t offset = 0;
++	size_t buflen = 0, offset = 0;
+ 	int ret = 0;
+ 
+ 	if (!sd_is_zoned(sdkp))
+ 		/* Not a zoned device */
+ 		return -EOPNOTSUPP;
+ 
+-	/*
+-	 * Get a reply buffer for the number of requested zones plus a header,
+-	 * without exceeding the device maximum command size. For ATA disks,
+-	 * buffers must be aligned to 512B.
+-	 */
+-	buflen = min(queue_max_hw_sectors(disk->queue) << 9,
+-		     roundup((nrz + 1) * 64, 512));
+-	buf = kmalloc(buflen, gfp_mask);
++	buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen);
+ 	if (!buf)
+ 		return -ENOMEM;
+ 
+ 	ret = sd_zbc_do_report_zones(sdkp, buf, buflen,
+ 			sectors_to_logical(sdkp->device, sector), true);
+ 	if (ret)
+-		goto out_free_buf;
++		goto out;
+ 
+ 	nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64);
+ 	for (i = 0; i < nrz; i++) {
+@@ -166,8 +207,8 @@ int sd_zbc_report_zones(struct gendisk *
+ 
+ 	*nr_zones = nrz;
+ 
+-out_free_buf:
+-	kfree(buf);
++out:
++	kvfree(buf);
+ 
+ 	return ret;
+ }
+@@ -301,8 +342,6 @@ static int sd_zbc_check_zoned_characteri
+ 	return 0;
+ }
+ 
+-#define SD_ZBC_BUF_SIZE 131072U
+-
+ /**
+  * sd_zbc_check_zones - Check the device capacity and zone sizes
+  * @sdkp: Target disk
+@@ -318,22 +357,28 @@ static int sd_zbc_check_zoned_characteri
+  */
+ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
+ {
++	size_t bufsize, buflen;
++	unsigned int noio_flag;
+ 	u64 zone_blocks = 0;
+ 	sector_t max_lba, block = 0;
+ 	unsigned char *buf;
+ 	unsigned char *rec;
+-	unsigned int buf_len;
+-	unsigned int list_length;
+ 	int ret;
+ 	u8 same;
+ 
++	/* Do all memory allocations as if GFP_NOIO was specified */
++	noio_flag = memalloc_noio_save();
++
+ 	/* Get a buffer */
+-	buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
+-	if (!buf)
+-		return -ENOMEM;
++	buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES,
++					 &bufsize);
++	if (!buf) {
++		ret = -ENOMEM;
++		goto out;
++	}
+ 
+ 	/* Do a report zone to get max_lba and the same field */
+-	ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false);
++	ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false);
+ 	if (ret)
+ 		goto out_free;
+ 
+@@ -369,12 +414,12 @@ static int sd_zbc_check_zones(struct scs
+ 	do {
+ 
+ 		/* Parse REPORT ZONES header */
+-		list_length = get_unaligned_be32(&buf[0]) + 64;
++		buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64,
++			       bufsize);
+ 		rec = buf + 64;
+-		buf_len = min(list_length, SD_ZBC_BUF_SIZE);
+ 
+ 		/* Parse zone descriptors */
+-		while (rec < buf + buf_len) {
++		while (rec < buf + buflen) {
+ 			u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
+ 
+ 			if (zone_blocks == 0) {
+@@ -390,8 +435,8 @@ static int sd_zbc_check_zones(struct scs
+ 		}
+ 
+ 		if (block < sdkp->capacity) {
+-			ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
+-						     block, true);
++			ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block,
++						     true);
+ 			if (ret)
+ 				goto out_free;
+ 		}
+@@ -422,7 +467,8 @@ out:
+ 	}
+ 
+ out_free:
+-	kfree(buf);
++	memalloc_noio_restore(noio_flag);
++	kvfree(buf);
+ 
+ 	return ret;
+ }
diff --git a/queue-5.1/series b/queue-5.1/series
index c119dd336c8..bb3620f15e0 100644
--- a/queue-5.1/series
+++ b/queue-5.1/series
@@ -54,3 +54,9 @@ mm-add-filemap_fdatawait_range_keep_errors.patch
 jbd2-introduce-jbd2_inode-dirty-range-scoping.patch
 ext4-use-jbd2_inode-dirty-range-scoping.patch
 ext4-allow-directory-holes.patch
+kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch
+kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch
+revert-kvm-x86-use-task-structs-fpu-field-for-user.patch
+sd_zbc-fix-report-zones-buffer-allocation.patch
+block-limit-zone-array-allocation-size.patch
+mm-vmscan-scan-anonymous-pages-on-file-refaults.patch