--- /dev/null
+From 26202928fafad8bda8b478edb7e62c885be623d7 Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <damien.lemoal@wdc.com>
+Date: Mon, 1 Jul 2019 14:09:18 +0900
+Subject: block: Limit zone array allocation size
+
+From: Damien Le Moal <damien.lemoal@wdc.com>
+
+commit 26202928fafad8bda8b478edb7e62c885be623d7 upstream.
+
+Limit the size of the struct blk_zone array used in
+blk_revalidate_disk_zones() to avoid memory allocation failures leading
+to disk revalidation failure. Also further reduce the likelyhood of
+such failures by using kvcalloc() (that is vmalloc()) instead of
+allocating contiguous pages with alloc_pages().
+
+Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
+Fixes: e76239a3748c ("block: add a report_zones method")
+Cc: stable@vger.kernel.org
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/blk-zoned.c | 46 ++++++++++++++++++++++++++++++----------------
+ include/linux/blkdev.h | 5 +++++
+ 2 files changed, 35 insertions(+), 16 deletions(-)
+
+--- a/block/blk-zoned.c
++++ b/block/blk-zoned.c
+@@ -13,6 +13,9 @@
+ #include <linux/rbtree.h>
+ #include <linux/blkdev.h>
+ #include <linux/blk-mq.h>
++#include <linux/mm.h>
++#include <linux/vmalloc.h>
++#include <linux/sched/mm.h>
+
+ #include "blk.h"
+
+@@ -372,22 +375,25 @@ static inline unsigned long *blk_alloc_z
+ * Allocate an array of struct blk_zone to get nr_zones zone information.
+ * The allocated array may be smaller than nr_zones.
+ */
+-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
++static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
+ {
+- size_t size = *nr_zones * sizeof(struct blk_zone);
+- struct page *page;
+- int order;
+-
+- for (order = get_order(size); order >= 0; order--) {
+- page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
+- if (page) {
+- *nr_zones = min_t(unsigned int, *nr_zones,
+- (PAGE_SIZE << order) / sizeof(struct blk_zone));
+- return page_address(page);
+- }
++ struct blk_zone *zones;
++ size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
++
++ /*
++ * GFP_KERNEL here is meaningless as the caller task context has
++ * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
++ * with memalloc_noio_save().
++ */
++ zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
++ if (!zones) {
++ *nr_zones = 0;
++ return NULL;
+ }
+
+- return NULL;
++ *nr_zones = nrz;
++
++ return zones;
+ }
+
+ void blk_queue_free_zone_bitmaps(struct request_queue *q)
+@@ -414,6 +420,7 @@ int blk_revalidate_disk_zones(struct gen
+ unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
+ unsigned int i, rep_nr_zones = 0, z = 0, nrz;
+ struct blk_zone *zones = NULL;
++ unsigned int noio_flag;
+ sector_t sector = 0;
+ int ret = 0;
+
+@@ -426,6 +433,12 @@ int blk_revalidate_disk_zones(struct gen
+ return 0;
+ }
+
++ /*
++ * Ensure that all memory allocations in this context are done as
++ * if GFP_NOIO was specified.
++ */
++ noio_flag = memalloc_noio_save();
++
+ if (!blk_queue_is_zoned(q) || !nr_zones) {
+ nr_zones = 0;
+ goto update;
+@@ -442,7 +455,7 @@ int blk_revalidate_disk_zones(struct gen
+
+ /* Get zone information and initialize seq_zones_bitmap */
+ rep_nr_zones = nr_zones;
+- zones = blk_alloc_zones(q->node, &rep_nr_zones);
++ zones = blk_alloc_zones(&rep_nr_zones);
+ if (!zones)
+ goto out;
+
+@@ -479,8 +492,9 @@ update:
+ blk_mq_unfreeze_queue(q);
+
+ out:
+- free_pages((unsigned long)zones,
+- get_order(rep_nr_zones * sizeof(struct blk_zone)));
++ memalloc_noio_restore(noio_flag);
++
++ kvfree(zones);
+ kfree(seq_zones_wlock);
+ kfree(seq_zones_bitmap);
+
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -344,6 +344,11 @@ struct queue_limits {
+
+ #ifdef CONFIG_BLK_DEV_ZONED
+
++/*
++ * Maximum number of zones to report with a single report zones command.
++ */
++#define BLK_ZONED_REPORT_MAX_ZONES 8192U
++
+ extern unsigned int blkdev_nr_zones(struct block_device *bdev);
+ extern int blkdev_report_zones(struct block_device *bdev,
+ sector_t sector, struct blk_zone *zones,
--- /dev/null
+From cf64527bb33f6cec2ed50f89182fc4688d0056b6 Mon Sep 17 00:00:00 2001
+From: Jan Kiszka <jan.kiszka@siemens.com>
+Date: Sun, 21 Jul 2019 13:52:18 +0200
+Subject: KVM: nVMX: Clear pending KVM_REQ_GET_VMCS12_PAGES when leaving nested
+
+From: Jan Kiszka <jan.kiszka@siemens.com>
+
+commit cf64527bb33f6cec2ed50f89182fc4688d0056b6 upstream.
+
+Letting this pend may cause nested_get_vmcs12_pages to run against an
+invalid state, corrupting the effective vmcs of L1.
+
+This was triggerable in QEMU after a guest corruption in L2, followed by
+a L1 reset.
+
+Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
+Reviewed-by: Liran Alon <liran.alon@oracle.com>
+Cc: stable@vger.kernel.org
+Fixes: 7f7f1ba33cf2 ("KVM: x86: do not load vmcs12 pages while still in SMM")
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx/nested.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -212,6 +212,8 @@ static void free_nested(struct kvm_vcpu
+ if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
+ return;
+
++ kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
++
+ vmx->nested.vmxon = false;
+ vmx->nested.smm.vmxon = false;
+ free_vpid(vmx->nested.vpid02);
--- /dev/null
+From 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Fri, 19 Jul 2019 18:41:10 +0200
+Subject: KVM: nVMX: do not use dangling shadow VMCS after guest reset
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 upstream.
+
+If a KVM guest is reset while running a nested guest, free_nested will
+disable the shadow VMCS execution control in the vmcs01. However,
+on the next KVM_RUN vmx_vcpu_run would nevertheless try to sync
+the VMCS12 to the shadow VMCS which has since been freed.
+
+This causes a vmptrld of a NULL pointer on my machime, but Jan reports
+the host to hang altogether. Let's see how much this trivial patch fixes.
+
+Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
+Cc: Liran Alon <liran.alon@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx/nested.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -184,6 +184,7 @@ static void vmx_disable_shadow_vmcs(stru
+ {
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
++ vmx->nested.need_vmcs12_sync = false;
+ }
+
+ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
+@@ -1328,6 +1329,9 @@ static void copy_shadow_to_vmcs12(struct
+ u64 field_value;
+ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+
++ if (WARN_ON(!shadow_vmcs))
++ return;
++
+ preempt_disable();
+
+ vmcs_load(shadow_vmcs);
+@@ -1366,6 +1370,9 @@ static void copy_vmcs12_to_shadow(struct
+ u64 field_value = 0;
+ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+
++ if (WARN_ON(!shadow_vmcs))
++ return;
++
+ vmcs_load(shadow_vmcs);
+
+ for (q = 0; q < ARRAY_SIZE(fields); q++) {
+@@ -4336,7 +4343,6 @@ static inline void nested_release_vmcs12
+ /* copy to memory all shadowed fields in case
+ they were modified */
+ copy_shadow_to_vmcs12(vmx);
+- vmx->nested.need_vmcs12_sync = false;
+ vmx_disable_shadow_vmcs(vmx);
+ }
+ vmx->nested.posted_intr_nv = -1;
--- /dev/null
+From 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa Mon Sep 17 00:00:00 2001
+From: Kuo-Hsin Yang <vovoy@chromium.org>
+Date: Thu, 11 Jul 2019 20:52:04 -0700
+Subject: mm: vmscan: scan anonymous pages on file refaults
+
+From: Kuo-Hsin Yang <vovoy@chromium.org>
+
+commit 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa upstream.
+
+When file refaults are detected and there are many inactive file pages,
+the system never reclaim anonymous pages, the file pages are dropped
+aggressively when there are still a lot of cold anonymous pages and
+system thrashes. This issue impacts the performance of applications
+with large executable, e.g. chrome.
+
+With this patch, when file refault is detected, inactive_list_is_low()
+always returns true for file pages in get_scan_count() to enable
+scanning anonymous pages.
+
+The problem can be reproduced by the following test program.
+
+---8<---
+void fallocate_file(const char *filename, off_t size)
+{
+ struct stat st;
+ int fd;
+
+ if (!stat(filename, &st) && st.st_size >= size)
+ return;
+
+ fd = open(filename, O_WRONLY | O_CREAT, 0600);
+ if (fd < 0) {
+ perror("create file");
+ exit(1);
+ }
+ if (posix_fallocate(fd, 0, size)) {
+ perror("fallocate");
+ exit(1);
+ }
+ close(fd);
+}
+
+long *alloc_anon(long size)
+{
+ long *start = malloc(size);
+ memset(start, 1, size);
+ return start;
+}
+
+long access_file(const char *filename, long size, long rounds)
+{
+ int fd, i;
+ volatile char *start1, *end1, *start2;
+ const int page_size = getpagesize();
+ long sum = 0;
+
+ fd = open(filename, O_RDONLY);
+ if (fd == -1) {
+ perror("open");
+ exit(1);
+ }
+
+ /*
+ * Some applications, e.g. chrome, use a lot of executable file
+ * pages, map some of the pages with PROT_EXEC flag to simulate
+ * the behavior.
+ */
+ start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED,
+ fd, 0);
+ if (start1 == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+ end1 = start1 + size / 2;
+
+ start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2);
+ if (start2 == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ for (i = 0; i < rounds; ++i) {
+ struct timeval before, after;
+ volatile char *ptr1 = start1, *ptr2 = start2;
+ gettimeofday(&before, NULL);
+ for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size)
+ sum += *ptr1 + *ptr2;
+ gettimeofday(&after, NULL);
+ printf("File access time, round %d: %f (sec)
+", i,
+ (after.tv_sec - before.tv_sec) +
+ (after.tv_usec - before.tv_usec) / 1000000.0);
+ }
+ return sum;
+}
+
+int main(int argc, char *argv[])
+{
+ const long MB = 1024 * 1024;
+ long anon_mb, file_mb, file_rounds;
+ const char filename[] = "large";
+ long *ret1;
+ long ret2;
+
+ if (argc != 4) {
+ printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS
+");
+ exit(0);
+ }
+ anon_mb = atoi(argv[1]);
+ file_mb = atoi(argv[2]);
+ file_rounds = atoi(argv[3]);
+
+ fallocate_file(filename, file_mb * MB);
+ printf("Allocate %ld MB anonymous pages
+", anon_mb);
+ ret1 = alloc_anon(anon_mb * MB);
+ printf("Access %ld MB file pages
+", file_mb);
+ ret2 = access_file(filename, file_mb * MB, file_rounds);
+ printf("Print result to prevent optimization: %ld
+",
+ *ret1 + ret2);
+ return 0;
+}
+---8<---
+
+Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the program
+fills ram with 2048 MB memory, access a 200 MB file for 10 times. Without
+this patch, the file cache is dropped aggresively and every access to the
+file is from disk.
+
+ $ ./thrash 2048 200 10
+ Allocate 2048 MB anonymous pages
+ Access 200 MB file pages
+ File access time, round 0: 2.489316 (sec)
+ File access time, round 1: 2.581277 (sec)
+ File access time, round 2: 2.487624 (sec)
+ File access time, round 3: 2.449100 (sec)
+ File access time, round 4: 2.420423 (sec)
+ File access time, round 5: 2.343411 (sec)
+ File access time, round 6: 2.454833 (sec)
+ File access time, round 7: 2.483398 (sec)
+ File access time, round 8: 2.572701 (sec)
+ File access time, round 9: 2.493014 (sec)
+
+With this patch, these file pages can be cached.
+
+ $ ./thrash 2048 200 10
+ Allocate 2048 MB anonymous pages
+ Access 200 MB file pages
+ File access time, round 0: 2.475189 (sec)
+ File access time, round 1: 2.440777 (sec)
+ File access time, round 2: 2.411671 (sec)
+ File access time, round 3: 1.955267 (sec)
+ File access time, round 4: 0.029924 (sec)
+ File access time, round 5: 0.000808 (sec)
+ File access time, round 6: 0.000771 (sec)
+ File access time, round 7: 0.000746 (sec)
+ File access time, round 8: 0.000738 (sec)
+ File access time, round 9: 0.000747 (sec)
+
+Checked the swap out stats during the test [1], 19006 pages swapped out
+with this patch, 3418 pages swapped out without this patch. There are
+more swap out, but I think it's within reasonable range when file backed
+data set doesn't fit into the memory.
+
+$ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS
+PROCESSES Allocate 2000 MB anonymous pages active_anon: 1613644,
+inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB)
+pswpout: 7972443, pgpgin: 478615246 Access 100 MB executable file pages
+Access 2100 MB regular file pages File access time, round 0: 12.165,
+(sec) active_anon: 1433788, inactive_anon: 478116, active_file: 17896,
+inactive_file: 24328 (kB) File access time, round 1: 11.493, (sec)
+active_anon: 1430576, inactive_anon: 477144, active_file: 25440,
+inactive_file: 26172 (kB) File access time, round 2: 11.455, (sec)
+active_anon: 1427436, inactive_anon: 476060, active_file: 21112,
+inactive_file: 28808 (kB) File access time, round 3: 11.454, (sec)
+active_anon: 1420444, inactive_anon: 473632, active_file: 23216,
+inactive_file: 35036 (kB) File access time, round 4: 11.479, (sec)
+active_anon: 1413964, inactive_anon: 471460, active_file: 31728,
+inactive_file: 32224 (kB) pswpout: 7991449 (+ 19006), pgpgin: 489924366
+(+ 11309120)
+
+With 4 processes accessing non-overlapping parts of a large file, 30316
+pages swapped out with this patch, 5152 pages swapped out without this
+patch. The swapout number is small comparing to pgpgin.
+
+[1]: https://github.com/vovo/testing/blob/master/mem_thrash.c
+
+Link: http://lkml.kernel.org/r/20190701081038.GA83398@google.com
+Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty")
+Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty")
+Signed-off-by: Kuo-Hsin Yang <vovoy@chromium.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Sonny Rao <sonnyrao@chromium.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: <stable@vger.kernel.org> [4.12+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[backported to 4.14.y, 4.19.y, 5.1.y: adjust context]
+Signed-off-by: Kuo-Hsin Yang <vovoy@chromium.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2176,7 +2176,7 @@ static void shrink_active_list(unsigned
+ * 10TB 320 32GB
+ */
+ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
+- struct scan_control *sc, bool actual_reclaim)
++ struct scan_control *sc, bool trace)
+ {
+ enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+@@ -2202,7 +2202,7 @@ static bool inactive_list_is_low(struct
+ * rid of the stale workingset quickly.
+ */
+ refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+- if (file && actual_reclaim && lruvec->refaults != refaults) {
++ if (file && lruvec->refaults != refaults) {
+ inactive_ratio = 0;
+ } else {
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+@@ -2212,7 +2212,7 @@ static bool inactive_list_is_low(struct
+ inactive_ratio = 1;
+ }
+
+- if (actual_reclaim)
++ if (trace)
+ trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+ lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+ lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
--- /dev/null
+From ec269475cba7bcdd1eb8fdf8e87f4c6c81a376fe Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Mon, 22 Jul 2019 13:31:27 +0200
+Subject: Revert "kvm: x86: Use task structs fpu field for user"
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit ec269475cba7bcdd1eb8fdf8e87f4c6c81a376fe upstream.
+
+This reverts commit 240c35a3783ab9b3a0afaba0dde7291295680a6b
+("kvm: x86: Use task structs fpu field for user", 2018-11-06).
+The commit is broken and causes QEMU's FPU state to be destroyed
+when KVM_RUN is preempted.
+
+Fixes: 240c35a3783a ("kvm: x86: Use task structs fpu field for user")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kvm_host.h | 7 ++++---
+ arch/x86/kvm/x86.c | 4 ++--
+ 2 files changed, 6 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -609,15 +609,16 @@ struct kvm_vcpu_arch {
+
+ /*
+ * QEMU userspace and the guest each have their own FPU state.
+- * In vcpu_run, we switch between the user, maintained in the
+- * task_struct struct, and guest FPU contexts. While running a VCPU,
+- * the VCPU thread will have the guest FPU context.
++ * In vcpu_run, we switch between the user and guest FPU contexts.
++ * While running a VCPU, the VCPU thread will have the guest FPU
++ * context.
+ *
+ * Note that while the PKRU state lives inside the fpu registers,
+ * it is switched out separately at VMENTER and VMEXIT time. The
+ * "guest_fpu" state here contains the guest FPU context, with the
+ * host PRKU bits.
+ */
++ struct fpu user_fpu;
+ struct fpu *guest_fpu;
+
+ u64 xcr0;
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -8172,7 +8172,7 @@ static int complete_emulated_mmio(struct
+ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+ {
+ preempt_disable();
+- copy_fpregs_to_fpstate(¤t->thread.fpu);
++ copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
+ /* PKRU is separately restored in kvm_x86_ops->run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+ ~XFEATURE_MASK_PKRU);
+@@ -8185,7 +8185,7 @@ static void kvm_put_guest_fpu(struct kvm
+ {
+ preempt_disable();
+ copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
+- copy_kernel_to_fpregs(¤t->thread.fpu.state);
++ copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
+ preempt_enable();
+ ++vcpu->stat.fpu_reload;
+ trace_kvm_fpu(0);
--- /dev/null
+From b091ac616846a1da75b1f2566b41255ce7f0e0a6 Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <damien.lemoal@wdc.com>
+Date: Mon, 1 Jul 2019 14:09:17 +0900
+Subject: sd_zbc: Fix report zones buffer allocation
+
+From: Damien Le Moal <damien.lemoal@wdc.com>
+
+commit b091ac616846a1da75b1f2566b41255ce7f0e0a6 upstream.
+
+During disk scan and revalidation done with sd_revalidate(), the zones
+of a zoned disk are checked using the helper function
+blk_revalidate_disk_zones() if a configuration change is detected
+(change in the number of zones or zone size). The function
+blk_revalidate_disk_zones() issues report_zones calls that are very
+large, that is, to obtain zone information for all zones of the disk
+with a single command. The size of the report zones command buffer
+necessary for such large request generally is lower than the disk
+max_hw_sectors and KMALLOC_MAX_SIZE (4MB) and succeeds on boot (no
+memory fragmentation), but often fail at run time (e.g. hot-plug
+event). This causes the disk revalidation to fail and the disk
+capacity to be changed to 0.
+
+This problem can be avoided by using vmalloc() instead of kmalloc() for
+the buffer allocation. To limit the amount of memory to be allocated,
+this patch also introduces the arbitrary SD_ZBC_REPORT_MAX_ZONES
+maximum number of zones to report with a single report zones command.
+This limit may be lowered further to satisfy the disk max_hw_sectors
+limit. Finally, to ensure that the vmalloc-ed buffer can always be
+mapped in a request, the buffer size is further limited to at most
+queue_max_segments() pages, allowing successful mapping of the buffer
+even in the worst case scenario where none of the buffer pages are
+contiguous.
+
+Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
+Fixes: e76239a3748c ("block: add a report_zones method")
+Cc: stable@vger.kernel.org
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ drivers/scsi/sd_zbc.c | 104 ++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 75 insertions(+), 29 deletions(-)
+
+--- a/drivers/scsi/sd_zbc.c
++++ b/drivers/scsi/sd_zbc.c
+@@ -23,6 +23,8 @@
+ */
+
+ #include <linux/blkdev.h>
++#include <linux/vmalloc.h>
++#include <linux/sched/mm.h>
+
+ #include <asm/unaligned.h>
+
+@@ -64,7 +66,7 @@ static void sd_zbc_parse_report(struct s
+ /**
+ * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command.
+ * @sdkp: The target disk
+- * @buf: Buffer to use for the reply
++ * @buf: vmalloc-ed buffer to use for the reply
+ * @buflen: the buffer size
+ * @lba: Start LBA of the report
+ * @partial: Do partial report
+@@ -93,7 +95,6 @@ static int sd_zbc_do_report_zones(struct
+ put_unaligned_be32(buflen, &cmd[10]);
+ if (partial)
+ cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
+- memset(buf, 0, buflen);
+
+ result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
+ buf, buflen, &sshdr,
+@@ -117,6 +118,53 @@ static int sd_zbc_do_report_zones(struct
+ return 0;
+ }
+
++/*
++ * Maximum number of zones to get with one report zones command.
++ */
++#define SD_ZBC_REPORT_MAX_ZONES 8192U
++
++/**
++ * Allocate a buffer for report zones reply.
++ * @sdkp: The target disk
++ * @nr_zones: Maximum number of zones to report
++ * @buflen: Size of the buffer allocated
++ *
++ * Try to allocate a reply buffer for the number of requested zones.
++ * The size of the buffer allocated may be smaller than requested to
++ * satify the device constraint (max_hw_sectors, max_segments, etc).
++ *
++ * Return the address of the allocated buffer and update @buflen with
++ * the size of the allocated buffer.
++ */
++static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
++ unsigned int nr_zones, size_t *buflen)
++{
++ struct request_queue *q = sdkp->disk->queue;
++ size_t bufsize;
++ void *buf;
++
++ /*
++ * Report zone buffer size should be at most 64B times the number of
++ * zones requested plus the 64B reply header, but should be at least
++ * SECTOR_SIZE for ATA devices.
++ * Make sure that this size does not exceed the hardware capabilities.
++ * Furthermore, since the report zone command cannot be split, make
++ * sure that the allocated buffer can always be mapped by limiting the
++ * number of pages allocated to the HBA max segments limit.
++ */
++ nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
++ bufsize = roundup((nr_zones + 1) * 64, 512);
++ bufsize = min_t(size_t, bufsize,
++ queue_max_hw_sectors(q) << SECTOR_SHIFT);
++ bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
++
++ buf = vzalloc(bufsize);
++ if (buf)
++ *buflen = bufsize;
++
++ return buf;
++}
++
+ /**
+ * sd_zbc_report_zones - Disk report zones operation.
+ * @disk: The target disk
+@@ -132,30 +180,23 @@ int sd_zbc_report_zones(struct gendisk *
+ gfp_t gfp_mask)
+ {
+ struct scsi_disk *sdkp = scsi_disk(disk);
+- unsigned int i, buflen, nrz = *nr_zones;
++ unsigned int i, nrz = *nr_zones;
+ unsigned char *buf;
+- size_t offset = 0;
++ size_t buflen = 0, offset = 0;
+ int ret = 0;
+
+ if (!sd_is_zoned(sdkp))
+ /* Not a zoned device */
+ return -EOPNOTSUPP;
+
+- /*
+- * Get a reply buffer for the number of requested zones plus a header,
+- * without exceeding the device maximum command size. For ATA disks,
+- * buffers must be aligned to 512B.
+- */
+- buflen = min(queue_max_hw_sectors(disk->queue) << 9,
+- roundup((nrz + 1) * 64, 512));
+- buf = kmalloc(buflen, gfp_mask);
++ buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen);
+ if (!buf)
+ return -ENOMEM;
+
+ ret = sd_zbc_do_report_zones(sdkp, buf, buflen,
+ sectors_to_logical(sdkp->device, sector), true);
+ if (ret)
+- goto out_free_buf;
++ goto out;
+
+ nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64);
+ for (i = 0; i < nrz; i++) {
+@@ -166,8 +207,8 @@ int sd_zbc_report_zones(struct gendisk *
+
+ *nr_zones = nrz;
+
+-out_free_buf:
+- kfree(buf);
++out:
++ kvfree(buf);
+
+ return ret;
+ }
+@@ -301,8 +342,6 @@ static int sd_zbc_check_zoned_characteri
+ return 0;
+ }
+
+-#define SD_ZBC_BUF_SIZE 131072U
+-
+ /**
+ * sd_zbc_check_zones - Check the device capacity and zone sizes
+ * @sdkp: Target disk
+@@ -318,22 +357,28 @@ static int sd_zbc_check_zoned_characteri
+ */
+ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
+ {
++ size_t bufsize, buflen;
++ unsigned int noio_flag;
+ u64 zone_blocks = 0;
+ sector_t max_lba, block = 0;
+ unsigned char *buf;
+ unsigned char *rec;
+- unsigned int buf_len;
+- unsigned int list_length;
+ int ret;
+ u8 same;
+
++ /* Do all memory allocations as if GFP_NOIO was specified */
++ noio_flag = memalloc_noio_save();
++
+ /* Get a buffer */
+- buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
+- if (!buf)
+- return -ENOMEM;
++ buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES,
++ &bufsize);
++ if (!buf) {
++ ret = -ENOMEM;
++ goto out;
++ }
+
+ /* Do a report zone to get max_lba and the same field */
+- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false);
++ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false);
+ if (ret)
+ goto out_free;
+
+@@ -369,12 +414,12 @@ static int sd_zbc_check_zones(struct scs
+ do {
+
+ /* Parse REPORT ZONES header */
+- list_length = get_unaligned_be32(&buf[0]) + 64;
++ buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64,
++ bufsize);
+ rec = buf + 64;
+- buf_len = min(list_length, SD_ZBC_BUF_SIZE);
+
+ /* Parse zone descriptors */
+- while (rec < buf + buf_len) {
++ while (rec < buf + buflen) {
+ u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
+
+ if (zone_blocks == 0) {
+@@ -390,8 +435,8 @@ static int sd_zbc_check_zones(struct scs
+ }
+
+ if (block < sdkp->capacity) {
+- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
+- block, true);
++ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block,
++ true);
+ if (ret)
+ goto out_free;
+ }
+@@ -422,7 +467,8 @@ out:
+ }
+
+ out_free:
+- kfree(buf);
++ memalloc_noio_restore(noio_flag);
++ kvfree(buf);
+
+ return ret;
+ }
jbd2-introduce-jbd2_inode-dirty-range-scoping.patch
ext4-use-jbd2_inode-dirty-range-scoping.patch
ext4-allow-directory-holes.patch
+kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch
+kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch
+revert-kvm-x86-use-task-structs-fpu-field-for-user.patch
+sd_zbc-fix-report-zones-buffer-allocation.patch
+block-limit-zone-array-allocation-size.patch
+mm-vmscan-scan-anonymous-pages-on-file-refaults.patch