From: Greg Kroah-Hartman Date: Fri, 26 Jul 2019 13:58:18 +0000 (+0200) Subject: 5.1-stable patches X-Git-Tag: v5.2.4~6 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=440b48b84b9c8ae81f901a23a19bf91944523730;p=thirdparty%2Fkernel%2Fstable-queue.git 5.1-stable patches added patches: block-limit-zone-array-allocation-size.patch kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch mm-vmscan-scan-anonymous-pages-on-file-refaults.patch revert-kvm-x86-use-task-structs-fpu-field-for-user.patch sd_zbc-fix-report-zones-buffer-allocation.patch --- diff --git a/queue-5.1/block-limit-zone-array-allocation-size.patch b/queue-5.1/block-limit-zone-array-allocation-size.patch new file mode 100644 index 00000000000..280d1ac24ac --- /dev/null +++ b/queue-5.1/block-limit-zone-array-allocation-size.patch @@ -0,0 +1,136 @@ +From 26202928fafad8bda8b478edb7e62c885be623d7 Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Mon, 1 Jul 2019 14:09:18 +0900 +Subject: block: Limit zone array allocation size + +From: Damien Le Moal + +commit 26202928fafad8bda8b478edb7e62c885be623d7 upstream. + +Limit the size of the struct blk_zone array used in +blk_revalidate_disk_zones() to avoid memory allocation failures leading +to disk revalidation failure. Also further reduce the likelyhood of +such failures by using kvcalloc() (that is vmalloc()) instead of +allocating contiguous pages with alloc_pages(). + +Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation") +Fixes: e76239a3748c ("block: add a report_zones method") +Cc: stable@vger.kernel.org +Reviewed-by: Christoph Hellwig +Reviewed-by: Martin K. Petersen +Signed-off-by: Damien Le Moal +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + block/blk-zoned.c | 46 ++++++++++++++++++++++++++++++---------------- + include/linux/blkdev.h | 5 +++++ + 2 files changed, 35 insertions(+), 16 deletions(-) + +--- a/block/blk-zoned.c ++++ b/block/blk-zoned.c +@@ -13,6 +13,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #include "blk.h" + +@@ -372,22 +375,25 @@ static inline unsigned long *blk_alloc_z + * Allocate an array of struct blk_zone to get nr_zones zone information. + * The allocated array may be smaller than nr_zones. + */ +-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones) ++static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones) + { +- size_t size = *nr_zones * sizeof(struct blk_zone); +- struct page *page; +- int order; +- +- for (order = get_order(size); order >= 0; order--) { +- page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); +- if (page) { +- *nr_zones = min_t(unsigned int, *nr_zones, +- (PAGE_SIZE << order) / sizeof(struct blk_zone)); +- return page_address(page); +- } ++ struct blk_zone *zones; ++ size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES); ++ ++ /* ++ * GFP_KERNEL here is meaningless as the caller task context has ++ * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones() ++ * with memalloc_noio_save(). ++ */ ++ zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL); ++ if (!zones) { ++ *nr_zones = 0; ++ return NULL; + } + +- return NULL; ++ *nr_zones = nrz; ++ ++ return zones; + } + + void blk_queue_free_zone_bitmaps(struct request_queue *q) +@@ -414,6 +420,7 @@ int blk_revalidate_disk_zones(struct gen + unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; + unsigned int i, rep_nr_zones = 0, z = 0, nrz; + struct blk_zone *zones = NULL; ++ unsigned int noio_flag; + sector_t sector = 0; + int ret = 0; + +@@ -426,6 +433,12 @@ int blk_revalidate_disk_zones(struct gen + return 0; + } + ++ /* ++ * Ensure that all memory allocations in this context are done as ++ * if GFP_NOIO was specified. ++ */ ++ noio_flag = memalloc_noio_save(); ++ + if (!blk_queue_is_zoned(q) || !nr_zones) { + nr_zones = 0; + goto update; +@@ -442,7 +455,7 @@ int blk_revalidate_disk_zones(struct gen + + /* Get zone information and initialize seq_zones_bitmap */ + rep_nr_zones = nr_zones; +- zones = blk_alloc_zones(q->node, &rep_nr_zones); ++ zones = blk_alloc_zones(&rep_nr_zones); + if (!zones) + goto out; + +@@ -479,8 +492,9 @@ update: + blk_mq_unfreeze_queue(q); + + out: +- free_pages((unsigned long)zones, +- get_order(rep_nr_zones * sizeof(struct blk_zone))); ++ memalloc_noio_restore(noio_flag); ++ ++ kvfree(zones); + kfree(seq_zones_wlock); + kfree(seq_zones_bitmap); + +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -344,6 +344,11 @@ struct queue_limits { + + #ifdef CONFIG_BLK_DEV_ZONED + ++/* ++ * Maximum number of zones to report with a single report zones command. ++ */ ++#define BLK_ZONED_REPORT_MAX_ZONES 8192U ++ + extern unsigned int blkdev_nr_zones(struct block_device *bdev); + extern int blkdev_report_zones(struct block_device *bdev, + sector_t sector, struct blk_zone *zones, diff --git a/queue-5.1/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch b/queue-5.1/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch new file mode 100644 index 00000000000..9eaddb5df79 --- /dev/null +++ b/queue-5.1/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch @@ -0,0 +1,37 @@ +From cf64527bb33f6cec2ed50f89182fc4688d0056b6 Mon Sep 17 00:00:00 2001 +From: Jan Kiszka +Date: Sun, 21 Jul 2019 13:52:18 +0200 +Subject: KVM: nVMX: Clear pending KVM_REQ_GET_VMCS12_PAGES when leaving nested + +From: Jan Kiszka + +commit cf64527bb33f6cec2ed50f89182fc4688d0056b6 upstream. + +Letting this pend may cause nested_get_vmcs12_pages to run against an +invalid state, corrupting the effective vmcs of L1. + +This was triggerable in QEMU after a guest corruption in L2, followed by +a L1 reset. + +Signed-off-by: Jan Kiszka +Reviewed-by: Liran Alon +Cc: stable@vger.kernel.org +Fixes: 7f7f1ba33cf2 ("KVM: x86: do not load vmcs12 pages while still in SMM") +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx/nested.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -212,6 +212,8 @@ static void free_nested(struct kvm_vcpu + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) + return; + ++ kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); ++ + vmx->nested.vmxon = false; + vmx->nested.smm.vmxon = false; + free_vpid(vmx->nested.vpid02); diff --git a/queue-5.1/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch b/queue-5.1/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch new file mode 100644 index 00000000000..cb802dd8178 --- /dev/null +++ b/queue-5.1/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch @@ -0,0 +1,65 @@ +From 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 19 Jul 2019 18:41:10 +0200 +Subject: KVM: nVMX: do not use dangling shadow VMCS after guest reset + +From: Paolo Bonzini + +commit 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 upstream. + +If a KVM guest is reset while running a nested guest, free_nested will +disable the shadow VMCS execution control in the vmcs01. However, +on the next KVM_RUN vmx_vcpu_run would nevertheless try to sync +the VMCS12 to the shadow VMCS which has since been freed. + +This causes a vmptrld of a NULL pointer on my machime, but Jan reports +the host to hang altogether. Let's see how much this trivial patch fixes. + +Reported-by: Jan Kiszka +Cc: Liran Alon +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx/nested.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -184,6 +184,7 @@ static void vmx_disable_shadow_vmcs(stru + { + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, -1ull); ++ vmx->nested.need_vmcs12_sync = false; + } + + static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) +@@ -1328,6 +1329,9 @@ static void copy_shadow_to_vmcs12(struct + u64 field_value; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + ++ if (WARN_ON(!shadow_vmcs)) ++ return; ++ + preempt_disable(); + + vmcs_load(shadow_vmcs); +@@ -1366,6 +1370,9 @@ static void copy_vmcs12_to_shadow(struct + u64 field_value = 0; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + ++ if (WARN_ON(!shadow_vmcs)) ++ return; ++ + vmcs_load(shadow_vmcs); + + for (q = 0; q < ARRAY_SIZE(fields); q++) { +@@ -4336,7 +4343,6 @@ static inline void nested_release_vmcs12 + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); +- vmx->nested.need_vmcs12_sync = false; + vmx_disable_shadow_vmcs(vmx); + } + vmx->nested.posted_intr_nv = -1; diff --git a/queue-5.1/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch b/queue-5.1/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch new file mode 100644 index 00000000000..207dbb72e08 --- /dev/null +++ b/queue-5.1/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch @@ -0,0 +1,241 @@ +From 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa Mon Sep 17 00:00:00 2001 +From: Kuo-Hsin Yang +Date: Thu, 11 Jul 2019 20:52:04 -0700 +Subject: mm: vmscan: scan anonymous pages on file refaults + +From: Kuo-Hsin Yang + +commit 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa upstream. + +When file refaults are detected and there are many inactive file pages, +the system never reclaim anonymous pages, the file pages are dropped +aggressively when there are still a lot of cold anonymous pages and +system thrashes. This issue impacts the performance of applications +with large executable, e.g. chrome. + +With this patch, when file refault is detected, inactive_list_is_low() +always returns true for file pages in get_scan_count() to enable +scanning anonymous pages. + +The problem can be reproduced by the following test program. + +---8<--- +void fallocate_file(const char *filename, off_t size) +{ + struct stat st; + int fd; + + if (!stat(filename, &st) && st.st_size >= size) + return; + + fd = open(filename, O_WRONLY | O_CREAT, 0600); + if (fd < 0) { + perror("create file"); + exit(1); + } + if (posix_fallocate(fd, 0, size)) { + perror("fallocate"); + exit(1); + } + close(fd); +} + +long *alloc_anon(long size) +{ + long *start = malloc(size); + memset(start, 1, size); + return start; +} + +long access_file(const char *filename, long size, long rounds) +{ + int fd, i; + volatile char *start1, *end1, *start2; + const int page_size = getpagesize(); + long sum = 0; + + fd = open(filename, O_RDONLY); + if (fd == -1) { + perror("open"); + exit(1); + } + + /* + * Some applications, e.g. chrome, use a lot of executable file + * pages, map some of the pages with PROT_EXEC flag to simulate + * the behavior. + */ + start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED, + fd, 0); + if (start1 == MAP_FAILED) { + perror("mmap"); + exit(1); + } + end1 = start1 + size / 2; + + start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2); + if (start2 == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + for (i = 0; i < rounds; ++i) { + struct timeval before, after; + volatile char *ptr1 = start1, *ptr2 = start2; + gettimeofday(&before, NULL); + for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size) + sum += *ptr1 + *ptr2; + gettimeofday(&after, NULL); + printf("File access time, round %d: %f (sec) +", i, + (after.tv_sec - before.tv_sec) + + (after.tv_usec - before.tv_usec) / 1000000.0); + } + return sum; +} + +int main(int argc, char *argv[]) +{ + const long MB = 1024 * 1024; + long anon_mb, file_mb, file_rounds; + const char filename[] = "large"; + long *ret1; + long ret2; + + if (argc != 4) { + printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS +"); + exit(0); + } + anon_mb = atoi(argv[1]); + file_mb = atoi(argv[2]); + file_rounds = atoi(argv[3]); + + fallocate_file(filename, file_mb * MB); + printf("Allocate %ld MB anonymous pages +", anon_mb); + ret1 = alloc_anon(anon_mb * MB); + printf("Access %ld MB file pages +", file_mb); + ret2 = access_file(filename, file_mb * MB, file_rounds); + printf("Print result to prevent optimization: %ld +", + *ret1 + ret2); + return 0; +} +---8<--- + +Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the program +fills ram with 2048 MB memory, access a 200 MB file for 10 times. Without +this patch, the file cache is dropped aggresively and every access to the +file is from disk. + + $ ./thrash 2048 200 10 + Allocate 2048 MB anonymous pages + Access 200 MB file pages + File access time, round 0: 2.489316 (sec) + File access time, round 1: 2.581277 (sec) + File access time, round 2: 2.487624 (sec) + File access time, round 3: 2.449100 (sec) + File access time, round 4: 2.420423 (sec) + File access time, round 5: 2.343411 (sec) + File access time, round 6: 2.454833 (sec) + File access time, round 7: 2.483398 (sec) + File access time, round 8: 2.572701 (sec) + File access time, round 9: 2.493014 (sec) + +With this patch, these file pages can be cached. + + $ ./thrash 2048 200 10 + Allocate 2048 MB anonymous pages + Access 200 MB file pages + File access time, round 0: 2.475189 (sec) + File access time, round 1: 2.440777 (sec) + File access time, round 2: 2.411671 (sec) + File access time, round 3: 1.955267 (sec) + File access time, round 4: 0.029924 (sec) + File access time, round 5: 0.000808 (sec) + File access time, round 6: 0.000771 (sec) + File access time, round 7: 0.000746 (sec) + File access time, round 8: 0.000738 (sec) + File access time, round 9: 0.000747 (sec) + +Checked the swap out stats during the test [1], 19006 pages swapped out +with this patch, 3418 pages swapped out without this patch. There are +more swap out, but I think it's within reasonable range when file backed +data set doesn't fit into the memory. + +$ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS +PROCESSES Allocate 2000 MB anonymous pages active_anon: 1613644, +inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB) +pswpout: 7972443, pgpgin: 478615246 Access 100 MB executable file pages +Access 2100 MB regular file pages File access time, round 0: 12.165, +(sec) active_anon: 1433788, inactive_anon: 478116, active_file: 17896, +inactive_file: 24328 (kB) File access time, round 1: 11.493, (sec) +active_anon: 1430576, inactive_anon: 477144, active_file: 25440, +inactive_file: 26172 (kB) File access time, round 2: 11.455, (sec) +active_anon: 1427436, inactive_anon: 476060, active_file: 21112, +inactive_file: 28808 (kB) File access time, round 3: 11.454, (sec) +active_anon: 1420444, inactive_anon: 473632, active_file: 23216, +inactive_file: 35036 (kB) File access time, round 4: 11.479, (sec) +active_anon: 1413964, inactive_anon: 471460, active_file: 31728, +inactive_file: 32224 (kB) pswpout: 7991449 (+ 19006), pgpgin: 489924366 +(+ 11309120) + +With 4 processes accessing non-overlapping parts of a large file, 30316 +pages swapped out with this patch, 5152 pages swapped out without this +patch. The swapout number is small comparing to pgpgin. + +[1]: https://github.com/vovo/testing/blob/master/mem_thrash.c + +Link: http://lkml.kernel.org/r/20190701081038.GA83398@google.com +Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty") +Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty") +Signed-off-by: Kuo-Hsin Yang +Acked-by: Johannes Weiner +Cc: Michal Hocko +Cc: Sonny Rao +Cc: Mel Gorman +Cc: Rik van Riel +Cc: Vladimir Davydov +Cc: Minchan Kim +Cc: [4.12+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[backported to 4.14.y, 4.19.y, 5.1.y: adjust context] +Signed-off-by: Kuo-Hsin Yang +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2176,7 +2176,7 @@ static void shrink_active_list(unsigned + * 10TB 320 32GB + */ + static bool inactive_list_is_low(struct lruvec *lruvec, bool file, +- struct scan_control *sc, bool actual_reclaim) ++ struct scan_control *sc, bool trace) + { + enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); +@@ -2202,7 +2202,7 @@ static bool inactive_list_is_low(struct + * rid of the stale workingset quickly. + */ + refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); +- if (file && actual_reclaim && lruvec->refaults != refaults) { ++ if (file && lruvec->refaults != refaults) { + inactive_ratio = 0; + } else { + gb = (inactive + active) >> (30 - PAGE_SHIFT); +@@ -2212,7 +2212,7 @@ static bool inactive_list_is_low(struct + inactive_ratio = 1; + } + +- if (actual_reclaim) ++ if (trace) + trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, + lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, + lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, diff --git a/queue-5.1/revert-kvm-x86-use-task-structs-fpu-field-for-user.patch b/queue-5.1/revert-kvm-x86-use-task-structs-fpu-field-for-user.patch new file mode 100644 index 00000000000..797cf9dcc14 --- /dev/null +++ b/queue-5.1/revert-kvm-x86-use-task-structs-fpu-field-for-user.patch @@ -0,0 +1,66 @@ +From ec269475cba7bcdd1eb8fdf8e87f4c6c81a376fe Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 22 Jul 2019 13:31:27 +0200 +Subject: Revert "kvm: x86: Use task structs fpu field for user" + +From: Paolo Bonzini + +commit ec269475cba7bcdd1eb8fdf8e87f4c6c81a376fe upstream. + +This reverts commit 240c35a3783ab9b3a0afaba0dde7291295680a6b +("kvm: x86: Use task structs fpu field for user", 2018-11-06). +The commit is broken and causes QEMU's FPU state to be destroyed +when KVM_RUN is preempted. + +Fixes: 240c35a3783a ("kvm: x86: Use task structs fpu field for user") +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/kvm_host.h | 7 ++++--- + arch/x86/kvm/x86.c | 4 ++-- + 2 files changed, 6 insertions(+), 5 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -609,15 +609,16 @@ struct kvm_vcpu_arch { + + /* + * QEMU userspace and the guest each have their own FPU state. +- * In vcpu_run, we switch between the user, maintained in the +- * task_struct struct, and guest FPU contexts. While running a VCPU, +- * the VCPU thread will have the guest FPU context. ++ * In vcpu_run, we switch between the user and guest FPU contexts. ++ * While running a VCPU, the VCPU thread will have the guest FPU ++ * context. + * + * Note that while the PKRU state lives inside the fpu registers, + * it is switched out separately at VMENTER and VMEXIT time. The + * "guest_fpu" state here contains the guest FPU context, with the + * host PRKU bits. + */ ++ struct fpu user_fpu; + struct fpu *guest_fpu; + + u64 xcr0; +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8172,7 +8172,7 @@ static int complete_emulated_mmio(struct + static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) + { + preempt_disable(); +- copy_fpregs_to_fpstate(¤t->thread.fpu); ++ copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); + /* PKRU is separately restored in kvm_x86_ops->run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, + ~XFEATURE_MASK_PKRU); +@@ -8185,7 +8185,7 @@ static void kvm_put_guest_fpu(struct kvm + { + preempt_disable(); + copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); +- copy_kernel_to_fpregs(¤t->thread.fpu.state); ++ copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + preempt_enable(); + ++vcpu->stat.fpu_reload; + trace_kvm_fpu(0); diff --git a/queue-5.1/sd_zbc-fix-report-zones-buffer-allocation.patch b/queue-5.1/sd_zbc-fix-report-zones-buffer-allocation.patch new file mode 100644 index 00000000000..f7848d7367e --- /dev/null +++ b/queue-5.1/sd_zbc-fix-report-zones-buffer-allocation.patch @@ -0,0 +1,256 @@ +From b091ac616846a1da75b1f2566b41255ce7f0e0a6 Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Mon, 1 Jul 2019 14:09:17 +0900 +Subject: sd_zbc: Fix report zones buffer allocation + +From: Damien Le Moal + +commit b091ac616846a1da75b1f2566b41255ce7f0e0a6 upstream. + +During disk scan and revalidation done with sd_revalidate(), the zones +of a zoned disk are checked using the helper function +blk_revalidate_disk_zones() if a configuration change is detected +(change in the number of zones or zone size). The function +blk_revalidate_disk_zones() issues report_zones calls that are very +large, that is, to obtain zone information for all zones of the disk +with a single command. The size of the report zones command buffer +necessary for such large request generally is lower than the disk +max_hw_sectors and KMALLOC_MAX_SIZE (4MB) and succeeds on boot (no +memory fragmentation), but often fail at run time (e.g. hot-plug +event). This causes the disk revalidation to fail and the disk +capacity to be changed to 0. + +This problem can be avoided by using vmalloc() instead of kmalloc() for +the buffer allocation. To limit the amount of memory to be allocated, +this patch also introduces the arbitrary SD_ZBC_REPORT_MAX_ZONES +maximum number of zones to report with a single report zones command. +This limit may be lowered further to satisfy the disk max_hw_sectors +limit. Finally, to ensure that the vmalloc-ed buffer can always be +mapped in a request, the buffer size is further limited to at most +queue_max_segments() pages, allowing successful mapping of the buffer +even in the worst case scenario where none of the buffer pages are +contiguous. + +Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation") +Fixes: e76239a3748c ("block: add a report_zones method") +Cc: stable@vger.kernel.org +Reviewed-by: Christoph Hellwig +Reviewed-by: Martin K. Petersen +Signed-off-by: Damien Le Moal +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + + +--- + drivers/scsi/sd_zbc.c | 104 ++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 75 insertions(+), 29 deletions(-) + +--- a/drivers/scsi/sd_zbc.c ++++ b/drivers/scsi/sd_zbc.c +@@ -23,6 +23,8 @@ + */ + + #include ++#include ++#include + + #include + +@@ -64,7 +66,7 @@ static void sd_zbc_parse_report(struct s + /** + * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command. + * @sdkp: The target disk +- * @buf: Buffer to use for the reply ++ * @buf: vmalloc-ed buffer to use for the reply + * @buflen: the buffer size + * @lba: Start LBA of the report + * @partial: Do partial report +@@ -93,7 +95,6 @@ static int sd_zbc_do_report_zones(struct + put_unaligned_be32(buflen, &cmd[10]); + if (partial) + cmd[14] = ZBC_REPORT_ZONE_PARTIAL; +- memset(buf, 0, buflen); + + result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE, + buf, buflen, &sshdr, +@@ -117,6 +118,53 @@ static int sd_zbc_do_report_zones(struct + return 0; + } + ++/* ++ * Maximum number of zones to get with one report zones command. ++ */ ++#define SD_ZBC_REPORT_MAX_ZONES 8192U ++ ++/** ++ * Allocate a buffer for report zones reply. ++ * @sdkp: The target disk ++ * @nr_zones: Maximum number of zones to report ++ * @buflen: Size of the buffer allocated ++ * ++ * Try to allocate a reply buffer for the number of requested zones. ++ * The size of the buffer allocated may be smaller than requested to ++ * satify the device constraint (max_hw_sectors, max_segments, etc). ++ * ++ * Return the address of the allocated buffer and update @buflen with ++ * the size of the allocated buffer. ++ */ ++static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, ++ unsigned int nr_zones, size_t *buflen) ++{ ++ struct request_queue *q = sdkp->disk->queue; ++ size_t bufsize; ++ void *buf; ++ ++ /* ++ * Report zone buffer size should be at most 64B times the number of ++ * zones requested plus the 64B reply header, but should be at least ++ * SECTOR_SIZE for ATA devices. ++ * Make sure that this size does not exceed the hardware capabilities. ++ * Furthermore, since the report zone command cannot be split, make ++ * sure that the allocated buffer can always be mapped by limiting the ++ * number of pages allocated to the HBA max segments limit. ++ */ ++ nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES); ++ bufsize = roundup((nr_zones + 1) * 64, 512); ++ bufsize = min_t(size_t, bufsize, ++ queue_max_hw_sectors(q) << SECTOR_SHIFT); ++ bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); ++ ++ buf = vzalloc(bufsize); ++ if (buf) ++ *buflen = bufsize; ++ ++ return buf; ++} ++ + /** + * sd_zbc_report_zones - Disk report zones operation. + * @disk: The target disk +@@ -132,30 +180,23 @@ int sd_zbc_report_zones(struct gendisk * + gfp_t gfp_mask) + { + struct scsi_disk *sdkp = scsi_disk(disk); +- unsigned int i, buflen, nrz = *nr_zones; ++ unsigned int i, nrz = *nr_zones; + unsigned char *buf; +- size_t offset = 0; ++ size_t buflen = 0, offset = 0; + int ret = 0; + + if (!sd_is_zoned(sdkp)) + /* Not a zoned device */ + return -EOPNOTSUPP; + +- /* +- * Get a reply buffer for the number of requested zones plus a header, +- * without exceeding the device maximum command size. For ATA disks, +- * buffers must be aligned to 512B. +- */ +- buflen = min(queue_max_hw_sectors(disk->queue) << 9, +- roundup((nrz + 1) * 64, 512)); +- buf = kmalloc(buflen, gfp_mask); ++ buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen); + if (!buf) + return -ENOMEM; + + ret = sd_zbc_do_report_zones(sdkp, buf, buflen, + sectors_to_logical(sdkp->device, sector), true); + if (ret) +- goto out_free_buf; ++ goto out; + + nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64); + for (i = 0; i < nrz; i++) { +@@ -166,8 +207,8 @@ int sd_zbc_report_zones(struct gendisk * + + *nr_zones = nrz; + +-out_free_buf: +- kfree(buf); ++out: ++ kvfree(buf); + + return ret; + } +@@ -301,8 +342,6 @@ static int sd_zbc_check_zoned_characteri + return 0; + } + +-#define SD_ZBC_BUF_SIZE 131072U +- + /** + * sd_zbc_check_zones - Check the device capacity and zone sizes + * @sdkp: Target disk +@@ -318,22 +357,28 @@ static int sd_zbc_check_zoned_characteri + */ + static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) + { ++ size_t bufsize, buflen; ++ unsigned int noio_flag; + u64 zone_blocks = 0; + sector_t max_lba, block = 0; + unsigned char *buf; + unsigned char *rec; +- unsigned int buf_len; +- unsigned int list_length; + int ret; + u8 same; + ++ /* Do all memory allocations as if GFP_NOIO was specified */ ++ noio_flag = memalloc_noio_save(); ++ + /* Get a buffer */ +- buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL); +- if (!buf) +- return -ENOMEM; ++ buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES, ++ &bufsize); ++ if (!buf) { ++ ret = -ENOMEM; ++ goto out; ++ } + + /* Do a report zone to get max_lba and the same field */ +- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false); ++ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false); + if (ret) + goto out_free; + +@@ -369,12 +414,12 @@ static int sd_zbc_check_zones(struct scs + do { + + /* Parse REPORT ZONES header */ +- list_length = get_unaligned_be32(&buf[0]) + 64; ++ buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64, ++ bufsize); + rec = buf + 64; +- buf_len = min(list_length, SD_ZBC_BUF_SIZE); + + /* Parse zone descriptors */ +- while (rec < buf + buf_len) { ++ while (rec < buf + buflen) { + u64 this_zone_blocks = get_unaligned_be64(&rec[8]); + + if (zone_blocks == 0) { +@@ -390,8 +435,8 @@ static int sd_zbc_check_zones(struct scs + } + + if (block < sdkp->capacity) { +- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, +- block, true); ++ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block, ++ true); + if (ret) + goto out_free; + } +@@ -422,7 +467,8 @@ out: + } + + out_free: +- kfree(buf); ++ memalloc_noio_restore(noio_flag); ++ kvfree(buf); + + return ret; + } diff --git a/queue-5.1/series b/queue-5.1/series index c119dd336c8..bb3620f15e0 100644 --- a/queue-5.1/series +++ b/queue-5.1/series @@ -54,3 +54,9 @@ mm-add-filemap_fdatawait_range_keep_errors.patch jbd2-introduce-jbd2_inode-dirty-range-scoping.patch ext4-use-jbd2_inode-dirty-range-scoping.patch ext4-allow-directory-holes.patch +kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch +kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch +revert-kvm-x86-use-task-structs-fpu-field-for-user.patch +sd_zbc-fix-report-zones-buffer-allocation.patch +block-limit-zone-array-allocation-size.patch +mm-vmscan-scan-anonymous-pages-on-file-refaults.patch