5.1-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 26 Jul 2019 13:58:18 +0000 (15:58 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 26 Jul 2019 13:58:18 +0000 (15:58 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 26 Jul 2019 13:58:18 +0000 (15:58 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 26 Jul 2019 13:58:18 +0000 (15:58 +0200)
diff --git a/queue-5.1/block-limit-zone-array-allocation-size.patch b/queue-5.1/block-limit-zone-array-allocation-size.patch

new file mode 100644 (file)

index 0000000..280d1ac
--- /dev/null
+++ b/queue-5.1/block-limit-zone-array-allocation-size.patch
@@ -0,0 +1,136 @@
+From 26202928fafad8bda8b478edb7e62c885be623d7 Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <damien.lemoal@wdc.com>
+Date: Mon, 1 Jul 2019 14:09:18 +0900
+Subject: block: Limit zone array allocation size
+
+From: Damien Le Moal <damien.lemoal@wdc.com>
+
+commit 26202928fafad8bda8b478edb7e62c885be623d7 upstream.
+
+Limit the size of the struct blk_zone array used in
+blk_revalidate_disk_zones() to avoid memory allocation failures leading
+to disk revalidation failure. Also further reduce the likelyhood of
+such failures by using kvcalloc() (that is vmalloc()) instead of
+allocating contiguous pages with alloc_pages().
+
+Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
+Fixes: e76239a3748c ("block: add a report_zones method")
+Cc: stable@vger.kernel.org
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/blk-zoned.c      |   46 ++++++++++++++++++++++++++++++----------------
+ include/linux/blkdev.h |    5 +++++
+ 2 files changed, 35 insertions(+), 16 deletions(-)
+
+--- a/block/blk-zoned.c
++++ b/block/blk-zoned.c
+@@ -13,6 +13,9 @@
+ #include <linux/rbtree.h>
+ #include <linux/blkdev.h>
+ #include <linux/blk-mq.h>
++#include <linux/mm.h>
++#include <linux/vmalloc.h>
++#include <linux/sched/mm.h>
+ 
+ #include "blk.h"
+ 
+@@ -372,22 +375,25 @@ static inline unsigned long *blk_alloc_z
+  * Allocate an array of struct blk_zone to get nr_zones zone information.
+  * The allocated array may be smaller than nr_zones.
+  */
+-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
++static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
+ {
+-      size_t size = *nr_zones * sizeof(struct blk_zone);
+-      struct page *page;
+-      int order;
+-
+-      for (order = get_order(size); order >= 0; order--) {
+-              page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
+-              if (page) {
+-                      *nr_zones = min_t(unsigned int, *nr_zones,
+-                              (PAGE_SIZE << order) / sizeof(struct blk_zone));
+-                      return page_address(page);
+-              }
++      struct blk_zone *zones;
++      size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
++
++      /*
++       * GFP_KERNEL here is meaningless as the caller task context has
++       * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
++       * with memalloc_noio_save().
++       */
++      zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
++      if (!zones) {
++              *nr_zones = 0;
++              return NULL;
+       }
+ 
+-      return NULL;
++      *nr_zones = nrz;
++
++      return zones;
+ }
+ 
+ void blk_queue_free_zone_bitmaps(struct request_queue *q)
+@@ -414,6 +420,7 @@ int blk_revalidate_disk_zones(struct gen
+       unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
+       unsigned int i, rep_nr_zones = 0, z = 0, nrz;
+       struct blk_zone *zones = NULL;
++      unsigned int noio_flag;
+       sector_t sector = 0;
+       int ret = 0;
+ 
+@@ -426,6 +433,12 @@ int blk_revalidate_disk_zones(struct gen
+               return 0;
+       }
+ 
++      /*
++       * Ensure that all memory allocations in this context are done as
++       * if GFP_NOIO was specified.
++       */
++      noio_flag = memalloc_noio_save();
++
+       if (!blk_queue_is_zoned(q) || !nr_zones) {
+               nr_zones = 0;
+               goto update;
+@@ -442,7 +455,7 @@ int blk_revalidate_disk_zones(struct gen
+ 
+       /* Get zone information and initialize seq_zones_bitmap */
+       rep_nr_zones = nr_zones;
+-      zones = blk_alloc_zones(q->node, &rep_nr_zones);
++      zones = blk_alloc_zones(&rep_nr_zones);
+       if (!zones)
+               goto out;
+ 
+@@ -479,8 +492,9 @@ update:
+       blk_mq_unfreeze_queue(q);
+ 
+ out:
+-      free_pages((unsigned long)zones,
+-                 get_order(rep_nr_zones * sizeof(struct blk_zone)));
++      memalloc_noio_restore(noio_flag);
++
++      kvfree(zones);
+       kfree(seq_zones_wlock);
+       kfree(seq_zones_bitmap);
+ 
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -344,6 +344,11 @@ struct queue_limits {
+ 
+ #ifdef CONFIG_BLK_DEV_ZONED
+ 
++/*
++ * Maximum number of zones to report with a single report zones command.
++ */
++#define BLK_ZONED_REPORT_MAX_ZONES    8192U
++
+ extern unsigned int blkdev_nr_zones(struct block_device *bdev);
+ extern int blkdev_report_zones(struct block_device *bdev,
+                              sector_t sector, struct blk_zone *zones,
diff --git a/queue-5.1/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch b/queue-5.1/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch

new file mode 100644 (file)

index 0000000..9eaddb5
--- /dev/null
+++ b/queue-5.1/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch
@@ -0,0 +1,37 @@
+From cf64527bb33f6cec2ed50f89182fc4688d0056b6 Mon Sep 17 00:00:00 2001
+From: Jan Kiszka <jan.kiszka@siemens.com>
+Date: Sun, 21 Jul 2019 13:52:18 +0200
+Subject: KVM: nVMX: Clear pending KVM_REQ_GET_VMCS12_PAGES when leaving nested
+
+From: Jan Kiszka <jan.kiszka@siemens.com>
+
+commit cf64527bb33f6cec2ed50f89182fc4688d0056b6 upstream.
+
+Letting this pend may cause nested_get_vmcs12_pages to run against an
+invalid state, corrupting the effective vmcs of L1.
+
+This was triggerable in QEMU after a guest corruption in L2, followed by
+a L1 reset.
+
+Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
+Reviewed-by: Liran Alon <liran.alon@oracle.com>
+Cc: stable@vger.kernel.org
+Fixes: 7f7f1ba33cf2 ("KVM: x86: do not load vmcs12 pages while still in SMM")
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx/nested.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -212,6 +212,8 @@ static void free_nested(struct kvm_vcpu
+       if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
+               return;
+ 
++      kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
++
+       vmx->nested.vmxon = false;
+       vmx->nested.smm.vmxon = false;
+       free_vpid(vmx->nested.vpid02);
diff --git a/queue-5.1/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch b/queue-5.1/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch

new file mode 100644 (file)

index 0000000..cb802dd
--- /dev/null
+++ b/queue-5.1/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch
@@ -0,0 +1,65 @@
+From 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Fri, 19 Jul 2019 18:41:10 +0200
+Subject: KVM: nVMX: do not use dangling shadow VMCS after guest reset
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 upstream.
+
+If a KVM guest is reset while running a nested guest, free_nested will
+disable the shadow VMCS execution control in the vmcs01.  However,
+on the next KVM_RUN vmx_vcpu_run would nevertheless try to sync
+the VMCS12 to the shadow VMCS which has since been freed.
+
+This causes a vmptrld of a NULL pointer on my machime, but Jan reports
+the host to hang altogether.  Let's see how much this trivial patch fixes.
+
+Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
+Cc: Liran Alon <liran.alon@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx/nested.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -184,6 +184,7 @@ static void vmx_disable_shadow_vmcs(stru
+ {
+       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+       vmcs_write64(VMCS_LINK_POINTER, -1ull);
++      vmx->nested.need_vmcs12_sync = false;
+ }
+ 
+ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
+@@ -1328,6 +1329,9 @@ static void copy_shadow_to_vmcs12(struct
+       u64 field_value;
+       struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ 
++      if (WARN_ON(!shadow_vmcs))
++              return;
++
+       preempt_disable();
+ 
+       vmcs_load(shadow_vmcs);
+@@ -1366,6 +1370,9 @@ static void copy_vmcs12_to_shadow(struct
+       u64 field_value = 0;
+       struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ 
++      if (WARN_ON(!shadow_vmcs))
++              return;
++
+       vmcs_load(shadow_vmcs);
+ 
+       for (q = 0; q < ARRAY_SIZE(fields); q++) {
+@@ -4336,7 +4343,6 @@ static inline void nested_release_vmcs12
+               /* copy to memory all shadowed fields in case
+                  they were modified */
+               copy_shadow_to_vmcs12(vmx);
+-              vmx->nested.need_vmcs12_sync = false;
+               vmx_disable_shadow_vmcs(vmx);
+       }
+       vmx->nested.posted_intr_nv = -1;
diff --git a/queue-5.1/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch b/queue-5.1/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch

new file mode 100644 (file)

index 0000000..207dbb7
--- /dev/null
+++ b/queue-5.1/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch
@@ -0,0 +1,241 @@
+From 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa Mon Sep 17 00:00:00 2001
+From: Kuo-Hsin Yang <vovoy@chromium.org>
+Date: Thu, 11 Jul 2019 20:52:04 -0700
+Subject: mm: vmscan: scan anonymous pages on file refaults
+
+From: Kuo-Hsin Yang <vovoy@chromium.org>
+
+commit 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa upstream.
+
+When file refaults are detected and there are many inactive file pages,
+the system never reclaim anonymous pages, the file pages are dropped
+aggressively when there are still a lot of cold anonymous pages and
+system thrashes.  This issue impacts the performance of applications
+with large executable, e.g.  chrome.
+
+With this patch, when file refault is detected, inactive_list_is_low()
+always returns true for file pages in get_scan_count() to enable
+scanning anonymous pages.
+
+The problem can be reproduced by the following test program.
+
+---8<---
+void fallocate_file(const char *filename, off_t size)
+{
+       struct stat st;
+       int fd;
+
+       if (!stat(filename, &st) && st.st_size >= size)
+               return;
+
+       fd = open(filename, O_WRONLY | O_CREAT, 0600);
+       if (fd < 0) {
+               perror("create file");
+               exit(1);
+       }
+       if (posix_fallocate(fd, 0, size)) {
+               perror("fallocate");
+               exit(1);
+       }
+       close(fd);
+}
+
+long *alloc_anon(long size)
+{
+       long *start = malloc(size);
+       memset(start, 1, size);
+       return start;
+}
+
+long access_file(const char *filename, long size, long rounds)
+{
+       int fd, i;
+       volatile char *start1, *end1, *start2;
+       const int page_size = getpagesize();
+       long sum = 0;
+
+       fd = open(filename, O_RDONLY);
+       if (fd == -1) {
+               perror("open");
+               exit(1);
+       }
+
+       /*
+        * Some applications, e.g. chrome, use a lot of executable file
+        * pages, map some of the pages with PROT_EXEC flag to simulate
+        * the behavior.
+        */
+       start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED,
+                     fd, 0);
+       if (start1 == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+       end1 = start1 + size / 2;
+
+       start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2);
+       if (start2 == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       for (i = 0; i < rounds; ++i) {
+               struct timeval before, after;
+               volatile char *ptr1 = start1, *ptr2 = start2;
+               gettimeofday(&before, NULL);
+               for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size)
+                       sum += *ptr1 + *ptr2;
+               gettimeofday(&after, NULL);
+               printf("File access time, round %d: %f (sec)
+", i,
+                      (after.tv_sec - before.tv_sec) +
+                      (after.tv_usec - before.tv_usec) / 1000000.0);
+       }
+       return sum;
+}
+
+int main(int argc, char *argv[])
+{
+       const long MB = 1024 * 1024;
+       long anon_mb, file_mb, file_rounds;
+       const char filename[] = "large";
+       long *ret1;
+       long ret2;
+
+       if (argc != 4) {
+               printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS
+");
+               exit(0);
+       }
+       anon_mb = atoi(argv[1]);
+       file_mb = atoi(argv[2]);
+       file_rounds = atoi(argv[3]);
+
+       fallocate_file(filename, file_mb * MB);
+       printf("Allocate %ld MB anonymous pages
+", anon_mb);
+       ret1 = alloc_anon(anon_mb * MB);
+       printf("Access %ld MB file pages
+", file_mb);
+       ret2 = access_file(filename, file_mb * MB, file_rounds);
+       printf("Print result to prevent optimization: %ld
+",
+              *ret1 + ret2);
+       return 0;
+}
+---8<---
+
+Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the program
+fills ram with 2048 MB memory, access a 200 MB file for 10 times.  Without
+this patch, the file cache is dropped aggresively and every access to the
+file is from disk.
+
+  $ ./thrash 2048 200 10
+  Allocate 2048 MB anonymous pages
+  Access 200 MB file pages
+  File access time, round 0: 2.489316 (sec)
+  File access time, round 1: 2.581277 (sec)
+  File access time, round 2: 2.487624 (sec)
+  File access time, round 3: 2.449100 (sec)
+  File access time, round 4: 2.420423 (sec)
+  File access time, round 5: 2.343411 (sec)
+  File access time, round 6: 2.454833 (sec)
+  File access time, round 7: 2.483398 (sec)
+  File access time, round 8: 2.572701 (sec)
+  File access time, round 9: 2.493014 (sec)
+
+With this patch, these file pages can be cached.
+
+  $ ./thrash 2048 200 10
+  Allocate 2048 MB anonymous pages
+  Access 200 MB file pages
+  File access time, round 0: 2.475189 (sec)
+  File access time, round 1: 2.440777 (sec)
+  File access time, round 2: 2.411671 (sec)
+  File access time, round 3: 1.955267 (sec)
+  File access time, round 4: 0.029924 (sec)
+  File access time, round 5: 0.000808 (sec)
+  File access time, round 6: 0.000771 (sec)
+  File access time, round 7: 0.000746 (sec)
+  File access time, round 8: 0.000738 (sec)
+  File access time, round 9: 0.000747 (sec)
+
+Checked the swap out stats during the test [1], 19006 pages swapped out
+with this patch, 3418 pages swapped out without this patch. There are
+more swap out, but I think it's within reasonable range when file backed
+data set doesn't fit into the memory.
+
+$ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS
+PROCESSES Allocate 2000 MB anonymous pages active_anon: 1613644,
+inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB)
+pswpout: 7972443, pgpgin: 478615246 Access 100 MB executable file pages
+Access 2100 MB regular file pages File access time, round 0: 12.165,
+(sec) active_anon: 1433788, inactive_anon: 478116, active_file: 17896,
+inactive_file: 24328 (kB) File access time, round 1: 11.493, (sec)
+active_anon: 1430576, inactive_anon: 477144, active_file: 25440,
+inactive_file: 26172 (kB) File access time, round 2: 11.455, (sec)
+active_anon: 1427436, inactive_anon: 476060, active_file: 21112,
+inactive_file: 28808 (kB) File access time, round 3: 11.454, (sec)
+active_anon: 1420444, inactive_anon: 473632, active_file: 23216,
+inactive_file: 35036 (kB) File access time, round 4: 11.479, (sec)
+active_anon: 1413964, inactive_anon: 471460, active_file: 31728,
+inactive_file: 32224 (kB) pswpout: 7991449 (+ 19006), pgpgin: 489924366
+(+ 11309120)
+
+With 4 processes accessing non-overlapping parts of a large file, 30316
+pages swapped out with this patch, 5152 pages swapped out without this
+patch.  The swapout number is small comparing to pgpgin.
+
+[1]: https://github.com/vovo/testing/blob/master/mem_thrash.c
+
+Link: http://lkml.kernel.org/r/20190701081038.GA83398@google.com
+Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty")
+Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty")
+Signed-off-by: Kuo-Hsin Yang <vovoy@chromium.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Sonny Rao <sonnyrao@chromium.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: <stable@vger.kernel.org>   [4.12+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[backported to 4.14.y, 4.19.y, 5.1.y: adjust context]
+Signed-off-by: Kuo-Hsin Yang <vovoy@chromium.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2176,7 +2176,7 @@ static void shrink_active_list(unsigned
+  *   10TB     320        32GB
+  */
+ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
+-                               struct scan_control *sc, bool actual_reclaim)
++                               struct scan_control *sc, bool trace)
+ {
+       enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+@@ -2202,7 +2202,7 @@ static bool inactive_list_is_low(struct
+        * rid of the stale workingset quickly.
+        */
+       refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+-      if (file && actual_reclaim && lruvec->refaults != refaults) {
++      if (file && lruvec->refaults != refaults) {
+               inactive_ratio = 0;
+       } else {
+               gb = (inactive + active) >> (30 - PAGE_SHIFT);
+@@ -2212,7 +2212,7 @@ static bool inactive_list_is_low(struct
+                       inactive_ratio = 1;
+       }
+ 
+-      if (actual_reclaim)
++      if (trace)
+               trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+                       lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+                       lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
diff --git a/queue-5.1/revert-kvm-x86-use-task-structs-fpu-field-for-user.patch b/queue-5.1/revert-kvm-x86-use-task-structs-fpu-field-for-user.patch

new file mode 100644 (file)

index 0000000..797cf9d
--- /dev/null
+++ b/queue-5.1/revert-kvm-x86-use-task-structs-fpu-field-for-user.patch
@@ -0,0 +1,66 @@
+From ec269475cba7bcdd1eb8fdf8e87f4c6c81a376fe Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Mon, 22 Jul 2019 13:31:27 +0200
+Subject: Revert "kvm: x86: Use task structs fpu field for user"
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit ec269475cba7bcdd1eb8fdf8e87f4c6c81a376fe upstream.
+
+This reverts commit 240c35a3783ab9b3a0afaba0dde7291295680a6b
+("kvm: x86: Use task structs fpu field for user", 2018-11-06).
+The commit is broken and causes QEMU's FPU state to be destroyed
+when KVM_RUN is preempted.
+
+Fixes: 240c35a3783a ("kvm: x86: Use task structs fpu field for user")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kvm_host.h |    7 ++++---
+ arch/x86/kvm/x86.c              |    4 ++--
+ 2 files changed, 6 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -609,15 +609,16 @@ struct kvm_vcpu_arch {
+ 
+       /*
+        * QEMU userspace and the guest each have their own FPU state.
+-       * In vcpu_run, we switch between the user, maintained in the
+-       * task_struct struct, and guest FPU contexts. While running a VCPU,
+-       * the VCPU thread will have the guest FPU context.
++       * In vcpu_run, we switch between the user and guest FPU contexts.
++       * While running a VCPU, the VCPU thread will have the guest FPU
++       * context.
+        *
+        * Note that while the PKRU state lives inside the fpu registers,
+        * it is switched out separately at VMENTER and VMEXIT time. The
+        * "guest_fpu" state here contains the guest FPU context, with the
+        * host PRKU bits.
+        */
++      struct fpu user_fpu;
+       struct fpu *guest_fpu;
+ 
+       u64 xcr0;
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -8172,7 +8172,7 @@ static int complete_emulated_mmio(struct
+ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+ {
+       preempt_disable();
+-      copy_fpregs_to_fpstate(&current->thread.fpu);
++      copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
+       /* PKRU is separately restored in kvm_x86_ops->run.  */
+       __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+                               ~XFEATURE_MASK_PKRU);
+@@ -8185,7 +8185,7 @@ static void kvm_put_guest_fpu(struct kvm
+ {
+       preempt_disable();
+       copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
+-      copy_kernel_to_fpregs(&current->thread.fpu.state);
++      copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
+       preempt_enable();
+       ++vcpu->stat.fpu_reload;
+       trace_kvm_fpu(0);
diff --git a/queue-5.1/sd_zbc-fix-report-zones-buffer-allocation.patch b/queue-5.1/sd_zbc-fix-report-zones-buffer-allocation.patch

new file mode 100644 (file)

index 0000000..f7848d7
--- /dev/null
+++ b/queue-5.1/sd_zbc-fix-report-zones-buffer-allocation.patch
@@ -0,0 +1,256 @@
+From b091ac616846a1da75b1f2566b41255ce7f0e0a6 Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <damien.lemoal@wdc.com>
+Date: Mon, 1 Jul 2019 14:09:17 +0900
+Subject: sd_zbc: Fix report zones buffer allocation
+
+From: Damien Le Moal <damien.lemoal@wdc.com>
+
+commit b091ac616846a1da75b1f2566b41255ce7f0e0a6 upstream.
+
+During disk scan and revalidation done with sd_revalidate(), the zones
+of a zoned disk are checked using the helper function
+blk_revalidate_disk_zones() if a configuration change is detected
+(change in the number of zones or zone size). The function
+blk_revalidate_disk_zones() issues report_zones calls that are very
+large, that is, to obtain zone information for all zones of the disk
+with a single command. The size of the report zones command buffer
+necessary for such large request generally is lower than the disk
+max_hw_sectors and KMALLOC_MAX_SIZE (4MB) and succeeds on boot (no
+memory fragmentation), but often fail at run time (e.g. hot-plug
+event). This causes the disk revalidation to fail and the disk
+capacity to be changed to 0.
+
+This problem can be avoided by using vmalloc() instead of kmalloc() for
+the buffer allocation. To limit the amount of memory to be allocated,
+this patch also introduces the arbitrary SD_ZBC_REPORT_MAX_ZONES
+maximum number of zones to report with a single report zones command.
+This limit may be lowered further to satisfy the disk max_hw_sectors
+limit. Finally, to ensure that the vmalloc-ed buffer can always be
+mapped in a request, the buffer size is further limited to at most
+queue_max_segments() pages, allowing successful mapping of the buffer
+even in the worst case scenario where none of the buffer pages are
+contiguous.
+
+Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
+Fixes: e76239a3748c ("block: add a report_zones method")
+Cc: stable@vger.kernel.org
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ drivers/scsi/sd_zbc.c |  104 ++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 75 insertions(+), 29 deletions(-)
+
+--- a/drivers/scsi/sd_zbc.c
++++ b/drivers/scsi/sd_zbc.c
+@@ -23,6 +23,8 @@
+  */
+ 
+ #include <linux/blkdev.h>
++#include <linux/vmalloc.h>
++#include <linux/sched/mm.h>
+ 
+ #include <asm/unaligned.h>
+ 
+@@ -64,7 +66,7 @@ static void sd_zbc_parse_report(struct s
+ /**
+  * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command.
+  * @sdkp: The target disk
+- * @buf: Buffer to use for the reply
++ * @buf: vmalloc-ed buffer to use for the reply
+  * @buflen: the buffer size
+  * @lba: Start LBA of the report
+  * @partial: Do partial report
+@@ -93,7 +95,6 @@ static int sd_zbc_do_report_zones(struct
+       put_unaligned_be32(buflen, &cmd[10]);
+       if (partial)
+               cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
+-      memset(buf, 0, buflen);
+ 
+       result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
+                                 buf, buflen, &sshdr,
+@@ -117,6 +118,53 @@ static int sd_zbc_do_report_zones(struct
+       return 0;
+ }
+ 
++/*
++ * Maximum number of zones to get with one report zones command.
++ */
++#define SD_ZBC_REPORT_MAX_ZONES               8192U
++
++/**
++ * Allocate a buffer for report zones reply.
++ * @sdkp: The target disk
++ * @nr_zones: Maximum number of zones to report
++ * @buflen: Size of the buffer allocated
++ *
++ * Try to allocate a reply buffer for the number of requested zones.
++ * The size of the buffer allocated may be smaller than requested to
++ * satify the device constraint (max_hw_sectors, max_segments, etc).
++ *
++ * Return the address of the allocated buffer and update @buflen with
++ * the size of the allocated buffer.
++ */
++static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
++                                      unsigned int nr_zones, size_t *buflen)
++{
++      struct request_queue *q = sdkp->disk->queue;
++      size_t bufsize;
++      void *buf;
++
++      /*
++       * Report zone buffer size should be at most 64B times the number of
++       * zones requested plus the 64B reply header, but should be at least
++       * SECTOR_SIZE for ATA devices.
++       * Make sure that this size does not exceed the hardware capabilities.
++       * Furthermore, since the report zone command cannot be split, make
++       * sure that the allocated buffer can always be mapped by limiting the
++       * number of pages allocated to the HBA max segments limit.
++       */
++      nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
++      bufsize = roundup((nr_zones + 1) * 64, 512);
++      bufsize = min_t(size_t, bufsize,
++                      queue_max_hw_sectors(q) << SECTOR_SHIFT);
++      bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
++
++      buf = vzalloc(bufsize);
++      if (buf)
++              *buflen = bufsize;
++
++      return buf;
++}
++
+ /**
+  * sd_zbc_report_zones - Disk report zones operation.
+  * @disk: The target disk
+@@ -132,30 +180,23 @@ int sd_zbc_report_zones(struct gendisk *
+                       gfp_t gfp_mask)
+ {
+       struct scsi_disk *sdkp = scsi_disk(disk);
+-      unsigned int i, buflen, nrz = *nr_zones;
++      unsigned int i, nrz = *nr_zones;
+       unsigned char *buf;
+-      size_t offset = 0;
++      size_t buflen = 0, offset = 0;
+       int ret = 0;
+ 
+       if (!sd_is_zoned(sdkp))
+               /* Not a zoned device */
+               return -EOPNOTSUPP;
+ 
+-      /*
+-       * Get a reply buffer for the number of requested zones plus a header,
+-       * without exceeding the device maximum command size. For ATA disks,
+-       * buffers must be aligned to 512B.
+-       */
+-      buflen = min(queue_max_hw_sectors(disk->queue) << 9,
+-                   roundup((nrz + 1) * 64, 512));
+-      buf = kmalloc(buflen, gfp_mask);
++      buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen);
+       if (!buf)
+               return -ENOMEM;
+ 
+       ret = sd_zbc_do_report_zones(sdkp, buf, buflen,
+                       sectors_to_logical(sdkp->device, sector), true);
+       if (ret)
+-              goto out_free_buf;
++              goto out;
+ 
+       nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64);
+       for (i = 0; i < nrz; i++) {
+@@ -166,8 +207,8 @@ int sd_zbc_report_zones(struct gendisk *
+ 
+       *nr_zones = nrz;
+ 
+-out_free_buf:
+-      kfree(buf);
++out:
++      kvfree(buf);
+ 
+       return ret;
+ }
+@@ -301,8 +342,6 @@ static int sd_zbc_check_zoned_characteri
+       return 0;
+ }
+ 
+-#define SD_ZBC_BUF_SIZE 131072U
+-
+ /**
+  * sd_zbc_check_zones - Check the device capacity and zone sizes
+  * @sdkp: Target disk
+@@ -318,22 +357,28 @@ static int sd_zbc_check_zoned_characteri
+  */
+ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
+ {
++      size_t bufsize, buflen;
++      unsigned int noio_flag;
+       u64 zone_blocks = 0;
+       sector_t max_lba, block = 0;
+       unsigned char *buf;
+       unsigned char *rec;
+-      unsigned int buf_len;
+-      unsigned int list_length;
+       int ret;
+       u8 same;
+ 
++      /* Do all memory allocations as if GFP_NOIO was specified */
++      noio_flag = memalloc_noio_save();
++
+       /* Get a buffer */
+-      buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
+-      if (!buf)
+-              return -ENOMEM;
++      buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES,
++                                       &bufsize);
++      if (!buf) {
++              ret = -ENOMEM;
++              goto out;
++      }
+ 
+       /* Do a report zone to get max_lba and the same field */
+-      ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false);
++      ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false);
+       if (ret)
+               goto out_free;
+ 
+@@ -369,12 +414,12 @@ static int sd_zbc_check_zones(struct scs
+       do {
+ 
+               /* Parse REPORT ZONES header */
+-              list_length = get_unaligned_be32(&buf[0]) + 64;
++              buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64,
++                             bufsize);
+               rec = buf + 64;
+-              buf_len = min(list_length, SD_ZBC_BUF_SIZE);
+ 
+               /* Parse zone descriptors */
+-              while (rec < buf + buf_len) {
++              while (rec < buf + buflen) {
+                       u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
+ 
+                       if (zone_blocks == 0) {
+@@ -390,8 +435,8 @@ static int sd_zbc_check_zones(struct scs
+               }
+ 
+               if (block < sdkp->capacity) {
+-                      ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
+-                                                   block, true);
++                      ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block,
++                                                   true);
+                       if (ret)
+                               goto out_free;
+               }
+@@ -422,7 +467,8 @@ out:
+       }
+ 
+ out_free:
+-      kfree(buf);
++      memalloc_noio_restore(noio_flag);
++      kvfree(buf);
+ 
+       return ret;
+ }
diff --git a/queue-5.1/series b/queue-5.1/series

index c119dd336c816a4454d3b88f36eccc6ac8a6ed36..bb3620f15e0ed121eb21033c37d794e99c0cd706 100644 (file)
--- a/queue-5.1/series
+++ b/queue-5.1/series
@@ -54,3 +54,9 @@ mm-add-filemap_fdatawait_range_keep_errors.patch
  jbd2-introduce-jbd2_inode-dirty-range-scoping.patch
  ext4-use-jbd2_inode-dirty-range-scoping.patch
  ext4-allow-directory-holes.patch
+kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch
+kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch
+revert-kvm-x86-use-task-structs-fpu-field-for-user.patch
+sd_zbc-fix-report-zones-buffer-allocation.patch
+block-limit-zone-array-allocation-size.patch
+mm-vmscan-scan-anonymous-pages-on-file-refaults.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 26 Jul 2019 13:58:18 +0000 (15:58 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 26 Jul 2019 13:58:18 +0000 (15:58 +0200)
queue-5.1/block-limit-zone-array-allocation-size.patch	[new file with mode: 0644]	patch \| blob
queue-5.1/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch	[new file with mode: 0644]	patch \| blob
queue-5.1/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch	[new file with mode: 0644]	patch \| blob
queue-5.1/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch	[new file with mode: 0644]	patch \| blob
queue-5.1/revert-kvm-x86-use-task-structs-fpu-field-for-user.patch	[new file with mode: 0644]	patch \| blob
queue-5.1/sd_zbc-fix-report-zones-buffer-allocation.patch	[new file with mode: 0644]	patch \| blob
queue-5.1/series		patch \| blob \| blame \| history