4.19-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 26 Jul 2019 13:58:00 +0000 (15:58 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 26 Jul 2019 13:58:00 +0000 (15:58 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 26 Jul 2019 13:58:00 +0000 (15:58 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 26 Jul 2019 13:58:00 +0000 (15:58 +0200)
diff --git a/queue-4.19/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch b/queue-4.19/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch

new file mode 100644 (file)

index 0000000..7d36294
--- /dev/null
+++ b/queue-4.19/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch
@@ -0,0 +1,37 @@
+From cf64527bb33f6cec2ed50f89182fc4688d0056b6 Mon Sep 17 00:00:00 2001
+From: Jan Kiszka <jan.kiszka@siemens.com>
+Date: Sun, 21 Jul 2019 13:52:18 +0200
+Subject: KVM: nVMX: Clear pending KVM_REQ_GET_VMCS12_PAGES when leaving nested
+
+From: Jan Kiszka <jan.kiszka@siemens.com>
+
+commit cf64527bb33f6cec2ed50f89182fc4688d0056b6 upstream.
+
+Letting this pend may cause nested_get_vmcs12_pages to run against an
+invalid state, corrupting the effective vmcs of L1.
+
+This was triggerable in QEMU after a guest corruption in L2, followed by
+a L1 reset.
+
+Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
+Reviewed-by: Liran Alon <liran.alon@oracle.com>
+Cc: stable@vger.kernel.org
+Fixes: 7f7f1ba33cf2 ("KVM: x86: do not load vmcs12 pages while still in SMM")
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -8490,6 +8490,8 @@ static void free_nested(struct vcpu_vmx
+       if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
+               return;
+ 
++      kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, &vmx->vcpu);
++
+       hrtimer_cancel(&vmx->nested.preemption_timer);
+       vmx->nested.vmxon = false;
+       vmx->nested.smm.vmxon = false;
diff --git a/queue-4.19/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch b/queue-4.19/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch

new file mode 100644 (file)

index 0000000..82cce6e
--- /dev/null
+++ b/queue-4.19/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch
@@ -0,0 +1,66 @@
+From 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Fri, 19 Jul 2019 18:41:10 +0200
+Subject: KVM: nVMX: do not use dangling shadow VMCS after guest reset
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 upstream.
+
+If a KVM guest is reset while running a nested guest, free_nested will
+disable the shadow VMCS execution control in the vmcs01.  However,
+on the next KVM_RUN vmx_vcpu_run would nevertheless try to sync
+the VMCS12 to the shadow VMCS which has since been freed.
+
+This causes a vmptrld of a NULL pointer on my machime, but Jan reports
+the host to hang altogether.  Let's see how much this trivial patch fixes.
+
+Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
+Cc: Liran Alon <liran.alon@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ arch/x86/kvm/vmx.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -8457,6 +8457,7 @@ static void vmx_disable_shadow_vmcs(stru
+ {
+       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+       vmcs_write64(VMCS_LINK_POINTER, -1ull);
++      vmx->nested.sync_shadow_vmcs = false;
+ }
+ 
+ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
+@@ -8468,7 +8469,6 @@ static inline void nested_release_vmcs12
+               /* copy to memory all shadowed fields in case
+                  they were modified */
+               copy_shadow_to_vmcs12(vmx);
+-              vmx->nested.sync_shadow_vmcs = false;
+               vmx_disable_shadow_vmcs(vmx);
+       }
+       vmx->nested.posted_intr_nv = -1;
+@@ -8668,6 +8668,9 @@ static void copy_shadow_to_vmcs12(struct
+       u64 field_value;
+       struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ 
++      if (WARN_ON(!shadow_vmcs))
++              return;
++
+       preempt_disable();
+ 
+       vmcs_load(shadow_vmcs);
+@@ -8706,6 +8709,9 @@ static void copy_vmcs12_to_shadow(struct
+       u64 field_value = 0;
+       struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ 
++      if (WARN_ON(!shadow_vmcs))
++              return;
++
+       vmcs_load(shadow_vmcs);
+ 
+       for (q = 0; q < ARRAY_SIZE(fields); q++) {
diff --git a/queue-4.19/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch b/queue-4.19/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch

new file mode 100644 (file)

index 0000000..2e3822d
--- /dev/null
+++ b/queue-4.19/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch
@@ -0,0 +1,241 @@
+From 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa Mon Sep 17 00:00:00 2001
+From: Kuo-Hsin Yang <vovoy@chromium.org>
+Date: Thu, 11 Jul 2019 20:52:04 -0700
+Subject: mm: vmscan: scan anonymous pages on file refaults
+
+From: Kuo-Hsin Yang <vovoy@chromium.org>
+
+commit 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa upstream.
+
+When file refaults are detected and there are many inactive file pages,
+the system never reclaim anonymous pages, the file pages are dropped
+aggressively when there are still a lot of cold anonymous pages and
+system thrashes.  This issue impacts the performance of applications
+with large executable, e.g.  chrome.
+
+With this patch, when file refault is detected, inactive_list_is_low()
+always returns true for file pages in get_scan_count() to enable
+scanning anonymous pages.
+
+The problem can be reproduced by the following test program.
+
+---8<---
+void fallocate_file(const char *filename, off_t size)
+{
+       struct stat st;
+       int fd;
+
+       if (!stat(filename, &st) && st.st_size >= size)
+               return;
+
+       fd = open(filename, O_WRONLY | O_CREAT, 0600);
+       if (fd < 0) {
+               perror("create file");
+               exit(1);
+       }
+       if (posix_fallocate(fd, 0, size)) {
+               perror("fallocate");
+               exit(1);
+       }
+       close(fd);
+}
+
+long *alloc_anon(long size)
+{
+       long *start = malloc(size);
+       memset(start, 1, size);
+       return start;
+}
+
+long access_file(const char *filename, long size, long rounds)
+{
+       int fd, i;
+       volatile char *start1, *end1, *start2;
+       const int page_size = getpagesize();
+       long sum = 0;
+
+       fd = open(filename, O_RDONLY);
+       if (fd == -1) {
+               perror("open");
+               exit(1);
+       }
+
+       /*
+        * Some applications, e.g. chrome, use a lot of executable file
+        * pages, map some of the pages with PROT_EXEC flag to simulate
+        * the behavior.
+        */
+       start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED,
+                     fd, 0);
+       if (start1 == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+       end1 = start1 + size / 2;
+
+       start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2);
+       if (start2 == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       for (i = 0; i < rounds; ++i) {
+               struct timeval before, after;
+               volatile char *ptr1 = start1, *ptr2 = start2;
+               gettimeofday(&before, NULL);
+               for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size)
+                       sum += *ptr1 + *ptr2;
+               gettimeofday(&after, NULL);
+               printf("File access time, round %d: %f (sec)
+", i,
+                      (after.tv_sec - before.tv_sec) +
+                      (after.tv_usec - before.tv_usec) / 1000000.0);
+       }
+       return sum;
+}
+
+int main(int argc, char *argv[])
+{
+       const long MB = 1024 * 1024;
+       long anon_mb, file_mb, file_rounds;
+       const char filename[] = "large";
+       long *ret1;
+       long ret2;
+
+       if (argc != 4) {
+               printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS
+");
+               exit(0);
+       }
+       anon_mb = atoi(argv[1]);
+       file_mb = atoi(argv[2]);
+       file_rounds = atoi(argv[3]);
+
+       fallocate_file(filename, file_mb * MB);
+       printf("Allocate %ld MB anonymous pages
+", anon_mb);
+       ret1 = alloc_anon(anon_mb * MB);
+       printf("Access %ld MB file pages
+", file_mb);
+       ret2 = access_file(filename, file_mb * MB, file_rounds);
+       printf("Print result to prevent optimization: %ld
+",
+              *ret1 + ret2);
+       return 0;
+}
+---8<---
+
+Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the program
+fills ram with 2048 MB memory, access a 200 MB file for 10 times.  Without
+this patch, the file cache is dropped aggresively and every access to the
+file is from disk.
+
+  $ ./thrash 2048 200 10
+  Allocate 2048 MB anonymous pages
+  Access 200 MB file pages
+  File access time, round 0: 2.489316 (sec)
+  File access time, round 1: 2.581277 (sec)
+  File access time, round 2: 2.487624 (sec)
+  File access time, round 3: 2.449100 (sec)
+  File access time, round 4: 2.420423 (sec)
+  File access time, round 5: 2.343411 (sec)
+  File access time, round 6: 2.454833 (sec)
+  File access time, round 7: 2.483398 (sec)
+  File access time, round 8: 2.572701 (sec)
+  File access time, round 9: 2.493014 (sec)
+
+With this patch, these file pages can be cached.
+
+  $ ./thrash 2048 200 10
+  Allocate 2048 MB anonymous pages
+  Access 200 MB file pages
+  File access time, round 0: 2.475189 (sec)
+  File access time, round 1: 2.440777 (sec)
+  File access time, round 2: 2.411671 (sec)
+  File access time, round 3: 1.955267 (sec)
+  File access time, round 4: 0.029924 (sec)
+  File access time, round 5: 0.000808 (sec)
+  File access time, round 6: 0.000771 (sec)
+  File access time, round 7: 0.000746 (sec)
+  File access time, round 8: 0.000738 (sec)
+  File access time, round 9: 0.000747 (sec)
+
+Checked the swap out stats during the test [1], 19006 pages swapped out
+with this patch, 3418 pages swapped out without this patch. There are
+more swap out, but I think it's within reasonable range when file backed
+data set doesn't fit into the memory.
+
+$ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS
+PROCESSES Allocate 2000 MB anonymous pages active_anon: 1613644,
+inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB)
+pswpout: 7972443, pgpgin: 478615246 Access 100 MB executable file pages
+Access 2100 MB regular file pages File access time, round 0: 12.165,
+(sec) active_anon: 1433788, inactive_anon: 478116, active_file: 17896,
+inactive_file: 24328 (kB) File access time, round 1: 11.493, (sec)
+active_anon: 1430576, inactive_anon: 477144, active_file: 25440,
+inactive_file: 26172 (kB) File access time, round 2: 11.455, (sec)
+active_anon: 1427436, inactive_anon: 476060, active_file: 21112,
+inactive_file: 28808 (kB) File access time, round 3: 11.454, (sec)
+active_anon: 1420444, inactive_anon: 473632, active_file: 23216,
+inactive_file: 35036 (kB) File access time, round 4: 11.479, (sec)
+active_anon: 1413964, inactive_anon: 471460, active_file: 31728,
+inactive_file: 32224 (kB) pswpout: 7991449 (+ 19006), pgpgin: 489924366
+(+ 11309120)
+
+With 4 processes accessing non-overlapping parts of a large file, 30316
+pages swapped out with this patch, 5152 pages swapped out without this
+patch.  The swapout number is small comparing to pgpgin.
+
+[1]: https://github.com/vovo/testing/blob/master/mem_thrash.c
+
+Link: http://lkml.kernel.org/r/20190701081038.GA83398@google.com
+Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty")
+Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty")
+Signed-off-by: Kuo-Hsin Yang <vovoy@chromium.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Sonny Rao <sonnyrao@chromium.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: <stable@vger.kernel.org>   [4.12+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[backported to 4.14.y, 4.19.y, 5.1.y: adjust context]
+Signed-off-by: Kuo-Hsin Yang <vovoy@chromium.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2190,7 +2190,7 @@ static void shrink_active_list(unsigned
+  *   10TB     320        32GB
+  */
+ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
+-                               struct scan_control *sc, bool actual_reclaim)
++                               struct scan_control *sc, bool trace)
+ {
+       enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+@@ -2216,7 +2216,7 @@ static bool inactive_list_is_low(struct
+        * rid of the stale workingset quickly.
+        */
+       refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+-      if (file && actual_reclaim && lruvec->refaults != refaults) {
++      if (file && lruvec->refaults != refaults) {
+               inactive_ratio = 0;
+       } else {
+               gb = (inactive + active) >> (30 - PAGE_SHIFT);
+@@ -2226,7 +2226,7 @@ static bool inactive_list_is_low(struct
+                       inactive_ratio = 1;
+       }
+ 
+-      if (actual_reclaim)
++      if (trace)
+               trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+                       lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+                       lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
diff --git a/queue-4.19/series b/queue-4.19/series

index c85d0fddb26b76e3e6cb529cfdda9dfdbd66ebce..c6a71f690a5bbab72ac7e210b931220e6a24ffd5 100644 (file)
--- a/queue-4.19/series
+++ b/queue-4.19/series
@@ -48,3 +48,6 @@ mm-add-filemap_fdatawait_range_keep_errors.patch
  jbd2-introduce-jbd2_inode-dirty-range-scoping.patch
  ext4-use-jbd2_inode-dirty-range-scoping.patch
  ext4-allow-directory-holes.patch
+kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch
+kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch
+mm-vmscan-scan-anonymous-pages-on-file-refaults.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 26 Jul 2019 13:58:00 +0000 (15:58 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 26 Jul 2019 13:58:00 +0000 (15:58 +0200)
queue-4.19/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/series		patch \| blob \| blame \| history