From: Greg Kroah-Hartman Date: Fri, 26 Jul 2019 13:58:00 +0000 (+0200) Subject: 4.19-stable patches X-Git-Tag: v5.2.4~7 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f26a830b68c4b1d8d322d3c58a752032d2deb487;p=thirdparty%2Fkernel%2Fstable-queue.git 4.19-stable patches added patches: kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch mm-vmscan-scan-anonymous-pages-on-file-refaults.patch --- diff --git a/queue-4.19/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch b/queue-4.19/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch new file mode 100644 index 00000000000..7d362943eea --- /dev/null +++ b/queue-4.19/kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch @@ -0,0 +1,37 @@ +From cf64527bb33f6cec2ed50f89182fc4688d0056b6 Mon Sep 17 00:00:00 2001 +From: Jan Kiszka +Date: Sun, 21 Jul 2019 13:52:18 +0200 +Subject: KVM: nVMX: Clear pending KVM_REQ_GET_VMCS12_PAGES when leaving nested + +From: Jan Kiszka + +commit cf64527bb33f6cec2ed50f89182fc4688d0056b6 upstream. + +Letting this pend may cause nested_get_vmcs12_pages to run against an +invalid state, corrupting the effective vmcs of L1. + +This was triggerable in QEMU after a guest corruption in L2, followed by +a L1 reset. + +Signed-off-by: Jan Kiszka +Reviewed-by: Liran Alon +Cc: stable@vger.kernel.org +Fixes: 7f7f1ba33cf2 ("KVM: x86: do not load vmcs12 pages while still in SMM") +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -8490,6 +8490,8 @@ static void free_nested(struct vcpu_vmx + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) + return; + ++ kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, &vmx->vcpu); ++ + hrtimer_cancel(&vmx->nested.preemption_timer); + vmx->nested.vmxon = false; + vmx->nested.smm.vmxon = false; diff --git a/queue-4.19/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch b/queue-4.19/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch new file mode 100644 index 00000000000..82cce6e5df5 --- /dev/null +++ b/queue-4.19/kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch @@ -0,0 +1,66 @@ +From 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 19 Jul 2019 18:41:10 +0200 +Subject: KVM: nVMX: do not use dangling shadow VMCS after guest reset + +From: Paolo Bonzini + +commit 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 upstream. + +If a KVM guest is reset while running a nested guest, free_nested will +disable the shadow VMCS execution control in the vmcs01. However, +on the next KVM_RUN vmx_vcpu_run would nevertheless try to sync +the VMCS12 to the shadow VMCS which has since been freed. + +This causes a vmptrld of a NULL pointer on my machime, but Jan reports +the host to hang altogether. Let's see how much this trivial patch fixes. + +Reported-by: Jan Kiszka +Cc: Liran Alon +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + + +--- + arch/x86/kvm/vmx.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -8457,6 +8457,7 @@ static void vmx_disable_shadow_vmcs(stru + { + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, -1ull); ++ vmx->nested.sync_shadow_vmcs = false; + } + + static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) +@@ -8468,7 +8469,6 @@ static inline void nested_release_vmcs12 + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); +- vmx->nested.sync_shadow_vmcs = false; + vmx_disable_shadow_vmcs(vmx); + } + vmx->nested.posted_intr_nv = -1; +@@ -8668,6 +8668,9 @@ static void copy_shadow_to_vmcs12(struct + u64 field_value; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + ++ if (WARN_ON(!shadow_vmcs)) ++ return; ++ + preempt_disable(); + + vmcs_load(shadow_vmcs); +@@ -8706,6 +8709,9 @@ static void copy_vmcs12_to_shadow(struct + u64 field_value = 0; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + ++ if (WARN_ON(!shadow_vmcs)) ++ return; ++ + vmcs_load(shadow_vmcs); + + for (q = 0; q < ARRAY_SIZE(fields); q++) { diff --git a/queue-4.19/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch b/queue-4.19/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch new file mode 100644 index 00000000000..2e3822dcec5 --- /dev/null +++ b/queue-4.19/mm-vmscan-scan-anonymous-pages-on-file-refaults.patch @@ -0,0 +1,241 @@ +From 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa Mon Sep 17 00:00:00 2001 +From: Kuo-Hsin Yang +Date: Thu, 11 Jul 2019 20:52:04 -0700 +Subject: mm: vmscan: scan anonymous pages on file refaults + +From: Kuo-Hsin Yang + +commit 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa upstream. + +When file refaults are detected and there are many inactive file pages, +the system never reclaim anonymous pages, the file pages are dropped +aggressively when there are still a lot of cold anonymous pages and +system thrashes. This issue impacts the performance of applications +with large executable, e.g. chrome. + +With this patch, when file refault is detected, inactive_list_is_low() +always returns true for file pages in get_scan_count() to enable +scanning anonymous pages. + +The problem can be reproduced by the following test program. + +---8<--- +void fallocate_file(const char *filename, off_t size) +{ + struct stat st; + int fd; + + if (!stat(filename, &st) && st.st_size >= size) + return; + + fd = open(filename, O_WRONLY | O_CREAT, 0600); + if (fd < 0) { + perror("create file"); + exit(1); + } + if (posix_fallocate(fd, 0, size)) { + perror("fallocate"); + exit(1); + } + close(fd); +} + +long *alloc_anon(long size) +{ + long *start = malloc(size); + memset(start, 1, size); + return start; +} + +long access_file(const char *filename, long size, long rounds) +{ + int fd, i; + volatile char *start1, *end1, *start2; + const int page_size = getpagesize(); + long sum = 0; + + fd = open(filename, O_RDONLY); + if (fd == -1) { + perror("open"); + exit(1); + } + + /* + * Some applications, e.g. chrome, use a lot of executable file + * pages, map some of the pages with PROT_EXEC flag to simulate + * the behavior. + */ + start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED, + fd, 0); + if (start1 == MAP_FAILED) { + perror("mmap"); + exit(1); + } + end1 = start1 + size / 2; + + start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2); + if (start2 == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + for (i = 0; i < rounds; ++i) { + struct timeval before, after; + volatile char *ptr1 = start1, *ptr2 = start2; + gettimeofday(&before, NULL); + for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size) + sum += *ptr1 + *ptr2; + gettimeofday(&after, NULL); + printf("File access time, round %d: %f (sec) +", i, + (after.tv_sec - before.tv_sec) + + (after.tv_usec - before.tv_usec) / 1000000.0); + } + return sum; +} + +int main(int argc, char *argv[]) +{ + const long MB = 1024 * 1024; + long anon_mb, file_mb, file_rounds; + const char filename[] = "large"; + long *ret1; + long ret2; + + if (argc != 4) { + printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS +"); + exit(0); + } + anon_mb = atoi(argv[1]); + file_mb = atoi(argv[2]); + file_rounds = atoi(argv[3]); + + fallocate_file(filename, file_mb * MB); + printf("Allocate %ld MB anonymous pages +", anon_mb); + ret1 = alloc_anon(anon_mb * MB); + printf("Access %ld MB file pages +", file_mb); + ret2 = access_file(filename, file_mb * MB, file_rounds); + printf("Print result to prevent optimization: %ld +", + *ret1 + ret2); + return 0; +} +---8<--- + +Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the program +fills ram with 2048 MB memory, access a 200 MB file for 10 times. Without +this patch, the file cache is dropped aggresively and every access to the +file is from disk. + + $ ./thrash 2048 200 10 + Allocate 2048 MB anonymous pages + Access 200 MB file pages + File access time, round 0: 2.489316 (sec) + File access time, round 1: 2.581277 (sec) + File access time, round 2: 2.487624 (sec) + File access time, round 3: 2.449100 (sec) + File access time, round 4: 2.420423 (sec) + File access time, round 5: 2.343411 (sec) + File access time, round 6: 2.454833 (sec) + File access time, round 7: 2.483398 (sec) + File access time, round 8: 2.572701 (sec) + File access time, round 9: 2.493014 (sec) + +With this patch, these file pages can be cached. + + $ ./thrash 2048 200 10 + Allocate 2048 MB anonymous pages + Access 200 MB file pages + File access time, round 0: 2.475189 (sec) + File access time, round 1: 2.440777 (sec) + File access time, round 2: 2.411671 (sec) + File access time, round 3: 1.955267 (sec) + File access time, round 4: 0.029924 (sec) + File access time, round 5: 0.000808 (sec) + File access time, round 6: 0.000771 (sec) + File access time, round 7: 0.000746 (sec) + File access time, round 8: 0.000738 (sec) + File access time, round 9: 0.000747 (sec) + +Checked the swap out stats during the test [1], 19006 pages swapped out +with this patch, 3418 pages swapped out without this patch. There are +more swap out, but I think it's within reasonable range when file backed +data set doesn't fit into the memory. + +$ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS +PROCESSES Allocate 2000 MB anonymous pages active_anon: 1613644, +inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB) +pswpout: 7972443, pgpgin: 478615246 Access 100 MB executable file pages +Access 2100 MB regular file pages File access time, round 0: 12.165, +(sec) active_anon: 1433788, inactive_anon: 478116, active_file: 17896, +inactive_file: 24328 (kB) File access time, round 1: 11.493, (sec) +active_anon: 1430576, inactive_anon: 477144, active_file: 25440, +inactive_file: 26172 (kB) File access time, round 2: 11.455, (sec) +active_anon: 1427436, inactive_anon: 476060, active_file: 21112, +inactive_file: 28808 (kB) File access time, round 3: 11.454, (sec) +active_anon: 1420444, inactive_anon: 473632, active_file: 23216, +inactive_file: 35036 (kB) File access time, round 4: 11.479, (sec) +active_anon: 1413964, inactive_anon: 471460, active_file: 31728, +inactive_file: 32224 (kB) pswpout: 7991449 (+ 19006), pgpgin: 489924366 +(+ 11309120) + +With 4 processes accessing non-overlapping parts of a large file, 30316 +pages swapped out with this patch, 5152 pages swapped out without this +patch. The swapout number is small comparing to pgpgin. + +[1]: https://github.com/vovo/testing/blob/master/mem_thrash.c + +Link: http://lkml.kernel.org/r/20190701081038.GA83398@google.com +Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty") +Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty") +Signed-off-by: Kuo-Hsin Yang +Acked-by: Johannes Weiner +Cc: Michal Hocko +Cc: Sonny Rao +Cc: Mel Gorman +Cc: Rik van Riel +Cc: Vladimir Davydov +Cc: Minchan Kim +Cc: [4.12+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[backported to 4.14.y, 4.19.y, 5.1.y: adjust context] +Signed-off-by: Kuo-Hsin Yang +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2190,7 +2190,7 @@ static void shrink_active_list(unsigned + * 10TB 320 32GB + */ + static bool inactive_list_is_low(struct lruvec *lruvec, bool file, +- struct scan_control *sc, bool actual_reclaim) ++ struct scan_control *sc, bool trace) + { + enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); +@@ -2216,7 +2216,7 @@ static bool inactive_list_is_low(struct + * rid of the stale workingset quickly. + */ + refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); +- if (file && actual_reclaim && lruvec->refaults != refaults) { ++ if (file && lruvec->refaults != refaults) { + inactive_ratio = 0; + } else { + gb = (inactive + active) >> (30 - PAGE_SHIFT); +@@ -2226,7 +2226,7 @@ static bool inactive_list_is_low(struct + inactive_ratio = 1; + } + +- if (actual_reclaim) ++ if (trace) + trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, + lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, + lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, diff --git a/queue-4.19/series b/queue-4.19/series index c85d0fddb26..c6a71f690a5 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -48,3 +48,6 @@ mm-add-filemap_fdatawait_range_keep_errors.patch jbd2-introduce-jbd2_inode-dirty-range-scoping.patch ext4-use-jbd2_inode-dirty-range-scoping.patch ext4-allow-directory-holes.patch +kvm-nvmx-do-not-use-dangling-shadow-vmcs-after-guest-reset.patch +kvm-nvmx-clear-pending-kvm_req_get_vmcs12_pages-when-leaving-nested.patch +mm-vmscan-scan-anonymous-pages-on-file-refaults.patch