]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.11-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 15 Mar 2021 09:06:13 +0000 (10:06 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 15 Mar 2021 09:06:13 +0000 (10:06 +0100)
added patches:
binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch
efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch
kasan-fix-kasan_stack-dependency-for-hw_tags.patch
kasan-mm-fix-crash-with-hw_tags-and-debug_pagealloc.patch
kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch
kvm-arm64-ensure-i-cache-isolation-between-vcpus-of-a-same-vm.patch
kvm-arm64-fix-exclusive-limit-for-ipa-size.patch
kvm-arm64-fix-range-alignment-when-walking-page-tables.patch
kvm-arm64-nvhe-save-the-spe-context-early.patch
kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch
kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch
kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch
linux-compiler-clang.h-define-have_builtin_bswap.patch
mm-highmem.c-fix-zero_user_segments-with-start-end.patch
mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch
mm-memcg-rename-mem_cgroup_split_huge_fixup-to-split_page_memcg-and-add-nr_pages-argument.patch
mm-memcg-set-memcg-when-splitting-page.patch
mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch
powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch
powerpc-fix-inverted-set_full_regs-bitop.patch
powerpc-fix-missing-declaration-of-able_kernel_vsx.patch
sched-collate-affine_move_task-stoppers.patch
sched-fix-affine_move_task-self-concurrency.patch
sched-fix-migration_cpu_stop-requeueing.patch
sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch
sched-optimize-migration_cpu_stop.patch
sched-simplify-migration_cpu_stop.patch
sched-simplify-set_affinity_pending-refcounts.patch
x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch
x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch
x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch
x86-sev-es-introduce-ip_within_syscall_gap-helper.patch
x86-sev-es-use-__copy_from_user_inatomic.patch
x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch
zram-fix-broken-page-writeback.patch
zram-fix-return-value-on-writeback_store.patch

37 files changed:
queue-5.11/binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch [new file with mode: 0644]
queue-5.11/efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch [new file with mode: 0644]
queue-5.11/kasan-fix-kasan_stack-dependency-for-hw_tags.patch [new file with mode: 0644]
queue-5.11/kasan-mm-fix-crash-with-hw_tags-and-debug_pagealloc.patch [new file with mode: 0644]
queue-5.11/kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch [new file with mode: 0644]
queue-5.11/kvm-arm64-ensure-i-cache-isolation-between-vcpus-of-a-same-vm.patch [new file with mode: 0644]
queue-5.11/kvm-arm64-fix-exclusive-limit-for-ipa-size.patch [new file with mode: 0644]
queue-5.11/kvm-arm64-fix-range-alignment-when-walking-page-tables.patch [new file with mode: 0644]
queue-5.11/kvm-arm64-nvhe-save-the-spe-context-early.patch [new file with mode: 0644]
queue-5.11/kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch [new file with mode: 0644]
queue-5.11/kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch [new file with mode: 0644]
queue-5.11/kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch [new file with mode: 0644]
queue-5.11/linux-compiler-clang.h-define-have_builtin_bswap.patch [new file with mode: 0644]
queue-5.11/mm-highmem.c-fix-zero_user_segments-with-start-end.patch [new file with mode: 0644]
queue-5.11/mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch [new file with mode: 0644]
queue-5.11/mm-memcg-rename-mem_cgroup_split_huge_fixup-to-split_page_memcg-and-add-nr_pages-argument.patch [new file with mode: 0644]
queue-5.11/mm-memcg-set-memcg-when-splitting-page.patch [new file with mode: 0644]
queue-5.11/mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch [new file with mode: 0644]
queue-5.11/powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch [new file with mode: 0644]
queue-5.11/powerpc-fix-inverted-set_full_regs-bitop.patch [new file with mode: 0644]
queue-5.11/powerpc-fix-missing-declaration-of-able_kernel_vsx.patch [new file with mode: 0644]
queue-5.11/sched-collate-affine_move_task-stoppers.patch [new file with mode: 0644]
queue-5.11/sched-fix-affine_move_task-self-concurrency.patch [new file with mode: 0644]
queue-5.11/sched-fix-migration_cpu_stop-requeueing.patch [new file with mode: 0644]
queue-5.11/sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch [new file with mode: 0644]
queue-5.11/sched-optimize-migration_cpu_stop.patch [new file with mode: 0644]
queue-5.11/sched-simplify-migration_cpu_stop.patch [new file with mode: 0644]
queue-5.11/sched-simplify-set_affinity_pending-refcounts.patch [new file with mode: 0644]
queue-5.11/series
queue-5.11/x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch [new file with mode: 0644]
queue-5.11/x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch [new file with mode: 0644]
queue-5.11/x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch [new file with mode: 0644]
queue-5.11/x86-sev-es-introduce-ip_within_syscall_gap-helper.patch [new file with mode: 0644]
queue-5.11/x86-sev-es-use-__copy_from_user_inatomic.patch [new file with mode: 0644]
queue-5.11/x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch [new file with mode: 0644]
queue-5.11/zram-fix-broken-page-writeback.patch [new file with mode: 0644]
queue-5.11/zram-fix-return-value-on-writeback_store.patch [new file with mode: 0644]

diff --git a/queue-5.11/binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch b/queue-5.11/binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch
new file mode 100644 (file)
index 0000000..5b3017b
--- /dev/null
@@ -0,0 +1,118 @@
+From e7850f4d844e0acfac7e570af611d89deade3146 Mon Sep 17 00:00:00 2001
+From: Lior Ribak <liorribak@gmail.com>
+Date: Fri, 12 Mar 2021 21:07:41 -0800
+Subject: binfmt_misc: fix possible deadlock in bm_register_write
+
+From: Lior Ribak <liorribak@gmail.com>
+
+commit e7850f4d844e0acfac7e570af611d89deade3146 upstream.
+
+There is a deadlock in bm_register_write:
+
+First, in the begining of the function, a lock is taken on the binfmt_misc
+root inode with inode_lock(d_inode(root)).
+
+Then, if the user used the MISC_FMT_OPEN_FILE flag, the function will call
+open_exec on the user-provided interpreter.
+
+open_exec will call a path lookup, and if the path lookup process includes
+the root of binfmt_misc, it will try to take a shared lock on its inode
+again, but it is already locked, and the code will get stuck in a deadlock
+
+To reproduce the bug:
+$ echo ":iiiii:E::ii::/proc/sys/fs/binfmt_misc/bla:F" > /proc/sys/fs/binfmt_misc/register
+
+backtrace of where the lock occurs (#5):
+0  schedule () at ./arch/x86/include/asm/current.h:15
+1  0xffffffff81b51237 in rwsem_down_read_slowpath (sem=0xffff888003b202e0, count=<optimized out>, state=state@entry=2) at kernel/locking/rwsem.c:992
+2  0xffffffff81b5150a in __down_read_common (state=2, sem=<optimized out>) at kernel/locking/rwsem.c:1213
+3  __down_read (sem=<optimized out>) at kernel/locking/rwsem.c:1222
+4  down_read (sem=<optimized out>) at kernel/locking/rwsem.c:1355
+5  0xffffffff811ee22a in inode_lock_shared (inode=<optimized out>) at ./include/linux/fs.h:783
+6  open_last_lookups (op=0xffffc9000022fe34, file=0xffff888004098600, nd=0xffffc9000022fd10) at fs/namei.c:3177
+7  path_openat (nd=nd@entry=0xffffc9000022fd10, op=op@entry=0xffffc9000022fe34, flags=flags@entry=65) at fs/namei.c:3366
+8  0xffffffff811efe1c in do_filp_open (dfd=<optimized out>, pathname=pathname@entry=0xffff8880031b9000, op=op@entry=0xffffc9000022fe34) at fs/namei.c:3396
+9  0xffffffff811e493f in do_open_execat (fd=fd@entry=-100, name=name@entry=0xffff8880031b9000, flags=<optimized out>, flags@entry=0) at fs/exec.c:913
+10 0xffffffff811e4a92 in open_exec (name=<optimized out>) at fs/exec.c:948
+11 0xffffffff8124aa84 in bm_register_write (file=<optimized out>, buffer=<optimized out>, count=19, ppos=<optimized out>) at fs/binfmt_misc.c:682
+12 0xffffffff811decd2 in vfs_write (file=file@entry=0xffff888004098500, buf=buf@entry=0xa758d0 ":iiiii:E::ii::i:CF
+", count=count@entry=19, pos=pos@entry=0xffffc9000022ff10) at fs/read_write.c:603
+13 0xffffffff811defda in ksys_write (fd=<optimized out>, buf=0xa758d0 ":iiiii:E::ii::i:CF
+", count=19) at fs/read_write.c:658
+14 0xffffffff81b49813 in do_syscall_64 (nr=<optimized out>, regs=0xffffc9000022ff58) at arch/x86/entry/common.c:46
+15 0xffffffff81c0007c in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:120
+
+To solve the issue, the open_exec call is moved to before the write
+lock is taken by bm_register_write
+
+Link: https://lkml.kernel.org/r/20210228224414.95962-1-liorribak@gmail.com
+Fixes: 948b701a607f1 ("binfmt_misc: add persistent opened binary handler for containers")
+Signed-off-by: Lior Ribak <liorribak@gmail.com>
+Acked-by: Helge Deller <deller@gmx.de>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/binfmt_misc.c |   29 ++++++++++++++---------------
+ 1 file changed, 14 insertions(+), 15 deletions(-)
+
+--- a/fs/binfmt_misc.c
++++ b/fs/binfmt_misc.c
+@@ -647,12 +647,24 @@ static ssize_t bm_register_write(struct
+       struct super_block *sb = file_inode(file)->i_sb;
+       struct dentry *root = sb->s_root, *dentry;
+       int err = 0;
++      struct file *f = NULL;
+       e = create_entry(buffer, count);
+       if (IS_ERR(e))
+               return PTR_ERR(e);
++      if (e->flags & MISC_FMT_OPEN_FILE) {
++              f = open_exec(e->interpreter);
++              if (IS_ERR(f)) {
++                      pr_notice("register: failed to install interpreter file %s\n",
++                               e->interpreter);
++                      kfree(e);
++                      return PTR_ERR(f);
++              }
++              e->interp_file = f;
++      }
++
+       inode_lock(d_inode(root));
+       dentry = lookup_one_len(e->name, root, strlen(e->name));
+       err = PTR_ERR(dentry);
+@@ -676,21 +688,6 @@ static ssize_t bm_register_write(struct
+               goto out2;
+       }
+-      if (e->flags & MISC_FMT_OPEN_FILE) {
+-              struct file *f;
+-
+-              f = open_exec(e->interpreter);
+-              if (IS_ERR(f)) {
+-                      err = PTR_ERR(f);
+-                      pr_notice("register: failed to install interpreter file %s\n", e->interpreter);
+-                      simple_release_fs(&bm_mnt, &entry_count);
+-                      iput(inode);
+-                      inode = NULL;
+-                      goto out2;
+-              }
+-              e->interp_file = f;
+-      }
+-
+       e->dentry = dget(dentry);
+       inode->i_private = e;
+       inode->i_fop = &bm_entry_operations;
+@@ -707,6 +704,8 @@ out:
+       inode_unlock(d_inode(root));
+       if (err) {
++              if (f)
++                      filp_close(f, NULL);
+               kfree(e);
+               return err;
+       }
diff --git a/queue-5.11/efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch b/queue-5.11/efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch
new file mode 100644 (file)
index 0000000..a99106f
--- /dev/null
@@ -0,0 +1,59 @@
+From 9e9888a0fe97b9501a40f717225d2bef7100a2c1 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 5 Mar 2021 10:21:05 +0100
+Subject: efi: stub: omit SetVirtualAddressMap() if marked unsupported in RT_PROP table
+
+From: Ard Biesheuvel <ardb@kernel.org>
+
+commit 9e9888a0fe97b9501a40f717225d2bef7100a2c1 upstream.
+
+The EFI_RT_PROPERTIES_TABLE contains a mask of runtime services that are
+available after ExitBootServices(). This mostly does not concern the EFI
+stub at all, given that it runs before that. However, there is one call
+that is made at runtime, which is the call to SetVirtualAddressMap()
+(which is not even callable at boot time to begin with)
+
+So add the missing handling of the RT_PROP table to ensure that we only
+call SetVirtualAddressMap() if it is not being advertised as unsupported
+by the firmware.
+
+Cc: <stable@vger.kernel.org> # v5.10+
+Tested-by: Shawn Guo <shawn.guo@linaro.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/efi/libstub/efi-stub.c |   16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+--- a/drivers/firmware/efi/libstub/efi-stub.c
++++ b/drivers/firmware/efi/libstub/efi-stub.c
+@@ -96,6 +96,18 @@ static void install_memreserve_table(voi
+               efi_err("Failed to install memreserve config table!\n");
+ }
++static u32 get_supported_rt_services(void)
++{
++      const efi_rt_properties_table_t *rt_prop_table;
++      u32 supported = EFI_RT_SUPPORTED_ALL;
++
++      rt_prop_table = get_efi_config_table(EFI_RT_PROPERTIES_TABLE_GUID);
++      if (rt_prop_table)
++              supported &= rt_prop_table->runtime_services_supported;
++
++      return supported;
++}
++
+ /*
+  * EFI entry point for the arm/arm64 EFI stubs.  This is the entrypoint
+  * that is described in the PE/COFF header.  Most of the code is the same
+@@ -250,6 +262,10 @@ efi_status_t __efiapi efi_pe_entry(efi_h
+                         (prop_tbl->memory_protection_attribute &
+                          EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA);
++      /* force efi_novamap if SetVirtualAddressMap() is unsupported */
++      efi_novamap |= !(get_supported_rt_services() &
++                       EFI_RT_SUPPORTED_SET_VIRTUAL_ADDRESS_MAP);
++
+       /* hibernation expects the runtime regions to stay in the same place */
+       if (!IS_ENABLED(CONFIG_HIBERNATION) && !efi_nokaslr && !flat_va_mapping) {
+               /*
diff --git a/queue-5.11/kasan-fix-kasan_stack-dependency-for-hw_tags.patch b/queue-5.11/kasan-fix-kasan_stack-dependency-for-hw_tags.patch
new file mode 100644 (file)
index 0000000..f9f8b7d
--- /dev/null
@@ -0,0 +1,52 @@
+From d9b571c885a8974fbb7d4ee639dbc643fd000f9e Mon Sep 17 00:00:00 2001
+From: Andrey Konovalov <andreyknvl@google.com>
+Date: Fri, 12 Mar 2021 21:08:13 -0800
+Subject: kasan: fix KASAN_STACK dependency for HW_TAGS
+
+From: Andrey Konovalov <andreyknvl@google.com>
+
+commit d9b571c885a8974fbb7d4ee639dbc643fd000f9e upstream.
+
+There's a runtime failure when running HW_TAGS-enabled kernel built with
+GCC on hardware that doesn't support MTE.  GCC-built kernels always have
+CONFIG_KASAN_STACK enabled, even though stack instrumentation isn't
+supported by HW_TAGS.  Having that config enabled causes KASAN to issue
+MTE-only instructions to unpoison kernel stacks, which causes the failure.
+
+Fix the issue by disallowing CONFIG_KASAN_STACK when HW_TAGS is used.
+
+(The commit that introduced CONFIG_KASAN_HW_TAGS specified proper
+ dependency for CONFIG_KASAN_STACK_ENABLE but not for CONFIG_KASAN_STACK.)
+
+Link: https://lkml.kernel.org/r/59e75426241dbb5611277758c8d4d6f5f9298dac.1615215441.git.andreyknvl@google.com
+Fixes: 6a63a63ff1ac ("kasan: introduce CONFIG_KASAN_HW_TAGS")
+Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
+Reported-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: <stable@vger.kernel.org>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Peter Collingbourne <pcc@google.com>
+Cc: Evgenii Stepanov <eugenis@google.com>
+Cc: Branislav Rankov <Branislav.Rankov@arm.com>
+Cc: Kevin Brodsky <kevin.brodsky@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/Kconfig.kasan |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/lib/Kconfig.kasan
++++ b/lib/Kconfig.kasan
+@@ -156,6 +156,7 @@ config KASAN_STACK_ENABLE
+ config KASAN_STACK
+       int
++      depends on KASAN_GENERIC || KASAN_SW_TAGS
+       default 1 if KASAN_STACK_ENABLE || CC_IS_GCC
+       default 0
diff --git a/queue-5.11/kasan-mm-fix-crash-with-hw_tags-and-debug_pagealloc.patch b/queue-5.11/kasan-mm-fix-crash-with-hw_tags-and-debug_pagealloc.patch
new file mode 100644 (file)
index 0000000..857a828
--- /dev/null
@@ -0,0 +1,65 @@
+From f9d79e8dce4077d3c6ab739c808169dfa99af9ef Mon Sep 17 00:00:00 2001
+From: Andrey Konovalov <andreyknvl@google.com>
+Date: Fri, 12 Mar 2021 21:08:10 -0800
+Subject: kasan, mm: fix crash with HW_TAGS and DEBUG_PAGEALLOC
+
+From: Andrey Konovalov <andreyknvl@google.com>
+
+commit f9d79e8dce4077d3c6ab739c808169dfa99af9ef upstream.
+
+Currently, kasan_free_nondeferred_pages()->kasan_free_pages() is called
+after debug_pagealloc_unmap_pages(). This causes a crash when
+debug_pagealloc is enabled, as HW_TAGS KASAN can't set tags on an
+unmapped page.
+
+This patch puts kasan_free_nondeferred_pages() before
+debug_pagealloc_unmap_pages() and arch_free_page(), which can also make
+the page unavailable.
+
+Link: https://lkml.kernel.org/r/24cd7db274090f0e5bc3adcdc7399243668e3171.1614987311.git.andreyknvl@google.com
+Fixes: 94ab5b61ee16 ("kasan, arm64: enable CONFIG_KASAN_HW_TAGS")
+Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Peter Collingbourne <pcc@google.com>
+Cc: Evgenii Stepanov <eugenis@google.com>
+Cc: Branislav Rankov <Branislav.Rankov@arm.com>
+Cc: Kevin Brodsky <kevin.brodsky@arm.com>
+Cc: Christoph Hellwig <hch@infradead.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1282,6 +1282,12 @@ static __always_inline bool free_pages_p
+       kernel_poison_pages(page, 1 << order);
+       /*
++       * With hardware tag-based KASAN, memory tags must be set before the
++       * page becomes unavailable via debug_pagealloc or arch_free_page.
++       */
++      kasan_free_nondeferred_pages(page, order);
++
++      /*
+        * arch_free_page() can make the page's contents inaccessible.  s390
+        * does this.  So nothing which can access the page's contents should
+        * happen after this.
+@@ -1290,8 +1296,6 @@ static __always_inline bool free_pages_p
+       debug_pagealloc_unmap_pages(page, 1 << order);
+-      kasan_free_nondeferred_pages(page, order);
+-
+       return true;
+ }
diff --git a/queue-5.11/kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch b/queue-5.11/kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch
new file mode 100644 (file)
index 0000000..7ae1b1e
--- /dev/null
@@ -0,0 +1,47 @@
+From 31948332d5fa392ad933f4a6a10026850649ed76 Mon Sep 17 00:00:00 2001
+From: Will Deacon <will@kernel.org>
+Date: Fri, 5 Mar 2021 18:52:48 +0000
+Subject: KVM: arm64: Avoid corrupting vCPU context register in guest exit
+
+From: Will Deacon <will@kernel.org>
+
+commit 31948332d5fa392ad933f4a6a10026850649ed76 upstream.
+
+Commit 7db21530479f ("KVM: arm64: Restore hyp when panicking in guest
+context") tracks the currently running vCPU, clearing the pointer to
+NULL on exit from a guest.
+
+Unfortunately, the use of 'set_loaded_vcpu' clobbers x1 to point at the
+kvm_hyp_ctxt instead of the vCPU context, causing the subsequent RAS
+code to go off into the weeds when it saves the DISR assuming that the
+CPU context is embedded in a struct vCPU.
+
+Leave x1 alone and use x3 as a temporary register instead when clearing
+the vCPU on the guest exit path.
+
+Cc: Marc Zyngier <maz@kernel.org>
+Cc: Andrew Scull <ascull@google.com>
+Cc: <stable@vger.kernel.org>
+Fixes: 7db21530479f ("KVM: arm64: Restore hyp when panicking in guest context")
+Suggested-by: Quentin Perret <qperret@google.com>
+Signed-off-by: Will Deacon <will@kernel.org>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Link: https://lore.kernel.org/r/20210226181211.14542-1-will@kernel.org
+Message-Id: <20210305185254.3730990-3-maz@kernel.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/entry.S |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/arm64/kvm/hyp/entry.S
++++ b/arch/arm64/kvm/hyp/entry.S
+@@ -146,7 +146,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOB
+       // Now restore the hyp regs
+       restore_callee_saved_regs x2
+-      set_loaded_vcpu xzr, x1, x2
++      set_loaded_vcpu xzr, x2, x3
+ alternative_if ARM64_HAS_RAS_EXTN
+       // If we have the RAS extensions we can consume a pending error
diff --git a/queue-5.11/kvm-arm64-ensure-i-cache-isolation-between-vcpus-of-a-same-vm.patch b/queue-5.11/kvm-arm64-ensure-i-cache-isolation-between-vcpus-of-a-same-vm.patch
new file mode 100644 (file)
index 0000000..154f516
--- /dev/null
@@ -0,0 +1,152 @@
+From 01dc9262ff5797b675c32c0c6bc682777d23de05 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Wed, 3 Mar 2021 16:45:05 +0000
+Subject: KVM: arm64: Ensure I-cache isolation between vcpus of a same VM
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 01dc9262ff5797b675c32c0c6bc682777d23de05 upstream.
+
+It recently became apparent that the ARMv8 architecture has interesting
+rules regarding attributes being used when fetching instructions
+if the MMU is off at Stage-1.
+
+In this situation, the CPU is allowed to fetch from the PoC and
+allocate into the I-cache (unless the memory is mapped with
+the XN attribute at Stage-2).
+
+If we transpose this to vcpus sharing a single physical CPU,
+it is possible for a vcpu running with its MMU off to influence
+another vcpu running with its MMU on, as the latter is expected to
+fetch from the PoU (and self-patching code doesn't flush below that
+level).
+
+In order to solve this, reuse the vcpu-private TLB invalidation
+code to apply the same policy to the I-cache, nuking it every time
+the vcpu runs on a physical CPU that ran another vcpu of the same
+VM in the past.
+
+This involve renaming __kvm_tlb_flush_local_vmid() to
+__kvm_flush_cpu_context(), and inserting a local i-cache invalidation
+there.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Acked-by: Will Deacon <will@kernel.org>
+Acked-by: Catalin Marinas <catalin.marinas@arm.com>
+Link: https://lore.kernel.org/r/20210303164505.68492-1-maz@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/kvm_asm.h   |    4 ++--
+ arch/arm64/kvm/arm.c               |    7 ++++++-
+ arch/arm64/kvm/hyp/nvhe/hyp-main.c |    6 +++---
+ arch/arm64/kvm/hyp/nvhe/tlb.c      |    3 ++-
+ arch/arm64/kvm/hyp/vhe/tlb.c       |    3 ++-
+ 5 files changed, 15 insertions(+), 8 deletions(-)
+
+--- a/arch/arm64/include/asm/kvm_asm.h
++++ b/arch/arm64/include/asm/kvm_asm.h
+@@ -47,7 +47,7 @@
+ #define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context          2
+ #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa                3
+ #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid            4
+-#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid      5
++#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context         5
+ #define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff         6
+ #define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs                       7
+ #define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2               8
+@@ -183,10 +183,10 @@ DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs
+ #define __bp_harden_hyp_vecs  CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
+ extern void __kvm_flush_vm_context(void);
++extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu);
+ extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
+                                    int level);
+ extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
+-extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu);
+ extern void __kvm_timer_set_cntvoff(u64 cntvoff);
+--- a/arch/arm64/kvm/arm.c
++++ b/arch/arm64/kvm/arm.c
+@@ -385,11 +385,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu
+       last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
+       /*
++       * We guarantee that both TLBs and I-cache are private to each
++       * vcpu. If detecting that a vcpu from the same VM has
++       * previously run on the same physical CPU, call into the
++       * hypervisor code to nuke the relevant contexts.
++       *
+        * We might get preempted before the vCPU actually runs, but
+        * over-invalidation doesn't affect correctness.
+        */
+       if (*last_ran != vcpu->vcpu_id) {
+-              kvm_call_hyp(__kvm_tlb_flush_local_vmid, mmu);
++              kvm_call_hyp(__kvm_flush_cpu_context, mmu);
+               *last_ran = vcpu->vcpu_id;
+       }
+--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
++++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+@@ -46,11 +46,11 @@ static void handle___kvm_tlb_flush_vmid(
+       __kvm_tlb_flush_vmid(kern_hyp_va(mmu));
+ }
+-static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt)
++static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt)
+ {
+       DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+-      __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
++      __kvm_flush_cpu_context(kern_hyp_va(mmu));
+ }
+ static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt)
+@@ -115,7 +115,7 @@ static const hcall_t *host_hcall[] = {
+       HANDLE_FUNC(__kvm_flush_vm_context),
+       HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
+       HANDLE_FUNC(__kvm_tlb_flush_vmid),
+-      HANDLE_FUNC(__kvm_tlb_flush_local_vmid),
++      HANDLE_FUNC(__kvm_flush_cpu_context),
+       HANDLE_FUNC(__kvm_timer_set_cntvoff),
+       HANDLE_FUNC(__kvm_enable_ssbs),
+       HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2),
+--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
++++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
+@@ -123,7 +123,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_
+       __tlb_switch_to_host(&cxt);
+ }
+-void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
++void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
+ {
+       struct tlb_inv_context cxt;
+@@ -131,6 +131,7 @@ void __kvm_tlb_flush_local_vmid(struct k
+       __tlb_switch_to_guest(mmu, &cxt);
+       __tlbi(vmalle1);
++      asm volatile("ic iallu");
+       dsb(nsh);
+       isb();
+--- a/arch/arm64/kvm/hyp/vhe/tlb.c
++++ b/arch/arm64/kvm/hyp/vhe/tlb.c
+@@ -127,7 +127,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_
+       __tlb_switch_to_host(&cxt);
+ }
+-void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
++void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
+ {
+       struct tlb_inv_context cxt;
+@@ -135,6 +135,7 @@ void __kvm_tlb_flush_local_vmid(struct k
+       __tlb_switch_to_guest(mmu, &cxt);
+       __tlbi(vmalle1);
++      asm volatile("ic iallu");
+       dsb(nsh);
+       isb();
diff --git a/queue-5.11/kvm-arm64-fix-exclusive-limit-for-ipa-size.patch b/queue-5.11/kvm-arm64-fix-exclusive-limit-for-ipa-size.patch
new file mode 100644 (file)
index 0000000..e08b71b
--- /dev/null
@@ -0,0 +1,44 @@
+From 262b003d059c6671601a19057e9fe1a5e7f23722 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Thu, 11 Mar 2021 10:00:16 +0000
+Subject: KVM: arm64: Fix exclusive limit for IPA size
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 262b003d059c6671601a19057e9fe1a5e7f23722 upstream.
+
+When registering a memslot, we check the size and location of that
+memslot against the IPA size to ensure that we can provide guest
+access to the whole of the memory.
+
+Unfortunately, this check rejects memslot that end-up at the exact
+limit of the addressing capability for a given IPA size. For example,
+it refuses the creation of a 2GB memslot at 0x8000000 with a 32bit
+IPA space.
+
+Fix it by relaxing the check to accept a memslot reaching the
+limit of the IPA space.
+
+Fixes: c3058d5da222 ("arm/arm64: KVM: Ensure memslots are within KVM_PHYS_SIZE")
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Reviewed-by: Andrew Jones <drjones@redhat.com>
+Link: https://lore.kernel.org/r/20210311100016.3830038-3-maz@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/mmu.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/arch/arm64/kvm/mmu.c
++++ b/arch/arm64/kvm/mmu.c
+@@ -1309,8 +1309,7 @@ int kvm_arch_prepare_memory_region(struc
+        * Prevent userspace from creating a memory region outside of the IPA
+        * space addressable by the KVM guest IPA space.
+        */
+-      if (memslot->base_gfn + memslot->npages >=
+-          (kvm_phys_size(kvm) >> PAGE_SHIFT))
++      if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
+               return -EFAULT;
+       mmap_read_lock(current->mm);
diff --git a/queue-5.11/kvm-arm64-fix-range-alignment-when-walking-page-tables.patch b/queue-5.11/kvm-arm64-fix-range-alignment-when-walking-page-tables.patch
new file mode 100644 (file)
index 0000000..4b93f24
--- /dev/null
@@ -0,0 +1,44 @@
+From 357ad203d45c0f9d76a8feadbd5a1c5d460c638b Mon Sep 17 00:00:00 2001
+From: Jia He <justin.he@arm.com>
+Date: Fri, 5 Mar 2021 18:52:54 +0000
+Subject: KVM: arm64: Fix range alignment when walking page tables
+
+From: Jia He <justin.he@arm.com>
+
+commit 357ad203d45c0f9d76a8feadbd5a1c5d460c638b upstream.
+
+When walking the page tables at a given level, and if the start
+address for the range isn't aligned for that level, we propagate
+the misalignment on each iteration at that level.
+
+This results in the walker ignoring a number of entries (depending
+on the original misalignment) on each subsequent iteration.
+
+Properly aligning the address before the next iteration addresses
+this issue.
+
+Cc: stable@vger.kernel.org
+Reported-by: Howard Zhang <Howard.Zhang@arm.com>
+Acked-by: Will Deacon <will@kernel.org>
+Signed-off-by: Jia He <justin.he@arm.com>
+Fixes: b1e57de62cfb ("KVM: arm64: Add stand-alone page-table walker infrastructure")
+[maz: rewrite commit message]
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Link: https://lore.kernel.org/r/20210303024225.2591-1-justin.he@arm.com
+Message-Id: <20210305185254.3730990-9-maz@kernel.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/pgtable.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm64/kvm/hyp/pgtable.c
++++ b/arch/arm64/kvm/hyp/pgtable.c
+@@ -225,6 +225,7 @@ static inline int __kvm_pgtable_visit(st
+               goto out;
+       if (!table) {
++              data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
+               data->addr += kvm_granule_size(level);
+               goto out;
+       }
diff --git a/queue-5.11/kvm-arm64-nvhe-save-the-spe-context-early.patch b/queue-5.11/kvm-arm64-nvhe-save-the-spe-context-early.patch
new file mode 100644 (file)
index 0000000..0cb1e15
--- /dev/null
@@ -0,0 +1,120 @@
+From b96b0c5de685df82019e16826a282d53d86d112c Mon Sep 17 00:00:00 2001
+From: Suzuki K Poulose <suzuki.poulose@arm.com>
+Date: Fri, 5 Mar 2021 18:52:47 +0000
+Subject: KVM: arm64: nvhe: Save the SPE context early
+
+From: Suzuki K Poulose <suzuki.poulose@arm.com>
+
+commit b96b0c5de685df82019e16826a282d53d86d112c upstream.
+
+The nVHE KVM hyp drains and disables the SPE buffer, before
+entering the guest, as the EL1&0 translation regime
+is going to be loaded with that of the guest.
+
+But this operation is performed way too late, because :
+  - The owning translation regime of the SPE buffer
+    is transferred to EL2. (MDCR_EL2_E2PB == 0)
+  - The guest Stage1 is loaded.
+
+Thus the flush could use the host EL1 virtual address,
+but use the EL2 translations instead of host EL1, for writing
+out any cached data.
+
+Fix this by moving the SPE buffer handling early enough.
+The restore path is doing the right thing.
+
+Fixes: 014c4c77aad7 ("KVM: arm64: Improve debug register save/restore flow")
+Cc: stable@vger.kernel.org
+Cc: Christoffer Dall <christoffer.dall@arm.com>
+Cc: Marc Zyngier <maz@kernel.org>
+Cc: Will Deacon <will@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Alexandru Elisei <alexandru.elisei@arm.com>
+Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
+Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Link: https://lore.kernel.org/r/20210302120345.3102874-1-suzuki.poulose@arm.com
+Message-Id: <20210305185254.3730990-2-maz@kernel.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/kvm_hyp.h   |    5 +++++
+ arch/arm64/kvm/hyp/nvhe/debug-sr.c |   12 ++++++++++--
+ arch/arm64/kvm/hyp/nvhe/switch.c   |   11 ++++++++++-
+ 3 files changed, 25 insertions(+), 3 deletions(-)
+
+--- a/arch/arm64/include/asm/kvm_hyp.h
++++ b/arch/arm64/include/asm/kvm_hyp.h
+@@ -83,6 +83,11 @@ void sysreg_restore_guest_state_vhe(stru
+ void __debug_switch_to_guest(struct kvm_vcpu *vcpu);
+ void __debug_switch_to_host(struct kvm_vcpu *vcpu);
++#ifdef __KVM_NVHE_HYPERVISOR__
++void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu);
++void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu);
++#endif
++
+ void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
+ void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
+--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
++++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+@@ -58,16 +58,24 @@ static void __debug_restore_spe(u64 pmsc
+       write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1);
+ }
+-void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
++void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
+ {
+       /* Disable and flush SPE data generation */
+       __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
++}
++
++void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
++{
+       __debug_switch_to_guest_common(vcpu);
+ }
+-void __debug_switch_to_host(struct kvm_vcpu *vcpu)
++void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
+ {
+       __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
++}
++
++void __debug_switch_to_host(struct kvm_vcpu *vcpu)
++{
+       __debug_switch_to_host_common(vcpu);
+ }
+--- a/arch/arm64/kvm/hyp/nvhe/switch.c
++++ b/arch/arm64/kvm/hyp/nvhe/switch.c
+@@ -192,6 +192,14 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu
+       pmu_switch_needed = __pmu_switch_to_guest(host_ctxt);
+       __sysreg_save_state_nvhe(host_ctxt);
++      /*
++       * We must flush and disable the SPE buffer for nVHE, as
++       * the translation regime(EL1&0) is going to be loaded with
++       * that of the guest. And we must do this before we change the
++       * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and
++       * before we load guest Stage1.
++       */
++      __debug_save_host_buffers_nvhe(vcpu);
+       __adjust_pc(vcpu);
+@@ -234,11 +242,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu
+       if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED)
+               __fpsimd_save_fpexc32(vcpu);
++      __debug_switch_to_host(vcpu);
+       /*
+        * This must come after restoring the host sysregs, since a non-VHE
+        * system may enable SPE here and make use of the TTBRs.
+        */
+-      __debug_switch_to_host(vcpu);
++      __debug_restore_host_buffers_nvhe(vcpu);
+       if (pmu_switch_needed)
+               __pmu_switch_to_host(host_ctxt);
diff --git a/queue-5.11/kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch b/queue-5.11/kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch
new file mode 100644 (file)
index 0000000..b1d294f
--- /dev/null
@@ -0,0 +1,88 @@
+From 7d717558dd5ef10d28866750d5c24ff892ea3778 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Thu, 11 Mar 2021 10:00:15 +0000
+Subject: KVM: arm64: Reject VM creation when the default IPA size is unsupported
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 7d717558dd5ef10d28866750d5c24ff892ea3778 upstream.
+
+KVM/arm64 has forever used a 40bit default IPA space, partially
+due to its 32bit heritage (where the only choice is 40bit).
+
+However, there are implementations in the wild that have a *cough*
+much smaller *cough* IPA space, which leads to a misprogramming of
+VTCR_EL2, and a guest that is stuck on its first memory access
+if userspace dares to ask for the default IPA setting (which most
+VMMs do).
+
+Instead, blundly reject the creation of such VM, as we can't
+satisfy the requirements from userspace (with a one-off warning).
+Also clarify the boot warning, and document that the VM creation
+will fail when an unsupported IPA size is provided.
+
+Although this is an ABI change, it doesn't really change much
+for userspace:
+
+- the guest couldn't run before this change, but no error was
+  returned. At least userspace knows what is happening.
+
+- a memory slot that was accepted because it did fit the default
+  IPA space now doesn't even get a chance to be registered.
+
+The other thing that is left doing is to convince userspace to
+actually use the IPA space setting instead of relying on the
+antiquated default.
+
+Fixes: 233a7cb23531 ("kvm: arm64: Allow tuning the physical address size for VM")
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Reviewed-by: Andrew Jones <drjones@redhat.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Link: https://lore.kernel.org/r/20210311100016.3830038-2-maz@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/virt/kvm/api.rst |    3 +++
+ arch/arm64/kvm/reset.c         |   12 ++++++++----
+ 2 files changed, 11 insertions(+), 4 deletions(-)
+
+--- a/Documentation/virt/kvm/api.rst
++++ b/Documentation/virt/kvm/api.rst
+@@ -182,6 +182,9 @@ is dependent on the CPU capability and t
+ be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION
+ ioctl() at run-time.
++Creation of the VM will fail if the requested IPA size (whether it is
++implicit or explicit) is unsupported on the host.
++
+ Please note that configuring the IPA size does not affect the capability
+ exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects
+ size of the address translated by the stage2 level (guest physical to
+--- a/arch/arm64/kvm/reset.c
++++ b/arch/arm64/kvm/reset.c
+@@ -324,10 +324,9 @@ int kvm_set_ipa_limit(void)
+       }
+       kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
+-      WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
+-           "KVM IPA Size Limit (%d bits) is smaller than default size\n",
+-           kvm_ipa_limit);
+-      kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
++      kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit,
++               ((kvm_ipa_limit < KVM_PHYS_SHIFT) ?
++                " (Reduced IPA size, limited VM/VMM compatibility)" : ""));
+       return 0;
+ }
+@@ -356,6 +355,11 @@ int kvm_arm_setup_stage2(struct kvm *kvm
+                       return -EINVAL;
+       } else {
+               phys_shift = KVM_PHYS_SHIFT;
++              if (phys_shift > kvm_ipa_limit) {
++                      pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
++                                   current->comm);
++                      return -EINVAL;
++              }
+       }
+       mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
diff --git a/queue-5.11/kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch b/queue-5.11/kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch
new file mode 100644 (file)
index 0000000..95fc015
--- /dev/null
@@ -0,0 +1,78 @@
+From d7eb79c6290c7ae4561418544072e0a3266e7384 Mon Sep 17 00:00:00 2001
+From: Wanpeng Li <wanpengli@tencent.com>
+Date: Wed, 24 Feb 2021 09:37:29 +0800
+Subject: KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
+
+From: Wanpeng Li <wanpengli@tencent.com>
+
+commit d7eb79c6290c7ae4561418544072e0a3266e7384 upstream.
+
+# lscpu
+Architecture:          x86_64
+CPU op-mode(s):        32-bit, 64-bit
+Byte Order:            Little Endian
+CPU(s):                88
+On-line CPU(s) list:   0-63
+Off-line CPU(s) list:  64-87
+
+# cat /proc/cmdline
+BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root ro
+rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US .UTF-8 no-kvmclock-vsyscall
+
+# echo 1 > /sys/devices/system/cpu/cpu76/online
+-bash: echo: write error: Cannot allocate memory
+
+The per-cpu vsyscall pvclock data pointer assigns either an element of the
+static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory
+hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if
+kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in
+kvmclock_setup_percpu() which returns -ENOMEM. It's broken for no-vsyscall
+and sometimes you end up with vsyscall disabled if the host does something
+strange. This patch fixes it by allocating this dynamically memory
+unconditionally even if vsyscall is disabled.
+
+Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables")
+Reported-by: Zelin Deng <zelin.deng@linux.alibaba.com>
+Cc: Brijesh Singh <brijesh.singh@amd.com>
+Cc: stable@vger.kernel.org#v4.19-rc5+
+Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
+Message-Id: <1614130683-24137-1-git-send-email-wanpengli@tencent.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/kvmclock.c |   19 +++++++++----------
+ 1 file changed, 9 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kernel/kvmclock.c
++++ b/arch/x86/kernel/kvmclock.c
+@@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(voi
+ static int __init kvm_setup_vsyscall_timeinfo(void)
+ {
+-#ifdef CONFIG_X86_64
+-      u8 flags;
++      kvmclock_init_mem();
+-      if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall)
+-              return 0;
++#ifdef CONFIG_X86_64
++      if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) {
++              u8 flags;
+-      flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+-      if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+-              return 0;
++              flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
++              if (!(flags & PVCLOCK_TSC_STABLE_BIT))
++                      return 0;
+-      kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
++              kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
++      }
+ #endif
+-      kvmclock_init_mem();
+-
+       return 0;
+ }
+ early_initcall(kvm_setup_vsyscall_timeinfo);
diff --git a/queue-5.11/kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch b/queue-5.11/kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch
new file mode 100644 (file)
index 0000000..05441e9
--- /dev/null
@@ -0,0 +1,46 @@
+From beda430177f56656e7980dcce93456ffaa35676b Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 4 Mar 2021 18:18:08 -0800
+Subject: KVM: x86: Ensure deadline timer has truly expired before posting its IRQ
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit beda430177f56656e7980dcce93456ffaa35676b upstream.
+
+When posting a deadline timer interrupt, open code the checks guarding
+__kvm_wait_lapic_expire() in order to skip the lapic_timer_int_injected()
+check in kvm_wait_lapic_expire().  The injection check will always fail
+since the interrupt has not yet be injected.  Moving the call after
+injection would also be wrong as that wouldn't actually delay delivery
+of the IRQ if it is indeed sent via posted interrupt.
+
+Fixes: 010fd37fddf6 ("KVM: LAPIC: Reduce world switch latency caused by timer_advance_ns")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210305021808.3769732-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/lapic.c |   11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -1641,7 +1641,16 @@ static void apic_timer_expired(struct kv
+       }
+       if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
+-              kvm_wait_lapic_expire(vcpu);
++              /*
++               * Ensure the guest's timer has truly expired before posting an
++               * interrupt.  Open code the relevant checks to avoid querying
++               * lapic_timer_int_injected(), which will be false since the
++               * interrupt isn't yet injected.  Waiting until after injecting
++               * is not an option since that won't help a posted interrupt.
++               */
++              if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
++                  vcpu->arch.apic->lapic_timer.timer_advance_ns)
++                      __kvm_wait_lapic_expire(vcpu);
+               kvm_apic_inject_pending_timer_irqs(apic);
+               return;
+       }
diff --git a/queue-5.11/linux-compiler-clang.h-define-have_builtin_bswap.patch b/queue-5.11/linux-compiler-clang.h-define-have_builtin_bswap.patch
new file mode 100644 (file)
index 0000000..4356e58
--- /dev/null
@@ -0,0 +1,80 @@
+From 97e4910232fa1f81e806aa60c25a0450276d99a2 Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <arnd@arndb.de>
+Date: Fri, 12 Mar 2021 21:07:47 -0800
+Subject: linux/compiler-clang.h: define HAVE_BUILTIN_BSWAP*
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+commit 97e4910232fa1f81e806aa60c25a0450276d99a2 upstream.
+
+Separating compiler-clang.h from compiler-gcc.h inadventently dropped the
+definitions of the three HAVE_BUILTIN_BSWAP macros, which requires falling
+back to the open-coded version and hoping that the compiler detects it.
+
+Since all versions of clang support the __builtin_bswap interfaces, add
+back the flags and have the headers pick these up automatically.
+
+This results in a 4% improvement of compilation speed for arm defconfig.
+
+Note: it might also be worth revisiting which architectures set
+CONFIG_ARCH_USE_BUILTIN_BSWAP for one compiler or the other, today this is
+set on six architectures (arm32, csky, mips, powerpc, s390, x86), while
+another ten architectures define custom helpers (alpha, arc, ia64, m68k,
+mips, nios2, parisc, sh, sparc, xtensa), and the rest (arm64, h8300,
+hexagon, microblaze, nds32, openrisc, riscv) just get the unoptimized
+version and rely on the compiler to detect it.
+
+A long time ago, the compiler builtins were architecture specific, but
+nowadays, all compilers that are able to build the kernel have correct
+implementations of them, though some may not be as optimized as the inline
+asm versions.
+
+The patch that dropped the optimization landed in v4.19, so as discussed
+it would be fairly safe to backport this revert to stable kernels to the
+4.19/5.4/5.10 stable kernels, but there is a remaining risk for
+regressions, and it has no known side-effects besides compile speed.
+
+Link: https://lkml.kernel.org/r/20210226161151.2629097-1-arnd@kernel.org
+Link: https://lore.kernel.org/lkml/20210225164513.3667778-1-arnd@kernel.org/
+Fixes: 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive")
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Reviewed-by: Nathan Chancellor <nathan@kernel.org>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+Acked-by: Miguel Ojeda <ojeda@kernel.org>
+Acked-by: Nick Desaulniers <ndesaulniers@google.com>
+Acked-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
+Cc: Masahiro Yamada <masahiroy@kernel.org>
+Cc: Nick Hu <nickhu@andestech.com>
+Cc: Greentime Hu <green.hu@gmail.com>
+Cc: Vincent Chen <deanbo422@gmail.com>
+Cc: Paul Walmsley <paul.walmsley@sifive.com>
+Cc: Palmer Dabbelt <palmer@dabbelt.com>
+Cc: Albert Ou <aou@eecs.berkeley.edu>
+Cc: Guo Ren <guoren@kernel.org>
+Cc: Randy Dunlap <rdunlap@infradead.org>
+Cc: Sami Tolvanen <samitolvanen@google.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Arvind Sankar <nivedita@alum.mit.edu>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/compiler-clang.h |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/include/linux/compiler-clang.h
++++ b/include/linux/compiler-clang.h
+@@ -41,6 +41,12 @@
+ #define __no_sanitize_thread
+ #endif
++#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
++#define __HAVE_BUILTIN_BSWAP32__
++#define __HAVE_BUILTIN_BSWAP64__
++#define __HAVE_BUILTIN_BSWAP16__
++#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
++
+ #if __has_feature(undefined_behavior_sanitizer)
+ /* GCC does not have __SANITIZE_UNDEFINED__ */
+ #define __no_sanitize_undefined \
diff --git a/queue-5.11/mm-highmem.c-fix-zero_user_segments-with-start-end.patch b/queue-5.11/mm-highmem.c-fix-zero_user_segments-with-start-end.patch
new file mode 100644 (file)
index 0000000..8df30ff
--- /dev/null
@@ -0,0 +1,77 @@
+From 184cee516f3e24019a08ac8eb5c7cf04c00933cb Mon Sep 17 00:00:00 2001
+From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+Date: Fri, 12 Mar 2021 21:07:37 -0800
+Subject: mm/highmem.c: fix zero_user_segments() with start > end
+
+From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+
+commit 184cee516f3e24019a08ac8eb5c7cf04c00933cb upstream.
+
+zero_user_segments() is used from __block_write_begin_int(), for example
+like the following
+
+       zero_user_segments(page, 4096, 1024, 512, 918)
+
+But new the zero_user_segments() implementation for for HIGHMEM +
+TRANSPARENT_HUGEPAGE doesn't handle "start > end" case correctly, and hits
+BUG_ON().  (we can fix __block_write_begin_int() instead though, it is the
+old and multiple usage)
+
+Also it calls kmap_atomic() unnecessarily while start == end == 0.
+
+Link: https://lkml.kernel.org/r/87v9ab60r4.fsf@mail.parknet.co.jp
+Fixes: 0060ef3b4e6d ("mm: support THPs in zero_user_segments")
+Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/highmem.c |   17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+--- a/mm/highmem.c
++++ b/mm/highmem.c
+@@ -368,20 +368,24 @@ void zero_user_segments(struct page *pag
+       BUG_ON(end1 > page_size(page) || end2 > page_size(page));
++      if (start1 >= end1)
++              start1 = end1 = 0;
++      if (start2 >= end2)
++              start2 = end2 = 0;
++
+       for (i = 0; i < compound_nr(page); i++) {
+               void *kaddr = NULL;
+-              if (start1 < PAGE_SIZE || start2 < PAGE_SIZE)
+-                      kaddr = kmap_atomic(page + i);
+-
+               if (start1 >= PAGE_SIZE) {
+                       start1 -= PAGE_SIZE;
+                       end1 -= PAGE_SIZE;
+               } else {
+                       unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
+-                      if (end1 > start1)
++                      if (end1 > start1) {
++                              kaddr = kmap_atomic(page + i);
+                               memset(kaddr + start1, 0, this_end - start1);
++                      }
+                       end1 -= this_end;
+                       start1 = 0;
+               }
+@@ -392,8 +396,11 @@ void zero_user_segments(struct page *pag
+               } else {
+                       unsigned this_end = min_t(unsigned, end2, PAGE_SIZE);
+-                      if (end2 > start2)
++                      if (end2 > start2) {
++                              if (!kaddr)
++                                      kaddr = kmap_atomic(page + i);
+                               memset(kaddr + start2, 0, this_end - start2);
++                      }
+                       end2 -= this_end;
+                       start2 = 0;
+               }
diff --git a/queue-5.11/mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch b/queue-5.11/mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch
new file mode 100644 (file)
index 0000000..0cb471d
--- /dev/null
@@ -0,0 +1,82 @@
+From 96cfe2c0fd23ea7c2368d14f769d287e7ae1082e Mon Sep 17 00:00:00 2001
+From: Suren Baghdasaryan <surenb@google.com>
+Date: Fri, 12 Mar 2021 21:08:06 -0800
+Subject: mm/madvise: replace ptrace attach requirement for process_madvise
+
+From: Suren Baghdasaryan <surenb@google.com>
+
+commit 96cfe2c0fd23ea7c2368d14f769d287e7ae1082e upstream.
+
+process_madvise currently requires ptrace attach capability.
+PTRACE_MODE_ATTACH gives one process complete control over another
+process.  It effectively removes the security boundary between the two
+processes (in one direction).  Granting ptrace attach capability even to a
+system process is considered dangerous since it creates an attack surface.
+This severely limits the usage of this API.
+
+The operations process_madvise can perform do not affect the correctness
+of the operation of the target process; they only affect where the data is
+physically located (and therefore, how fast it can be accessed).  What we
+want is the ability for one process to influence another process in order
+to optimize performance across the entire system while leaving the
+security boundary intact.
+
+Replace PTRACE_MODE_ATTACH with a combination of PTRACE_MODE_READ and
+CAP_SYS_NICE.  PTRACE_MODE_READ to prevent leaking ASLR metadata and
+CAP_SYS_NICE for influencing process performance.
+
+Link: https://lkml.kernel.org/r/20210303185807.2160264-1-surenb@google.com
+Signed-off-by: Suren Baghdasaryan <surenb@google.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+Acked-by: Minchan Kim <minchan@kernel.org>
+Acked-by: David Rientjes <rientjes@google.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jeff Vander Stoep <jeffv@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Tim Murray <timmurray@google.com>
+Cc: Florian Weimer <fweimer@redhat.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: James Morris <jmorris@namei.org>
+Cc: <stable@vger.kernel.org>   [5.10+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c |   13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1197,12 +1197,22 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+               goto release_task;
+       }
+-      mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
++      /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
++      mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+       if (IS_ERR_OR_NULL(mm)) {
+               ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+               goto release_task;
+       }
++      /*
++       * Require CAP_SYS_NICE for influencing process performance. Note that
++       * only non-destructive hints are currently supported.
++       */
++      if (!capable(CAP_SYS_NICE)) {
++              ret = -EPERM;
++              goto release_mm;
++      }
++
+       total_len = iov_iter_count(&iter);
+       while (iov_iter_count(&iter)) {
+@@ -1217,6 +1227,7 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+       if (ret == 0)
+               ret = total_len - iov_iter_count(&iter);
++release_mm:
+       mmput(mm);
+ release_task:
+       put_task_struct(task);
diff --git a/queue-5.11/mm-memcg-rename-mem_cgroup_split_huge_fixup-to-split_page_memcg-and-add-nr_pages-argument.patch b/queue-5.11/mm-memcg-rename-mem_cgroup_split_huge_fixup-to-split_page_memcg-and-add-nr_pages-argument.patch
new file mode 100644 (file)
index 0000000..2d90701
--- /dev/null
@@ -0,0 +1,106 @@
+From be6c8982e4ab9a41907555f601b711a7e2a17d4c Mon Sep 17 00:00:00 2001
+From: Zhou Guanghui <zhouguanghui1@huawei.com>
+Date: Fri, 12 Mar 2021 21:08:30 -0800
+Subject: mm/memcg: rename mem_cgroup_split_huge_fixup to split_page_memcg and add nr_pages argument
+
+From: Zhou Guanghui <zhouguanghui1@huawei.com>
+
+commit be6c8982e4ab9a41907555f601b711a7e2a17d4c upstream.
+
+Rename mem_cgroup_split_huge_fixup to split_page_memcg and explicitly pass
+in page number argument.
+
+In this way, the interface name is more common and can be used by
+potential users.  In addition, the complete info(memcg and flag) of the
+memcg needs to be set to the tail pages.
+
+Link: https://lkml.kernel.org/r/20210304074053.65527-2-zhouguanghui1@huawei.com
+Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Reviewed-by: Shakeel Butt <shakeelb@google.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Nicholas Piggin <npiggin@gmail.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Hanjun Guo <guohanjun@huawei.com>
+Cc: Tianhong Ding <dingtianhong@huawei.com>
+Cc: Weilong Chen <chenweilong@huawei.com>
+Cc: Rui Xiang <rui.xiang@huawei.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h |    6 ++----
+ mm/huge_memory.c           |    2 +-
+ mm/memcontrol.c            |   15 ++++++---------
+ 3 files changed, 9 insertions(+), 14 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -1072,9 +1072,7 @@ static inline void memcg_memory_event_mm
+       rcu_read_unlock();
+ }
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+-void mem_cgroup_split_huge_fixup(struct page *head);
+-#endif
++void split_page_memcg(struct page *head, unsigned int nr);
+ #else /* CONFIG_MEMCG */
+@@ -1416,7 +1414,7 @@ unsigned long mem_cgroup_soft_limit_recl
+       return 0;
+ }
+-static inline void mem_cgroup_split_huge_fixup(struct page *head)
++static inline void split_page_memcg(struct page *head, unsigned int nr)
+ {
+ }
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2465,7 +2465,7 @@ static void __split_huge_page(struct pag
+       int i;
+       /* complete memcg works before add pages to LRU */
+-      mem_cgroup_split_huge_fixup(head);
++      split_page_memcg(head, nr);
+       if (PageAnon(head) && PageSwapCache(head)) {
+               swp_entry_t entry = { .val = page_private(head) };
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -3296,24 +3296,21 @@ void obj_cgroup_uncharge(struct obj_cgro
+ #endif /* CONFIG_MEMCG_KMEM */
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+- * Because page_memcg(head) is not set on compound tails, set it now.
++ * Because page_memcg(head) is not set on tails, set it now.
+  */
+-void mem_cgroup_split_huge_fixup(struct page *head)
++void split_page_memcg(struct page *head, unsigned int nr)
+ {
+       struct mem_cgroup *memcg = page_memcg(head);
+       int i;
+-      if (mem_cgroup_disabled())
++      if (mem_cgroup_disabled() || !memcg)
+               return;
+-      for (i = 1; i < HPAGE_PMD_NR; i++) {
+-              css_get(&memcg->css);
+-              head[i].memcg_data = (unsigned long)memcg;
+-      }
++      for (i = 1; i < nr; i++)
++              head[i].memcg_data = head->memcg_data;
++      css_get_many(&memcg->css, nr - 1);
+ }
+-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ #ifdef CONFIG_MEMCG_SWAP
+ /**
diff --git a/queue-5.11/mm-memcg-set-memcg-when-splitting-page.patch b/queue-5.11/mm-memcg-set-memcg-when-splitting-page.patch
new file mode 100644 (file)
index 0000000..b462df3
--- /dev/null
@@ -0,0 +1,61 @@
+From e1baddf8475b06cc56f4bafecf9a32a124343d9f Mon Sep 17 00:00:00 2001
+From: Zhou Guanghui <zhouguanghui1@huawei.com>
+Date: Fri, 12 Mar 2021 21:08:33 -0800
+Subject: mm/memcg: set memcg when splitting page
+
+From: Zhou Guanghui <zhouguanghui1@huawei.com>
+
+commit e1baddf8475b06cc56f4bafecf9a32a124343d9f upstream.
+
+As described in the split_page() comment, for the non-compound high order
+page, the sub-pages must be freed individually.  If the memcg of the first
+page is valid, the tail pages cannot be uncharged when be freed.
+
+For example, when alloc_pages_exact is used to allocate 1MB continuous
+physical memory, 2MB is charged(kmemcg is enabled and __GFP_ACCOUNT is
+set).  When make_alloc_exact free the unused 1MB and free_pages_exact free
+the applied 1MB, actually, only 4KB(one page) is uncharged.
+
+Therefore, the memcg of the tail page needs to be set when splitting a
+page.
+
+Michel:
+
+There are at least two explicit users of __GFP_ACCOUNT with
+alloc_exact_pages added recently.  See 7efe8ef274024 ("KVM: arm64:
+Allocate stage-2 pgd pages with GFP_KERNEL_ACCOUNT") and c419621873713
+("KVM: s390: Add memcg accounting to KVM allocations"), so this is not
+just a theoretical issue.
+
+Link: https://lkml.kernel.org/r/20210304074053.65527-3-zhouguanghui1@huawei.com
+Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Reviewed-by: Shakeel Butt <shakeelb@google.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Hanjun Guo <guohanjun@huawei.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Nicholas Piggin <npiggin@gmail.com>
+Cc: Rui Xiang <rui.xiang@huawei.com>
+Cc: Tianhong Ding <dingtianhong@huawei.com>
+Cc: Weilong Chen <chenweilong@huawei.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3313,6 +3313,7 @@ void split_page(struct page *page, unsig
+       for (i = 1; i < (1 << order); i++)
+               set_page_refcounted(page + i);
+       split_page_owner(page, 1 << order);
++      split_page_memcg(page, 1 << order);
+ }
+ EXPORT_SYMBOL_GPL(split_page);
diff --git a/queue-5.11/mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch b/queue-5.11/mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch
new file mode 100644 (file)
index 0000000..86e4e86
--- /dev/null
@@ -0,0 +1,130 @@
+From 6ce64428d62026a10cb5d80138ff2f90cc21d367 Mon Sep 17 00:00:00 2001
+From: Nadav Amit <namit@vmware.com>
+Date: Fri, 12 Mar 2021 21:08:17 -0800
+Subject: mm/userfaultfd: fix memory corruption due to writeprotect
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Nadav Amit <namit@vmware.com>
+
+commit 6ce64428d62026a10cb5d80138ff2f90cc21d367 upstream.
+
+Userfaultfd self-test fails occasionally, indicating a memory corruption.
+
+Analyzing this problem indicates that there is a real bug since mmap_lock
+is only taken for read in mwriteprotect_range() and defers flushes, and
+since there is insufficient consideration of concurrent deferred TLB
+flushes in wp_page_copy().  Although the PTE is flushed from the TLBs in
+wp_page_copy(), this flush takes place after the copy has already been
+performed, and therefore changes of the page are possible between the time
+of the copy and the time in which the PTE is flushed.
+
+To make matters worse, memory-unprotection using userfaultfd also poses a
+problem.  Although memory unprotection is logically a promotion of PTE
+permissions, and therefore should not require a TLB flush, the current
+userrfaultfd code might actually cause a demotion of the architectural PTE
+permission: when userfaultfd_writeprotect() unprotects memory region, it
+unintentionally *clears* the RW-bit if it was already set.  Note that this
+unprotecting a PTE that is not write-protected is a valid use-case: the
+userfaultfd monitor might ask to unprotect a region that holds both
+write-protected and write-unprotected PTEs.
+
+The scenario that happens in selftests/vm/userfaultfd is as follows:
+
+cpu0                           cpu1                    cpu2
+----                           ----                    ----
+                                                       [ Writable PTE
+                                                         cached in TLB ]
+userfaultfd_writeprotect()
+[ write-*unprotect* ]
+mwriteprotect_range()
+mmap_read_lock()
+change_protection()
+
+change_protection_range()
+...
+change_pte_range()
+[ *clear* “write”-bit ]
+[ defer TLB flushes ]
+                               [ page-fault ]
+                               ...
+                               wp_page_copy()
+                                cow_user_page()
+                                 [ copy page ]
+                                                       [ write to old
+                                                         page ]
+                               ...
+                                set_pte_at_notify()
+
+A similar scenario can happen:
+
+cpu0           cpu1            cpu2            cpu3
+----           ----            ----            ----
+                                               [ Writable PTE
+                                                 cached in TLB ]
+userfaultfd_writeprotect()
+[ write-protect ]
+[ deferred TLB flush ]
+               userfaultfd_writeprotect()
+               [ write-unprotect ]
+               [ deferred TLB flush]
+                               [ page-fault ]
+                               wp_page_copy()
+                                cow_user_page()
+                                [ copy page ]
+                                ...            [ write to page ]
+                               set_pte_at_notify()
+
+This race exists since commit 292924b26024 ("userfaultfd: wp: apply
+_PAGE_UFFD_WP bit").  Yet, as Yu Zhao pointed, these races became apparent
+since commit 09854ba94c6a ("mm: do_wp_page() simplification") which made
+wp_page_copy() more likely to take place, specifically if page_count(page)
+> 1.
+
+To resolve the aforementioned races, check whether there are pending
+flushes on uffd-write-protected VMAs, and if there are, perform a flush
+before doing the COW.
+
+Further optimizations will follow to avoid during uffd-write-unprotect
+unnecassary PTE write-protection and TLB flushes.
+
+Link: https://lkml.kernel.org/r/20210304095423.3825684-1-namit@vmware.com
+Fixes: 09854ba94c6a ("mm: do_wp_page() simplification")
+Signed-off-by: Nadav Amit <namit@vmware.com>
+Suggested-by: Yu Zhao <yuzhao@google.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
+Tested-by: Peter Xu <peterx@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Pavel Emelyanov <xemul@openvz.org>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Will Deacon <will@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: <stable@vger.kernel.org>   [5.9+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3092,6 +3092,14 @@ static vm_fault_t do_wp_page(struct vm_f
+               return handle_userfault(vmf, VM_UFFD_WP);
+       }
++      /*
++       * Userfaultfd write-protect can defer flushes. Ensure the TLB
++       * is flushed in this case before copying.
++       */
++      if (unlikely(userfaultfd_wp(vmf->vma) &&
++                   mm_tlb_flush_pending(vmf->vma->vm_mm)))
++              flush_tlb_page(vmf->vma, vmf->address);
++
+       vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+       if (!vmf->page) {
+               /*
diff --git a/queue-5.11/powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch b/queue-5.11/powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch
new file mode 100644 (file)
index 0000000..e799db9
--- /dev/null
@@ -0,0 +1,36 @@
+From cea15316ceee2d4a51dfdecd79e08a438135416c Mon Sep 17 00:00:00 2001
+From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
+Date: Thu, 4 Mar 2021 07:34:11 +0530
+Subject: powerpc/64s: Fix instruction encoding for lis in ppc_function_entry()
+
+From: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
+
+commit cea15316ceee2d4a51dfdecd79e08a438135416c upstream.
+
+'lis r2,N' is 'addis r2,0,N' and the instruction encoding in the macro
+LIS_R2 is incorrect (it currently maps to 'addis r0,r2,N'). Fix the
+same.
+
+Fixes: c71b7eff426f ("powerpc: Add ABIv2 support to ppc_function_entry")
+Cc: stable@vger.kernel.org # v3.16+
+Reported-by: Jiri Olsa <jolsa@redhat.com>
+Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
+Acked-by: Segher Boessenkool <segher@kernel.crashing.org>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210304020411.16796-1-naveen.n.rao@linux.vnet.ibm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/include/asm/code-patching.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/powerpc/include/asm/code-patching.h
++++ b/arch/powerpc/include/asm/code-patching.h
+@@ -73,7 +73,7 @@ void __patch_exception(int exc, unsigned
+ #endif
+ #define OP_RT_RA_MASK 0xffff0000UL
+-#define LIS_R2                0x3c020000UL
++#define LIS_R2                0x3c400000UL
+ #define ADDIS_R2_R12  0x3c4c0000UL
+ #define ADDI_R2_R2    0x38420000UL
diff --git a/queue-5.11/powerpc-fix-inverted-set_full_regs-bitop.patch b/queue-5.11/powerpc-fix-inverted-set_full_regs-bitop.patch
new file mode 100644 (file)
index 0000000..b882bc0
--- /dev/null
@@ -0,0 +1,45 @@
+From 73ac79881804eed2e9d76ecdd1018037f8510cb1 Mon Sep 17 00:00:00 2001
+From: Nicholas Piggin <npiggin@gmail.com>
+Date: Mon, 8 Mar 2021 18:55:30 +1000
+Subject: powerpc: Fix inverted SET_FULL_REGS bitop
+
+From: Nicholas Piggin <npiggin@gmail.com>
+
+commit 73ac79881804eed2e9d76ecdd1018037f8510cb1 upstream.
+
+This bit operation was inverted and set the low bit rather than
+cleared it, breaking the ability to ptrace non-volatile GPRs after
+exec. Fix.
+
+Only affects 64e and 32-bit.
+
+Fixes: feb9df3462e6 ("powerpc/64s: Always has full regs, so remove remnant checks")
+Cc: stable@vger.kernel.org # v5.8+
+Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210308085530.3191843-1-npiggin@gmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/include/asm/ptrace.h |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/powerpc/include/asm/ptrace.h
++++ b/arch/powerpc/include/asm/ptrace.h
+@@ -195,7 +195,7 @@ static inline void regs_set_return_value
+ #define TRAP_FLAGS_MASK               0x11
+ #define TRAP(regs)            ((regs)->trap & ~TRAP_FLAGS_MASK)
+ #define FULL_REGS(regs)               (((regs)->trap & 1) == 0)
+-#define SET_FULL_REGS(regs)   ((regs)->trap |= 1)
++#define SET_FULL_REGS(regs)   ((regs)->trap &= ~1)
+ #endif
+ #define CHECK_FULL_REGS(regs) BUG_ON(!FULL_REGS(regs))
+ #define NV_REG_POISON         0xdeadbeefdeadbeefUL
+@@ -210,7 +210,7 @@ static inline void regs_set_return_value
+ #define TRAP_FLAGS_MASK               0x1F
+ #define TRAP(regs)            ((regs)->trap & ~TRAP_FLAGS_MASK)
+ #define FULL_REGS(regs)               (((regs)->trap & 1) == 0)
+-#define SET_FULL_REGS(regs)   ((regs)->trap |= 1)
++#define SET_FULL_REGS(regs)   ((regs)->trap &= ~1)
+ #define IS_CRITICAL_EXC(regs) (((regs)->trap & 2) != 0)
+ #define IS_MCHECK_EXC(regs)   (((regs)->trap & 4) != 0)
+ #define IS_DEBUG_EXC(regs)    (((regs)->trap & 8) != 0)
diff --git a/queue-5.11/powerpc-fix-missing-declaration-of-able_kernel_vsx.patch b/queue-5.11/powerpc-fix-missing-declaration-of-able_kernel_vsx.patch
new file mode 100644 (file)
index 0000000..609935d
--- /dev/null
@@ -0,0 +1,83 @@
+From bd73758803c2eedc037c2268b65a19542a832594 Mon Sep 17 00:00:00 2001
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+Date: Tue, 9 Mar 2021 08:39:39 +0000
+Subject: powerpc: Fix missing declaration of [en/dis]able_kernel_vsx()
+
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+
+commit bd73758803c2eedc037c2268b65a19542a832594 upstream.
+
+Add stub instances of enable_kernel_vsx() and disable_kernel_vsx()
+when CONFIG_VSX is not set, to avoid following build failure.
+
+  CC [M]  drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.o
+  In file included from ./drivers/gpu/drm/amd/amdgpu/../display/dc/dm_services_types.h:29,
+                   from ./drivers/gpu/drm/amd/amdgpu/../display/dc/dm_services.h:37,
+                   from drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c:27:
+  drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c: In function 'dcn_bw_apply_registry_override':
+  ./drivers/gpu/drm/amd/amdgpu/../display/dc/os_types.h:64:3: error: implicit declaration of function 'enable_kernel_vsx'; did you mean 'enable_kernel_fp'? [-Werror=implicit-function-declaration]
+     64 |   enable_kernel_vsx(); \
+        |   ^~~~~~~~~~~~~~~~~
+  drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c:640:2: note: in expansion of macro 'DC_FP_START'
+    640 |  DC_FP_START();
+        |  ^~~~~~~~~~~
+  ./drivers/gpu/drm/amd/amdgpu/../display/dc/os_types.h:75:3: error: implicit declaration of function 'disable_kernel_vsx'; did you mean 'disable_kernel_fp'? [-Werror=implicit-function-declaration]
+     75 |   disable_kernel_vsx(); \
+        |   ^~~~~~~~~~~~~~~~~~
+  drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c:676:2: note: in expansion of macro 'DC_FP_END'
+    676 |  DC_FP_END();
+        |  ^~~~~~~~~
+  cc1: some warnings being treated as errors
+  make[5]: *** [drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.o] Error 1
+
+This works because the caller is checking if VSX is available using
+cpu_has_feature():
+
+  #define DC_FP_START() { \
+       if (cpu_has_feature(CPU_FTR_VSX_COMP)) { \
+               preempt_disable(); \
+               enable_kernel_vsx(); \
+       } else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) { \
+               preempt_disable(); \
+               enable_kernel_altivec(); \
+       } else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) { \
+               preempt_disable(); \
+               enable_kernel_fp(); \
+       } \
+
+When CONFIG_VSX is not selected, cpu_has_feature(CPU_FTR_VSX_COMP)
+constant folds to 'false' so the call to enable_kernel_vsx() is
+discarded and the build succeeds.
+
+Fixes: 16a9dea110a6 ("amdgpu: Enable initial DCN support on POWER")
+Cc: stable@vger.kernel.org # v5.6+
+Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
+Reported-by: kernel test robot <lkp@intel.com>
+Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+[mpe: Incorporate some discussion comments into the change log]
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/8d7d285a027e9d21f5ff7f850fa71a2655b0c4af.1615279170.git.christophe.leroy@csgroup.eu
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/include/asm/switch_to.h |   10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/arch/powerpc/include/asm/switch_to.h
++++ b/arch/powerpc/include/asm/switch_to.h
+@@ -71,6 +71,16 @@ static inline void disable_kernel_vsx(vo
+ {
+       msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
+ }
++#else
++static inline void enable_kernel_vsx(void)
++{
++      BUILD_BUG();
++}
++
++static inline void disable_kernel_vsx(void)
++{
++      BUILD_BUG();
++}
+ #endif
+ #ifdef CONFIG_SPE
diff --git a/queue-5.11/sched-collate-affine_move_task-stoppers.patch b/queue-5.11/sched-collate-affine_move_task-stoppers.patch
new file mode 100644 (file)
index 0000000..5a1da4c
--- /dev/null
@@ -0,0 +1,64 @@
+From 58b1a45086b5f80f2b2842aa7ed0da51a64a302b Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 24 Feb 2021 11:15:23 +0100
+Subject: sched: Collate affine_move_task() stoppers
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 58b1a45086b5f80f2b2842aa7ed0da51a64a302b upstream.
+
+The SCA_MIGRATE_ENABLE and task_running() cases are almost identical,
+collapse them to avoid further duplication.
+
+Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
+Cc: stable@kernel.org
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
+Link: https://lkml.kernel.org/r/20210224131355.500108964@infradead.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/core.c |   23 ++++++++---------------
+ 1 file changed, 8 insertions(+), 15 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -2279,30 +2279,23 @@ static int affine_move_task(struct rq *r
+               return -EINVAL;
+       }
+-      if (flags & SCA_MIGRATE_ENABLE) {
+-
+-              refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+-              p->migration_flags &= ~MDF_PUSH;
+-              task_rq_unlock(rq, p, rf);
+-
+-              stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+-                                  &pending->arg, &pending->stop_work);
+-
+-              return 0;
+-      }
+-
+       if (task_running(rq, p) || p->state == TASK_WAKING) {
+               /*
+-               * Lessen races (and headaches) by delegating
+-               * is_migration_disabled(p) checks to the stopper, which will
+-               * run on the same CPU as said p.
++               * MIGRATE_ENABLE gets here because 'p == current', but for
++               * anything else we cannot do is_migration_disabled(), punt
++               * and have the stopper function handle it all race-free.
+                */
++
+               refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
++              if (flags & SCA_MIGRATE_ENABLE)
++                      p->migration_flags &= ~MDF_PUSH;
+               task_rq_unlock(rq, p, rf);
+               stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+                                   &pending->arg, &pending->stop_work);
++              if (flags & SCA_MIGRATE_ENABLE)
++                      return 0;
+       } else {
+               if (!is_migration_disabled(p)) {
diff --git a/queue-5.11/sched-fix-affine_move_task-self-concurrency.patch b/queue-5.11/sched-fix-affine_move_task-self-concurrency.patch
new file mode 100644 (file)
index 0000000..f507f7c
--- /dev/null
@@ -0,0 +1,91 @@
+From 9e81889c7648d48dd5fe13f41cbc99f3c362484a Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 24 Feb 2021 11:31:09 +0100
+Subject: sched: Fix affine_move_task() self-concurrency
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 9e81889c7648d48dd5fe13f41cbc99f3c362484a upstream.
+
+Consider:
+
+   sched_setaffinity(p, X);            sched_setaffinity(p, Y);
+
+Then the first will install p->migration_pending = &my_pending; and
+issue stop_one_cpu_nowait(pending); and the second one will read
+p->migration_pending and _also_ issue: stop_one_cpu_nowait(pending),
+the _SAME_ @pending.
+
+This causes stopper list corruption.
+
+Add set_affinity_pending::stop_pending, to indicate if a stopper is in
+progress.
+
+Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
+Cc: stable@kernel.org
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
+Link: https://lkml.kernel.org/r/20210224131355.649146419@infradead.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/core.c |   15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1864,6 +1864,7 @@ struct migration_arg {
+ struct set_affinity_pending {
+       refcount_t              refs;
++      unsigned int            stop_pending;
+       struct completion       done;
+       struct cpu_stop_work    stop_work;
+       struct migration_arg    arg;
+@@ -1982,12 +1983,15 @@ static int migration_cpu_stop(void *data
+                * determine is_migration_disabled() and so have to chase after
+                * it.
+                */
++              WARN_ON_ONCE(!pending->stop_pending);
+               task_rq_unlock(rq, p, &rf);
+               stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+                                   &pending->arg, &pending->stop_work);
+               return 0;
+       }
+ out:
++      if (pending)
++              pending->stop_pending = false;
+       task_rq_unlock(rq, p, &rf);
+       if (complete)
+@@ -2183,7 +2187,7 @@ static int affine_move_task(struct rq *r
+                           int dest_cpu, unsigned int flags)
+ {
+       struct set_affinity_pending my_pending = { }, *pending = NULL;
+-      bool complete = false;
++      bool stop_pending, complete = false;
+       /* Can the task run on the task's current CPU? If so, we're done */
+       if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+@@ -2256,14 +2260,19 @@ static int affine_move_task(struct rq *r
+                * anything else we cannot do is_migration_disabled(), punt
+                * and have the stopper function handle it all race-free.
+                */
++              stop_pending = pending->stop_pending;
++              if (!stop_pending)
++                      pending->stop_pending = true;
+               refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+               if (flags & SCA_MIGRATE_ENABLE)
+                       p->migration_flags &= ~MDF_PUSH;
+               task_rq_unlock(rq, p, rf);
+-              stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+-                                  &pending->arg, &pending->stop_work);
++              if (!stop_pending) {
++                      stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
++                                          &pending->arg, &pending->stop_work);
++              }
+               if (flags & SCA_MIGRATE_ENABLE)
+                       return 0;
diff --git a/queue-5.11/sched-fix-migration_cpu_stop-requeueing.patch b/queue-5.11/sched-fix-migration_cpu_stop-requeueing.patch
new file mode 100644 (file)
index 0000000..4ebe5b5
--- /dev/null
@@ -0,0 +1,142 @@
+From 8a6edb5257e2a84720fe78cb179eca58ba76126f Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Sat, 13 Feb 2021 13:10:35 +0100
+Subject: sched: Fix migration_cpu_stop() requeueing
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 8a6edb5257e2a84720fe78cb179eca58ba76126f upstream.
+
+When affine_move_task(p) is called on a running task @p, which is not
+otherwise already changing affinity, we'll first set
+p->migration_pending and then do:
+
+        stop_one_cpu(cpu_of_rq(rq), migration_cpu_stop, &arg);
+
+This then gets us to migration_cpu_stop() running on the CPU that was
+previously running our victim task @p.
+
+If we find that our task is no longer on that runqueue (this can
+happen because of a concurrent migration due to load-balance etc.),
+then we'll end up at the:
+
+       } else if (dest_cpu < 1 || pending) {
+
+branch. Which we'll take because we set pending earlier. Here we first
+check if the task @p has already satisfied the affinity constraints,
+if so we bail early [A]. Otherwise we'll reissue migration_cpu_stop()
+onto the CPU that is now hosting our task @p:
+
+       stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+                           &pending->arg, &pending->stop_work);
+
+Except, we've never initialized pending->arg, which will be all 0s.
+
+This then results in running migration_cpu_stop() on the next CPU with
+arg->p == NULL, which gives the by now obvious result of fireworks.
+
+The cure is to change affine_move_task() to always use pending->arg,
+furthermore we can use the exact same pattern as the
+SCA_MIGRATE_ENABLE case, since we'll block on the pending->done
+completion anyway, no point in adding yet another completion in
+stop_one_cpu().
+
+This then gives a clear distinction between the two
+migration_cpu_stop() use cases:
+
+  - sched_exec() / migrate_task_to() : arg->pending == NULL
+  - affine_move_task() : arg->pending != NULL;
+
+And we can have it ignore p->migration_pending when !arg->pending. Any
+stop work from sched_exec() / migrate_task_to() is in addition to stop
+works from affine_move_task(), which will be sufficient to issue the
+completion.
+
+Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
+Cc: stable@kernel.org
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
+Link: https://lkml.kernel.org/r/20210224131355.357743989@infradead.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/core.c |   39 ++++++++++++++++++++++++++++-----------
+ 1 file changed, 28 insertions(+), 11 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1922,6 +1922,24 @@ static int migration_cpu_stop(void *data
+       rq_lock(rq, &rf);
+       pending = p->migration_pending;
++      if (pending && !arg->pending) {
++              /*
++               * This happens from sched_exec() and migrate_task_to(),
++               * neither of them care about pending and just want a task to
++               * maybe move about.
++               *
++               * Even if there is a pending, we can ignore it, since
++               * affine_move_task() will have it's own stop_work's in flight
++               * which will manage the completion.
++               *
++               * Notably, pending doesn't need to match arg->pending. This can
++               * happen when tripple concurrent affine_move_task() first sets
++               * pending, then clears pending and eventually sets another
++               * pending.
++               */
++              pending = NULL;
++      }
++
+       /*
+        * If task_rq(p) != rq, it cannot be migrated here, because we're
+        * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+@@ -2194,10 +2212,6 @@ static int affine_move_task(struct rq *r
+                           int dest_cpu, unsigned int flags)
+ {
+       struct set_affinity_pending my_pending = { }, *pending = NULL;
+-      struct migration_arg arg = {
+-              .task = p,
+-              .dest_cpu = dest_cpu,
+-      };
+       bool complete = false;
+       /* Can the task run on the task's current CPU? If so, we're done */
+@@ -2235,6 +2249,12 @@ static int affine_move_task(struct rq *r
+                       /* Install the request */
+                       refcount_set(&my_pending.refs, 1);
+                       init_completion(&my_pending.done);
++                      my_pending.arg = (struct migration_arg) {
++                              .task = p,
++                              .dest_cpu = -1,         /* any */
++                              .pending = &my_pending,
++                      };
++
+                       p->migration_pending = &my_pending;
+               } else {
+                       pending = p->migration_pending;
+@@ -2265,12 +2285,6 @@ static int affine_move_task(struct rq *r
+               p->migration_flags &= ~MDF_PUSH;
+               task_rq_unlock(rq, p, rf);
+-              pending->arg = (struct migration_arg) {
+-                      .task = p,
+-                      .dest_cpu = -1,
+-                      .pending = pending,
+-              };
+-
+               stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+                                   &pending->arg, &pending->stop_work);
+@@ -2283,8 +2297,11 @@ static int affine_move_task(struct rq *r
+                * is_migration_disabled(p) checks to the stopper, which will
+                * run on the same CPU as said p.
+                */
++              refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+               task_rq_unlock(rq, p, rf);
+-              stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
++
++              stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
++                                  &pending->arg, &pending->stop_work);
+       } else {
diff --git a/queue-5.11/sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch b/queue-5.11/sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch
new file mode 100644 (file)
index 0000000..e536c20
--- /dev/null
@@ -0,0 +1,41 @@
+From ce29ddc47b91f97e7f69a0fb7cbb5845f52a9825 Mon Sep 17 00:00:00 2001
+From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Date: Wed, 17 Feb 2021 11:56:51 -0500
+Subject: sched/membarrier: fix missing local execution of ipi_sync_rq_state()
+
+From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+
+commit ce29ddc47b91f97e7f69a0fb7cbb5845f52a9825 upstream.
+
+The function sync_runqueues_membarrier_state() should copy the
+membarrier state from the @mm received as parameter to each runqueue
+currently running tasks using that mm.
+
+However, the use of smp_call_function_many() skips the current runqueue,
+which is unintended. Replace by a call to on_each_cpu_mask().
+
+Fixes: 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy load")
+Reported-by: Nadav Amit <nadav.amit@gmail.com>
+Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: stable@vger.kernel.org # 5.4.x+
+Link: https://lore.kernel.org/r/74F1E842-4A84-47BF-B6C2-5407DFDD4A4A@gmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/membarrier.c |    4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/kernel/sched/membarrier.c
++++ b/kernel/sched/membarrier.c
+@@ -471,9 +471,7 @@ static int sync_runqueues_membarrier_sta
+       }
+       rcu_read_unlock();
+-      preempt_disable();
+-      smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
+-      preempt_enable();
++      on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
+       free_cpumask_var(tmpmask);
+       cpus_read_unlock();
diff --git a/queue-5.11/sched-optimize-migration_cpu_stop.patch b/queue-5.11/sched-optimize-migration_cpu_stop.patch
new file mode 100644 (file)
index 0000000..47a6bd5
--- /dev/null
@@ -0,0 +1,53 @@
+From 3f1bc119cd7fc987c8ed25ffb717f99403bb308c Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 24 Feb 2021 11:21:35 +0100
+Subject: sched: Optimize migration_cpu_stop()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 3f1bc119cd7fc987c8ed25ffb717f99403bb308c upstream.
+
+When the purpose of migration_cpu_stop() is to migrate the task to
+'any' valid CPU, don't migrate the task when it's already running on a
+valid CPU.
+
+Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
+Cc: stable@kernel.org
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
+Link: https://lkml.kernel.org/r/20210224131355.569238629@infradead.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/core.c |   13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1936,14 +1936,25 @@ static int migration_cpu_stop(void *data
+                       complete = true;
+               }
+-              if (dest_cpu < 0)
++              if (dest_cpu < 0) {
++                      if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
++                              goto out;
++
+                       dest_cpu = cpumask_any_distribute(&p->cpus_mask);
++              }
+               if (task_on_rq_queued(p))
+                       rq = __migrate_task(rq, &rf, p, dest_cpu);
+               else
+                       p->wake_cpu = dest_cpu;
++              /*
++               * XXX __migrate_task() can fail, at which point we might end
++               * up running on a dodgy CPU, AFAICT this can only happen
++               * during CPU hotplug, at which point we'll get pushed out
++               * anyway, so it's probably not a big deal.
++               */
++
+       } else if (pending) {
+               /*
+                * This happens when we get migrated between migrate_enable()'s
diff --git a/queue-5.11/sched-simplify-migration_cpu_stop.patch b/queue-5.11/sched-simplify-migration_cpu_stop.patch
new file mode 100644 (file)
index 0000000..764b095
--- /dev/null
@@ -0,0 +1,138 @@
+From c20cf065d4a619d394d23290093b1002e27dff86 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 24 Feb 2021 11:50:39 +0100
+Subject: sched: Simplify migration_cpu_stop()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit c20cf065d4a619d394d23290093b1002e27dff86 upstream.
+
+When affine_move_task() issues a migration_cpu_stop(), the purpose of
+that function is to complete that @pending, not any random other
+p->migration_pending that might have gotten installed since.
+
+This realization much simplifies migration_cpu_stop() and allows
+further necessary steps to fix all this as it provides the guarantee
+that @pending's stopper will complete @pending (and not some random
+other @pending).
+
+Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
+Cc: stable@kernel.org
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
+Link: https://lkml.kernel.org/r/20210224131355.430014682@infradead.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/core.c |   56 +++++++---------------------------------------------
+ 1 file changed, 8 insertions(+), 48 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1898,8 +1898,8 @@ static struct rq *__migrate_task(struct
+  */
+ static int migration_cpu_stop(void *data)
+ {
+-      struct set_affinity_pending *pending;
+       struct migration_arg *arg = data;
++      struct set_affinity_pending *pending = arg->pending;
+       struct task_struct *p = arg->task;
+       int dest_cpu = arg->dest_cpu;
+       struct rq *rq = this_rq();
+@@ -1921,25 +1921,6 @@ static int migration_cpu_stop(void *data
+       raw_spin_lock(&p->pi_lock);
+       rq_lock(rq, &rf);
+-      pending = p->migration_pending;
+-      if (pending && !arg->pending) {
+-              /*
+-               * This happens from sched_exec() and migrate_task_to(),
+-               * neither of them care about pending and just want a task to
+-               * maybe move about.
+-               *
+-               * Even if there is a pending, we can ignore it, since
+-               * affine_move_task() will have it's own stop_work's in flight
+-               * which will manage the completion.
+-               *
+-               * Notably, pending doesn't need to match arg->pending. This can
+-               * happen when tripple concurrent affine_move_task() first sets
+-               * pending, then clears pending and eventually sets another
+-               * pending.
+-               */
+-              pending = NULL;
+-      }
+-
+       /*
+        * If task_rq(p) != rq, it cannot be migrated here, because we're
+        * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+@@ -1950,31 +1931,20 @@ static int migration_cpu_stop(void *data
+                       goto out;
+               if (pending) {
+-                      p->migration_pending = NULL;
++                      if (p->migration_pending == pending)
++                              p->migration_pending = NULL;
+                       complete = true;
+               }
+-              /* migrate_enable() --  we must not race against SCA */
+-              if (dest_cpu < 0) {
+-                      /*
+-                       * When this was migrate_enable() but we no longer
+-                       * have a @pending, a concurrent SCA 'fixed' things
+-                       * and we should be valid again. Nothing to do.
+-                       */
+-                      if (!pending) {
+-                              WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+-                              goto out;
+-                      }
+-
++              if (dest_cpu < 0)
+                       dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+-              }
+               if (task_on_rq_queued(p))
+                       rq = __migrate_task(rq, &rf, p, dest_cpu);
+               else
+                       p->wake_cpu = dest_cpu;
+-      } else if (dest_cpu < 0 || pending) {
++      } else if (pending) {
+               /*
+                * This happens when we get migrated between migrate_enable()'s
+                * preempt_enable() and scheduling the stopper task. At that
+@@ -1989,23 +1959,14 @@ static int migration_cpu_stop(void *data
+                * ->pi_lock, so the allowed mask is stable - if it got
+                * somewhere allowed, we're done.
+                */
+-              if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+-                      p->migration_pending = NULL;
++              if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
++                      if (p->migration_pending == pending)
++                              p->migration_pending = NULL;
+                       complete = true;
+                       goto out;
+               }
+               /*
+-               * When this was migrate_enable() but we no longer have an
+-               * @pending, a concurrent SCA 'fixed' things and we should be
+-               * valid again. Nothing to do.
+-               */
+-              if (!pending) {
+-                      WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+-                      goto out;
+-              }
+-
+-              /*
+                * When migrate_enable() hits a rq mis-match we can't reliably
+                * determine is_migration_disabled() and so have to chase after
+                * it.
+@@ -2022,7 +1983,6 @@ out:
+               complete_all(&pending->done);
+       /* For pending->{arg,stop_work} */
+-      pending = arg->pending;
+       if (pending && refcount_dec_and_test(&pending->refs))
+               wake_up_var(&pending->refs);
diff --git a/queue-5.11/sched-simplify-set_affinity_pending-refcounts.patch b/queue-5.11/sched-simplify-set_affinity_pending-refcounts.patch
new file mode 100644 (file)
index 0000000..fbc787e
--- /dev/null
@@ -0,0 +1,124 @@
+From 50caf9c14b1498c90cf808dbba2ca29bd32ccba4 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 24 Feb 2021 11:42:08 +0100
+Subject: sched: Simplify set_affinity_pending refcounts
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 50caf9c14b1498c90cf808dbba2ca29bd32ccba4 upstream.
+
+Now that we have set_affinity_pending::stop_pending to indicate if a
+stopper is in progress, and we have the guarantee that if that stopper
+exists, it will (eventually) complete our @pending we can simplify the
+refcount scheme by no longer counting the stopper thread.
+
+Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
+Cc: stable@kernel.org
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
+Link: https://lkml.kernel.org/r/20210224131355.724130207@infradead.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/core.c |   32 ++++++++++++++++++++------------
+ 1 file changed, 20 insertions(+), 12 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1862,6 +1862,10 @@ struct migration_arg {
+       struct set_affinity_pending     *pending;
+ };
++/*
++ * @refs: number of wait_for_completion()
++ * @stop_pending: is @stop_work in use
++ */
+ struct set_affinity_pending {
+       refcount_t              refs;
+       unsigned int            stop_pending;
+@@ -1997,10 +2001,6 @@ out:
+       if (complete)
+               complete_all(&pending->done);
+-      /* For pending->{arg,stop_work} */
+-      if (pending && refcount_dec_and_test(&pending->refs))
+-              wake_up_var(&pending->refs);
+-
+       return 0;
+ }
+@@ -2199,12 +2199,16 @@ static int affine_move_task(struct rq *r
+                       push_task = get_task_struct(p);
+               }
++              /*
++               * If there are pending waiters, but no pending stop_work,
++               * then complete now.
++               */
+               pending = p->migration_pending;
+-              if (pending) {
+-                      refcount_inc(&pending->refs);
++              if (pending && !pending->stop_pending) {
+                       p->migration_pending = NULL;
+                       complete = true;
+               }
++
+               task_rq_unlock(rq, p, rf);
+               if (push_task) {
+@@ -2213,7 +2217,7 @@ static int affine_move_task(struct rq *r
+               }
+               if (complete)
+-                      goto do_complete;
++                      complete_all(&pending->done);
+               return 0;
+       }
+@@ -2264,9 +2268,9 @@ static int affine_move_task(struct rq *r
+               if (!stop_pending)
+                       pending->stop_pending = true;
+-              refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+               if (flags & SCA_MIGRATE_ENABLE)
+                       p->migration_flags &= ~MDF_PUSH;
++
+               task_rq_unlock(rq, p, rf);
+               if (!stop_pending) {
+@@ -2282,12 +2286,13 @@ static int affine_move_task(struct rq *r
+                       if (task_on_rq_queued(p))
+                               rq = move_queued_task(rq, rf, p, dest_cpu);
+-                      p->migration_pending = NULL;
+-                      complete = true;
++                      if (!pending->stop_pending) {
++                              p->migration_pending = NULL;
++                              complete = true;
++                      }
+               }
+               task_rq_unlock(rq, p, rf);
+-do_complete:
+               if (complete)
+                       complete_all(&pending->done);
+       }
+@@ -2295,7 +2300,7 @@ do_complete:
+       wait_for_completion(&pending->done);
+       if (refcount_dec_and_test(&pending->refs))
+-              wake_up_var(&pending->refs);
++              wake_up_var(&pending->refs); /* No UaF, just an address */
+       /*
+        * Block the original owner of &pending until all subsequent callers
+@@ -2303,6 +2308,9 @@ do_complete:
+        */
+       wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
++      /* ARGH */
++      WARN_ON_ONCE(my_pending.stop_pending);
++
+       return 0;
+ }
index 54cb77eace8d26dcc035a6addd3b1a2c0054f0d8..a9396f5d12408b2652a8c50fe8953146fbdea0ef 100644 (file)
@@ -267,3 +267,39 @@ memblock-fix-section-mismatch-warning.patch
 stop_machine-mark-helpers-__always_inline.patch
 include-linux-sched-mm.h-use-rcu_dereference-in-in_v.patch
 prctl-fix-pr_set_mm_auxv-kernel-stack-leak.patch
+zram-fix-return-value-on-writeback_store.patch
+zram-fix-broken-page-writeback.patch
+linux-compiler-clang.h-define-have_builtin_bswap.patch
+sched-fix-migration_cpu_stop-requeueing.patch
+sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch
+sched-collate-affine_move_task-stoppers.patch
+sched-simplify-migration_cpu_stop.patch
+sched-optimize-migration_cpu_stop.patch
+sched-fix-affine_move_task-self-concurrency.patch
+sched-simplify-set_affinity_pending-refcounts.patch
+efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch
+powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch
+powerpc-fix-inverted-set_full_regs-bitop.patch
+powerpc-fix-missing-declaration-of-able_kernel_vsx.patch
+binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch
+kasan-mm-fix-crash-with-hw_tags-and-debug_pagealloc.patch
+kasan-fix-kasan_stack-dependency-for-hw_tags.patch
+x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch
+x86-sev-es-introduce-ip_within_syscall_gap-helper.patch
+x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch
+x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch
+x86-sev-es-use-__copy_from_user_inatomic.patch
+x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch
+kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch
+kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch
+kvm-arm64-ensure-i-cache-isolation-between-vcpus-of-a-same-vm.patch
+kvm-arm64-fix-range-alignment-when-walking-page-tables.patch
+kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch
+kvm-arm64-nvhe-save-the-spe-context-early.patch
+kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch
+kvm-arm64-fix-exclusive-limit-for-ipa-size.patch
+mm-highmem.c-fix-zero_user_segments-with-start-end.patch
+mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch
+mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch
+mm-memcg-set-memcg-when-splitting-page.patch
+mm-memcg-rename-mem_cgroup_split_huge_fixup-to-split_page_memcg-and-add-nr_pages-argument.patch
diff --git a/queue-5.11/x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch b/queue-5.11/x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch
new file mode 100644 (file)
index 0000000..919b7e7
--- /dev/null
@@ -0,0 +1,44 @@
+From 5d5675df792ff67e74a500c4c94db0f99e6a10ef Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Thu, 4 Mar 2021 11:05:54 -0800
+Subject: x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 5d5675df792ff67e74a500c4c94db0f99e6a10ef upstream.
+
+On a 32-bit fast syscall that fails to read its arguments from user
+memory, the kernel currently does syscall exit work but not
+syscall entry work.  This confuses audit and ptrace.  For example:
+
+    $ ./tools/testing/selftests/x86/syscall_arg_fault_32
+    ...
+    strace: pid 264258: entering, ptrace_syscall_info.op == 2
+    ...
+
+This is a minimal fix intended for ease of backporting.  A more
+complete cleanup is coming.
+
+Fixes: 0b085e68f407 ("x86/entry: Consolidate 32/64 bit syscall entry")
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/8c82296ddf803b91f8d1e5eac89e5803ba54ab0e.1614884673.git.luto@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/entry/common.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/entry/common.c
++++ b/arch/x86/entry/common.c
+@@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32
+               regs->ax = -EFAULT;
+               instrumentation_end();
+-              syscall_exit_to_user_mode(regs);
++              local_irq_disable();
++              irqentry_exit_to_user_mode(regs);
+               return false;
+       }
diff --git a/queue-5.11/x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch b/queue-5.11/x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch
new file mode 100644 (file)
index 0000000..db5d245
--- /dev/null
@@ -0,0 +1,60 @@
+From 545ac14c16b5dbd909d5a90ddf5b5a629a40fa94 Mon Sep 17 00:00:00 2001
+From: Joerg Roedel <jroedel@suse.de>
+Date: Wed, 3 Mar 2021 15:17:13 +0100
+Subject: x86/sev-es: Check regs->sp is trusted before adjusting #VC IST stack
+
+From: Joerg Roedel <jroedel@suse.de>
+
+commit 545ac14c16b5dbd909d5a90ddf5b5a629a40fa94 upstream.
+
+The code in the NMI handler to adjust the #VC handler IST stack is
+needed in case an NMI hits when the #VC handler is still using its IST
+stack.
+
+But the check for this condition also needs to look if the regs->sp
+value is trusted, meaning it was not set by user-space. Extend the check
+to not use regs->sp when the NMI interrupted user-space code or the
+SYSCALL gap.
+
+Fixes: 315562c9af3d5 ("x86/sev-es: Adjust #VC IST Stack on entering NMI handler")
+Reported-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: stable@vger.kernel.org # 5.10+
+Link: https://lkml.kernel.org/r/20210303141716.29223-3-joro@8bytes.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/sev-es.c |   14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kernel/sev-es.c
++++ b/arch/x86/kernel/sev-es.c
+@@ -121,8 +121,18 @@ static void __init setup_vc_stacks(int c
+       cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
+ }
+-static __always_inline bool on_vc_stack(unsigned long sp)
++static __always_inline bool on_vc_stack(struct pt_regs *regs)
+ {
++      unsigned long sp = regs->sp;
++
++      /* User-mode RSP is not trusted */
++      if (user_mode(regs))
++              return false;
++
++      /* SYSCALL gap still has user-mode RSP */
++      if (ip_within_syscall_gap(regs))
++              return false;
++
+       return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
+ }
+@@ -144,7 +154,7 @@ void noinstr __sev_es_ist_enter(struct p
+       old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+       /* Make room on the IST stack */
+-      if (on_vc_stack(regs->sp))
++      if (on_vc_stack(regs))
+               new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
+       else
+               new_ist = old_ist - sizeof(old_ist);
diff --git a/queue-5.11/x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch b/queue-5.11/x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch
new file mode 100644 (file)
index 0000000..39a3193
--- /dev/null
@@ -0,0 +1,57 @@
+From 62441a1fb53263bda349b6e5997c3cc5c120d89e Mon Sep 17 00:00:00 2001
+From: Joerg Roedel <jroedel@suse.de>
+Date: Wed, 3 Mar 2021 15:17:15 +0100
+Subject: x86/sev-es: Correctly track IRQ states in runtime #VC handler
+
+From: Joerg Roedel <jroedel@suse.de>
+
+commit 62441a1fb53263bda349b6e5997c3cc5c120d89e upstream.
+
+Call irqentry_nmi_enter()/irqentry_nmi_exit() in the #VC handler to
+correctly track the IRQ state during its execution.
+
+Fixes: 0786138c78e79 ("x86/sev-es: Add a Runtime #VC Exception Handler")
+Reported-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: stable@vger.kernel.org # v5.10+
+Link: https://lkml.kernel.org/r/20210303141716.29223-5-joro@8bytes.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/sev-es.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kernel/sev-es.c
++++ b/arch/x86/kernel/sev-es.c
+@@ -1258,13 +1258,12 @@ static __always_inline bool on_vc_fallba
+ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
+ {
+       struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
++      irqentry_state_t irq_state;
+       struct ghcb_state state;
+       struct es_em_ctxt ctxt;
+       enum es_result result;
+       struct ghcb *ghcb;
+-      lockdep_assert_irqs_disabled();
+-
+       /*
+        * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+        */
+@@ -1273,6 +1272,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_co
+               return;
+       }
++      irq_state = irqentry_nmi_enter(regs);
++      lockdep_assert_irqs_disabled();
+       instrumentation_begin();
+       /*
+@@ -1335,6 +1336,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_co
+ out:
+       instrumentation_end();
++      irqentry_nmi_exit(regs, irq_state);
+       return;
diff --git a/queue-5.11/x86-sev-es-introduce-ip_within_syscall_gap-helper.patch b/queue-5.11/x86-sev-es-introduce-ip_within_syscall_gap-helper.patch
new file mode 100644 (file)
index 0000000..9209662
--- /dev/null
@@ -0,0 +1,90 @@
+From 78a81d88f60ba773cbe890205e1ee67f00502948 Mon Sep 17 00:00:00 2001
+From: Joerg Roedel <jroedel@suse.de>
+Date: Wed, 3 Mar 2021 15:17:12 +0100
+Subject: x86/sev-es: Introduce ip_within_syscall_gap() helper
+
+From: Joerg Roedel <jroedel@suse.de>
+
+commit 78a81d88f60ba773cbe890205e1ee67f00502948 upstream.
+
+Introduce a helper to check whether an exception came from the syscall
+gap and use it in the SEV-ES code. Extend the check to also cover the
+compatibility SYSCALL entry path.
+
+Fixes: 315562c9af3d5 ("x86/sev-es: Adjust #VC IST Stack on entering NMI handler")
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: stable@vger.kernel.org # 5.10+
+Link: https://lkml.kernel.org/r/20210303141716.29223-2-joro@8bytes.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/entry/entry_64_compat.S |    2 ++
+ arch/x86/include/asm/proto.h     |    1 +
+ arch/x86/include/asm/ptrace.h    |   15 +++++++++++++++
+ arch/x86/kernel/traps.c          |    3 +--
+ 4 files changed, 19 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat)
+       /* Switch to the kernel stack */
+       movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
++SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
++
+       /* Construct struct pt_regs on stack */
+       pushq   $__USER32_DS            /* pt_regs->ss */
+       pushq   %r8                     /* pt_regs->sp */
+--- a/arch/x86/include/asm/proto.h
++++ b/arch/x86/include/asm/proto.h
+@@ -25,6 +25,7 @@ void __end_SYSENTER_singlestep_region(vo
+ void entry_SYSENTER_compat(void);
+ void __end_entry_SYSENTER_compat(void);
+ void entry_SYSCALL_compat(void);
++void entry_SYSCALL_compat_safe_stack(void);
+ void entry_INT80_compat(void);
+ #ifdef CONFIG_XEN_PV
+ void xen_entry_INT80_compat(void);
+--- a/arch/x86/include/asm/ptrace.h
++++ b/arch/x86/include/asm/ptrace.h
+@@ -94,6 +94,8 @@ struct pt_regs {
+ #include <asm/paravirt_types.h>
+ #endif
++#include <asm/proto.h>
++
+ struct cpuinfo_x86;
+ struct task_struct;
+@@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct
+ #ifdef CONFIG_X86_64
+ #define current_user_stack_pointer()  current_pt_regs()->sp
+ #define compat_user_stack_pointer()   current_pt_regs()->sp
++
++static inline bool ip_within_syscall_gap(struct pt_regs *regs)
++{
++      bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
++                  regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack);
++
++#ifdef CONFIG_IA32_EMULATION
++      ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
++                    regs->ip <  (unsigned long)entry_SYSCALL_compat_safe_stack);
++#endif
++
++      return ret;
++}
+ #endif
+ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -694,8 +694,7 @@ asmlinkage __visible noinstr struct pt_r
+        * In the SYSCALL entry path the RSP value comes from user-space - don't
+        * trust it and switch to the current kernel stack
+        */
+-      if (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
+-          regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack) {
++      if (ip_within_syscall_gap(regs)) {
+               sp = this_cpu_read(cpu_current_top_of_stack);
+               goto sync;
+       }
diff --git a/queue-5.11/x86-sev-es-use-__copy_from_user_inatomic.patch b/queue-5.11/x86-sev-es-use-__copy_from_user_inatomic.patch
new file mode 100644 (file)
index 0000000..2cf04e7
--- /dev/null
@@ -0,0 +1,137 @@
+From bffe30dd9f1f3b2608a87ac909a224d6be472485 Mon Sep 17 00:00:00 2001
+From: Joerg Roedel <jroedel@suse.de>
+Date: Wed, 3 Mar 2021 15:17:16 +0100
+Subject: x86/sev-es: Use __copy_from_user_inatomic()
+
+From: Joerg Roedel <jroedel@suse.de>
+
+commit bffe30dd9f1f3b2608a87ac909a224d6be472485 upstream.
+
+The #VC handler must run in atomic context and cannot sleep. This is a
+problem when it tries to fetch instruction bytes from user-space via
+copy_from_user().
+
+Introduce a insn_fetch_from_user_inatomic() helper which uses
+__copy_from_user_inatomic() to safely copy the instruction bytes to
+kernel memory in the #VC handler.
+
+Fixes: 5e3427a7bc432 ("x86/sev-es: Handle instruction fetches from user-space")
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: stable@vger.kernel.org # v5.10+
+Link: https://lkml.kernel.org/r/20210303141716.29223-6-joro@8bytes.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/insn-eval.h |    2 +
+ arch/x86/kernel/sev-es.c         |    2 -
+ arch/x86/lib/insn-eval.c         |   66 ++++++++++++++++++++++++++++++---------
+ 3 files changed, 55 insertions(+), 15 deletions(-)
+
+--- a/arch/x86/include/asm/insn-eval.h
++++ b/arch/x86/include/asm/insn-eval.h
+@@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct p
+ int insn_get_code_seg_params(struct pt_regs *regs);
+ int insn_fetch_from_user(struct pt_regs *regs,
+                        unsigned char buf[MAX_INSN_SIZE]);
++int insn_fetch_from_user_inatomic(struct pt_regs *regs,
++                                unsigned char buf[MAX_INSN_SIZE]);
+ bool insn_decode(struct insn *insn, struct pt_regs *regs,
+                unsigned char buf[MAX_INSN_SIZE], int buf_size);
+--- a/arch/x86/kernel/sev-es.c
++++ b/arch/x86/kernel/sev-es.c
+@@ -258,7 +258,7 @@ static enum es_result vc_decode_insn(str
+       int res;
+       if (user_mode(ctxt->regs)) {
+-              res = insn_fetch_from_user(ctxt->regs, buffer);
++              res = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
+               if (!res) {
+                       ctxt->fi.vector     = X86_TRAP_PF;
+                       ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
+--- a/arch/x86/lib/insn-eval.c
++++ b/arch/x86/lib/insn-eval.c
+@@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct in
+       }
+ }
++static unsigned long insn_get_effective_ip(struct pt_regs *regs)
++{
++      unsigned long seg_base = 0;
++
++      /*
++       * If not in user-space long mode, a custom code segment could be in
++       * use. This is true in protected mode (if the process defined a local
++       * descriptor table), or virtual-8086 mode. In most of the cases
++       * seg_base will be zero as in USER_CS.
++       */
++      if (!user_64bit_mode(regs)) {
++              seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
++              if (seg_base == -1L)
++                      return 0;
++      }
++
++      return seg_base + regs->ip;
++}
++
+ /**
+  * insn_fetch_from_user() - Copy instruction bytes from user-space memory
+  * @regs:     Structure with register values as seen when entering kernel mode
+@@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct in
+  */
+ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
+ {
+-      unsigned long seg_base = 0;
++      unsigned long ip;
+       int not_copied;
+-      /*
+-       * If not in user-space long mode, a custom code segment could be in
+-       * use. This is true in protected mode (if the process defined a local
+-       * descriptor table), or virtual-8086 mode. In most of the cases
+-       * seg_base will be zero as in USER_CS.
+-       */
+-      if (!user_64bit_mode(regs)) {
+-              seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
+-              if (seg_base == -1L)
+-                      return 0;
+-      }
++      ip = insn_get_effective_ip(regs);
++      if (!ip)
++              return 0;
++
++      not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE);
++
++      return MAX_INSN_SIZE - not_copied;
++}
++
++/**
++ * insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory
++ *                                   while in atomic code
++ * @regs:     Structure with register values as seen when entering kernel mode
++ * @buf:      Array to store the fetched instruction
++ *
++ * Gets the linear address of the instruction and copies the instruction bytes
++ * to the buf. This function must be used in atomic context.
++ *
++ * Returns:
++ *
++ * Number of instruction bytes copied.
++ *
++ * 0 if nothing was copied.
++ */
++int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
++{
++      unsigned long ip;
++      int not_copied;
++      ip = insn_get_effective_ip(regs);
++      if (!ip)
++              return 0;
+-      not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
+-                                  MAX_INSN_SIZE);
++      not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE);
+       return MAX_INSN_SIZE - not_copied;
+ }
diff --git a/queue-5.11/x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch b/queue-5.11/x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch
new file mode 100644 (file)
index 0000000..b830b89
--- /dev/null
@@ -0,0 +1,87 @@
+From e504e74cc3a2c092b05577ce3e8e013fae7d94e6 Mon Sep 17 00:00:00 2001
+From: Josh Poimboeuf <jpoimboe@redhat.com>
+Date: Fri, 5 Feb 2021 08:24:02 -0600
+Subject: x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
+
+From: Josh Poimboeuf <jpoimboe@redhat.com>
+
+commit e504e74cc3a2c092b05577ce3e8e013fae7d94e6 upstream.
+
+KASAN reserves "redzone" areas between stack frames in order to detect
+stack overruns.  A read or write to such an area triggers a KASAN
+"stack-out-of-bounds" BUG.
+
+Normally, the ORC unwinder stays in-bounds and doesn't access the
+redzone.  But sometimes it can't find ORC metadata for a given
+instruction.  This can happen for code which is missing ORC metadata, or
+for generated code.  In such cases, the unwinder attempts to fall back
+to frame pointers, as a best-effort type thing.
+
+This fallback often works, but when it doesn't, the unwinder can get
+confused and go off into the weeds into the KASAN redzone, triggering
+the aforementioned KASAN BUG.
+
+But in this case, the unwinder's confusion is actually harmless and
+working as designed.  It already has checks in place to prevent
+off-stack accesses, but those checks get short-circuited by the KASAN
+BUG.  And a BUG is a lot more disruptive than a harmless unwinder
+warning.
+
+Disable the KASAN checks by using READ_ONCE_NOCHECK() for all stack
+accesses.  This finishes the job started by commit 881125bfe65b
+("x86/unwind: Disable KASAN checking in the ORC unwinder"), which only
+partially fixed the issue.
+
+Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder")
+Reported-by: Ivan Babrou <ivan@cloudflare.com>
+Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Tested-by: Ivan Babrou <ivan@cloudflare.com>
+Cc: stable@kernel.org
+Link: https://lkml.kernel.org/r/9583327904ebbbeda399eca9c56d6c7085ac20fe.1612534649.git.jpoimboe@redhat.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/unwind_orc.c |   12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kernel/unwind_orc.c
++++ b/arch/x86/kernel/unwind_orc.c
+@@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwi
+       if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
+               return false;
+-      *ip = regs->ip;
+-      *sp = regs->sp;
++      *ip = READ_ONCE_NOCHECK(regs->ip);
++      *sp = READ_ONCE_NOCHECK(regs->sp);
+       return true;
+ }
+@@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct
+       if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+               return false;
+-      *ip = regs->ip;
+-      *sp = regs->sp;
++      *ip = READ_ONCE_NOCHECK(regs->ip);
++      *sp = READ_ONCE_NOCHECK(regs->sp);
+       return true;
+ }
+@@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state
+               return false;
+       if (state->full_regs) {
+-              *val = ((unsigned long *)state->regs)[reg];
++              *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]);
+               return true;
+       }
+       if (state->prev_regs) {
+-              *val = ((unsigned long *)state->prev_regs)[reg];
++              *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]);
+               return true;
+       }
diff --git a/queue-5.11/zram-fix-broken-page-writeback.patch b/queue-5.11/zram-fix-broken-page-writeback.patch
new file mode 100644 (file)
index 0000000..444648f
--- /dev/null
@@ -0,0 +1,57 @@
+From 2766f1821600cc7562bae2128ad0b163f744c5d9 Mon Sep 17 00:00:00 2001
+From: Minchan Kim <minchan@kernel.org>
+Date: Fri, 12 Mar 2021 21:08:41 -0800
+Subject: zram: fix broken page writeback
+
+From: Minchan Kim <minchan@kernel.org>
+
+commit 2766f1821600cc7562bae2128ad0b163f744c5d9 upstream.
+
+commit 0d8359620d9b ("zram: support page writeback") introduced two
+problems.  It overwrites writeback_store's return value as kstrtol's
+return value, which makes return value zero so user could see zero as
+return value of write syscall even though it wrote data successfully.
+
+It also breaks index value in the loop in that it doesn't increase the
+index any longer.  It means it can write only first starting block index
+so user couldn't write all idle pages in the zram so lose memory saving
+chance.
+
+This patch fixes those issues.
+
+Link: https://lkml.kernel.org/r/20210312173949.2197662-2-minchan@kernel.org
+Fixes: 0d8359620d9b("zram: support page writeback")
+Signed-off-by: Minchan Kim <minchan@kernel.org>
+Reported-by: Amos Bianchi <amosbianchi@google.com>
+Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Cc: John Dias <joaodias@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/zram/zram_drv.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/drivers/block/zram/zram_drv.c
++++ b/drivers/block/zram/zram_drv.c
+@@ -639,8 +639,8 @@ static ssize_t writeback_store(struct de
+               if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
+                       return -EINVAL;
+-              ret = kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index);
+-              if (ret || index >= nr_pages)
++              if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) ||
++                              index >= nr_pages)
+                       return -EINVAL;
+               nr_pages = 1;
+@@ -664,7 +664,7 @@ static ssize_t writeback_store(struct de
+               goto release_init_lock;
+       }
+-      while (nr_pages--) {
++      for (; nr_pages != 0; index++, nr_pages--) {
+               struct bio_vec bvec;
+               bvec.bv_page = page;
diff --git a/queue-5.11/zram-fix-return-value-on-writeback_store.patch b/queue-5.11/zram-fix-return-value-on-writeback_store.patch
new file mode 100644 (file)
index 0000000..81c625d
--- /dev/null
@@ -0,0 +1,60 @@
+From 57e0076e6575a7b7cef620a0bd2ee2549ef77818 Mon Sep 17 00:00:00 2001
+From: Minchan Kim <minchan@kernel.org>
+Date: Fri, 12 Mar 2021 21:08:38 -0800
+Subject: zram: fix return value on writeback_store
+
+From: Minchan Kim <minchan@kernel.org>
+
+commit 57e0076e6575a7b7cef620a0bd2ee2549ef77818 upstream.
+
+writeback_store's return value is overwritten by submit_bio_wait's return
+value.  Thus, writeback_store will return zero since there was no IO
+error.  In the end, write syscall from userspace will see the zero as
+return value, which could make the process stall to keep trying the write
+until it will succeed.
+
+Link: https://lkml.kernel.org/r/20210312173949.2197662-1-minchan@kernel.org
+Fixes: 3b82a051c101("drivers/block/zram/zram_drv.c: fix error return codes not being returned in writeback_store")
+Signed-off-by: Minchan Kim <minchan@kernel.org>
+Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Cc: Colin Ian King <colin.king@canonical.com>
+Cc: John Dias <joaodias@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/zram/zram_drv.c |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/drivers/block/zram/zram_drv.c
++++ b/drivers/block/zram/zram_drv.c
+@@ -628,7 +628,7 @@ static ssize_t writeback_store(struct de
+       struct bio_vec bio_vec;
+       struct page *page;
+       ssize_t ret = len;
+-      int mode;
++      int mode, err;
+       unsigned long blk_idx = 0;
+       if (sysfs_streq(buf, "idle"))
+@@ -729,12 +729,17 @@ static ssize_t writeback_store(struct de
+                * XXX: A single page IO would be inefficient for write
+                * but it would be not bad as starter.
+                */
+-              ret = submit_bio_wait(&bio);
+-              if (ret) {
++              err = submit_bio_wait(&bio);
++              if (err) {
+                       zram_slot_lock(zram, index);
+                       zram_clear_flag(zram, index, ZRAM_UNDER_WB);
+                       zram_clear_flag(zram, index, ZRAM_IDLE);
+                       zram_slot_unlock(zram, index);
++                      /*
++                       * Return last IO error unless every IO were
++                       * not suceeded.
++                       */
++                      ret = err;
+                       continue;
+               }