From: Greg Kroah-Hartman Date: Mon, 15 Mar 2021 09:06:13 +0000 (+0100) Subject: 5.11-stable patches X-Git-Tag: v4.4.262~29 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2c7ad951f900f65474cb0b6911cca628957ec583;p=thirdparty%2Fkernel%2Fstable-queue.git 5.11-stable patches added patches: binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch kasan-fix-kasan_stack-dependency-for-hw_tags.patch kasan-mm-fix-crash-with-hw_tags-and-debug_pagealloc.patch kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch kvm-arm64-ensure-i-cache-isolation-between-vcpus-of-a-same-vm.patch kvm-arm64-fix-exclusive-limit-for-ipa-size.patch kvm-arm64-fix-range-alignment-when-walking-page-tables.patch kvm-arm64-nvhe-save-the-spe-context-early.patch kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch linux-compiler-clang.h-define-have_builtin_bswap.patch mm-highmem.c-fix-zero_user_segments-with-start-end.patch mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch mm-memcg-rename-mem_cgroup_split_huge_fixup-to-split_page_memcg-and-add-nr_pages-argument.patch mm-memcg-set-memcg-when-splitting-page.patch mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch powerpc-fix-inverted-set_full_regs-bitop.patch powerpc-fix-missing-declaration-of-able_kernel_vsx.patch sched-collate-affine_move_task-stoppers.patch sched-fix-affine_move_task-self-concurrency.patch sched-fix-migration_cpu_stop-requeueing.patch sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch sched-optimize-migration_cpu_stop.patch sched-simplify-migration_cpu_stop.patch sched-simplify-set_affinity_pending-refcounts.patch x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch x86-sev-es-introduce-ip_within_syscall_gap-helper.patch x86-sev-es-use-__copy_from_user_inatomic.patch x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch zram-fix-broken-page-writeback.patch zram-fix-return-value-on-writeback_store.patch --- diff --git a/queue-5.11/binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch b/queue-5.11/binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch new file mode 100644 index 00000000000..5b3017bbcbb --- /dev/null +++ b/queue-5.11/binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch @@ -0,0 +1,118 @@ +From e7850f4d844e0acfac7e570af611d89deade3146 Mon Sep 17 00:00:00 2001 +From: Lior Ribak +Date: Fri, 12 Mar 2021 21:07:41 -0800 +Subject: binfmt_misc: fix possible deadlock in bm_register_write + +From: Lior Ribak + +commit e7850f4d844e0acfac7e570af611d89deade3146 upstream. + +There is a deadlock in bm_register_write: + +First, in the begining of the function, a lock is taken on the binfmt_misc +root inode with inode_lock(d_inode(root)). + +Then, if the user used the MISC_FMT_OPEN_FILE flag, the function will call +open_exec on the user-provided interpreter. + +open_exec will call a path lookup, and if the path lookup process includes +the root of binfmt_misc, it will try to take a shared lock on its inode +again, but it is already locked, and the code will get stuck in a deadlock + +To reproduce the bug: +$ echo ":iiiii:E::ii::/proc/sys/fs/binfmt_misc/bla:F" > /proc/sys/fs/binfmt_misc/register + +backtrace of where the lock occurs (#5): +0 schedule () at ./arch/x86/include/asm/current.h:15 +1 0xffffffff81b51237 in rwsem_down_read_slowpath (sem=0xffff888003b202e0, count=, state=state@entry=2) at kernel/locking/rwsem.c:992 +2 0xffffffff81b5150a in __down_read_common (state=2, sem=) at kernel/locking/rwsem.c:1213 +3 __down_read (sem=) at kernel/locking/rwsem.c:1222 +4 down_read (sem=) at kernel/locking/rwsem.c:1355 +5 0xffffffff811ee22a in inode_lock_shared (inode=) at ./include/linux/fs.h:783 +6 open_last_lookups (op=0xffffc9000022fe34, file=0xffff888004098600, nd=0xffffc9000022fd10) at fs/namei.c:3177 +7 path_openat (nd=nd@entry=0xffffc9000022fd10, op=op@entry=0xffffc9000022fe34, flags=flags@entry=65) at fs/namei.c:3366 +8 0xffffffff811efe1c in do_filp_open (dfd=, pathname=pathname@entry=0xffff8880031b9000, op=op@entry=0xffffc9000022fe34) at fs/namei.c:3396 +9 0xffffffff811e493f in do_open_execat (fd=fd@entry=-100, name=name@entry=0xffff8880031b9000, flags=, flags@entry=0) at fs/exec.c:913 +10 0xffffffff811e4a92 in open_exec (name=) at fs/exec.c:948 +11 0xffffffff8124aa84 in bm_register_write (file=, buffer=, count=19, ppos=) at fs/binfmt_misc.c:682 +12 0xffffffff811decd2 in vfs_write (file=file@entry=0xffff888004098500, buf=buf@entry=0xa758d0 ":iiiii:E::ii::i:CF +", count=count@entry=19, pos=pos@entry=0xffffc9000022ff10) at fs/read_write.c:603 +13 0xffffffff811defda in ksys_write (fd=, buf=0xa758d0 ":iiiii:E::ii::i:CF +", count=19) at fs/read_write.c:658 +14 0xffffffff81b49813 in do_syscall_64 (nr=, regs=0xffffc9000022ff58) at arch/x86/entry/common.c:46 +15 0xffffffff81c0007c in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:120 + +To solve the issue, the open_exec call is moved to before the write +lock is taken by bm_register_write + +Link: https://lkml.kernel.org/r/20210228224414.95962-1-liorribak@gmail.com +Fixes: 948b701a607f1 ("binfmt_misc: add persistent opened binary handler for containers") +Signed-off-by: Lior Ribak +Acked-by: Helge Deller +Cc: Al Viro +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/binfmt_misc.c | 29 ++++++++++++++--------------- + 1 file changed, 14 insertions(+), 15 deletions(-) + +--- a/fs/binfmt_misc.c ++++ b/fs/binfmt_misc.c +@@ -647,12 +647,24 @@ static ssize_t bm_register_write(struct + struct super_block *sb = file_inode(file)->i_sb; + struct dentry *root = sb->s_root, *dentry; + int err = 0; ++ struct file *f = NULL; + + e = create_entry(buffer, count); + + if (IS_ERR(e)) + return PTR_ERR(e); + ++ if (e->flags & MISC_FMT_OPEN_FILE) { ++ f = open_exec(e->interpreter); ++ if (IS_ERR(f)) { ++ pr_notice("register: failed to install interpreter file %s\n", ++ e->interpreter); ++ kfree(e); ++ return PTR_ERR(f); ++ } ++ e->interp_file = f; ++ } ++ + inode_lock(d_inode(root)); + dentry = lookup_one_len(e->name, root, strlen(e->name)); + err = PTR_ERR(dentry); +@@ -676,21 +688,6 @@ static ssize_t bm_register_write(struct + goto out2; + } + +- if (e->flags & MISC_FMT_OPEN_FILE) { +- struct file *f; +- +- f = open_exec(e->interpreter); +- if (IS_ERR(f)) { +- err = PTR_ERR(f); +- pr_notice("register: failed to install interpreter file %s\n", e->interpreter); +- simple_release_fs(&bm_mnt, &entry_count); +- iput(inode); +- inode = NULL; +- goto out2; +- } +- e->interp_file = f; +- } +- + e->dentry = dget(dentry); + inode->i_private = e; + inode->i_fop = &bm_entry_operations; +@@ -707,6 +704,8 @@ out: + inode_unlock(d_inode(root)); + + if (err) { ++ if (f) ++ filp_close(f, NULL); + kfree(e); + return err; + } diff --git a/queue-5.11/efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch b/queue-5.11/efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch new file mode 100644 index 00000000000..a99106f1984 --- /dev/null +++ b/queue-5.11/efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch @@ -0,0 +1,59 @@ +From 9e9888a0fe97b9501a40f717225d2bef7100a2c1 Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel +Date: Fri, 5 Mar 2021 10:21:05 +0100 +Subject: efi: stub: omit SetVirtualAddressMap() if marked unsupported in RT_PROP table + +From: Ard Biesheuvel + +commit 9e9888a0fe97b9501a40f717225d2bef7100a2c1 upstream. + +The EFI_RT_PROPERTIES_TABLE contains a mask of runtime services that are +available after ExitBootServices(). This mostly does not concern the EFI +stub at all, given that it runs before that. However, there is one call +that is made at runtime, which is the call to SetVirtualAddressMap() +(which is not even callable at boot time to begin with) + +So add the missing handling of the RT_PROP table to ensure that we only +call SetVirtualAddressMap() if it is not being advertised as unsupported +by the firmware. + +Cc: # v5.10+ +Tested-by: Shawn Guo +Signed-off-by: Ard Biesheuvel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/firmware/efi/libstub/efi-stub.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/drivers/firmware/efi/libstub/efi-stub.c ++++ b/drivers/firmware/efi/libstub/efi-stub.c +@@ -96,6 +96,18 @@ static void install_memreserve_table(voi + efi_err("Failed to install memreserve config table!\n"); + } + ++static u32 get_supported_rt_services(void) ++{ ++ const efi_rt_properties_table_t *rt_prop_table; ++ u32 supported = EFI_RT_SUPPORTED_ALL; ++ ++ rt_prop_table = get_efi_config_table(EFI_RT_PROPERTIES_TABLE_GUID); ++ if (rt_prop_table) ++ supported &= rt_prop_table->runtime_services_supported; ++ ++ return supported; ++} ++ + /* + * EFI entry point for the arm/arm64 EFI stubs. This is the entrypoint + * that is described in the PE/COFF header. Most of the code is the same +@@ -250,6 +262,10 @@ efi_status_t __efiapi efi_pe_entry(efi_h + (prop_tbl->memory_protection_attribute & + EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA); + ++ /* force efi_novamap if SetVirtualAddressMap() is unsupported */ ++ efi_novamap |= !(get_supported_rt_services() & ++ EFI_RT_SUPPORTED_SET_VIRTUAL_ADDRESS_MAP); ++ + /* hibernation expects the runtime regions to stay in the same place */ + if (!IS_ENABLED(CONFIG_HIBERNATION) && !efi_nokaslr && !flat_va_mapping) { + /* diff --git a/queue-5.11/kasan-fix-kasan_stack-dependency-for-hw_tags.patch b/queue-5.11/kasan-fix-kasan_stack-dependency-for-hw_tags.patch new file mode 100644 index 00000000000..f9f8b7da544 --- /dev/null +++ b/queue-5.11/kasan-fix-kasan_stack-dependency-for-hw_tags.patch @@ -0,0 +1,52 @@ +From d9b571c885a8974fbb7d4ee639dbc643fd000f9e Mon Sep 17 00:00:00 2001 +From: Andrey Konovalov +Date: Fri, 12 Mar 2021 21:08:13 -0800 +Subject: kasan: fix KASAN_STACK dependency for HW_TAGS + +From: Andrey Konovalov + +commit d9b571c885a8974fbb7d4ee639dbc643fd000f9e upstream. + +There's a runtime failure when running HW_TAGS-enabled kernel built with +GCC on hardware that doesn't support MTE. GCC-built kernels always have +CONFIG_KASAN_STACK enabled, even though stack instrumentation isn't +supported by HW_TAGS. Having that config enabled causes KASAN to issue +MTE-only instructions to unpoison kernel stacks, which causes the failure. + +Fix the issue by disallowing CONFIG_KASAN_STACK when HW_TAGS is used. + +(The commit that introduced CONFIG_KASAN_HW_TAGS specified proper + dependency for CONFIG_KASAN_STACK_ENABLE but not for CONFIG_KASAN_STACK.) + +Link: https://lkml.kernel.org/r/59e75426241dbb5611277758c8d4d6f5f9298dac.1615215441.git.andreyknvl@google.com +Fixes: 6a63a63ff1ac ("kasan: introduce CONFIG_KASAN_HW_TAGS") +Signed-off-by: Andrey Konovalov +Reported-by: Catalin Marinas +Cc: +Cc: Will Deacon +Cc: Vincenzo Frascino +Cc: Dmitry Vyukov +Cc: Andrey Ryabinin +Cc: Alexander Potapenko +Cc: Marco Elver +Cc: Peter Collingbourne +Cc: Evgenii Stepanov +Cc: Branislav Rankov +Cc: Kevin Brodsky +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + lib/Kconfig.kasan | 1 + + 1 file changed, 1 insertion(+) + +--- a/lib/Kconfig.kasan ++++ b/lib/Kconfig.kasan +@@ -156,6 +156,7 @@ config KASAN_STACK_ENABLE + + config KASAN_STACK + int ++ depends on KASAN_GENERIC || KASAN_SW_TAGS + default 1 if KASAN_STACK_ENABLE || CC_IS_GCC + default 0 + diff --git a/queue-5.11/kasan-mm-fix-crash-with-hw_tags-and-debug_pagealloc.patch b/queue-5.11/kasan-mm-fix-crash-with-hw_tags-and-debug_pagealloc.patch new file mode 100644 index 00000000000..857a828a080 --- /dev/null +++ b/queue-5.11/kasan-mm-fix-crash-with-hw_tags-and-debug_pagealloc.patch @@ -0,0 +1,65 @@ +From f9d79e8dce4077d3c6ab739c808169dfa99af9ef Mon Sep 17 00:00:00 2001 +From: Andrey Konovalov +Date: Fri, 12 Mar 2021 21:08:10 -0800 +Subject: kasan, mm: fix crash with HW_TAGS and DEBUG_PAGEALLOC + +From: Andrey Konovalov + +commit f9d79e8dce4077d3c6ab739c808169dfa99af9ef upstream. + +Currently, kasan_free_nondeferred_pages()->kasan_free_pages() is called +after debug_pagealloc_unmap_pages(). This causes a crash when +debug_pagealloc is enabled, as HW_TAGS KASAN can't set tags on an +unmapped page. + +This patch puts kasan_free_nondeferred_pages() before +debug_pagealloc_unmap_pages() and arch_free_page(), which can also make +the page unavailable. + +Link: https://lkml.kernel.org/r/24cd7db274090f0e5bc3adcdc7399243668e3171.1614987311.git.andreyknvl@google.com +Fixes: 94ab5b61ee16 ("kasan, arm64: enable CONFIG_KASAN_HW_TAGS") +Signed-off-by: Andrey Konovalov +Cc: Catalin Marinas +Cc: Will Deacon +Cc: Vincenzo Frascino +Cc: Dmitry Vyukov +Cc: Andrey Ryabinin +Cc: Alexander Potapenko +Cc: Marco Elver +Cc: Peter Collingbourne +Cc: Evgenii Stepanov +Cc: Branislav Rankov +Cc: Kevin Brodsky +Cc: Christoph Hellwig +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1282,6 +1282,12 @@ static __always_inline bool free_pages_p + kernel_poison_pages(page, 1 << order); + + /* ++ * With hardware tag-based KASAN, memory tags must be set before the ++ * page becomes unavailable via debug_pagealloc or arch_free_page. ++ */ ++ kasan_free_nondeferred_pages(page, order); ++ ++ /* + * arch_free_page() can make the page's contents inaccessible. s390 + * does this. So nothing which can access the page's contents should + * happen after this. +@@ -1290,8 +1296,6 @@ static __always_inline bool free_pages_p + + debug_pagealloc_unmap_pages(page, 1 << order); + +- kasan_free_nondeferred_pages(page, order); +- + return true; + } + diff --git a/queue-5.11/kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch b/queue-5.11/kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch new file mode 100644 index 00000000000..7ae1b1e66a5 --- /dev/null +++ b/queue-5.11/kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch @@ -0,0 +1,47 @@ +From 31948332d5fa392ad933f4a6a10026850649ed76 Mon Sep 17 00:00:00 2001 +From: Will Deacon +Date: Fri, 5 Mar 2021 18:52:48 +0000 +Subject: KVM: arm64: Avoid corrupting vCPU context register in guest exit + +From: Will Deacon + +commit 31948332d5fa392ad933f4a6a10026850649ed76 upstream. + +Commit 7db21530479f ("KVM: arm64: Restore hyp when panicking in guest +context") tracks the currently running vCPU, clearing the pointer to +NULL on exit from a guest. + +Unfortunately, the use of 'set_loaded_vcpu' clobbers x1 to point at the +kvm_hyp_ctxt instead of the vCPU context, causing the subsequent RAS +code to go off into the weeds when it saves the DISR assuming that the +CPU context is embedded in a struct vCPU. + +Leave x1 alone and use x3 as a temporary register instead when clearing +the vCPU on the guest exit path. + +Cc: Marc Zyngier +Cc: Andrew Scull +Cc: +Fixes: 7db21530479f ("KVM: arm64: Restore hyp when panicking in guest context") +Suggested-by: Quentin Perret +Signed-off-by: Will Deacon +Signed-off-by: Marc Zyngier +Link: https://lore.kernel.org/r/20210226181211.14542-1-will@kernel.org +Message-Id: <20210305185254.3730990-3-maz@kernel.org> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/hyp/entry.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/arm64/kvm/hyp/entry.S ++++ b/arch/arm64/kvm/hyp/entry.S +@@ -146,7 +146,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOB + // Now restore the hyp regs + restore_callee_saved_regs x2 + +- set_loaded_vcpu xzr, x1, x2 ++ set_loaded_vcpu xzr, x2, x3 + + alternative_if ARM64_HAS_RAS_EXTN + // If we have the RAS extensions we can consume a pending error diff --git a/queue-5.11/kvm-arm64-ensure-i-cache-isolation-between-vcpus-of-a-same-vm.patch b/queue-5.11/kvm-arm64-ensure-i-cache-isolation-between-vcpus-of-a-same-vm.patch new file mode 100644 index 00000000000..154f516c5e2 --- /dev/null +++ b/queue-5.11/kvm-arm64-ensure-i-cache-isolation-between-vcpus-of-a-same-vm.patch @@ -0,0 +1,152 @@ +From 01dc9262ff5797b675c32c0c6bc682777d23de05 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Wed, 3 Mar 2021 16:45:05 +0000 +Subject: KVM: arm64: Ensure I-cache isolation between vcpus of a same VM + +From: Marc Zyngier + +commit 01dc9262ff5797b675c32c0c6bc682777d23de05 upstream. + +It recently became apparent that the ARMv8 architecture has interesting +rules regarding attributes being used when fetching instructions +if the MMU is off at Stage-1. + +In this situation, the CPU is allowed to fetch from the PoC and +allocate into the I-cache (unless the memory is mapped with +the XN attribute at Stage-2). + +If we transpose this to vcpus sharing a single physical CPU, +it is possible for a vcpu running with its MMU off to influence +another vcpu running with its MMU on, as the latter is expected to +fetch from the PoU (and self-patching code doesn't flush below that +level). + +In order to solve this, reuse the vcpu-private TLB invalidation +code to apply the same policy to the I-cache, nuking it every time +the vcpu runs on a physical CPU that ran another vcpu of the same +VM in the past. + +This involve renaming __kvm_tlb_flush_local_vmid() to +__kvm_flush_cpu_context(), and inserting a local i-cache invalidation +there. + +Cc: stable@vger.kernel.org +Signed-off-by: Marc Zyngier +Acked-by: Will Deacon +Acked-by: Catalin Marinas +Link: https://lore.kernel.org/r/20210303164505.68492-1-maz@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/include/asm/kvm_asm.h | 4 ++-- + arch/arm64/kvm/arm.c | 7 ++++++- + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 6 +++--- + arch/arm64/kvm/hyp/nvhe/tlb.c | 3 ++- + arch/arm64/kvm/hyp/vhe/tlb.c | 3 ++- + 5 files changed, 15 insertions(+), 8 deletions(-) + +--- a/arch/arm64/include/asm/kvm_asm.h ++++ b/arch/arm64/include/asm/kvm_asm.h +@@ -47,7 +47,7 @@ + #define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 + #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 + #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 +-#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5 ++#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5 + #define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 + #define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 + #define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8 +@@ -183,10 +183,10 @@ DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs + #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) + + extern void __kvm_flush_vm_context(void); ++extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu); + extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, + int level); + extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); +-extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu); + + extern void __kvm_timer_set_cntvoff(u64 cntvoff); + +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -385,11 +385,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu + last_ran = this_cpu_ptr(mmu->last_vcpu_ran); + + /* ++ * We guarantee that both TLBs and I-cache are private to each ++ * vcpu. If detecting that a vcpu from the same VM has ++ * previously run on the same physical CPU, call into the ++ * hypervisor code to nuke the relevant contexts. ++ * + * We might get preempted before the vCPU actually runs, but + * over-invalidation doesn't affect correctness. + */ + if (*last_ran != vcpu->vcpu_id) { +- kvm_call_hyp(__kvm_tlb_flush_local_vmid, mmu); ++ kvm_call_hyp(__kvm_flush_cpu_context, mmu); + *last_ran = vcpu->vcpu_id; + } + +--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c ++++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c +@@ -46,11 +46,11 @@ static void handle___kvm_tlb_flush_vmid( + __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); + } + +-static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt) ++static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) + { + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + +- __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); ++ __kvm_flush_cpu_context(kern_hyp_va(mmu)); + } + + static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt) +@@ -115,7 +115,7 @@ static const hcall_t *host_hcall[] = { + HANDLE_FUNC(__kvm_flush_vm_context), + HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), + HANDLE_FUNC(__kvm_tlb_flush_vmid), +- HANDLE_FUNC(__kvm_tlb_flush_local_vmid), ++ HANDLE_FUNC(__kvm_flush_cpu_context), + HANDLE_FUNC(__kvm_timer_set_cntvoff), + HANDLE_FUNC(__kvm_enable_ssbs), + HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2), +--- a/arch/arm64/kvm/hyp/nvhe/tlb.c ++++ b/arch/arm64/kvm/hyp/nvhe/tlb.c +@@ -123,7 +123,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_ + __tlb_switch_to_host(&cxt); + } + +-void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) ++void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) + { + struct tlb_inv_context cxt; + +@@ -131,6 +131,7 @@ void __kvm_tlb_flush_local_vmid(struct k + __tlb_switch_to_guest(mmu, &cxt); + + __tlbi(vmalle1); ++ asm volatile("ic iallu"); + dsb(nsh); + isb(); + +--- a/arch/arm64/kvm/hyp/vhe/tlb.c ++++ b/arch/arm64/kvm/hyp/vhe/tlb.c +@@ -127,7 +127,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_ + __tlb_switch_to_host(&cxt); + } + +-void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) ++void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) + { + struct tlb_inv_context cxt; + +@@ -135,6 +135,7 @@ void __kvm_tlb_flush_local_vmid(struct k + __tlb_switch_to_guest(mmu, &cxt); + + __tlbi(vmalle1); ++ asm volatile("ic iallu"); + dsb(nsh); + isb(); + diff --git a/queue-5.11/kvm-arm64-fix-exclusive-limit-for-ipa-size.patch b/queue-5.11/kvm-arm64-fix-exclusive-limit-for-ipa-size.patch new file mode 100644 index 00000000000..e08b71bc576 --- /dev/null +++ b/queue-5.11/kvm-arm64-fix-exclusive-limit-for-ipa-size.patch @@ -0,0 +1,44 @@ +From 262b003d059c6671601a19057e9fe1a5e7f23722 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Thu, 11 Mar 2021 10:00:16 +0000 +Subject: KVM: arm64: Fix exclusive limit for IPA size + +From: Marc Zyngier + +commit 262b003d059c6671601a19057e9fe1a5e7f23722 upstream. + +When registering a memslot, we check the size and location of that +memslot against the IPA size to ensure that we can provide guest +access to the whole of the memory. + +Unfortunately, this check rejects memslot that end-up at the exact +limit of the addressing capability for a given IPA size. For example, +it refuses the creation of a 2GB memslot at 0x8000000 with a 32bit +IPA space. + +Fix it by relaxing the check to accept a memslot reaching the +limit of the IPA space. + +Fixes: c3058d5da222 ("arm/arm64: KVM: Ensure memslots are within KVM_PHYS_SIZE") +Reviewed-by: Eric Auger +Signed-off-by: Marc Zyngier +Cc: stable@vger.kernel.org +Reviewed-by: Andrew Jones +Link: https://lore.kernel.org/r/20210311100016.3830038-3-maz@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/mmu.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/arch/arm64/kvm/mmu.c ++++ b/arch/arm64/kvm/mmu.c +@@ -1309,8 +1309,7 @@ int kvm_arch_prepare_memory_region(struc + * Prevent userspace from creating a memory region outside of the IPA + * space addressable by the KVM guest IPA space. + */ +- if (memslot->base_gfn + memslot->npages >= +- (kvm_phys_size(kvm) >> PAGE_SHIFT)) ++ if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT)) + return -EFAULT; + + mmap_read_lock(current->mm); diff --git a/queue-5.11/kvm-arm64-fix-range-alignment-when-walking-page-tables.patch b/queue-5.11/kvm-arm64-fix-range-alignment-when-walking-page-tables.patch new file mode 100644 index 00000000000..4b93f2496f6 --- /dev/null +++ b/queue-5.11/kvm-arm64-fix-range-alignment-when-walking-page-tables.patch @@ -0,0 +1,44 @@ +From 357ad203d45c0f9d76a8feadbd5a1c5d460c638b Mon Sep 17 00:00:00 2001 +From: Jia He +Date: Fri, 5 Mar 2021 18:52:54 +0000 +Subject: KVM: arm64: Fix range alignment when walking page tables + +From: Jia He + +commit 357ad203d45c0f9d76a8feadbd5a1c5d460c638b upstream. + +When walking the page tables at a given level, and if the start +address for the range isn't aligned for that level, we propagate +the misalignment on each iteration at that level. + +This results in the walker ignoring a number of entries (depending +on the original misalignment) on each subsequent iteration. + +Properly aligning the address before the next iteration addresses +this issue. + +Cc: stable@vger.kernel.org +Reported-by: Howard Zhang +Acked-by: Will Deacon +Signed-off-by: Jia He +Fixes: b1e57de62cfb ("KVM: arm64: Add stand-alone page-table walker infrastructure") +[maz: rewrite commit message] +Signed-off-by: Marc Zyngier +Link: https://lore.kernel.org/r/20210303024225.2591-1-justin.he@arm.com +Message-Id: <20210305185254.3730990-9-maz@kernel.org> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/hyp/pgtable.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/arm64/kvm/hyp/pgtable.c ++++ b/arch/arm64/kvm/hyp/pgtable.c +@@ -225,6 +225,7 @@ static inline int __kvm_pgtable_visit(st + goto out; + + if (!table) { ++ data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); + data->addr += kvm_granule_size(level); + goto out; + } diff --git a/queue-5.11/kvm-arm64-nvhe-save-the-spe-context-early.patch b/queue-5.11/kvm-arm64-nvhe-save-the-spe-context-early.patch new file mode 100644 index 00000000000..0cb1e15e1db --- /dev/null +++ b/queue-5.11/kvm-arm64-nvhe-save-the-spe-context-early.patch @@ -0,0 +1,120 @@ +From b96b0c5de685df82019e16826a282d53d86d112c Mon Sep 17 00:00:00 2001 +From: Suzuki K Poulose +Date: Fri, 5 Mar 2021 18:52:47 +0000 +Subject: KVM: arm64: nvhe: Save the SPE context early + +From: Suzuki K Poulose + +commit b96b0c5de685df82019e16826a282d53d86d112c upstream. + +The nVHE KVM hyp drains and disables the SPE buffer, before +entering the guest, as the EL1&0 translation regime +is going to be loaded with that of the guest. + +But this operation is performed way too late, because : + - The owning translation regime of the SPE buffer + is transferred to EL2. (MDCR_EL2_E2PB == 0) + - The guest Stage1 is loaded. + +Thus the flush could use the host EL1 virtual address, +but use the EL2 translations instead of host EL1, for writing +out any cached data. + +Fix this by moving the SPE buffer handling early enough. +The restore path is doing the right thing. + +Fixes: 014c4c77aad7 ("KVM: arm64: Improve debug register save/restore flow") +Cc: stable@vger.kernel.org +Cc: Christoffer Dall +Cc: Marc Zyngier +Cc: Will Deacon +Cc: Catalin Marinas +Cc: Mark Rutland +Cc: Alexandru Elisei +Reviewed-by: Alexandru Elisei +Signed-off-by: Suzuki K Poulose +Signed-off-by: Marc Zyngier +Link: https://lore.kernel.org/r/20210302120345.3102874-1-suzuki.poulose@arm.com +Message-Id: <20210305185254.3730990-2-maz@kernel.org> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/include/asm/kvm_hyp.h | 5 +++++ + arch/arm64/kvm/hyp/nvhe/debug-sr.c | 12 ++++++++++-- + arch/arm64/kvm/hyp/nvhe/switch.c | 11 ++++++++++- + 3 files changed, 25 insertions(+), 3 deletions(-) + +--- a/arch/arm64/include/asm/kvm_hyp.h ++++ b/arch/arm64/include/asm/kvm_hyp.h +@@ -83,6 +83,11 @@ void sysreg_restore_guest_state_vhe(stru + void __debug_switch_to_guest(struct kvm_vcpu *vcpu); + void __debug_switch_to_host(struct kvm_vcpu *vcpu); + ++#ifdef __KVM_NVHE_HYPERVISOR__ ++void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu); ++void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); ++#endif ++ + void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); + void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); + +--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c ++++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c +@@ -58,16 +58,24 @@ static void __debug_restore_spe(u64 pmsc + write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); + } + +-void __debug_switch_to_guest(struct kvm_vcpu *vcpu) ++void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) + { + /* Disable and flush SPE data generation */ + __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); ++} ++ ++void __debug_switch_to_guest(struct kvm_vcpu *vcpu) ++{ + __debug_switch_to_guest_common(vcpu); + } + +-void __debug_switch_to_host(struct kvm_vcpu *vcpu) ++void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) + { + __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); ++} ++ ++void __debug_switch_to_host(struct kvm_vcpu *vcpu) ++{ + __debug_switch_to_host_common(vcpu); + } + +--- a/arch/arm64/kvm/hyp/nvhe/switch.c ++++ b/arch/arm64/kvm/hyp/nvhe/switch.c +@@ -192,6 +192,14 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu + pmu_switch_needed = __pmu_switch_to_guest(host_ctxt); + + __sysreg_save_state_nvhe(host_ctxt); ++ /* ++ * We must flush and disable the SPE buffer for nVHE, as ++ * the translation regime(EL1&0) is going to be loaded with ++ * that of the guest. And we must do this before we change the ++ * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and ++ * before we load guest Stage1. ++ */ ++ __debug_save_host_buffers_nvhe(vcpu); + + __adjust_pc(vcpu); + +@@ -234,11 +242,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu + if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) + __fpsimd_save_fpexc32(vcpu); + ++ __debug_switch_to_host(vcpu); + /* + * This must come after restoring the host sysregs, since a non-VHE + * system may enable SPE here and make use of the TTBRs. + */ +- __debug_switch_to_host(vcpu); ++ __debug_restore_host_buffers_nvhe(vcpu); + + if (pmu_switch_needed) + __pmu_switch_to_host(host_ctxt); diff --git a/queue-5.11/kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch b/queue-5.11/kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch new file mode 100644 index 00000000000..b1d294f8e21 --- /dev/null +++ b/queue-5.11/kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch @@ -0,0 +1,88 @@ +From 7d717558dd5ef10d28866750d5c24ff892ea3778 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Thu, 11 Mar 2021 10:00:15 +0000 +Subject: KVM: arm64: Reject VM creation when the default IPA size is unsupported + +From: Marc Zyngier + +commit 7d717558dd5ef10d28866750d5c24ff892ea3778 upstream. + +KVM/arm64 has forever used a 40bit default IPA space, partially +due to its 32bit heritage (where the only choice is 40bit). + +However, there are implementations in the wild that have a *cough* +much smaller *cough* IPA space, which leads to a misprogramming of +VTCR_EL2, and a guest that is stuck on its first memory access +if userspace dares to ask for the default IPA setting (which most +VMMs do). + +Instead, blundly reject the creation of such VM, as we can't +satisfy the requirements from userspace (with a one-off warning). +Also clarify the boot warning, and document that the VM creation +will fail when an unsupported IPA size is provided. + +Although this is an ABI change, it doesn't really change much +for userspace: + +- the guest couldn't run before this change, but no error was + returned. At least userspace knows what is happening. + +- a memory slot that was accepted because it did fit the default + IPA space now doesn't even get a chance to be registered. + +The other thing that is left doing is to convince userspace to +actually use the IPA space setting instead of relying on the +antiquated default. + +Fixes: 233a7cb23531 ("kvm: arm64: Allow tuning the physical address size for VM") +Signed-off-by: Marc Zyngier +Cc: stable@vger.kernel.org +Reviewed-by: Andrew Jones +Reviewed-by: Eric Auger +Link: https://lore.kernel.org/r/20210311100016.3830038-2-maz@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/virt/kvm/api.rst | 3 +++ + arch/arm64/kvm/reset.c | 12 ++++++++---- + 2 files changed, 11 insertions(+), 4 deletions(-) + +--- a/Documentation/virt/kvm/api.rst ++++ b/Documentation/virt/kvm/api.rst +@@ -182,6 +182,9 @@ is dependent on the CPU capability and t + be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION + ioctl() at run-time. + ++Creation of the VM will fail if the requested IPA size (whether it is ++implicit or explicit) is unsupported on the host. ++ + Please note that configuring the IPA size does not affect the capability + exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects + size of the address translated by the stage2 level (guest physical to +--- a/arch/arm64/kvm/reset.c ++++ b/arch/arm64/kvm/reset.c +@@ -324,10 +324,9 @@ int kvm_set_ipa_limit(void) + } + + kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); +- WARN(kvm_ipa_limit < KVM_PHYS_SHIFT, +- "KVM IPA Size Limit (%d bits) is smaller than default size\n", +- kvm_ipa_limit); +- kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit); ++ kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit, ++ ((kvm_ipa_limit < KVM_PHYS_SHIFT) ? ++ " (Reduced IPA size, limited VM/VMM compatibility)" : "")); + + return 0; + } +@@ -356,6 +355,11 @@ int kvm_arm_setup_stage2(struct kvm *kvm + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; ++ if (phys_shift > kvm_ipa_limit) { ++ pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", ++ current->comm); ++ return -EINVAL; ++ } + } + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); diff --git a/queue-5.11/kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch b/queue-5.11/kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch new file mode 100644 index 00000000000..95fc0158fc5 --- /dev/null +++ b/queue-5.11/kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch @@ -0,0 +1,78 @@ +From d7eb79c6290c7ae4561418544072e0a3266e7384 Mon Sep 17 00:00:00 2001 +From: Wanpeng Li +Date: Wed, 24 Feb 2021 09:37:29 +0800 +Subject: KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged + +From: Wanpeng Li + +commit d7eb79c6290c7ae4561418544072e0a3266e7384 upstream. + +# lscpu +Architecture: x86_64 +CPU op-mode(s): 32-bit, 64-bit +Byte Order: Little Endian +CPU(s): 88 +On-line CPU(s) list: 0-63 +Off-line CPU(s) list: 64-87 + +# cat /proc/cmdline +BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root ro +rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US .UTF-8 no-kvmclock-vsyscall + +# echo 1 > /sys/devices/system/cpu/cpu76/online +-bash: echo: write error: Cannot allocate memory + +The per-cpu vsyscall pvclock data pointer assigns either an element of the +static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory +hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if +kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in +kvmclock_setup_percpu() which returns -ENOMEM. It's broken for no-vsyscall +and sometimes you end up with vsyscall disabled if the host does something +strange. This patch fixes it by allocating this dynamically memory +unconditionally even if vsyscall is disabled. + +Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables") +Reported-by: Zelin Deng +Cc: Brijesh Singh +Cc: stable@vger.kernel.org#v4.19-rc5+ +Signed-off-by: Wanpeng Li +Message-Id: <1614130683-24137-1-git-send-email-wanpengli@tencent.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/kvmclock.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +--- a/arch/x86/kernel/kvmclock.c ++++ b/arch/x86/kernel/kvmclock.c +@@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(voi + + static int __init kvm_setup_vsyscall_timeinfo(void) + { +-#ifdef CONFIG_X86_64 +- u8 flags; ++ kvmclock_init_mem(); + +- if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) +- return 0; ++#ifdef CONFIG_X86_64 ++ if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { ++ u8 flags; + +- flags = pvclock_read_flags(&hv_clock_boot[0].pvti); +- if (!(flags & PVCLOCK_TSC_STABLE_BIT)) +- return 0; ++ flags = pvclock_read_flags(&hv_clock_boot[0].pvti); ++ if (!(flags & PVCLOCK_TSC_STABLE_BIT)) ++ return 0; + +- kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; ++ kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; ++ } + #endif + +- kvmclock_init_mem(); +- + return 0; + } + early_initcall(kvm_setup_vsyscall_timeinfo); diff --git a/queue-5.11/kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch b/queue-5.11/kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch new file mode 100644 index 00000000000..05441e9c867 --- /dev/null +++ b/queue-5.11/kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch @@ -0,0 +1,46 @@ +From beda430177f56656e7980dcce93456ffaa35676b Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 4 Mar 2021 18:18:08 -0800 +Subject: KVM: x86: Ensure deadline timer has truly expired before posting its IRQ + +From: Sean Christopherson + +commit beda430177f56656e7980dcce93456ffaa35676b upstream. + +When posting a deadline timer interrupt, open code the checks guarding +__kvm_wait_lapic_expire() in order to skip the lapic_timer_int_injected() +check in kvm_wait_lapic_expire(). The injection check will always fail +since the interrupt has not yet be injected. Moving the call after +injection would also be wrong as that wouldn't actually delay delivery +of the IRQ if it is indeed sent via posted interrupt. + +Fixes: 010fd37fddf6 ("KVM: LAPIC: Reduce world switch latency caused by timer_advance_ns") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20210305021808.3769732-1-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/lapic.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -1641,7 +1641,16 @@ static void apic_timer_expired(struct kv + } + + if (kvm_use_posted_timer_interrupt(apic->vcpu)) { +- kvm_wait_lapic_expire(vcpu); ++ /* ++ * Ensure the guest's timer has truly expired before posting an ++ * interrupt. Open code the relevant checks to avoid querying ++ * lapic_timer_int_injected(), which will be false since the ++ * interrupt isn't yet injected. Waiting until after injecting ++ * is not an option since that won't help a posted interrupt. ++ */ ++ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && ++ vcpu->arch.apic->lapic_timer.timer_advance_ns) ++ __kvm_wait_lapic_expire(vcpu); + kvm_apic_inject_pending_timer_irqs(apic); + return; + } diff --git a/queue-5.11/linux-compiler-clang.h-define-have_builtin_bswap.patch b/queue-5.11/linux-compiler-clang.h-define-have_builtin_bswap.patch new file mode 100644 index 00000000000..4356e585c48 --- /dev/null +++ b/queue-5.11/linux-compiler-clang.h-define-have_builtin_bswap.patch @@ -0,0 +1,80 @@ +From 97e4910232fa1f81e806aa60c25a0450276d99a2 Mon Sep 17 00:00:00 2001 +From: Arnd Bergmann +Date: Fri, 12 Mar 2021 21:07:47 -0800 +Subject: linux/compiler-clang.h: define HAVE_BUILTIN_BSWAP* + +From: Arnd Bergmann + +commit 97e4910232fa1f81e806aa60c25a0450276d99a2 upstream. + +Separating compiler-clang.h from compiler-gcc.h inadventently dropped the +definitions of the three HAVE_BUILTIN_BSWAP macros, which requires falling +back to the open-coded version and hoping that the compiler detects it. + +Since all versions of clang support the __builtin_bswap interfaces, add +back the flags and have the headers pick these up automatically. + +This results in a 4% improvement of compilation speed for arm defconfig. + +Note: it might also be worth revisiting which architectures set +CONFIG_ARCH_USE_BUILTIN_BSWAP for one compiler or the other, today this is +set on six architectures (arm32, csky, mips, powerpc, s390, x86), while +another ten architectures define custom helpers (alpha, arc, ia64, m68k, +mips, nios2, parisc, sh, sparc, xtensa), and the rest (arm64, h8300, +hexagon, microblaze, nds32, openrisc, riscv) just get the unoptimized +version and rely on the compiler to detect it. + +A long time ago, the compiler builtins were architecture specific, but +nowadays, all compilers that are able to build the kernel have correct +implementations of them, though some may not be as optimized as the inline +asm versions. + +The patch that dropped the optimization landed in v4.19, so as discussed +it would be fairly safe to backport this revert to stable kernels to the +4.19/5.4/5.10 stable kernels, but there is a remaining risk for +regressions, and it has no known side-effects besides compile speed. + +Link: https://lkml.kernel.org/r/20210226161151.2629097-1-arnd@kernel.org +Link: https://lore.kernel.org/lkml/20210225164513.3667778-1-arnd@kernel.org/ +Fixes: 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") +Signed-off-by: Arnd Bergmann +Reviewed-by: Nathan Chancellor +Reviewed-by: Kees Cook +Acked-by: Miguel Ojeda +Acked-by: Nick Desaulniers +Acked-by: Luc Van Oostenryck +Cc: Masahiro Yamada +Cc: Nick Hu +Cc: Greentime Hu +Cc: Vincent Chen +Cc: Paul Walmsley +Cc: Palmer Dabbelt +Cc: Albert Ou +Cc: Guo Ren +Cc: Randy Dunlap +Cc: Sami Tolvanen +Cc: Marco Elver +Cc: Arvind Sankar +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/compiler-clang.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/include/linux/compiler-clang.h ++++ b/include/linux/compiler-clang.h +@@ -41,6 +41,12 @@ + #define __no_sanitize_thread + #endif + ++#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) ++#define __HAVE_BUILTIN_BSWAP32__ ++#define __HAVE_BUILTIN_BSWAP64__ ++#define __HAVE_BUILTIN_BSWAP16__ ++#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ ++ + #if __has_feature(undefined_behavior_sanitizer) + /* GCC does not have __SANITIZE_UNDEFINED__ */ + #define __no_sanitize_undefined \ diff --git a/queue-5.11/mm-highmem.c-fix-zero_user_segments-with-start-end.patch b/queue-5.11/mm-highmem.c-fix-zero_user_segments-with-start-end.patch new file mode 100644 index 00000000000..8df30ff278d --- /dev/null +++ b/queue-5.11/mm-highmem.c-fix-zero_user_segments-with-start-end.patch @@ -0,0 +1,77 @@ +From 184cee516f3e24019a08ac8eb5c7cf04c00933cb Mon Sep 17 00:00:00 2001 +From: OGAWA Hirofumi +Date: Fri, 12 Mar 2021 21:07:37 -0800 +Subject: mm/highmem.c: fix zero_user_segments() with start > end + +From: OGAWA Hirofumi + +commit 184cee516f3e24019a08ac8eb5c7cf04c00933cb upstream. + +zero_user_segments() is used from __block_write_begin_int(), for example +like the following + + zero_user_segments(page, 4096, 1024, 512, 918) + +But new the zero_user_segments() implementation for for HIGHMEM + +TRANSPARENT_HUGEPAGE doesn't handle "start > end" case correctly, and hits +BUG_ON(). (we can fix __block_write_begin_int() instead though, it is the +old and multiple usage) + +Also it calls kmap_atomic() unnecessarily while start == end == 0. + +Link: https://lkml.kernel.org/r/87v9ab60r4.fsf@mail.parknet.co.jp +Fixes: 0060ef3b4e6d ("mm: support THPs in zero_user_segments") +Signed-off-by: OGAWA Hirofumi +Cc: Matthew Wilcox +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/highmem.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -368,20 +368,24 @@ void zero_user_segments(struct page *pag + + BUG_ON(end1 > page_size(page) || end2 > page_size(page)); + ++ if (start1 >= end1) ++ start1 = end1 = 0; ++ if (start2 >= end2) ++ start2 = end2 = 0; ++ + for (i = 0; i < compound_nr(page); i++) { + void *kaddr = NULL; + +- if (start1 < PAGE_SIZE || start2 < PAGE_SIZE) +- kaddr = kmap_atomic(page + i); +- + if (start1 >= PAGE_SIZE) { + start1 -= PAGE_SIZE; + end1 -= PAGE_SIZE; + } else { + unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); + +- if (end1 > start1) ++ if (end1 > start1) { ++ kaddr = kmap_atomic(page + i); + memset(kaddr + start1, 0, this_end - start1); ++ } + end1 -= this_end; + start1 = 0; + } +@@ -392,8 +396,11 @@ void zero_user_segments(struct page *pag + } else { + unsigned this_end = min_t(unsigned, end2, PAGE_SIZE); + +- if (end2 > start2) ++ if (end2 > start2) { ++ if (!kaddr) ++ kaddr = kmap_atomic(page + i); + memset(kaddr + start2, 0, this_end - start2); ++ } + end2 -= this_end; + start2 = 0; + } diff --git a/queue-5.11/mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch b/queue-5.11/mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch new file mode 100644 index 00000000000..0cb471d1741 --- /dev/null +++ b/queue-5.11/mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch @@ -0,0 +1,82 @@ +From 96cfe2c0fd23ea7c2368d14f769d287e7ae1082e Mon Sep 17 00:00:00 2001 +From: Suren Baghdasaryan +Date: Fri, 12 Mar 2021 21:08:06 -0800 +Subject: mm/madvise: replace ptrace attach requirement for process_madvise + +From: Suren Baghdasaryan + +commit 96cfe2c0fd23ea7c2368d14f769d287e7ae1082e upstream. + +process_madvise currently requires ptrace attach capability. +PTRACE_MODE_ATTACH gives one process complete control over another +process. It effectively removes the security boundary between the two +processes (in one direction). Granting ptrace attach capability even to a +system process is considered dangerous since it creates an attack surface. +This severely limits the usage of this API. + +The operations process_madvise can perform do not affect the correctness +of the operation of the target process; they only affect where the data is +physically located (and therefore, how fast it can be accessed). What we +want is the ability for one process to influence another process in order +to optimize performance across the entire system while leaving the +security boundary intact. + +Replace PTRACE_MODE_ATTACH with a combination of PTRACE_MODE_READ and +CAP_SYS_NICE. PTRACE_MODE_READ to prevent leaking ASLR metadata and +CAP_SYS_NICE for influencing process performance. + +Link: https://lkml.kernel.org/r/20210303185807.2160264-1-surenb@google.com +Signed-off-by: Suren Baghdasaryan +Reviewed-by: Kees Cook +Acked-by: Minchan Kim +Acked-by: David Rientjes +Cc: Jann Horn +Cc: Jeff Vander Stoep +Cc: Michal Hocko +Cc: Shakeel Butt +Cc: Tim Murray +Cc: Florian Weimer +Cc: Oleg Nesterov +Cc: James Morris +Cc: [5.10+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/madvise.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -1197,12 +1197,22 @@ SYSCALL_DEFINE5(process_madvise, int, pi + goto release_task; + } + +- mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); ++ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ ++ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + ++ /* ++ * Require CAP_SYS_NICE for influencing process performance. Note that ++ * only non-destructive hints are currently supported. ++ */ ++ if (!capable(CAP_SYS_NICE)) { ++ ret = -EPERM; ++ goto release_mm; ++ } ++ + total_len = iov_iter_count(&iter); + + while (iov_iter_count(&iter)) { +@@ -1217,6 +1227,7 @@ SYSCALL_DEFINE5(process_madvise, int, pi + if (ret == 0) + ret = total_len - iov_iter_count(&iter); + ++release_mm: + mmput(mm); + release_task: + put_task_struct(task); diff --git a/queue-5.11/mm-memcg-rename-mem_cgroup_split_huge_fixup-to-split_page_memcg-and-add-nr_pages-argument.patch b/queue-5.11/mm-memcg-rename-mem_cgroup_split_huge_fixup-to-split_page_memcg-and-add-nr_pages-argument.patch new file mode 100644 index 00000000000..2d907016d12 --- /dev/null +++ b/queue-5.11/mm-memcg-rename-mem_cgroup_split_huge_fixup-to-split_page_memcg-and-add-nr_pages-argument.patch @@ -0,0 +1,106 @@ +From be6c8982e4ab9a41907555f601b711a7e2a17d4c Mon Sep 17 00:00:00 2001 +From: Zhou Guanghui +Date: Fri, 12 Mar 2021 21:08:30 -0800 +Subject: mm/memcg: rename mem_cgroup_split_huge_fixup to split_page_memcg and add nr_pages argument + +From: Zhou Guanghui + +commit be6c8982e4ab9a41907555f601b711a7e2a17d4c upstream. + +Rename mem_cgroup_split_huge_fixup to split_page_memcg and explicitly pass +in page number argument. + +In this way, the interface name is more common and can be used by +potential users. In addition, the complete info(memcg and flag) of the +memcg needs to be set to the tail pages. + +Link: https://lkml.kernel.org/r/20210304074053.65527-2-zhouguanghui1@huawei.com +Signed-off-by: Zhou Guanghui +Acked-by: Johannes Weiner +Reviewed-by: Zi Yan +Reviewed-by: Shakeel Butt +Acked-by: Michal Hocko +Cc: Hugh Dickins +Cc: "Kirill A. Shutemov" +Cc: Nicholas Piggin +Cc: Kefeng Wang +Cc: Hanjun Guo +Cc: Tianhong Ding +Cc: Weilong Chen +Cc: Rui Xiang +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/memcontrol.h | 6 ++---- + mm/huge_memory.c | 2 +- + mm/memcontrol.c | 15 ++++++--------- + 3 files changed, 9 insertions(+), 14 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -1072,9 +1072,7 @@ static inline void memcg_memory_event_mm + rcu_read_unlock(); + } + +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE +-void mem_cgroup_split_huge_fixup(struct page *head); +-#endif ++void split_page_memcg(struct page *head, unsigned int nr); + + #else /* CONFIG_MEMCG */ + +@@ -1416,7 +1414,7 @@ unsigned long mem_cgroup_soft_limit_recl + return 0; + } + +-static inline void mem_cgroup_split_huge_fixup(struct page *head) ++static inline void split_page_memcg(struct page *head, unsigned int nr) + { + } + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2465,7 +2465,7 @@ static void __split_huge_page(struct pag + int i; + + /* complete memcg works before add pages to LRU */ +- mem_cgroup_split_huge_fixup(head); ++ split_page_memcg(head, nr); + + if (PageAnon(head) && PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -3296,24 +3296,21 @@ void obj_cgroup_uncharge(struct obj_cgro + + #endif /* CONFIG_MEMCG_KMEM */ + +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* +- * Because page_memcg(head) is not set on compound tails, set it now. ++ * Because page_memcg(head) is not set on tails, set it now. + */ +-void mem_cgroup_split_huge_fixup(struct page *head) ++void split_page_memcg(struct page *head, unsigned int nr) + { + struct mem_cgroup *memcg = page_memcg(head); + int i; + +- if (mem_cgroup_disabled()) ++ if (mem_cgroup_disabled() || !memcg) + return; + +- for (i = 1; i < HPAGE_PMD_NR; i++) { +- css_get(&memcg->css); +- head[i].memcg_data = (unsigned long)memcg; +- } ++ for (i = 1; i < nr; i++) ++ head[i].memcg_data = head->memcg_data; ++ css_get_many(&memcg->css, nr - 1); + } +-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + #ifdef CONFIG_MEMCG_SWAP + /** diff --git a/queue-5.11/mm-memcg-set-memcg-when-splitting-page.patch b/queue-5.11/mm-memcg-set-memcg-when-splitting-page.patch new file mode 100644 index 00000000000..b462df33c59 --- /dev/null +++ b/queue-5.11/mm-memcg-set-memcg-when-splitting-page.patch @@ -0,0 +1,61 @@ +From e1baddf8475b06cc56f4bafecf9a32a124343d9f Mon Sep 17 00:00:00 2001 +From: Zhou Guanghui +Date: Fri, 12 Mar 2021 21:08:33 -0800 +Subject: mm/memcg: set memcg when splitting page + +From: Zhou Guanghui + +commit e1baddf8475b06cc56f4bafecf9a32a124343d9f upstream. + +As described in the split_page() comment, for the non-compound high order +page, the sub-pages must be freed individually. If the memcg of the first +page is valid, the tail pages cannot be uncharged when be freed. + +For example, when alloc_pages_exact is used to allocate 1MB continuous +physical memory, 2MB is charged(kmemcg is enabled and __GFP_ACCOUNT is +set). When make_alloc_exact free the unused 1MB and free_pages_exact free +the applied 1MB, actually, only 4KB(one page) is uncharged. + +Therefore, the memcg of the tail page needs to be set when splitting a +page. + +Michel: + +There are at least two explicit users of __GFP_ACCOUNT with +alloc_exact_pages added recently. See 7efe8ef274024 ("KVM: arm64: +Allocate stage-2 pgd pages with GFP_KERNEL_ACCOUNT") and c419621873713 +("KVM: s390: Add memcg accounting to KVM allocations"), so this is not +just a theoretical issue. + +Link: https://lkml.kernel.org/r/20210304074053.65527-3-zhouguanghui1@huawei.com +Signed-off-by: Zhou Guanghui +Acked-by: Johannes Weiner +Reviewed-by: Zi Yan +Reviewed-by: Shakeel Butt +Acked-by: Michal Hocko +Cc: Hanjun Guo +Cc: Hugh Dickins +Cc: Kefeng Wang +Cc: "Kirill A. Shutemov" +Cc: Nicholas Piggin +Cc: Rui Xiang +Cc: Tianhong Ding +Cc: Weilong Chen +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3313,6 +3313,7 @@ void split_page(struct page *page, unsig + for (i = 1; i < (1 << order); i++) + set_page_refcounted(page + i); + split_page_owner(page, 1 << order); ++ split_page_memcg(page, 1 << order); + } + EXPORT_SYMBOL_GPL(split_page); + diff --git a/queue-5.11/mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch b/queue-5.11/mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch new file mode 100644 index 00000000000..86e4e863ab9 --- /dev/null +++ b/queue-5.11/mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch @@ -0,0 +1,130 @@ +From 6ce64428d62026a10cb5d80138ff2f90cc21d367 Mon Sep 17 00:00:00 2001 +From: Nadav Amit +Date: Fri, 12 Mar 2021 21:08:17 -0800 +Subject: mm/userfaultfd: fix memory corruption due to writeprotect +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Nadav Amit + +commit 6ce64428d62026a10cb5d80138ff2f90cc21d367 upstream. + +Userfaultfd self-test fails occasionally, indicating a memory corruption. + +Analyzing this problem indicates that there is a real bug since mmap_lock +is only taken for read in mwriteprotect_range() and defers flushes, and +since there is insufficient consideration of concurrent deferred TLB +flushes in wp_page_copy(). Although the PTE is flushed from the TLBs in +wp_page_copy(), this flush takes place after the copy has already been +performed, and therefore changes of the page are possible between the time +of the copy and the time in which the PTE is flushed. + +To make matters worse, memory-unprotection using userfaultfd also poses a +problem. Although memory unprotection is logically a promotion of PTE +permissions, and therefore should not require a TLB flush, the current +userrfaultfd code might actually cause a demotion of the architectural PTE +permission: when userfaultfd_writeprotect() unprotects memory region, it +unintentionally *clears* the RW-bit if it was already set. Note that this +unprotecting a PTE that is not write-protected is a valid use-case: the +userfaultfd monitor might ask to unprotect a region that holds both +write-protected and write-unprotected PTEs. + +The scenario that happens in selftests/vm/userfaultfd is as follows: + +cpu0 cpu1 cpu2 +---- ---- ---- + [ Writable PTE + cached in TLB ] +userfaultfd_writeprotect() +[ write-*unprotect* ] +mwriteprotect_range() +mmap_read_lock() +change_protection() + +change_protection_range() +... +change_pte_range() +[ *clear* “write”-bit ] +[ defer TLB flushes ] + [ page-fault ] + ... + wp_page_copy() + cow_user_page() + [ copy page ] + [ write to old + page ] + ... + set_pte_at_notify() + +A similar scenario can happen: + +cpu0 cpu1 cpu2 cpu3 +---- ---- ---- ---- + [ Writable PTE + cached in TLB ] +userfaultfd_writeprotect() +[ write-protect ] +[ deferred TLB flush ] + userfaultfd_writeprotect() + [ write-unprotect ] + [ deferred TLB flush] + [ page-fault ] + wp_page_copy() + cow_user_page() + [ copy page ] + ... [ write to page ] + set_pte_at_notify() + +This race exists since commit 292924b26024 ("userfaultfd: wp: apply +_PAGE_UFFD_WP bit"). Yet, as Yu Zhao pointed, these races became apparent +since commit 09854ba94c6a ("mm: do_wp_page() simplification") which made +wp_page_copy() more likely to take place, specifically if page_count(page) +> 1. + +To resolve the aforementioned races, check whether there are pending +flushes on uffd-write-protected VMAs, and if there are, perform a flush +before doing the COW. + +Further optimizations will follow to avoid during uffd-write-unprotect +unnecassary PTE write-protection and TLB flushes. + +Link: https://lkml.kernel.org/r/20210304095423.3825684-1-namit@vmware.com +Fixes: 09854ba94c6a ("mm: do_wp_page() simplification") +Signed-off-by: Nadav Amit +Suggested-by: Yu Zhao +Reviewed-by: Peter Xu +Tested-by: Peter Xu +Cc: Andrea Arcangeli +Cc: Andy Lutomirski +Cc: Pavel Emelyanov +Cc: Mike Kravetz +Cc: Mike Rapoport +Cc: Minchan Kim +Cc: Will Deacon +Cc: Peter Zijlstra +Cc: [5.9+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/memory.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -3092,6 +3092,14 @@ static vm_fault_t do_wp_page(struct vm_f + return handle_userfault(vmf, VM_UFFD_WP); + } + ++ /* ++ * Userfaultfd write-protect can defer flushes. Ensure the TLB ++ * is flushed in this case before copying. ++ */ ++ if (unlikely(userfaultfd_wp(vmf->vma) && ++ mm_tlb_flush_pending(vmf->vma->vm_mm))) ++ flush_tlb_page(vmf->vma, vmf->address); ++ + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (!vmf->page) { + /* diff --git a/queue-5.11/powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch b/queue-5.11/powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch new file mode 100644 index 00000000000..e799db977a6 --- /dev/null +++ b/queue-5.11/powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch @@ -0,0 +1,36 @@ +From cea15316ceee2d4a51dfdecd79e08a438135416c Mon Sep 17 00:00:00 2001 +From: "Naveen N. Rao" +Date: Thu, 4 Mar 2021 07:34:11 +0530 +Subject: powerpc/64s: Fix instruction encoding for lis in ppc_function_entry() + +From: Naveen N. Rao + +commit cea15316ceee2d4a51dfdecd79e08a438135416c upstream. + +'lis r2,N' is 'addis r2,0,N' and the instruction encoding in the macro +LIS_R2 is incorrect (it currently maps to 'addis r0,r2,N'). Fix the +same. + +Fixes: c71b7eff426f ("powerpc: Add ABIv2 support to ppc_function_entry") +Cc: stable@vger.kernel.org # v3.16+ +Reported-by: Jiri Olsa +Signed-off-by: Naveen N. Rao +Acked-by: Segher Boessenkool +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210304020411.16796-1-naveen.n.rao@linux.vnet.ibm.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/include/asm/code-patching.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/powerpc/include/asm/code-patching.h ++++ b/arch/powerpc/include/asm/code-patching.h +@@ -73,7 +73,7 @@ void __patch_exception(int exc, unsigned + #endif + + #define OP_RT_RA_MASK 0xffff0000UL +-#define LIS_R2 0x3c020000UL ++#define LIS_R2 0x3c400000UL + #define ADDIS_R2_R12 0x3c4c0000UL + #define ADDI_R2_R2 0x38420000UL + diff --git a/queue-5.11/powerpc-fix-inverted-set_full_regs-bitop.patch b/queue-5.11/powerpc-fix-inverted-set_full_regs-bitop.patch new file mode 100644 index 00000000000..b882bc0c6ca --- /dev/null +++ b/queue-5.11/powerpc-fix-inverted-set_full_regs-bitop.patch @@ -0,0 +1,45 @@ +From 73ac79881804eed2e9d76ecdd1018037f8510cb1 Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin +Date: Mon, 8 Mar 2021 18:55:30 +1000 +Subject: powerpc: Fix inverted SET_FULL_REGS bitop + +From: Nicholas Piggin + +commit 73ac79881804eed2e9d76ecdd1018037f8510cb1 upstream. + +This bit operation was inverted and set the low bit rather than +cleared it, breaking the ability to ptrace non-volatile GPRs after +exec. Fix. + +Only affects 64e and 32-bit. + +Fixes: feb9df3462e6 ("powerpc/64s: Always has full regs, so remove remnant checks") +Cc: stable@vger.kernel.org # v5.8+ +Signed-off-by: Nicholas Piggin +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210308085530.3191843-1-npiggin@gmail.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/include/asm/ptrace.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/powerpc/include/asm/ptrace.h ++++ b/arch/powerpc/include/asm/ptrace.h +@@ -195,7 +195,7 @@ static inline void regs_set_return_value + #define TRAP_FLAGS_MASK 0x11 + #define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) + #define FULL_REGS(regs) (((regs)->trap & 1) == 0) +-#define SET_FULL_REGS(regs) ((regs)->trap |= 1) ++#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) + #endif + #define CHECK_FULL_REGS(regs) BUG_ON(!FULL_REGS(regs)) + #define NV_REG_POISON 0xdeadbeefdeadbeefUL +@@ -210,7 +210,7 @@ static inline void regs_set_return_value + #define TRAP_FLAGS_MASK 0x1F + #define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) + #define FULL_REGS(regs) (((regs)->trap & 1) == 0) +-#define SET_FULL_REGS(regs) ((regs)->trap |= 1) ++#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) + #define IS_CRITICAL_EXC(regs) (((regs)->trap & 2) != 0) + #define IS_MCHECK_EXC(regs) (((regs)->trap & 4) != 0) + #define IS_DEBUG_EXC(regs) (((regs)->trap & 8) != 0) diff --git a/queue-5.11/powerpc-fix-missing-declaration-of-able_kernel_vsx.patch b/queue-5.11/powerpc-fix-missing-declaration-of-able_kernel_vsx.patch new file mode 100644 index 00000000000..609935d750d --- /dev/null +++ b/queue-5.11/powerpc-fix-missing-declaration-of-able_kernel_vsx.patch @@ -0,0 +1,83 @@ +From bd73758803c2eedc037c2268b65a19542a832594 Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Tue, 9 Mar 2021 08:39:39 +0000 +Subject: powerpc: Fix missing declaration of [en/dis]able_kernel_vsx() + +From: Christophe Leroy + +commit bd73758803c2eedc037c2268b65a19542a832594 upstream. + +Add stub instances of enable_kernel_vsx() and disable_kernel_vsx() +when CONFIG_VSX is not set, to avoid following build failure. + + CC [M] drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.o + In file included from ./drivers/gpu/drm/amd/amdgpu/../display/dc/dm_services_types.h:29, + from ./drivers/gpu/drm/amd/amdgpu/../display/dc/dm_services.h:37, + from drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c:27: + drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c: In function 'dcn_bw_apply_registry_override': + ./drivers/gpu/drm/amd/amdgpu/../display/dc/os_types.h:64:3: error: implicit declaration of function 'enable_kernel_vsx'; did you mean 'enable_kernel_fp'? [-Werror=implicit-function-declaration] + 64 | enable_kernel_vsx(); \ + | ^~~~~~~~~~~~~~~~~ + drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c:640:2: note: in expansion of macro 'DC_FP_START' + 640 | DC_FP_START(); + | ^~~~~~~~~~~ + ./drivers/gpu/drm/amd/amdgpu/../display/dc/os_types.h:75:3: error: implicit declaration of function 'disable_kernel_vsx'; did you mean 'disable_kernel_fp'? [-Werror=implicit-function-declaration] + 75 | disable_kernel_vsx(); \ + | ^~~~~~~~~~~~~~~~~~ + drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c:676:2: note: in expansion of macro 'DC_FP_END' + 676 | DC_FP_END(); + | ^~~~~~~~~ + cc1: some warnings being treated as errors + make[5]: *** [drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.o] Error 1 + +This works because the caller is checking if VSX is available using +cpu_has_feature(): + + #define DC_FP_START() { \ + if (cpu_has_feature(CPU_FTR_VSX_COMP)) { \ + preempt_disable(); \ + enable_kernel_vsx(); \ + } else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) { \ + preempt_disable(); \ + enable_kernel_altivec(); \ + } else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) { \ + preempt_disable(); \ + enable_kernel_fp(); \ + } \ + +When CONFIG_VSX is not selected, cpu_has_feature(CPU_FTR_VSX_COMP) +constant folds to 'false' so the call to enable_kernel_vsx() is +discarded and the build succeeds. + +Fixes: 16a9dea110a6 ("amdgpu: Enable initial DCN support on POWER") +Cc: stable@vger.kernel.org # v5.6+ +Reported-by: Geert Uytterhoeven +Reported-by: kernel test robot +Signed-off-by: Christophe Leroy +[mpe: Incorporate some discussion comments into the change log] +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/8d7d285a027e9d21f5ff7f850fa71a2655b0c4af.1615279170.git.christophe.leroy@csgroup.eu +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/include/asm/switch_to.h | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/arch/powerpc/include/asm/switch_to.h ++++ b/arch/powerpc/include/asm/switch_to.h +@@ -71,6 +71,16 @@ static inline void disable_kernel_vsx(vo + { + msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX); + } ++#else ++static inline void enable_kernel_vsx(void) ++{ ++ BUILD_BUG(); ++} ++ ++static inline void disable_kernel_vsx(void) ++{ ++ BUILD_BUG(); ++} + #endif + + #ifdef CONFIG_SPE diff --git a/queue-5.11/sched-collate-affine_move_task-stoppers.patch b/queue-5.11/sched-collate-affine_move_task-stoppers.patch new file mode 100644 index 00000000000..5a1da4c2a58 --- /dev/null +++ b/queue-5.11/sched-collate-affine_move_task-stoppers.patch @@ -0,0 +1,64 @@ +From 58b1a45086b5f80f2b2842aa7ed0da51a64a302b Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Wed, 24 Feb 2021 11:15:23 +0100 +Subject: sched: Collate affine_move_task() stoppers + +From: Peter Zijlstra + +commit 58b1a45086b5f80f2b2842aa7ed0da51a64a302b upstream. + +The SCA_MIGRATE_ENABLE and task_running() cases are almost identical, +collapse them to avoid further duplication. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Ingo Molnar +Reviewed-by: Valentin Schneider +Link: https://lkml.kernel.org/r/20210224131355.500108964@infradead.org +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/core.c | 23 ++++++++--------------- + 1 file changed, 8 insertions(+), 15 deletions(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2279,30 +2279,23 @@ static int affine_move_task(struct rq *r + return -EINVAL; + } + +- if (flags & SCA_MIGRATE_ENABLE) { +- +- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ +- p->migration_flags &= ~MDF_PUSH; +- task_rq_unlock(rq, p, rf); +- +- stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, +- &pending->arg, &pending->stop_work); +- +- return 0; +- } +- + if (task_running(rq, p) || p->state == TASK_WAKING) { + /* +- * Lessen races (and headaches) by delegating +- * is_migration_disabled(p) checks to the stopper, which will +- * run on the same CPU as said p. ++ * MIGRATE_ENABLE gets here because 'p == current', but for ++ * anything else we cannot do is_migration_disabled(), punt ++ * and have the stopper function handle it all race-free. + */ ++ + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ ++ if (flags & SCA_MIGRATE_ENABLE) ++ p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + ++ if (flags & SCA_MIGRATE_ENABLE) ++ return 0; + } else { + + if (!is_migration_disabled(p)) { diff --git a/queue-5.11/sched-fix-affine_move_task-self-concurrency.patch b/queue-5.11/sched-fix-affine_move_task-self-concurrency.patch new file mode 100644 index 00000000000..f507f7cd360 --- /dev/null +++ b/queue-5.11/sched-fix-affine_move_task-self-concurrency.patch @@ -0,0 +1,91 @@ +From 9e81889c7648d48dd5fe13f41cbc99f3c362484a Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Wed, 24 Feb 2021 11:31:09 +0100 +Subject: sched: Fix affine_move_task() self-concurrency + +From: Peter Zijlstra + +commit 9e81889c7648d48dd5fe13f41cbc99f3c362484a upstream. + +Consider: + + sched_setaffinity(p, X); sched_setaffinity(p, Y); + +Then the first will install p->migration_pending = &my_pending; and +issue stop_one_cpu_nowait(pending); and the second one will read +p->migration_pending and _also_ issue: stop_one_cpu_nowait(pending), +the _SAME_ @pending. + +This causes stopper list corruption. + +Add set_affinity_pending::stop_pending, to indicate if a stopper is in +progress. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Ingo Molnar +Reviewed-by: Valentin Schneider +Link: https://lkml.kernel.org/r/20210224131355.649146419@infradead.org +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/core.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1864,6 +1864,7 @@ struct migration_arg { + + struct set_affinity_pending { + refcount_t refs; ++ unsigned int stop_pending; + struct completion done; + struct cpu_stop_work stop_work; + struct migration_arg arg; +@@ -1982,12 +1983,15 @@ static int migration_cpu_stop(void *data + * determine is_migration_disabled() and so have to chase after + * it. + */ ++ WARN_ON_ONCE(!pending->stop_pending); + task_rq_unlock(rq, p, &rf); + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, + &pending->arg, &pending->stop_work); + return 0; + } + out: ++ if (pending) ++ pending->stop_pending = false; + task_rq_unlock(rq, p, &rf); + + if (complete) +@@ -2183,7 +2187,7 @@ static int affine_move_task(struct rq *r + int dest_cpu, unsigned int flags) + { + struct set_affinity_pending my_pending = { }, *pending = NULL; +- bool complete = false; ++ bool stop_pending, complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { +@@ -2256,14 +2260,19 @@ static int affine_move_task(struct rq *r + * anything else we cannot do is_migration_disabled(), punt + * and have the stopper function handle it all race-free. + */ ++ stop_pending = pending->stop_pending; ++ if (!stop_pending) ++ pending->stop_pending = true; + + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + if (flags & SCA_MIGRATE_ENABLE) + p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + +- stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, +- &pending->arg, &pending->stop_work); ++ if (!stop_pending) { ++ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, ++ &pending->arg, &pending->stop_work); ++ } + + if (flags & SCA_MIGRATE_ENABLE) + return 0; diff --git a/queue-5.11/sched-fix-migration_cpu_stop-requeueing.patch b/queue-5.11/sched-fix-migration_cpu_stop-requeueing.patch new file mode 100644 index 00000000000..4ebe5b58d40 --- /dev/null +++ b/queue-5.11/sched-fix-migration_cpu_stop-requeueing.patch @@ -0,0 +1,142 @@ +From 8a6edb5257e2a84720fe78cb179eca58ba76126f Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Sat, 13 Feb 2021 13:10:35 +0100 +Subject: sched: Fix migration_cpu_stop() requeueing + +From: Peter Zijlstra + +commit 8a6edb5257e2a84720fe78cb179eca58ba76126f upstream. + +When affine_move_task(p) is called on a running task @p, which is not +otherwise already changing affinity, we'll first set +p->migration_pending and then do: + + stop_one_cpu(cpu_of_rq(rq), migration_cpu_stop, &arg); + +This then gets us to migration_cpu_stop() running on the CPU that was +previously running our victim task @p. + +If we find that our task is no longer on that runqueue (this can +happen because of a concurrent migration due to load-balance etc.), +then we'll end up at the: + + } else if (dest_cpu < 1 || pending) { + +branch. Which we'll take because we set pending earlier. Here we first +check if the task @p has already satisfied the affinity constraints, +if so we bail early [A]. Otherwise we'll reissue migration_cpu_stop() +onto the CPU that is now hosting our task @p: + + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + +Except, we've never initialized pending->arg, which will be all 0s. + +This then results in running migration_cpu_stop() on the next CPU with +arg->p == NULL, which gives the by now obvious result of fireworks. + +The cure is to change affine_move_task() to always use pending->arg, +furthermore we can use the exact same pattern as the +SCA_MIGRATE_ENABLE case, since we'll block on the pending->done +completion anyway, no point in adding yet another completion in +stop_one_cpu(). + +This then gives a clear distinction between the two +migration_cpu_stop() use cases: + + - sched_exec() / migrate_task_to() : arg->pending == NULL + - affine_move_task() : arg->pending != NULL; + +And we can have it ignore p->migration_pending when !arg->pending. Any +stop work from sched_exec() / migrate_task_to() is in addition to stop +works from affine_move_task(), which will be sufficient to issue the +completion. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Ingo Molnar +Reviewed-by: Valentin Schneider +Link: https://lkml.kernel.org/r/20210224131355.357743989@infradead.org +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/core.c | 39 ++++++++++++++++++++++++++++----------- + 1 file changed, 28 insertions(+), 11 deletions(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1922,6 +1922,24 @@ static int migration_cpu_stop(void *data + rq_lock(rq, &rf); + + pending = p->migration_pending; ++ if (pending && !arg->pending) { ++ /* ++ * This happens from sched_exec() and migrate_task_to(), ++ * neither of them care about pending and just want a task to ++ * maybe move about. ++ * ++ * Even if there is a pending, we can ignore it, since ++ * affine_move_task() will have it's own stop_work's in flight ++ * which will manage the completion. ++ * ++ * Notably, pending doesn't need to match arg->pending. This can ++ * happen when tripple concurrent affine_move_task() first sets ++ * pending, then clears pending and eventually sets another ++ * pending. ++ */ ++ pending = NULL; ++ } ++ + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because +@@ -2194,10 +2212,6 @@ static int affine_move_task(struct rq *r + int dest_cpu, unsigned int flags) + { + struct set_affinity_pending my_pending = { }, *pending = NULL; +- struct migration_arg arg = { +- .task = p, +- .dest_cpu = dest_cpu, +- }; + bool complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ +@@ -2235,6 +2249,12 @@ static int affine_move_task(struct rq *r + /* Install the request */ + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); ++ my_pending.arg = (struct migration_arg) { ++ .task = p, ++ .dest_cpu = -1, /* any */ ++ .pending = &my_pending, ++ }; ++ + p->migration_pending = &my_pending; + } else { + pending = p->migration_pending; +@@ -2265,12 +2285,6 @@ static int affine_move_task(struct rq *r + p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + +- pending->arg = (struct migration_arg) { +- .task = p, +- .dest_cpu = -1, +- .pending = pending, +- }; +- + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + +@@ -2283,8 +2297,11 @@ static int affine_move_task(struct rq *r + * is_migration_disabled(p) checks to the stopper, which will + * run on the same CPU as said p. + */ ++ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + task_rq_unlock(rq, p, rf); +- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ ++ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, ++ &pending->arg, &pending->stop_work); + + } else { + diff --git a/queue-5.11/sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch b/queue-5.11/sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch new file mode 100644 index 00000000000..e536c20a64a --- /dev/null +++ b/queue-5.11/sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch @@ -0,0 +1,41 @@ +From ce29ddc47b91f97e7f69a0fb7cbb5845f52a9825 Mon Sep 17 00:00:00 2001 +From: Mathieu Desnoyers +Date: Wed, 17 Feb 2021 11:56:51 -0500 +Subject: sched/membarrier: fix missing local execution of ipi_sync_rq_state() + +From: Mathieu Desnoyers + +commit ce29ddc47b91f97e7f69a0fb7cbb5845f52a9825 upstream. + +The function sync_runqueues_membarrier_state() should copy the +membarrier state from the @mm received as parameter to each runqueue +currently running tasks using that mm. + +However, the use of smp_call_function_many() skips the current runqueue, +which is unintended. Replace by a call to on_each_cpu_mask(). + +Fixes: 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy load") +Reported-by: Nadav Amit +Signed-off-by: Mathieu Desnoyers +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Ingo Molnar +Cc: stable@vger.kernel.org # 5.4.x+ +Link: https://lore.kernel.org/r/74F1E842-4A84-47BF-B6C2-5407DFDD4A4A@gmail.com +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/membarrier.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +--- a/kernel/sched/membarrier.c ++++ b/kernel/sched/membarrier.c +@@ -471,9 +471,7 @@ static int sync_runqueues_membarrier_sta + } + rcu_read_unlock(); + +- preempt_disable(); +- smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); +- preempt_enable(); ++ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true); + + free_cpumask_var(tmpmask); + cpus_read_unlock(); diff --git a/queue-5.11/sched-optimize-migration_cpu_stop.patch b/queue-5.11/sched-optimize-migration_cpu_stop.patch new file mode 100644 index 00000000000..47a6bd519d2 --- /dev/null +++ b/queue-5.11/sched-optimize-migration_cpu_stop.patch @@ -0,0 +1,53 @@ +From 3f1bc119cd7fc987c8ed25ffb717f99403bb308c Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Wed, 24 Feb 2021 11:21:35 +0100 +Subject: sched: Optimize migration_cpu_stop() + +From: Peter Zijlstra + +commit 3f1bc119cd7fc987c8ed25ffb717f99403bb308c upstream. + +When the purpose of migration_cpu_stop() is to migrate the task to +'any' valid CPU, don't migrate the task when it's already running on a +valid CPU. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Ingo Molnar +Reviewed-by: Valentin Schneider +Link: https://lkml.kernel.org/r/20210224131355.569238629@infradead.org +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/core.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1936,14 +1936,25 @@ static int migration_cpu_stop(void *data + complete = true; + } + +- if (dest_cpu < 0) ++ if (dest_cpu < 0) { ++ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) ++ goto out; ++ + dest_cpu = cpumask_any_distribute(&p->cpus_mask); ++ } + + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, &rf, p, dest_cpu); + else + p->wake_cpu = dest_cpu; + ++ /* ++ * XXX __migrate_task() can fail, at which point we might end ++ * up running on a dodgy CPU, AFAICT this can only happen ++ * during CPU hotplug, at which point we'll get pushed out ++ * anyway, so it's probably not a big deal. ++ */ ++ + } else if (pending) { + /* + * This happens when we get migrated between migrate_enable()'s diff --git a/queue-5.11/sched-simplify-migration_cpu_stop.patch b/queue-5.11/sched-simplify-migration_cpu_stop.patch new file mode 100644 index 00000000000..764b09559e9 --- /dev/null +++ b/queue-5.11/sched-simplify-migration_cpu_stop.patch @@ -0,0 +1,138 @@ +From c20cf065d4a619d394d23290093b1002e27dff86 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Wed, 24 Feb 2021 11:50:39 +0100 +Subject: sched: Simplify migration_cpu_stop() + +From: Peter Zijlstra + +commit c20cf065d4a619d394d23290093b1002e27dff86 upstream. + +When affine_move_task() issues a migration_cpu_stop(), the purpose of +that function is to complete that @pending, not any random other +p->migration_pending that might have gotten installed since. + +This realization much simplifies migration_cpu_stop() and allows +further necessary steps to fix all this as it provides the guarantee +that @pending's stopper will complete @pending (and not some random +other @pending). + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Ingo Molnar +Reviewed-by: Valentin Schneider +Link: https://lkml.kernel.org/r/20210224131355.430014682@infradead.org +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/core.c | 56 +++++++--------------------------------------------- + 1 file changed, 8 insertions(+), 48 deletions(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1898,8 +1898,8 @@ static struct rq *__migrate_task(struct + */ + static int migration_cpu_stop(void *data) + { +- struct set_affinity_pending *pending; + struct migration_arg *arg = data; ++ struct set_affinity_pending *pending = arg->pending; + struct task_struct *p = arg->task; + int dest_cpu = arg->dest_cpu; + struct rq *rq = this_rq(); +@@ -1921,25 +1921,6 @@ static int migration_cpu_stop(void *data + raw_spin_lock(&p->pi_lock); + rq_lock(rq, &rf); + +- pending = p->migration_pending; +- if (pending && !arg->pending) { +- /* +- * This happens from sched_exec() and migrate_task_to(), +- * neither of them care about pending and just want a task to +- * maybe move about. +- * +- * Even if there is a pending, we can ignore it, since +- * affine_move_task() will have it's own stop_work's in flight +- * which will manage the completion. +- * +- * Notably, pending doesn't need to match arg->pending. This can +- * happen when tripple concurrent affine_move_task() first sets +- * pending, then clears pending and eventually sets another +- * pending. +- */ +- pending = NULL; +- } +- + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because +@@ -1950,31 +1931,20 @@ static int migration_cpu_stop(void *data + goto out; + + if (pending) { +- p->migration_pending = NULL; ++ if (p->migration_pending == pending) ++ p->migration_pending = NULL; + complete = true; + } + +- /* migrate_enable() -- we must not race against SCA */ +- if (dest_cpu < 0) { +- /* +- * When this was migrate_enable() but we no longer +- * have a @pending, a concurrent SCA 'fixed' things +- * and we should be valid again. Nothing to do. +- */ +- if (!pending) { +- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); +- goto out; +- } +- ++ if (dest_cpu < 0) + dest_cpu = cpumask_any_distribute(&p->cpus_mask); +- } + + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, &rf, p, dest_cpu); + else + p->wake_cpu = dest_cpu; + +- } else if (dest_cpu < 0 || pending) { ++ } else if (pending) { + /* + * This happens when we get migrated between migrate_enable()'s + * preempt_enable() and scheduling the stopper task. At that +@@ -1989,23 +1959,14 @@ static int migration_cpu_stop(void *data + * ->pi_lock, so the allowed mask is stable - if it got + * somewhere allowed, we're done. + */ +- if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { +- p->migration_pending = NULL; ++ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { ++ if (p->migration_pending == pending) ++ p->migration_pending = NULL; + complete = true; + goto out; + } + + /* +- * When this was migrate_enable() but we no longer have an +- * @pending, a concurrent SCA 'fixed' things and we should be +- * valid again. Nothing to do. +- */ +- if (!pending) { +- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); +- goto out; +- } +- +- /* + * When migrate_enable() hits a rq mis-match we can't reliably + * determine is_migration_disabled() and so have to chase after + * it. +@@ -2022,7 +1983,6 @@ out: + complete_all(&pending->done); + + /* For pending->{arg,stop_work} */ +- pending = arg->pending; + if (pending && refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); + diff --git a/queue-5.11/sched-simplify-set_affinity_pending-refcounts.patch b/queue-5.11/sched-simplify-set_affinity_pending-refcounts.patch new file mode 100644 index 00000000000..fbc787ee1d3 --- /dev/null +++ b/queue-5.11/sched-simplify-set_affinity_pending-refcounts.patch @@ -0,0 +1,124 @@ +From 50caf9c14b1498c90cf808dbba2ca29bd32ccba4 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Wed, 24 Feb 2021 11:42:08 +0100 +Subject: sched: Simplify set_affinity_pending refcounts + +From: Peter Zijlstra + +commit 50caf9c14b1498c90cf808dbba2ca29bd32ccba4 upstream. + +Now that we have set_affinity_pending::stop_pending to indicate if a +stopper is in progress, and we have the guarantee that if that stopper +exists, it will (eventually) complete our @pending we can simplify the +refcount scheme by no longer counting the stopper thread. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Ingo Molnar +Reviewed-by: Valentin Schneider +Link: https://lkml.kernel.org/r/20210224131355.724130207@infradead.org +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/core.c | 32 ++++++++++++++++++++------------ + 1 file changed, 20 insertions(+), 12 deletions(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1862,6 +1862,10 @@ struct migration_arg { + struct set_affinity_pending *pending; + }; + ++/* ++ * @refs: number of wait_for_completion() ++ * @stop_pending: is @stop_work in use ++ */ + struct set_affinity_pending { + refcount_t refs; + unsigned int stop_pending; +@@ -1997,10 +2001,6 @@ out: + if (complete) + complete_all(&pending->done); + +- /* For pending->{arg,stop_work} */ +- if (pending && refcount_dec_and_test(&pending->refs)) +- wake_up_var(&pending->refs); +- + return 0; + } + +@@ -2199,12 +2199,16 @@ static int affine_move_task(struct rq *r + push_task = get_task_struct(p); + } + ++ /* ++ * If there are pending waiters, but no pending stop_work, ++ * then complete now. ++ */ + pending = p->migration_pending; +- if (pending) { +- refcount_inc(&pending->refs); ++ if (pending && !pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } ++ + task_rq_unlock(rq, p, rf); + + if (push_task) { +@@ -2213,7 +2217,7 @@ static int affine_move_task(struct rq *r + } + + if (complete) +- goto do_complete; ++ complete_all(&pending->done); + + return 0; + } +@@ -2264,9 +2268,9 @@ static int affine_move_task(struct rq *r + if (!stop_pending) + pending->stop_pending = true; + +- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + if (flags & SCA_MIGRATE_ENABLE) + p->migration_flags &= ~MDF_PUSH; ++ + task_rq_unlock(rq, p, rf); + + if (!stop_pending) { +@@ -2282,12 +2286,13 @@ static int affine_move_task(struct rq *r + if (task_on_rq_queued(p)) + rq = move_queued_task(rq, rf, p, dest_cpu); + +- p->migration_pending = NULL; +- complete = true; ++ if (!pending->stop_pending) { ++ p->migration_pending = NULL; ++ complete = true; ++ } + } + task_rq_unlock(rq, p, rf); + +-do_complete: + if (complete) + complete_all(&pending->done); + } +@@ -2295,7 +2300,7 @@ do_complete: + wait_for_completion(&pending->done); + + if (refcount_dec_and_test(&pending->refs)) +- wake_up_var(&pending->refs); ++ wake_up_var(&pending->refs); /* No UaF, just an address */ + + /* + * Block the original owner of &pending until all subsequent callers +@@ -2303,6 +2308,9 @@ do_complete: + */ + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + ++ /* ARGH */ ++ WARN_ON_ONCE(my_pending.stop_pending); ++ + return 0; + } + diff --git a/queue-5.11/series b/queue-5.11/series index 54cb77eace8..a9396f5d124 100644 --- a/queue-5.11/series +++ b/queue-5.11/series @@ -267,3 +267,39 @@ memblock-fix-section-mismatch-warning.patch stop_machine-mark-helpers-__always_inline.patch include-linux-sched-mm.h-use-rcu_dereference-in-in_v.patch prctl-fix-pr_set_mm_auxv-kernel-stack-leak.patch +zram-fix-return-value-on-writeback_store.patch +zram-fix-broken-page-writeback.patch +linux-compiler-clang.h-define-have_builtin_bswap.patch +sched-fix-migration_cpu_stop-requeueing.patch +sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch +sched-collate-affine_move_task-stoppers.patch +sched-simplify-migration_cpu_stop.patch +sched-optimize-migration_cpu_stop.patch +sched-fix-affine_move_task-self-concurrency.patch +sched-simplify-set_affinity_pending-refcounts.patch +efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch +powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch +powerpc-fix-inverted-set_full_regs-bitop.patch +powerpc-fix-missing-declaration-of-able_kernel_vsx.patch +binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch +kasan-mm-fix-crash-with-hw_tags-and-debug_pagealloc.patch +kasan-fix-kasan_stack-dependency-for-hw_tags.patch +x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch +x86-sev-es-introduce-ip_within_syscall_gap-helper.patch +x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch +x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch +x86-sev-es-use-__copy_from_user_inatomic.patch +x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch +kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch +kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch +kvm-arm64-ensure-i-cache-isolation-between-vcpus-of-a-same-vm.patch +kvm-arm64-fix-range-alignment-when-walking-page-tables.patch +kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch +kvm-arm64-nvhe-save-the-spe-context-early.patch +kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch +kvm-arm64-fix-exclusive-limit-for-ipa-size.patch +mm-highmem.c-fix-zero_user_segments-with-start-end.patch +mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch +mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch +mm-memcg-set-memcg-when-splitting-page.patch +mm-memcg-rename-mem_cgroup_split_huge_fixup-to-split_page_memcg-and-add-nr_pages-argument.patch diff --git a/queue-5.11/x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch b/queue-5.11/x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch new file mode 100644 index 00000000000..919b7e7c1e8 --- /dev/null +++ b/queue-5.11/x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch @@ -0,0 +1,44 @@ +From 5d5675df792ff67e74a500c4c94db0f99e6a10ef Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 4 Mar 2021 11:05:54 -0800 +Subject: x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls + +From: Andy Lutomirski + +commit 5d5675df792ff67e74a500c4c94db0f99e6a10ef upstream. + +On a 32-bit fast syscall that fails to read its arguments from user +memory, the kernel currently does syscall exit work but not +syscall entry work. This confuses audit and ptrace. For example: + + $ ./tools/testing/selftests/x86/syscall_arg_fault_32 + ... + strace: pid 264258: entering, ptrace_syscall_info.op == 2 + ... + +This is a minimal fix intended for ease of backporting. A more +complete cleanup is coming. + +Fixes: 0b085e68f407 ("x86/entry: Consolidate 32/64 bit syscall entry") +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/8c82296ddf803b91f8d1e5eac89e5803ba54ab0e.1614884673.git.luto@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/entry/common.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/arch/x86/entry/common.c ++++ b/arch/x86/entry/common.c +@@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32 + regs->ax = -EFAULT; + + instrumentation_end(); +- syscall_exit_to_user_mode(regs); ++ local_irq_disable(); ++ irqentry_exit_to_user_mode(regs); + return false; + } + diff --git a/queue-5.11/x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch b/queue-5.11/x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch new file mode 100644 index 00000000000..db5d245d150 --- /dev/null +++ b/queue-5.11/x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch @@ -0,0 +1,60 @@ +From 545ac14c16b5dbd909d5a90ddf5b5a629a40fa94 Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Wed, 3 Mar 2021 15:17:13 +0100 +Subject: x86/sev-es: Check regs->sp is trusted before adjusting #VC IST stack + +From: Joerg Roedel + +commit 545ac14c16b5dbd909d5a90ddf5b5a629a40fa94 upstream. + +The code in the NMI handler to adjust the #VC handler IST stack is +needed in case an NMI hits when the #VC handler is still using its IST +stack. + +But the check for this condition also needs to look if the regs->sp +value is trusted, meaning it was not set by user-space. Extend the check +to not use regs->sp when the NMI interrupted user-space code or the +SYSCALL gap. + +Fixes: 315562c9af3d5 ("x86/sev-es: Adjust #VC IST Stack on entering NMI handler") +Reported-by: Andy Lutomirski +Signed-off-by: Joerg Roedel +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org # 5.10+ +Link: https://lkml.kernel.org/r/20210303141716.29223-3-joro@8bytes.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/sev-es.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +--- a/arch/x86/kernel/sev-es.c ++++ b/arch/x86/kernel/sev-es.c +@@ -121,8 +121,18 @@ static void __init setup_vc_stacks(int c + cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); + } + +-static __always_inline bool on_vc_stack(unsigned long sp) ++static __always_inline bool on_vc_stack(struct pt_regs *regs) + { ++ unsigned long sp = regs->sp; ++ ++ /* User-mode RSP is not trusted */ ++ if (user_mode(regs)) ++ return false; ++ ++ /* SYSCALL gap still has user-mode RSP */ ++ if (ip_within_syscall_gap(regs)) ++ return false; ++ + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); + } + +@@ -144,7 +154,7 @@ void noinstr __sev_es_ist_enter(struct p + old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* Make room on the IST stack */ +- if (on_vc_stack(regs->sp)) ++ if (on_vc_stack(regs)) + new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); + else + new_ist = old_ist - sizeof(old_ist); diff --git a/queue-5.11/x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch b/queue-5.11/x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch new file mode 100644 index 00000000000..39a31936ef9 --- /dev/null +++ b/queue-5.11/x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch @@ -0,0 +1,57 @@ +From 62441a1fb53263bda349b6e5997c3cc5c120d89e Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Wed, 3 Mar 2021 15:17:15 +0100 +Subject: x86/sev-es: Correctly track IRQ states in runtime #VC handler + +From: Joerg Roedel + +commit 62441a1fb53263bda349b6e5997c3cc5c120d89e upstream. + +Call irqentry_nmi_enter()/irqentry_nmi_exit() in the #VC handler to +correctly track the IRQ state during its execution. + +Fixes: 0786138c78e79 ("x86/sev-es: Add a Runtime #VC Exception Handler") +Reported-by: Andy Lutomirski +Signed-off-by: Joerg Roedel +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org # v5.10+ +Link: https://lkml.kernel.org/r/20210303141716.29223-5-joro@8bytes.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/sev-es.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/arch/x86/kernel/sev-es.c ++++ b/arch/x86/kernel/sev-es.c +@@ -1258,13 +1258,12 @@ static __always_inline bool on_vc_fallba + DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) + { + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); ++ irqentry_state_t irq_state; + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + +- lockdep_assert_irqs_disabled(); +- + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ +@@ -1273,6 +1272,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_co + return; + } + ++ irq_state = irqentry_nmi_enter(regs); ++ lockdep_assert_irqs_disabled(); + instrumentation_begin(); + + /* +@@ -1335,6 +1336,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_co + + out: + instrumentation_end(); ++ irqentry_nmi_exit(regs, irq_state); + + return; + diff --git a/queue-5.11/x86-sev-es-introduce-ip_within_syscall_gap-helper.patch b/queue-5.11/x86-sev-es-introduce-ip_within_syscall_gap-helper.patch new file mode 100644 index 00000000000..9209662cafd --- /dev/null +++ b/queue-5.11/x86-sev-es-introduce-ip_within_syscall_gap-helper.patch @@ -0,0 +1,90 @@ +From 78a81d88f60ba773cbe890205e1ee67f00502948 Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Wed, 3 Mar 2021 15:17:12 +0100 +Subject: x86/sev-es: Introduce ip_within_syscall_gap() helper + +From: Joerg Roedel + +commit 78a81d88f60ba773cbe890205e1ee67f00502948 upstream. + +Introduce a helper to check whether an exception came from the syscall +gap and use it in the SEV-ES code. Extend the check to also cover the +compatibility SYSCALL entry path. + +Fixes: 315562c9af3d5 ("x86/sev-es: Adjust #VC IST Stack on entering NMI handler") +Signed-off-by: Joerg Roedel +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org # 5.10+ +Link: https://lkml.kernel.org/r/20210303141716.29223-2-joro@8bytes.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/entry/entry_64_compat.S | 2 ++ + arch/x86/include/asm/proto.h | 1 + + arch/x86/include/asm/ptrace.h | 15 +++++++++++++++ + arch/x86/kernel/traps.c | 3 +-- + 4 files changed, 19 insertions(+), 2 deletions(-) + +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat) + /* Switch to the kernel stack */ + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ++SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ++ + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ +--- a/arch/x86/include/asm/proto.h ++++ b/arch/x86/include/asm/proto.h +@@ -25,6 +25,7 @@ void __end_SYSENTER_singlestep_region(vo + void entry_SYSENTER_compat(void); + void __end_entry_SYSENTER_compat(void); + void entry_SYSCALL_compat(void); ++void entry_SYSCALL_compat_safe_stack(void); + void entry_INT80_compat(void); + #ifdef CONFIG_XEN_PV + void xen_entry_INT80_compat(void); +--- a/arch/x86/include/asm/ptrace.h ++++ b/arch/x86/include/asm/ptrace.h +@@ -94,6 +94,8 @@ struct pt_regs { + #include + #endif + ++#include ++ + struct cpuinfo_x86; + struct task_struct; + +@@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct + #ifdef CONFIG_X86_64 + #define current_user_stack_pointer() current_pt_regs()->sp + #define compat_user_stack_pointer() current_pt_regs()->sp ++ ++static inline bool ip_within_syscall_gap(struct pt_regs *regs) ++{ ++ bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && ++ regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); ++ ++#ifdef CONFIG_IA32_EMULATION ++ ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && ++ regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); ++#endif ++ ++ return ret; ++} + #endif + + static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -694,8 +694,7 @@ asmlinkage __visible noinstr struct pt_r + * In the SYSCALL entry path the RSP value comes from user-space - don't + * trust it and switch to the current kernel stack + */ +- if (regs->ip >= (unsigned long)entry_SYSCALL_64 && +- regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) { ++ if (ip_within_syscall_gap(regs)) { + sp = this_cpu_read(cpu_current_top_of_stack); + goto sync; + } diff --git a/queue-5.11/x86-sev-es-use-__copy_from_user_inatomic.patch b/queue-5.11/x86-sev-es-use-__copy_from_user_inatomic.patch new file mode 100644 index 00000000000..2cf04e752ab --- /dev/null +++ b/queue-5.11/x86-sev-es-use-__copy_from_user_inatomic.patch @@ -0,0 +1,137 @@ +From bffe30dd9f1f3b2608a87ac909a224d6be472485 Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Wed, 3 Mar 2021 15:17:16 +0100 +Subject: x86/sev-es: Use __copy_from_user_inatomic() + +From: Joerg Roedel + +commit bffe30dd9f1f3b2608a87ac909a224d6be472485 upstream. + +The #VC handler must run in atomic context and cannot sleep. This is a +problem when it tries to fetch instruction bytes from user-space via +copy_from_user(). + +Introduce a insn_fetch_from_user_inatomic() helper which uses +__copy_from_user_inatomic() to safely copy the instruction bytes to +kernel memory in the #VC handler. + +Fixes: 5e3427a7bc432 ("x86/sev-es: Handle instruction fetches from user-space") +Signed-off-by: Joerg Roedel +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org # v5.10+ +Link: https://lkml.kernel.org/r/20210303141716.29223-6-joro@8bytes.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/insn-eval.h | 2 + + arch/x86/kernel/sev-es.c | 2 - + arch/x86/lib/insn-eval.c | 66 ++++++++++++++++++++++++++++++--------- + 3 files changed, 55 insertions(+), 15 deletions(-) + +--- a/arch/x86/include/asm/insn-eval.h ++++ b/arch/x86/include/asm/insn-eval.h +@@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct p + int insn_get_code_seg_params(struct pt_regs *regs); + int insn_fetch_from_user(struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE]); ++int insn_fetch_from_user_inatomic(struct pt_regs *regs, ++ unsigned char buf[MAX_INSN_SIZE]); + bool insn_decode(struct insn *insn, struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE], int buf_size); + +--- a/arch/x86/kernel/sev-es.c ++++ b/arch/x86/kernel/sev-es.c +@@ -258,7 +258,7 @@ static enum es_result vc_decode_insn(str + int res; + + if (user_mode(ctxt->regs)) { +- res = insn_fetch_from_user(ctxt->regs, buffer); ++ res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (!res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; +--- a/arch/x86/lib/insn-eval.c ++++ b/arch/x86/lib/insn-eval.c +@@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct in + } + } + ++static unsigned long insn_get_effective_ip(struct pt_regs *regs) ++{ ++ unsigned long seg_base = 0; ++ ++ /* ++ * If not in user-space long mode, a custom code segment could be in ++ * use. This is true in protected mode (if the process defined a local ++ * descriptor table), or virtual-8086 mode. In most of the cases ++ * seg_base will be zero as in USER_CS. ++ */ ++ if (!user_64bit_mode(regs)) { ++ seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); ++ if (seg_base == -1L) ++ return 0; ++ } ++ ++ return seg_base + regs->ip; ++} ++ + /** + * insn_fetch_from_user() - Copy instruction bytes from user-space memory + * @regs: Structure with register values as seen when entering kernel mode +@@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct in + */ + int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) + { +- unsigned long seg_base = 0; ++ unsigned long ip; + int not_copied; + +- /* +- * If not in user-space long mode, a custom code segment could be in +- * use. This is true in protected mode (if the process defined a local +- * descriptor table), or virtual-8086 mode. In most of the cases +- * seg_base will be zero as in USER_CS. +- */ +- if (!user_64bit_mode(regs)) { +- seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); +- if (seg_base == -1L) +- return 0; +- } ++ ip = insn_get_effective_ip(regs); ++ if (!ip) ++ return 0; ++ ++ not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); ++ ++ return MAX_INSN_SIZE - not_copied; ++} ++ ++/** ++ * insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory ++ * while in atomic code ++ * @regs: Structure with register values as seen when entering kernel mode ++ * @buf: Array to store the fetched instruction ++ * ++ * Gets the linear address of the instruction and copies the instruction bytes ++ * to the buf. This function must be used in atomic context. ++ * ++ * Returns: ++ * ++ * Number of instruction bytes copied. ++ * ++ * 0 if nothing was copied. ++ */ ++int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) ++{ ++ unsigned long ip; ++ int not_copied; + ++ ip = insn_get_effective_ip(regs); ++ if (!ip) ++ return 0; + +- not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), +- MAX_INSN_SIZE); ++ not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); + + return MAX_INSN_SIZE - not_copied; + } diff --git a/queue-5.11/x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch b/queue-5.11/x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch new file mode 100644 index 00000000000..b830b893914 --- /dev/null +++ b/queue-5.11/x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch @@ -0,0 +1,87 @@ +From e504e74cc3a2c092b05577ce3e8e013fae7d94e6 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Fri, 5 Feb 2021 08:24:02 -0600 +Subject: x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2 + +From: Josh Poimboeuf + +commit e504e74cc3a2c092b05577ce3e8e013fae7d94e6 upstream. + +KASAN reserves "redzone" areas between stack frames in order to detect +stack overruns. A read or write to such an area triggers a KASAN +"stack-out-of-bounds" BUG. + +Normally, the ORC unwinder stays in-bounds and doesn't access the +redzone. But sometimes it can't find ORC metadata for a given +instruction. This can happen for code which is missing ORC metadata, or +for generated code. In such cases, the unwinder attempts to fall back +to frame pointers, as a best-effort type thing. + +This fallback often works, but when it doesn't, the unwinder can get +confused and go off into the weeds into the KASAN redzone, triggering +the aforementioned KASAN BUG. + +But in this case, the unwinder's confusion is actually harmless and +working as designed. It already has checks in place to prevent +off-stack accesses, but those checks get short-circuited by the KASAN +BUG. And a BUG is a lot more disruptive than a harmless unwinder +warning. + +Disable the KASAN checks by using READ_ONCE_NOCHECK() for all stack +accesses. This finishes the job started by commit 881125bfe65b +("x86/unwind: Disable KASAN checking in the ORC unwinder"), which only +partially fixed the issue. + +Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") +Reported-by: Ivan Babrou +Signed-off-by: Josh Poimboeuf +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Borislav Petkov +Reviewed-by: Steven Rostedt (VMware) +Tested-by: Ivan Babrou +Cc: stable@kernel.org +Link: https://lkml.kernel.org/r/9583327904ebbbeda399eca9c56d6c7085ac20fe.1612534649.git.jpoimboe@redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/unwind_orc.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/arch/x86/kernel/unwind_orc.c ++++ b/arch/x86/kernel/unwind_orc.c +@@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwi + if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) + return false; + +- *ip = regs->ip; +- *sp = regs->sp; ++ *ip = READ_ONCE_NOCHECK(regs->ip); ++ *sp = READ_ONCE_NOCHECK(regs->sp); + return true; + } + +@@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct + if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) + return false; + +- *ip = regs->ip; +- *sp = regs->sp; ++ *ip = READ_ONCE_NOCHECK(regs->ip); ++ *sp = READ_ONCE_NOCHECK(regs->sp); + return true; + } + +@@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state + return false; + + if (state->full_regs) { +- *val = ((unsigned long *)state->regs)[reg]; ++ *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]); + return true; + } + + if (state->prev_regs) { +- *val = ((unsigned long *)state->prev_regs)[reg]; ++ *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]); + return true; + } + diff --git a/queue-5.11/zram-fix-broken-page-writeback.patch b/queue-5.11/zram-fix-broken-page-writeback.patch new file mode 100644 index 00000000000..444648fb8e6 --- /dev/null +++ b/queue-5.11/zram-fix-broken-page-writeback.patch @@ -0,0 +1,57 @@ +From 2766f1821600cc7562bae2128ad0b163f744c5d9 Mon Sep 17 00:00:00 2001 +From: Minchan Kim +Date: Fri, 12 Mar 2021 21:08:41 -0800 +Subject: zram: fix broken page writeback + +From: Minchan Kim + +commit 2766f1821600cc7562bae2128ad0b163f744c5d9 upstream. + +commit 0d8359620d9b ("zram: support page writeback") introduced two +problems. It overwrites writeback_store's return value as kstrtol's +return value, which makes return value zero so user could see zero as +return value of write syscall even though it wrote data successfully. + +It also breaks index value in the loop in that it doesn't increase the +index any longer. It means it can write only first starting block index +so user couldn't write all idle pages in the zram so lose memory saving +chance. + +This patch fixes those issues. + +Link: https://lkml.kernel.org/r/20210312173949.2197662-2-minchan@kernel.org +Fixes: 0d8359620d9b("zram: support page writeback") +Signed-off-by: Minchan Kim +Reported-by: Amos Bianchi +Cc: Sergey Senozhatsky +Cc: John Dias +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/zram/zram_drv.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/block/zram/zram_drv.c ++++ b/drivers/block/zram/zram_drv.c +@@ -639,8 +639,8 @@ static ssize_t writeback_store(struct de + if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) + return -EINVAL; + +- ret = kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index); +- if (ret || index >= nr_pages) ++ if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) || ++ index >= nr_pages) + return -EINVAL; + + nr_pages = 1; +@@ -664,7 +664,7 @@ static ssize_t writeback_store(struct de + goto release_init_lock; + } + +- while (nr_pages--) { ++ for (; nr_pages != 0; index++, nr_pages--) { + struct bio_vec bvec; + + bvec.bv_page = page; diff --git a/queue-5.11/zram-fix-return-value-on-writeback_store.patch b/queue-5.11/zram-fix-return-value-on-writeback_store.patch new file mode 100644 index 00000000000..81c625d88d7 --- /dev/null +++ b/queue-5.11/zram-fix-return-value-on-writeback_store.patch @@ -0,0 +1,60 @@ +From 57e0076e6575a7b7cef620a0bd2ee2549ef77818 Mon Sep 17 00:00:00 2001 +From: Minchan Kim +Date: Fri, 12 Mar 2021 21:08:38 -0800 +Subject: zram: fix return value on writeback_store + +From: Minchan Kim + +commit 57e0076e6575a7b7cef620a0bd2ee2549ef77818 upstream. + +writeback_store's return value is overwritten by submit_bio_wait's return +value. Thus, writeback_store will return zero since there was no IO +error. In the end, write syscall from userspace will see the zero as +return value, which could make the process stall to keep trying the write +until it will succeed. + +Link: https://lkml.kernel.org/r/20210312173949.2197662-1-minchan@kernel.org +Fixes: 3b82a051c101("drivers/block/zram/zram_drv.c: fix error return codes not being returned in writeback_store") +Signed-off-by: Minchan Kim +Cc: Sergey Senozhatsky +Cc: Colin Ian King +Cc: John Dias +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/zram/zram_drv.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/drivers/block/zram/zram_drv.c ++++ b/drivers/block/zram/zram_drv.c +@@ -628,7 +628,7 @@ static ssize_t writeback_store(struct de + struct bio_vec bio_vec; + struct page *page; + ssize_t ret = len; +- int mode; ++ int mode, err; + unsigned long blk_idx = 0; + + if (sysfs_streq(buf, "idle")) +@@ -729,12 +729,17 @@ static ssize_t writeback_store(struct de + * XXX: A single page IO would be inefficient for write + * but it would be not bad as starter. + */ +- ret = submit_bio_wait(&bio); +- if (ret) { ++ err = submit_bio_wait(&bio); ++ if (err) { + zram_slot_lock(zram, index); + zram_clear_flag(zram, index, ZRAM_UNDER_WB); + zram_clear_flag(zram, index, ZRAM_IDLE); + zram_slot_unlock(zram, index); ++ /* ++ * Return last IO error unless every IO were ++ * not suceeded. ++ */ ++ ret = err; + continue; + } +