From: Greg Kroah-Hartman Date: Mon, 15 Mar 2021 09:05:51 +0000 (+0100) Subject: 5.10-stable patches X-Git-Tag: v4.4.262~30 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=3053e6707c4c8e20b7d6230360d410623666c57d;p=thirdparty%2Fkernel%2Fstable-queue.git 5.10-stable patches added patches: binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch kvm-arm64-fix-exclusive-limit-for-ipa-size.patch kvm-arm64-fix-range-alignment-when-walking-page-tables.patch kvm-arm64-nvhe-save-the-spe-context-early.patch kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch linux-compiler-clang.h-define-have_builtin_bswap.patch mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch mm-memcg-set-memcg-when-splitting-page.patch mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch powerpc-fix-inverted-set_full_regs-bitop.patch powerpc-fix-missing-declaration-of-able_kernel_vsx.patch sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch x86-sev-es-introduce-ip_within_syscall_gap-helper.patch x86-sev-es-use-__copy_from_user_inatomic.patch x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch zram-fix-return-value-on-writeback_store.patch --- diff --git a/queue-5.10/binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch b/queue-5.10/binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch new file mode 100644 index 00000000000..5b3017bbcbb --- /dev/null +++ b/queue-5.10/binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch @@ -0,0 +1,118 @@ +From e7850f4d844e0acfac7e570af611d89deade3146 Mon Sep 17 00:00:00 2001 +From: Lior Ribak +Date: Fri, 12 Mar 2021 21:07:41 -0800 +Subject: binfmt_misc: fix possible deadlock in bm_register_write + +From: Lior Ribak + +commit e7850f4d844e0acfac7e570af611d89deade3146 upstream. + +There is a deadlock in bm_register_write: + +First, in the begining of the function, a lock is taken on the binfmt_misc +root inode with inode_lock(d_inode(root)). + +Then, if the user used the MISC_FMT_OPEN_FILE flag, the function will call +open_exec on the user-provided interpreter. + +open_exec will call a path lookup, and if the path lookup process includes +the root of binfmt_misc, it will try to take a shared lock on its inode +again, but it is already locked, and the code will get stuck in a deadlock + +To reproduce the bug: +$ echo ":iiiii:E::ii::/proc/sys/fs/binfmt_misc/bla:F" > /proc/sys/fs/binfmt_misc/register + +backtrace of where the lock occurs (#5): +0 schedule () at ./arch/x86/include/asm/current.h:15 +1 0xffffffff81b51237 in rwsem_down_read_slowpath (sem=0xffff888003b202e0, count=, state=state@entry=2) at kernel/locking/rwsem.c:992 +2 0xffffffff81b5150a in __down_read_common (state=2, sem=) at kernel/locking/rwsem.c:1213 +3 __down_read (sem=) at kernel/locking/rwsem.c:1222 +4 down_read (sem=) at kernel/locking/rwsem.c:1355 +5 0xffffffff811ee22a in inode_lock_shared (inode=) at ./include/linux/fs.h:783 +6 open_last_lookups (op=0xffffc9000022fe34, file=0xffff888004098600, nd=0xffffc9000022fd10) at fs/namei.c:3177 +7 path_openat (nd=nd@entry=0xffffc9000022fd10, op=op@entry=0xffffc9000022fe34, flags=flags@entry=65) at fs/namei.c:3366 +8 0xffffffff811efe1c in do_filp_open (dfd=, pathname=pathname@entry=0xffff8880031b9000, op=op@entry=0xffffc9000022fe34) at fs/namei.c:3396 +9 0xffffffff811e493f in do_open_execat (fd=fd@entry=-100, name=name@entry=0xffff8880031b9000, flags=, flags@entry=0) at fs/exec.c:913 +10 0xffffffff811e4a92 in open_exec (name=) at fs/exec.c:948 +11 0xffffffff8124aa84 in bm_register_write (file=, buffer=, count=19, ppos=) at fs/binfmt_misc.c:682 +12 0xffffffff811decd2 in vfs_write (file=file@entry=0xffff888004098500, buf=buf@entry=0xa758d0 ":iiiii:E::ii::i:CF +", count=count@entry=19, pos=pos@entry=0xffffc9000022ff10) at fs/read_write.c:603 +13 0xffffffff811defda in ksys_write (fd=, buf=0xa758d0 ":iiiii:E::ii::i:CF +", count=19) at fs/read_write.c:658 +14 0xffffffff81b49813 in do_syscall_64 (nr=, regs=0xffffc9000022ff58) at arch/x86/entry/common.c:46 +15 0xffffffff81c0007c in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:120 + +To solve the issue, the open_exec call is moved to before the write +lock is taken by bm_register_write + +Link: https://lkml.kernel.org/r/20210228224414.95962-1-liorribak@gmail.com +Fixes: 948b701a607f1 ("binfmt_misc: add persistent opened binary handler for containers") +Signed-off-by: Lior Ribak +Acked-by: Helge Deller +Cc: Al Viro +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/binfmt_misc.c | 29 ++++++++++++++--------------- + 1 file changed, 14 insertions(+), 15 deletions(-) + +--- a/fs/binfmt_misc.c ++++ b/fs/binfmt_misc.c +@@ -647,12 +647,24 @@ static ssize_t bm_register_write(struct + struct super_block *sb = file_inode(file)->i_sb; + struct dentry *root = sb->s_root, *dentry; + int err = 0; ++ struct file *f = NULL; + + e = create_entry(buffer, count); + + if (IS_ERR(e)) + return PTR_ERR(e); + ++ if (e->flags & MISC_FMT_OPEN_FILE) { ++ f = open_exec(e->interpreter); ++ if (IS_ERR(f)) { ++ pr_notice("register: failed to install interpreter file %s\n", ++ e->interpreter); ++ kfree(e); ++ return PTR_ERR(f); ++ } ++ e->interp_file = f; ++ } ++ + inode_lock(d_inode(root)); + dentry = lookup_one_len(e->name, root, strlen(e->name)); + err = PTR_ERR(dentry); +@@ -676,21 +688,6 @@ static ssize_t bm_register_write(struct + goto out2; + } + +- if (e->flags & MISC_FMT_OPEN_FILE) { +- struct file *f; +- +- f = open_exec(e->interpreter); +- if (IS_ERR(f)) { +- err = PTR_ERR(f); +- pr_notice("register: failed to install interpreter file %s\n", e->interpreter); +- simple_release_fs(&bm_mnt, &entry_count); +- iput(inode); +- inode = NULL; +- goto out2; +- } +- e->interp_file = f; +- } +- + e->dentry = dget(dentry); + inode->i_private = e; + inode->i_fop = &bm_entry_operations; +@@ -707,6 +704,8 @@ out: + inode_unlock(d_inode(root)); + + if (err) { ++ if (f) ++ filp_close(f, NULL); + kfree(e); + return err; + } diff --git a/queue-5.10/efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch b/queue-5.10/efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch new file mode 100644 index 00000000000..a99106f1984 --- /dev/null +++ b/queue-5.10/efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch @@ -0,0 +1,59 @@ +From 9e9888a0fe97b9501a40f717225d2bef7100a2c1 Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel +Date: Fri, 5 Mar 2021 10:21:05 +0100 +Subject: efi: stub: omit SetVirtualAddressMap() if marked unsupported in RT_PROP table + +From: Ard Biesheuvel + +commit 9e9888a0fe97b9501a40f717225d2bef7100a2c1 upstream. + +The EFI_RT_PROPERTIES_TABLE contains a mask of runtime services that are +available after ExitBootServices(). This mostly does not concern the EFI +stub at all, given that it runs before that. However, there is one call +that is made at runtime, which is the call to SetVirtualAddressMap() +(which is not even callable at boot time to begin with) + +So add the missing handling of the RT_PROP table to ensure that we only +call SetVirtualAddressMap() if it is not being advertised as unsupported +by the firmware. + +Cc: # v5.10+ +Tested-by: Shawn Guo +Signed-off-by: Ard Biesheuvel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/firmware/efi/libstub/efi-stub.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/drivers/firmware/efi/libstub/efi-stub.c ++++ b/drivers/firmware/efi/libstub/efi-stub.c +@@ -96,6 +96,18 @@ static void install_memreserve_table(voi + efi_err("Failed to install memreserve config table!\n"); + } + ++static u32 get_supported_rt_services(void) ++{ ++ const efi_rt_properties_table_t *rt_prop_table; ++ u32 supported = EFI_RT_SUPPORTED_ALL; ++ ++ rt_prop_table = get_efi_config_table(EFI_RT_PROPERTIES_TABLE_GUID); ++ if (rt_prop_table) ++ supported &= rt_prop_table->runtime_services_supported; ++ ++ return supported; ++} ++ + /* + * EFI entry point for the arm/arm64 EFI stubs. This is the entrypoint + * that is described in the PE/COFF header. Most of the code is the same +@@ -250,6 +262,10 @@ efi_status_t __efiapi efi_pe_entry(efi_h + (prop_tbl->memory_protection_attribute & + EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA); + ++ /* force efi_novamap if SetVirtualAddressMap() is unsupported */ ++ efi_novamap |= !(get_supported_rt_services() & ++ EFI_RT_SUPPORTED_SET_VIRTUAL_ADDRESS_MAP); ++ + /* hibernation expects the runtime regions to stay in the same place */ + if (!IS_ENABLED(CONFIG_HIBERNATION) && !efi_nokaslr && !flat_va_mapping) { + /* diff --git a/queue-5.10/kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch b/queue-5.10/kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch new file mode 100644 index 00000000000..7ae1b1e66a5 --- /dev/null +++ b/queue-5.10/kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch @@ -0,0 +1,47 @@ +From 31948332d5fa392ad933f4a6a10026850649ed76 Mon Sep 17 00:00:00 2001 +From: Will Deacon +Date: Fri, 5 Mar 2021 18:52:48 +0000 +Subject: KVM: arm64: Avoid corrupting vCPU context register in guest exit + +From: Will Deacon + +commit 31948332d5fa392ad933f4a6a10026850649ed76 upstream. + +Commit 7db21530479f ("KVM: arm64: Restore hyp when panicking in guest +context") tracks the currently running vCPU, clearing the pointer to +NULL on exit from a guest. + +Unfortunately, the use of 'set_loaded_vcpu' clobbers x1 to point at the +kvm_hyp_ctxt instead of the vCPU context, causing the subsequent RAS +code to go off into the weeds when it saves the DISR assuming that the +CPU context is embedded in a struct vCPU. + +Leave x1 alone and use x3 as a temporary register instead when clearing +the vCPU on the guest exit path. + +Cc: Marc Zyngier +Cc: Andrew Scull +Cc: +Fixes: 7db21530479f ("KVM: arm64: Restore hyp when panicking in guest context") +Suggested-by: Quentin Perret +Signed-off-by: Will Deacon +Signed-off-by: Marc Zyngier +Link: https://lore.kernel.org/r/20210226181211.14542-1-will@kernel.org +Message-Id: <20210305185254.3730990-3-maz@kernel.org> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/hyp/entry.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/arm64/kvm/hyp/entry.S ++++ b/arch/arm64/kvm/hyp/entry.S +@@ -146,7 +146,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOB + // Now restore the hyp regs + restore_callee_saved_regs x2 + +- set_loaded_vcpu xzr, x1, x2 ++ set_loaded_vcpu xzr, x2, x3 + + alternative_if ARM64_HAS_RAS_EXTN + // If we have the RAS extensions we can consume a pending error diff --git a/queue-5.10/kvm-arm64-fix-exclusive-limit-for-ipa-size.patch b/queue-5.10/kvm-arm64-fix-exclusive-limit-for-ipa-size.patch new file mode 100644 index 00000000000..e08b71bc576 --- /dev/null +++ b/queue-5.10/kvm-arm64-fix-exclusive-limit-for-ipa-size.patch @@ -0,0 +1,44 @@ +From 262b003d059c6671601a19057e9fe1a5e7f23722 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Thu, 11 Mar 2021 10:00:16 +0000 +Subject: KVM: arm64: Fix exclusive limit for IPA size + +From: Marc Zyngier + +commit 262b003d059c6671601a19057e9fe1a5e7f23722 upstream. + +When registering a memslot, we check the size and location of that +memslot against the IPA size to ensure that we can provide guest +access to the whole of the memory. + +Unfortunately, this check rejects memslot that end-up at the exact +limit of the addressing capability for a given IPA size. For example, +it refuses the creation of a 2GB memslot at 0x8000000 with a 32bit +IPA space. + +Fix it by relaxing the check to accept a memslot reaching the +limit of the IPA space. + +Fixes: c3058d5da222 ("arm/arm64: KVM: Ensure memslots are within KVM_PHYS_SIZE") +Reviewed-by: Eric Auger +Signed-off-by: Marc Zyngier +Cc: stable@vger.kernel.org +Reviewed-by: Andrew Jones +Link: https://lore.kernel.org/r/20210311100016.3830038-3-maz@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/mmu.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/arch/arm64/kvm/mmu.c ++++ b/arch/arm64/kvm/mmu.c +@@ -1309,8 +1309,7 @@ int kvm_arch_prepare_memory_region(struc + * Prevent userspace from creating a memory region outside of the IPA + * space addressable by the KVM guest IPA space. + */ +- if (memslot->base_gfn + memslot->npages >= +- (kvm_phys_size(kvm) >> PAGE_SHIFT)) ++ if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT)) + return -EFAULT; + + mmap_read_lock(current->mm); diff --git a/queue-5.10/kvm-arm64-fix-range-alignment-when-walking-page-tables.patch b/queue-5.10/kvm-arm64-fix-range-alignment-when-walking-page-tables.patch new file mode 100644 index 00000000000..4b93f2496f6 --- /dev/null +++ b/queue-5.10/kvm-arm64-fix-range-alignment-when-walking-page-tables.patch @@ -0,0 +1,44 @@ +From 357ad203d45c0f9d76a8feadbd5a1c5d460c638b Mon Sep 17 00:00:00 2001 +From: Jia He +Date: Fri, 5 Mar 2021 18:52:54 +0000 +Subject: KVM: arm64: Fix range alignment when walking page tables + +From: Jia He + +commit 357ad203d45c0f9d76a8feadbd5a1c5d460c638b upstream. + +When walking the page tables at a given level, and if the start +address for the range isn't aligned for that level, we propagate +the misalignment on each iteration at that level. + +This results in the walker ignoring a number of entries (depending +on the original misalignment) on each subsequent iteration. + +Properly aligning the address before the next iteration addresses +this issue. + +Cc: stable@vger.kernel.org +Reported-by: Howard Zhang +Acked-by: Will Deacon +Signed-off-by: Jia He +Fixes: b1e57de62cfb ("KVM: arm64: Add stand-alone page-table walker infrastructure") +[maz: rewrite commit message] +Signed-off-by: Marc Zyngier +Link: https://lore.kernel.org/r/20210303024225.2591-1-justin.he@arm.com +Message-Id: <20210305185254.3730990-9-maz@kernel.org> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/hyp/pgtable.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/arm64/kvm/hyp/pgtable.c ++++ b/arch/arm64/kvm/hyp/pgtable.c +@@ -225,6 +225,7 @@ static inline int __kvm_pgtable_visit(st + goto out; + + if (!table) { ++ data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); + data->addr += kvm_granule_size(level); + goto out; + } diff --git a/queue-5.10/kvm-arm64-nvhe-save-the-spe-context-early.patch b/queue-5.10/kvm-arm64-nvhe-save-the-spe-context-early.patch new file mode 100644 index 00000000000..fdcbf32d135 --- /dev/null +++ b/queue-5.10/kvm-arm64-nvhe-save-the-spe-context-early.patch @@ -0,0 +1,120 @@ +From b96b0c5de685df82019e16826a282d53d86d112c Mon Sep 17 00:00:00 2001 +From: Suzuki K Poulose +Date: Fri, 5 Mar 2021 18:52:47 +0000 +Subject: KVM: arm64: nvhe: Save the SPE context early + +From: Suzuki K Poulose + +commit b96b0c5de685df82019e16826a282d53d86d112c upstream. + +The nVHE KVM hyp drains and disables the SPE buffer, before +entering the guest, as the EL1&0 translation regime +is going to be loaded with that of the guest. + +But this operation is performed way too late, because : + - The owning translation regime of the SPE buffer + is transferred to EL2. (MDCR_EL2_E2PB == 0) + - The guest Stage1 is loaded. + +Thus the flush could use the host EL1 virtual address, +but use the EL2 translations instead of host EL1, for writing +out any cached data. + +Fix this by moving the SPE buffer handling early enough. +The restore path is doing the right thing. + +Fixes: 014c4c77aad7 ("KVM: arm64: Improve debug register save/restore flow") +Cc: stable@vger.kernel.org +Cc: Christoffer Dall +Cc: Marc Zyngier +Cc: Will Deacon +Cc: Catalin Marinas +Cc: Mark Rutland +Cc: Alexandru Elisei +Reviewed-by: Alexandru Elisei +Signed-off-by: Suzuki K Poulose +Signed-off-by: Marc Zyngier +Link: https://lore.kernel.org/r/20210302120345.3102874-1-suzuki.poulose@arm.com +Message-Id: <20210305185254.3730990-2-maz@kernel.org> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/include/asm/kvm_hyp.h | 5 +++++ + arch/arm64/kvm/hyp/nvhe/debug-sr.c | 12 ++++++++++-- + arch/arm64/kvm/hyp/nvhe/switch.c | 11 ++++++++++- + 3 files changed, 25 insertions(+), 3 deletions(-) + +--- a/arch/arm64/include/asm/kvm_hyp.h ++++ b/arch/arm64/include/asm/kvm_hyp.h +@@ -82,6 +82,11 @@ void sysreg_restore_guest_state_vhe(stru + void __debug_switch_to_guest(struct kvm_vcpu *vcpu); + void __debug_switch_to_host(struct kvm_vcpu *vcpu); + ++#ifdef __KVM_NVHE_HYPERVISOR__ ++void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu); ++void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); ++#endif ++ + void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); + void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); + +--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c ++++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c +@@ -58,16 +58,24 @@ static void __debug_restore_spe(u64 pmsc + write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); + } + +-void __debug_switch_to_guest(struct kvm_vcpu *vcpu) ++void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) + { + /* Disable and flush SPE data generation */ + __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); ++} ++ ++void __debug_switch_to_guest(struct kvm_vcpu *vcpu) ++{ + __debug_switch_to_guest_common(vcpu); + } + +-void __debug_switch_to_host(struct kvm_vcpu *vcpu) ++void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) + { + __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); ++} ++ ++void __debug_switch_to_host(struct kvm_vcpu *vcpu) ++{ + __debug_switch_to_host_common(vcpu); + } + +--- a/arch/arm64/kvm/hyp/nvhe/switch.c ++++ b/arch/arm64/kvm/hyp/nvhe/switch.c +@@ -188,6 +188,14 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu + pmu_switch_needed = __pmu_switch_to_guest(host_ctxt); + + __sysreg_save_state_nvhe(host_ctxt); ++ /* ++ * We must flush and disable the SPE buffer for nVHE, as ++ * the translation regime(EL1&0) is going to be loaded with ++ * that of the guest. And we must do this before we change the ++ * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and ++ * before we load guest Stage1. ++ */ ++ __debug_save_host_buffers_nvhe(vcpu); + + /* + * We must restore the 32-bit state before the sysregs, thanks +@@ -228,11 +236,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu + if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) + __fpsimd_save_fpexc32(vcpu); + ++ __debug_switch_to_host(vcpu); + /* + * This must come after restoring the host sysregs, since a non-VHE + * system may enable SPE here and make use of the TTBRs. + */ +- __debug_switch_to_host(vcpu); ++ __debug_restore_host_buffers_nvhe(vcpu); + + if (pmu_switch_needed) + __pmu_switch_to_host(host_ctxt); diff --git a/queue-5.10/kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch b/queue-5.10/kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch new file mode 100644 index 00000000000..821d59d514c --- /dev/null +++ b/queue-5.10/kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch @@ -0,0 +1,88 @@ +From 7d717558dd5ef10d28866750d5c24ff892ea3778 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Thu, 11 Mar 2021 10:00:15 +0000 +Subject: KVM: arm64: Reject VM creation when the default IPA size is unsupported + +From: Marc Zyngier + +commit 7d717558dd5ef10d28866750d5c24ff892ea3778 upstream. + +KVM/arm64 has forever used a 40bit default IPA space, partially +due to its 32bit heritage (where the only choice is 40bit). + +However, there are implementations in the wild that have a *cough* +much smaller *cough* IPA space, which leads to a misprogramming of +VTCR_EL2, and a guest that is stuck on its first memory access +if userspace dares to ask for the default IPA setting (which most +VMMs do). + +Instead, blundly reject the creation of such VM, as we can't +satisfy the requirements from userspace (with a one-off warning). +Also clarify the boot warning, and document that the VM creation +will fail when an unsupported IPA size is provided. + +Although this is an ABI change, it doesn't really change much +for userspace: + +- the guest couldn't run before this change, but no error was + returned. At least userspace knows what is happening. + +- a memory slot that was accepted because it did fit the default + IPA space now doesn't even get a chance to be registered. + +The other thing that is left doing is to convince userspace to +actually use the IPA space setting instead of relying on the +antiquated default. + +Fixes: 233a7cb23531 ("kvm: arm64: Allow tuning the physical address size for VM") +Signed-off-by: Marc Zyngier +Cc: stable@vger.kernel.org +Reviewed-by: Andrew Jones +Reviewed-by: Eric Auger +Link: https://lore.kernel.org/r/20210311100016.3830038-2-maz@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/virt/kvm/api.rst | 3 +++ + arch/arm64/kvm/reset.c | 12 ++++++++---- + 2 files changed, 11 insertions(+), 4 deletions(-) + +--- a/Documentation/virt/kvm/api.rst ++++ b/Documentation/virt/kvm/api.rst +@@ -182,6 +182,9 @@ is dependent on the CPU capability and t + be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION + ioctl() at run-time. + ++Creation of the VM will fail if the requested IPA size (whether it is ++implicit or explicit) is unsupported on the host. ++ + Please note that configuring the IPA size does not affect the capability + exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects + size of the address translated by the stage2 level (guest physical to +--- a/arch/arm64/kvm/reset.c ++++ b/arch/arm64/kvm/reset.c +@@ -373,10 +373,9 @@ int kvm_set_ipa_limit(void) + } + + kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); +- WARN(kvm_ipa_limit < KVM_PHYS_SHIFT, +- "KVM IPA Size Limit (%d bits) is smaller than default size\n", +- kvm_ipa_limit); +- kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit); ++ kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit, ++ ((kvm_ipa_limit < KVM_PHYS_SHIFT) ? ++ " (Reduced IPA size, limited VM/VMM compatibility)" : "")); + + return 0; + } +@@ -405,6 +404,11 @@ int kvm_arm_setup_stage2(struct kvm *kvm + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; ++ if (phys_shift > kvm_ipa_limit) { ++ pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", ++ current->comm); ++ return -EINVAL; ++ } + } + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); diff --git a/queue-5.10/kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch b/queue-5.10/kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch new file mode 100644 index 00000000000..6471d48af16 --- /dev/null +++ b/queue-5.10/kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch @@ -0,0 +1,78 @@ +From d7eb79c6290c7ae4561418544072e0a3266e7384 Mon Sep 17 00:00:00 2001 +From: Wanpeng Li +Date: Wed, 24 Feb 2021 09:37:29 +0800 +Subject: KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged + +From: Wanpeng Li + +commit d7eb79c6290c7ae4561418544072e0a3266e7384 upstream. + +# lscpu +Architecture: x86_64 +CPU op-mode(s): 32-bit, 64-bit +Byte Order: Little Endian +CPU(s): 88 +On-line CPU(s) list: 0-63 +Off-line CPU(s) list: 64-87 + +# cat /proc/cmdline +BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root ro +rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US .UTF-8 no-kvmclock-vsyscall + +# echo 1 > /sys/devices/system/cpu/cpu76/online +-bash: echo: write error: Cannot allocate memory + +The per-cpu vsyscall pvclock data pointer assigns either an element of the +static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory +hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if +kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in +kvmclock_setup_percpu() which returns -ENOMEM. It's broken for no-vsyscall +and sometimes you end up with vsyscall disabled if the host does something +strange. This patch fixes it by allocating this dynamically memory +unconditionally even if vsyscall is disabled. + +Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables") +Reported-by: Zelin Deng +Cc: Brijesh Singh +Cc: stable@vger.kernel.org#v4.19-rc5+ +Signed-off-by: Wanpeng Li +Message-Id: <1614130683-24137-1-git-send-email-wanpengli@tencent.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/kvmclock.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +--- a/arch/x86/kernel/kvmclock.c ++++ b/arch/x86/kernel/kvmclock.c +@@ -269,21 +269,20 @@ static void __init kvmclock_init_mem(voi + + static int __init kvm_setup_vsyscall_timeinfo(void) + { +-#ifdef CONFIG_X86_64 +- u8 flags; ++ kvmclock_init_mem(); + +- if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) +- return 0; ++#ifdef CONFIG_X86_64 ++ if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { ++ u8 flags; + +- flags = pvclock_read_flags(&hv_clock_boot[0].pvti); +- if (!(flags & PVCLOCK_TSC_STABLE_BIT)) +- return 0; ++ flags = pvclock_read_flags(&hv_clock_boot[0].pvti); ++ if (!(flags & PVCLOCK_TSC_STABLE_BIT)) ++ return 0; + +- kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; ++ kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; ++ } + #endif + +- kvmclock_init_mem(); +- + return 0; + } + early_initcall(kvm_setup_vsyscall_timeinfo); diff --git a/queue-5.10/kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch b/queue-5.10/kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch new file mode 100644 index 00000000000..05441e9c867 --- /dev/null +++ b/queue-5.10/kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch @@ -0,0 +1,46 @@ +From beda430177f56656e7980dcce93456ffaa35676b Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 4 Mar 2021 18:18:08 -0800 +Subject: KVM: x86: Ensure deadline timer has truly expired before posting its IRQ + +From: Sean Christopherson + +commit beda430177f56656e7980dcce93456ffaa35676b upstream. + +When posting a deadline timer interrupt, open code the checks guarding +__kvm_wait_lapic_expire() in order to skip the lapic_timer_int_injected() +check in kvm_wait_lapic_expire(). The injection check will always fail +since the interrupt has not yet be injected. Moving the call after +injection would also be wrong as that wouldn't actually delay delivery +of the IRQ if it is indeed sent via posted interrupt. + +Fixes: 010fd37fddf6 ("KVM: LAPIC: Reduce world switch latency caused by timer_advance_ns") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20210305021808.3769732-1-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/lapic.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -1641,7 +1641,16 @@ static void apic_timer_expired(struct kv + } + + if (kvm_use_posted_timer_interrupt(apic->vcpu)) { +- kvm_wait_lapic_expire(vcpu); ++ /* ++ * Ensure the guest's timer has truly expired before posting an ++ * interrupt. Open code the relevant checks to avoid querying ++ * lapic_timer_int_injected(), which will be false since the ++ * interrupt isn't yet injected. Waiting until after injecting ++ * is not an option since that won't help a posted interrupt. ++ */ ++ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && ++ vcpu->arch.apic->lapic_timer.timer_advance_ns) ++ __kvm_wait_lapic_expire(vcpu); + kvm_apic_inject_pending_timer_irqs(apic); + return; + } diff --git a/queue-5.10/linux-compiler-clang.h-define-have_builtin_bswap.patch b/queue-5.10/linux-compiler-clang.h-define-have_builtin_bswap.patch new file mode 100644 index 00000000000..4356e585c48 --- /dev/null +++ b/queue-5.10/linux-compiler-clang.h-define-have_builtin_bswap.patch @@ -0,0 +1,80 @@ +From 97e4910232fa1f81e806aa60c25a0450276d99a2 Mon Sep 17 00:00:00 2001 +From: Arnd Bergmann +Date: Fri, 12 Mar 2021 21:07:47 -0800 +Subject: linux/compiler-clang.h: define HAVE_BUILTIN_BSWAP* + +From: Arnd Bergmann + +commit 97e4910232fa1f81e806aa60c25a0450276d99a2 upstream. + +Separating compiler-clang.h from compiler-gcc.h inadventently dropped the +definitions of the three HAVE_BUILTIN_BSWAP macros, which requires falling +back to the open-coded version and hoping that the compiler detects it. + +Since all versions of clang support the __builtin_bswap interfaces, add +back the flags and have the headers pick these up automatically. + +This results in a 4% improvement of compilation speed for arm defconfig. + +Note: it might also be worth revisiting which architectures set +CONFIG_ARCH_USE_BUILTIN_BSWAP for one compiler or the other, today this is +set on six architectures (arm32, csky, mips, powerpc, s390, x86), while +another ten architectures define custom helpers (alpha, arc, ia64, m68k, +mips, nios2, parisc, sh, sparc, xtensa), and the rest (arm64, h8300, +hexagon, microblaze, nds32, openrisc, riscv) just get the unoptimized +version and rely on the compiler to detect it. + +A long time ago, the compiler builtins were architecture specific, but +nowadays, all compilers that are able to build the kernel have correct +implementations of them, though some may not be as optimized as the inline +asm versions. + +The patch that dropped the optimization landed in v4.19, so as discussed +it would be fairly safe to backport this revert to stable kernels to the +4.19/5.4/5.10 stable kernels, but there is a remaining risk for +regressions, and it has no known side-effects besides compile speed. + +Link: https://lkml.kernel.org/r/20210226161151.2629097-1-arnd@kernel.org +Link: https://lore.kernel.org/lkml/20210225164513.3667778-1-arnd@kernel.org/ +Fixes: 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") +Signed-off-by: Arnd Bergmann +Reviewed-by: Nathan Chancellor +Reviewed-by: Kees Cook +Acked-by: Miguel Ojeda +Acked-by: Nick Desaulniers +Acked-by: Luc Van Oostenryck +Cc: Masahiro Yamada +Cc: Nick Hu +Cc: Greentime Hu +Cc: Vincent Chen +Cc: Paul Walmsley +Cc: Palmer Dabbelt +Cc: Albert Ou +Cc: Guo Ren +Cc: Randy Dunlap +Cc: Sami Tolvanen +Cc: Marco Elver +Cc: Arvind Sankar +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/compiler-clang.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/include/linux/compiler-clang.h ++++ b/include/linux/compiler-clang.h +@@ -41,6 +41,12 @@ + #define __no_sanitize_thread + #endif + ++#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) ++#define __HAVE_BUILTIN_BSWAP32__ ++#define __HAVE_BUILTIN_BSWAP64__ ++#define __HAVE_BUILTIN_BSWAP16__ ++#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ ++ + #if __has_feature(undefined_behavior_sanitizer) + /* GCC does not have __SANITIZE_UNDEFINED__ */ + #define __no_sanitize_undefined \ diff --git a/queue-5.10/mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch b/queue-5.10/mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch new file mode 100644 index 00000000000..b03361a70a9 --- /dev/null +++ b/queue-5.10/mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch @@ -0,0 +1,82 @@ +From 96cfe2c0fd23ea7c2368d14f769d287e7ae1082e Mon Sep 17 00:00:00 2001 +From: Suren Baghdasaryan +Date: Fri, 12 Mar 2021 21:08:06 -0800 +Subject: mm/madvise: replace ptrace attach requirement for process_madvise + +From: Suren Baghdasaryan + +commit 96cfe2c0fd23ea7c2368d14f769d287e7ae1082e upstream. + +process_madvise currently requires ptrace attach capability. +PTRACE_MODE_ATTACH gives one process complete control over another +process. It effectively removes the security boundary between the two +processes (in one direction). Granting ptrace attach capability even to a +system process is considered dangerous since it creates an attack surface. +This severely limits the usage of this API. + +The operations process_madvise can perform do not affect the correctness +of the operation of the target process; they only affect where the data is +physically located (and therefore, how fast it can be accessed). What we +want is the ability for one process to influence another process in order +to optimize performance across the entire system while leaving the +security boundary intact. + +Replace PTRACE_MODE_ATTACH with a combination of PTRACE_MODE_READ and +CAP_SYS_NICE. PTRACE_MODE_READ to prevent leaking ASLR metadata and +CAP_SYS_NICE for influencing process performance. + +Link: https://lkml.kernel.org/r/20210303185807.2160264-1-surenb@google.com +Signed-off-by: Suren Baghdasaryan +Reviewed-by: Kees Cook +Acked-by: Minchan Kim +Acked-by: David Rientjes +Cc: Jann Horn +Cc: Jeff Vander Stoep +Cc: Michal Hocko +Cc: Shakeel Butt +Cc: Tim Murray +Cc: Florian Weimer +Cc: Oleg Nesterov +Cc: James Morris +Cc: [5.10+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/madvise.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -1202,12 +1202,22 @@ SYSCALL_DEFINE5(process_madvise, int, pi + goto release_task; + } + +- mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); ++ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ ++ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + ++ /* ++ * Require CAP_SYS_NICE for influencing process performance. Note that ++ * only non-destructive hints are currently supported. ++ */ ++ if (!capable(CAP_SYS_NICE)) { ++ ret = -EPERM; ++ goto release_mm; ++ } ++ + total_len = iov_iter_count(&iter); + + while (iov_iter_count(&iter)) { +@@ -1222,6 +1232,7 @@ SYSCALL_DEFINE5(process_madvise, int, pi + if (ret == 0) + ret = total_len - iov_iter_count(&iter); + ++release_mm: + mmput(mm); + release_task: + put_task_struct(task); diff --git a/queue-5.10/mm-memcg-set-memcg-when-splitting-page.patch b/queue-5.10/mm-memcg-set-memcg-when-splitting-page.patch new file mode 100644 index 00000000000..76d97e05ea9 --- /dev/null +++ b/queue-5.10/mm-memcg-set-memcg-when-splitting-page.patch @@ -0,0 +1,61 @@ +From e1baddf8475b06cc56f4bafecf9a32a124343d9f Mon Sep 17 00:00:00 2001 +From: Zhou Guanghui +Date: Fri, 12 Mar 2021 21:08:33 -0800 +Subject: mm/memcg: set memcg when splitting page + +From: Zhou Guanghui + +commit e1baddf8475b06cc56f4bafecf9a32a124343d9f upstream. + +As described in the split_page() comment, for the non-compound high order +page, the sub-pages must be freed individually. If the memcg of the first +page is valid, the tail pages cannot be uncharged when be freed. + +For example, when alloc_pages_exact is used to allocate 1MB continuous +physical memory, 2MB is charged(kmemcg is enabled and __GFP_ACCOUNT is +set). When make_alloc_exact free the unused 1MB and free_pages_exact free +the applied 1MB, actually, only 4KB(one page) is uncharged. + +Therefore, the memcg of the tail page needs to be set when splitting a +page. + +Michel: + +There are at least two explicit users of __GFP_ACCOUNT with +alloc_exact_pages added recently. See 7efe8ef274024 ("KVM: arm64: +Allocate stage-2 pgd pages with GFP_KERNEL_ACCOUNT") and c419621873713 +("KVM: s390: Add memcg accounting to KVM allocations"), so this is not +just a theoretical issue. + +Link: https://lkml.kernel.org/r/20210304074053.65527-3-zhouguanghui1@huawei.com +Signed-off-by: Zhou Guanghui +Acked-by: Johannes Weiner +Reviewed-by: Zi Yan +Reviewed-by: Shakeel Butt +Acked-by: Michal Hocko +Cc: Hanjun Guo +Cc: Hugh Dickins +Cc: Kefeng Wang +Cc: "Kirill A. Shutemov" +Cc: Nicholas Piggin +Cc: Rui Xiang +Cc: Tianhong Ding +Cc: Weilong Chen +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3272,6 +3272,7 @@ void split_page(struct page *page, unsig + for (i = 1; i < (1 << order); i++) + set_page_refcounted(page + i); + split_page_owner(page, 1 << order); ++ split_page_memcg(page, 1 << order); + } + EXPORT_SYMBOL_GPL(split_page); + diff --git a/queue-5.10/mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch b/queue-5.10/mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch new file mode 100644 index 00000000000..2ba9894ff13 --- /dev/null +++ b/queue-5.10/mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch @@ -0,0 +1,130 @@ +From 6ce64428d62026a10cb5d80138ff2f90cc21d367 Mon Sep 17 00:00:00 2001 +From: Nadav Amit +Date: Fri, 12 Mar 2021 21:08:17 -0800 +Subject: mm/userfaultfd: fix memory corruption due to writeprotect +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Nadav Amit + +commit 6ce64428d62026a10cb5d80138ff2f90cc21d367 upstream. + +Userfaultfd self-test fails occasionally, indicating a memory corruption. + +Analyzing this problem indicates that there is a real bug since mmap_lock +is only taken for read in mwriteprotect_range() and defers flushes, and +since there is insufficient consideration of concurrent deferred TLB +flushes in wp_page_copy(). Although the PTE is flushed from the TLBs in +wp_page_copy(), this flush takes place after the copy has already been +performed, and therefore changes of the page are possible between the time +of the copy and the time in which the PTE is flushed. + +To make matters worse, memory-unprotection using userfaultfd also poses a +problem. Although memory unprotection is logically a promotion of PTE +permissions, and therefore should not require a TLB flush, the current +userrfaultfd code might actually cause a demotion of the architectural PTE +permission: when userfaultfd_writeprotect() unprotects memory region, it +unintentionally *clears* the RW-bit if it was already set. Note that this +unprotecting a PTE that is not write-protected is a valid use-case: the +userfaultfd monitor might ask to unprotect a region that holds both +write-protected and write-unprotected PTEs. + +The scenario that happens in selftests/vm/userfaultfd is as follows: + +cpu0 cpu1 cpu2 +---- ---- ---- + [ Writable PTE + cached in TLB ] +userfaultfd_writeprotect() +[ write-*unprotect* ] +mwriteprotect_range() +mmap_read_lock() +change_protection() + +change_protection_range() +... +change_pte_range() +[ *clear* “write”-bit ] +[ defer TLB flushes ] + [ page-fault ] + ... + wp_page_copy() + cow_user_page() + [ copy page ] + [ write to old + page ] + ... + set_pte_at_notify() + +A similar scenario can happen: + +cpu0 cpu1 cpu2 cpu3 +---- ---- ---- ---- + [ Writable PTE + cached in TLB ] +userfaultfd_writeprotect() +[ write-protect ] +[ deferred TLB flush ] + userfaultfd_writeprotect() + [ write-unprotect ] + [ deferred TLB flush] + [ page-fault ] + wp_page_copy() + cow_user_page() + [ copy page ] + ... [ write to page ] + set_pte_at_notify() + +This race exists since commit 292924b26024 ("userfaultfd: wp: apply +_PAGE_UFFD_WP bit"). Yet, as Yu Zhao pointed, these races became apparent +since commit 09854ba94c6a ("mm: do_wp_page() simplification") which made +wp_page_copy() more likely to take place, specifically if page_count(page) +> 1. + +To resolve the aforementioned races, check whether there are pending +flushes on uffd-write-protected VMAs, and if there are, perform a flush +before doing the COW. + +Further optimizations will follow to avoid during uffd-write-unprotect +unnecassary PTE write-protection and TLB flushes. + +Link: https://lkml.kernel.org/r/20210304095423.3825684-1-namit@vmware.com +Fixes: 09854ba94c6a ("mm: do_wp_page() simplification") +Signed-off-by: Nadav Amit +Suggested-by: Yu Zhao +Reviewed-by: Peter Xu +Tested-by: Peter Xu +Cc: Andrea Arcangeli +Cc: Andy Lutomirski +Cc: Pavel Emelyanov +Cc: Mike Kravetz +Cc: Mike Rapoport +Cc: Minchan Kim +Cc: Will Deacon +Cc: Peter Zijlstra +Cc: [5.9+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/memory.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -3090,6 +3090,14 @@ static vm_fault_t do_wp_page(struct vm_f + return handle_userfault(vmf, VM_UFFD_WP); + } + ++ /* ++ * Userfaultfd write-protect can defer flushes. Ensure the TLB ++ * is flushed in this case before copying. ++ */ ++ if (unlikely(userfaultfd_wp(vmf->vma) && ++ mm_tlb_flush_pending(vmf->vma->vm_mm))) ++ flush_tlb_page(vmf->vma, vmf->address); ++ + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (!vmf->page) { + /* diff --git a/queue-5.10/powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch b/queue-5.10/powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch new file mode 100644 index 00000000000..e799db977a6 --- /dev/null +++ b/queue-5.10/powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch @@ -0,0 +1,36 @@ +From cea15316ceee2d4a51dfdecd79e08a438135416c Mon Sep 17 00:00:00 2001 +From: "Naveen N. Rao" +Date: Thu, 4 Mar 2021 07:34:11 +0530 +Subject: powerpc/64s: Fix instruction encoding for lis in ppc_function_entry() + +From: Naveen N. Rao + +commit cea15316ceee2d4a51dfdecd79e08a438135416c upstream. + +'lis r2,N' is 'addis r2,0,N' and the instruction encoding in the macro +LIS_R2 is incorrect (it currently maps to 'addis r0,r2,N'). Fix the +same. + +Fixes: c71b7eff426f ("powerpc: Add ABIv2 support to ppc_function_entry") +Cc: stable@vger.kernel.org # v3.16+ +Reported-by: Jiri Olsa +Signed-off-by: Naveen N. Rao +Acked-by: Segher Boessenkool +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210304020411.16796-1-naveen.n.rao@linux.vnet.ibm.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/include/asm/code-patching.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/powerpc/include/asm/code-patching.h ++++ b/arch/powerpc/include/asm/code-patching.h +@@ -73,7 +73,7 @@ void __patch_exception(int exc, unsigned + #endif + + #define OP_RT_RA_MASK 0xffff0000UL +-#define LIS_R2 0x3c020000UL ++#define LIS_R2 0x3c400000UL + #define ADDIS_R2_R12 0x3c4c0000UL + #define ADDI_R2_R2 0x38420000UL + diff --git a/queue-5.10/powerpc-fix-inverted-set_full_regs-bitop.patch b/queue-5.10/powerpc-fix-inverted-set_full_regs-bitop.patch new file mode 100644 index 00000000000..fcc71550d9c --- /dev/null +++ b/queue-5.10/powerpc-fix-inverted-set_full_regs-bitop.patch @@ -0,0 +1,45 @@ +From 73ac79881804eed2e9d76ecdd1018037f8510cb1 Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin +Date: Mon, 8 Mar 2021 18:55:30 +1000 +Subject: powerpc: Fix inverted SET_FULL_REGS bitop + +From: Nicholas Piggin + +commit 73ac79881804eed2e9d76ecdd1018037f8510cb1 upstream. + +This bit operation was inverted and set the low bit rather than +cleared it, breaking the ability to ptrace non-volatile GPRs after +exec. Fix. + +Only affects 64e and 32-bit. + +Fixes: feb9df3462e6 ("powerpc/64s: Always has full regs, so remove remnant checks") +Cc: stable@vger.kernel.org # v5.8+ +Signed-off-by: Nicholas Piggin +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210308085530.3191843-1-npiggin@gmail.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/include/asm/ptrace.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/powerpc/include/asm/ptrace.h ++++ b/arch/powerpc/include/asm/ptrace.h +@@ -193,7 +193,7 @@ extern int ptrace_put_reg(struct task_st + #define TRAP_FLAGS_MASK 0x11 + #define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) + #define FULL_REGS(regs) (((regs)->trap & 1) == 0) +-#define SET_FULL_REGS(regs) ((regs)->trap |= 1) ++#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) + #endif + #define CHECK_FULL_REGS(regs) BUG_ON(!FULL_REGS(regs)) + #define NV_REG_POISON 0xdeadbeefdeadbeefUL +@@ -208,7 +208,7 @@ extern int ptrace_put_reg(struct task_st + #define TRAP_FLAGS_MASK 0x1F + #define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) + #define FULL_REGS(regs) (((regs)->trap & 1) == 0) +-#define SET_FULL_REGS(regs) ((regs)->trap |= 1) ++#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) + #define IS_CRITICAL_EXC(regs) (((regs)->trap & 2) != 0) + #define IS_MCHECK_EXC(regs) (((regs)->trap & 4) != 0) + #define IS_DEBUG_EXC(regs) (((regs)->trap & 8) != 0) diff --git a/queue-5.10/powerpc-fix-missing-declaration-of-able_kernel_vsx.patch b/queue-5.10/powerpc-fix-missing-declaration-of-able_kernel_vsx.patch new file mode 100644 index 00000000000..609935d750d --- /dev/null +++ b/queue-5.10/powerpc-fix-missing-declaration-of-able_kernel_vsx.patch @@ -0,0 +1,83 @@ +From bd73758803c2eedc037c2268b65a19542a832594 Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Tue, 9 Mar 2021 08:39:39 +0000 +Subject: powerpc: Fix missing declaration of [en/dis]able_kernel_vsx() + +From: Christophe Leroy + +commit bd73758803c2eedc037c2268b65a19542a832594 upstream. + +Add stub instances of enable_kernel_vsx() and disable_kernel_vsx() +when CONFIG_VSX is not set, to avoid following build failure. + + CC [M] drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.o + In file included from ./drivers/gpu/drm/amd/amdgpu/../display/dc/dm_services_types.h:29, + from ./drivers/gpu/drm/amd/amdgpu/../display/dc/dm_services.h:37, + from drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c:27: + drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c: In function 'dcn_bw_apply_registry_override': + ./drivers/gpu/drm/amd/amdgpu/../display/dc/os_types.h:64:3: error: implicit declaration of function 'enable_kernel_vsx'; did you mean 'enable_kernel_fp'? [-Werror=implicit-function-declaration] + 64 | enable_kernel_vsx(); \ + | ^~~~~~~~~~~~~~~~~ + drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c:640:2: note: in expansion of macro 'DC_FP_START' + 640 | DC_FP_START(); + | ^~~~~~~~~~~ + ./drivers/gpu/drm/amd/amdgpu/../display/dc/os_types.h:75:3: error: implicit declaration of function 'disable_kernel_vsx'; did you mean 'disable_kernel_fp'? [-Werror=implicit-function-declaration] + 75 | disable_kernel_vsx(); \ + | ^~~~~~~~~~~~~~~~~~ + drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.c:676:2: note: in expansion of macro 'DC_FP_END' + 676 | DC_FP_END(); + | ^~~~~~~~~ + cc1: some warnings being treated as errors + make[5]: *** [drivers/gpu/drm/amd/amdgpu/../display/dc/calcs/dcn_calcs.o] Error 1 + +This works because the caller is checking if VSX is available using +cpu_has_feature(): + + #define DC_FP_START() { \ + if (cpu_has_feature(CPU_FTR_VSX_COMP)) { \ + preempt_disable(); \ + enable_kernel_vsx(); \ + } else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) { \ + preempt_disable(); \ + enable_kernel_altivec(); \ + } else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) { \ + preempt_disable(); \ + enable_kernel_fp(); \ + } \ + +When CONFIG_VSX is not selected, cpu_has_feature(CPU_FTR_VSX_COMP) +constant folds to 'false' so the call to enable_kernel_vsx() is +discarded and the build succeeds. + +Fixes: 16a9dea110a6 ("amdgpu: Enable initial DCN support on POWER") +Cc: stable@vger.kernel.org # v5.6+ +Reported-by: Geert Uytterhoeven +Reported-by: kernel test robot +Signed-off-by: Christophe Leroy +[mpe: Incorporate some discussion comments into the change log] +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/8d7d285a027e9d21f5ff7f850fa71a2655b0c4af.1615279170.git.christophe.leroy@csgroup.eu +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/include/asm/switch_to.h | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/arch/powerpc/include/asm/switch_to.h ++++ b/arch/powerpc/include/asm/switch_to.h +@@ -71,6 +71,16 @@ static inline void disable_kernel_vsx(vo + { + msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX); + } ++#else ++static inline void enable_kernel_vsx(void) ++{ ++ BUILD_BUG(); ++} ++ ++static inline void disable_kernel_vsx(void) ++{ ++ BUILD_BUG(); ++} + #endif + + #ifdef CONFIG_SPE diff --git a/queue-5.10/sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch b/queue-5.10/sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch new file mode 100644 index 00000000000..af5bfd4d649 --- /dev/null +++ b/queue-5.10/sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch @@ -0,0 +1,41 @@ +From ce29ddc47b91f97e7f69a0fb7cbb5845f52a9825 Mon Sep 17 00:00:00 2001 +From: Mathieu Desnoyers +Date: Wed, 17 Feb 2021 11:56:51 -0500 +Subject: sched/membarrier: fix missing local execution of ipi_sync_rq_state() + +From: Mathieu Desnoyers + +commit ce29ddc47b91f97e7f69a0fb7cbb5845f52a9825 upstream. + +The function sync_runqueues_membarrier_state() should copy the +membarrier state from the @mm received as parameter to each runqueue +currently running tasks using that mm. + +However, the use of smp_call_function_many() skips the current runqueue, +which is unintended. Replace by a call to on_each_cpu_mask(). + +Fixes: 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy load") +Reported-by: Nadav Amit +Signed-off-by: Mathieu Desnoyers +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Ingo Molnar +Cc: stable@vger.kernel.org # 5.4.x+ +Link: https://lore.kernel.org/r/74F1E842-4A84-47BF-B6C2-5407DFDD4A4A@gmail.com +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/membarrier.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +--- a/kernel/sched/membarrier.c ++++ b/kernel/sched/membarrier.c +@@ -332,9 +332,7 @@ static int sync_runqueues_membarrier_sta + } + rcu_read_unlock(); + +- preempt_disable(); +- smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); +- preempt_enable(); ++ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true); + + free_cpumask_var(tmpmask); + cpus_read_unlock(); diff --git a/queue-5.10/series b/queue-5.10/series index c9631d0cc66..369c42a40ec 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -260,3 +260,27 @@ seqlock-lockdep-fix-seqcount_latch_init.patch stop_machine-mark-helpers-__always_inline.patch include-linux-sched-mm.h-use-rcu_dereference-in-in_v.patch prctl-fix-pr_set_mm_auxv-kernel-stack-leak.patch +zram-fix-return-value-on-writeback_store.patch +linux-compiler-clang.h-define-have_builtin_bswap.patch +sched-membarrier-fix-missing-local-execution-of-ipi_sync_rq_state.patch +efi-stub-omit-setvirtualaddressmap-if-marked-unsupported-in-rt_prop-table.patch +powerpc-64s-fix-instruction-encoding-for-lis-in-ppc_function_entry.patch +powerpc-fix-inverted-set_full_regs-bitop.patch +powerpc-fix-missing-declaration-of-able_kernel_vsx.patch +binfmt_misc-fix-possible-deadlock-in-bm_register_write.patch +x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch +x86-sev-es-introduce-ip_within_syscall_gap-helper.patch +x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch +x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch +x86-sev-es-use-__copy_from_user_inatomic.patch +x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch +kvm-x86-ensure-deadline-timer-has-truly-expired-before-posting-its-irq.patch +kvm-kvmclock-fix-vcpus-64-can-t-be-online-hotpluged.patch +kvm-arm64-fix-range-alignment-when-walking-page-tables.patch +kvm-arm64-avoid-corrupting-vcpu-context-register-in-guest-exit.patch +kvm-arm64-nvhe-save-the-spe-context-early.patch +kvm-arm64-reject-vm-creation-when-the-default-ipa-size-is-unsupported.patch +kvm-arm64-fix-exclusive-limit-for-ipa-size.patch +mm-userfaultfd-fix-memory-corruption-due-to-writeprotect.patch +mm-madvise-replace-ptrace-attach-requirement-for-process_madvise.patch +mm-memcg-set-memcg-when-splitting-page.patch diff --git a/queue-5.10/x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch b/queue-5.10/x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch new file mode 100644 index 00000000000..919b7e7c1e8 --- /dev/null +++ b/queue-5.10/x86-entry-fix-entry-exit-mismatch-on-failed-fast-32-bit-syscalls.patch @@ -0,0 +1,44 @@ +From 5d5675df792ff67e74a500c4c94db0f99e6a10ef Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 4 Mar 2021 11:05:54 -0800 +Subject: x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls + +From: Andy Lutomirski + +commit 5d5675df792ff67e74a500c4c94db0f99e6a10ef upstream. + +On a 32-bit fast syscall that fails to read its arguments from user +memory, the kernel currently does syscall exit work but not +syscall entry work. This confuses audit and ptrace. For example: + + $ ./tools/testing/selftests/x86/syscall_arg_fault_32 + ... + strace: pid 264258: entering, ptrace_syscall_info.op == 2 + ... + +This is a minimal fix intended for ease of backporting. A more +complete cleanup is coming. + +Fixes: 0b085e68f407 ("x86/entry: Consolidate 32/64 bit syscall entry") +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/8c82296ddf803b91f8d1e5eac89e5803ba54ab0e.1614884673.git.luto@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/entry/common.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/arch/x86/entry/common.c ++++ b/arch/x86/entry/common.c +@@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32 + regs->ax = -EFAULT; + + instrumentation_end(); +- syscall_exit_to_user_mode(regs); ++ local_irq_disable(); ++ irqentry_exit_to_user_mode(regs); + return false; + } + diff --git a/queue-5.10/x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch b/queue-5.10/x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch new file mode 100644 index 00000000000..3633af970e0 --- /dev/null +++ b/queue-5.10/x86-sev-es-check-regs-sp-is-trusted-before-adjusting-vc-ist-stack.patch @@ -0,0 +1,65 @@ +From 545ac14c16b5dbd909d5a90ddf5b5a629a40fa94 Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Wed, 3 Mar 2021 15:17:13 +0100 +Subject: x86/sev-es: Check regs->sp is trusted before adjusting #VC IST stack + +From: Joerg Roedel + +commit 545ac14c16b5dbd909d5a90ddf5b5a629a40fa94 upstream. + +The code in the NMI handler to adjust the #VC handler IST stack is +needed in case an NMI hits when the #VC handler is still using its IST +stack. + +But the check for this condition also needs to look if the regs->sp +value is trusted, meaning it was not set by user-space. Extend the check +to not use regs->sp when the NMI interrupted user-space code or the +SYSCALL gap. + +Fixes: 315562c9af3d5 ("x86/sev-es: Adjust #VC IST Stack on entering NMI handler") +Reported-by: Andy Lutomirski +Signed-off-by: Joerg Roedel +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org # 5.10+ +Link: https://lkml.kernel.org/r/20210303141716.29223-3-joro@8bytes.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/sev-es.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c +index 84c1821819af..301f20f6d4dd 100644 +--- a/arch/x86/kernel/sev-es.c ++++ b/arch/x86/kernel/sev-es.c +@@ -121,8 +121,18 @@ static void __init setup_vc_stacks(int cpu) + cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); + } + +-static __always_inline bool on_vc_stack(unsigned long sp) ++static __always_inline bool on_vc_stack(struct pt_regs *regs) + { ++ unsigned long sp = regs->sp; ++ ++ /* User-mode RSP is not trusted */ ++ if (user_mode(regs)) ++ return false; ++ ++ /* SYSCALL gap still has user-mode RSP */ ++ if (ip_within_syscall_gap(regs)) ++ return false; ++ + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); + } + +@@ -144,7 +154,7 @@ void noinstr __sev_es_ist_enter(struct pt_regs *regs) + old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* Make room on the IST stack */ +- if (on_vc_stack(regs->sp)) ++ if (on_vc_stack(regs)) + new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); + else + new_ist = old_ist - sizeof(old_ist); +-- +2.30.2 + diff --git a/queue-5.10/x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch b/queue-5.10/x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch new file mode 100644 index 00000000000..39a31936ef9 --- /dev/null +++ b/queue-5.10/x86-sev-es-correctly-track-irq-states-in-runtime-vc-handler.patch @@ -0,0 +1,57 @@ +From 62441a1fb53263bda349b6e5997c3cc5c120d89e Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Wed, 3 Mar 2021 15:17:15 +0100 +Subject: x86/sev-es: Correctly track IRQ states in runtime #VC handler + +From: Joerg Roedel + +commit 62441a1fb53263bda349b6e5997c3cc5c120d89e upstream. + +Call irqentry_nmi_enter()/irqentry_nmi_exit() in the #VC handler to +correctly track the IRQ state during its execution. + +Fixes: 0786138c78e79 ("x86/sev-es: Add a Runtime #VC Exception Handler") +Reported-by: Andy Lutomirski +Signed-off-by: Joerg Roedel +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org # v5.10+ +Link: https://lkml.kernel.org/r/20210303141716.29223-5-joro@8bytes.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/sev-es.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/arch/x86/kernel/sev-es.c ++++ b/arch/x86/kernel/sev-es.c +@@ -1258,13 +1258,12 @@ static __always_inline bool on_vc_fallba + DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) + { + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); ++ irqentry_state_t irq_state; + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + +- lockdep_assert_irqs_disabled(); +- + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ +@@ -1273,6 +1272,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_co + return; + } + ++ irq_state = irqentry_nmi_enter(regs); ++ lockdep_assert_irqs_disabled(); + instrumentation_begin(); + + /* +@@ -1335,6 +1336,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_co + + out: + instrumentation_end(); ++ irqentry_nmi_exit(regs, irq_state); + + return; + diff --git a/queue-5.10/x86-sev-es-introduce-ip_within_syscall_gap-helper.patch b/queue-5.10/x86-sev-es-introduce-ip_within_syscall_gap-helper.patch new file mode 100644 index 00000000000..fbc8badf2d8 --- /dev/null +++ b/queue-5.10/x86-sev-es-introduce-ip_within_syscall_gap-helper.patch @@ -0,0 +1,90 @@ +From 78a81d88f60ba773cbe890205e1ee67f00502948 Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Wed, 3 Mar 2021 15:17:12 +0100 +Subject: x86/sev-es: Introduce ip_within_syscall_gap() helper + +From: Joerg Roedel + +commit 78a81d88f60ba773cbe890205e1ee67f00502948 upstream. + +Introduce a helper to check whether an exception came from the syscall +gap and use it in the SEV-ES code. Extend the check to also cover the +compatibility SYSCALL entry path. + +Fixes: 315562c9af3d5 ("x86/sev-es: Adjust #VC IST Stack on entering NMI handler") +Signed-off-by: Joerg Roedel +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org # 5.10+ +Link: https://lkml.kernel.org/r/20210303141716.29223-2-joro@8bytes.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/entry/entry_64_compat.S | 2 ++ + arch/x86/include/asm/proto.h | 1 + + arch/x86/include/asm/ptrace.h | 15 +++++++++++++++ + arch/x86/kernel/traps.c | 3 +-- + 4 files changed, 19 insertions(+), 2 deletions(-) + +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat) + /* Switch to the kernel stack */ + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ++SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ++ + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ +--- a/arch/x86/include/asm/proto.h ++++ b/arch/x86/include/asm/proto.h +@@ -25,6 +25,7 @@ void __end_SYSENTER_singlestep_region(vo + void entry_SYSENTER_compat(void); + void __end_entry_SYSENTER_compat(void); + void entry_SYSCALL_compat(void); ++void entry_SYSCALL_compat_safe_stack(void); + void entry_INT80_compat(void); + #ifdef CONFIG_XEN_PV + void xen_entry_INT80_compat(void); +--- a/arch/x86/include/asm/ptrace.h ++++ b/arch/x86/include/asm/ptrace.h +@@ -94,6 +94,8 @@ struct pt_regs { + #include + #endif + ++#include ++ + struct cpuinfo_x86; + struct task_struct; + +@@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct + #ifdef CONFIG_X86_64 + #define current_user_stack_pointer() current_pt_regs()->sp + #define compat_user_stack_pointer() current_pt_regs()->sp ++ ++static inline bool ip_within_syscall_gap(struct pt_regs *regs) ++{ ++ bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && ++ regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); ++ ++#ifdef CONFIG_IA32_EMULATION ++ ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && ++ regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); ++#endif ++ ++ return ret; ++} + #endif + + static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -686,8 +686,7 @@ asmlinkage __visible noinstr struct pt_r + * In the SYSCALL entry path the RSP value comes from user-space - don't + * trust it and switch to the current kernel stack + */ +- if (regs->ip >= (unsigned long)entry_SYSCALL_64 && +- regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) { ++ if (ip_within_syscall_gap(regs)) { + sp = this_cpu_read(cpu_current_top_of_stack); + goto sync; + } diff --git a/queue-5.10/x86-sev-es-use-__copy_from_user_inatomic.patch b/queue-5.10/x86-sev-es-use-__copy_from_user_inatomic.patch new file mode 100644 index 00000000000..6e6731d1cef --- /dev/null +++ b/queue-5.10/x86-sev-es-use-__copy_from_user_inatomic.patch @@ -0,0 +1,146 @@ +From bffe30dd9f1f3b2608a87ac909a224d6be472485 Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Wed, 3 Mar 2021 15:17:16 +0100 +Subject: x86/sev-es: Use __copy_from_user_inatomic() + +From: Joerg Roedel + +commit bffe30dd9f1f3b2608a87ac909a224d6be472485 upstream. + +The #VC handler must run in atomic context and cannot sleep. This is a +problem when it tries to fetch instruction bytes from user-space via +copy_from_user(). + +Introduce a insn_fetch_from_user_inatomic() helper which uses +__copy_from_user_inatomic() to safely copy the instruction bytes to +kernel memory in the #VC handler. + +Fixes: 5e3427a7bc432 ("x86/sev-es: Handle instruction fetches from user-space") +Signed-off-by: Joerg Roedel +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org # v5.10+ +Link: https://lkml.kernel.org/r/20210303141716.29223-6-joro@8bytes.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/insn-eval.h | 2 + + arch/x86/kernel/sev-es.c | 2 +- + arch/x86/lib/insn-eval.c | 66 +++++++++++++++++++++++++------- + 3 files changed, 55 insertions(+), 15 deletions(-) + +diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h +index a0f839aa144d..98b4dae5e8bc 100644 +--- a/arch/x86/include/asm/insn-eval.h ++++ b/arch/x86/include/asm/insn-eval.h +@@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); + int insn_get_code_seg_params(struct pt_regs *regs); + int insn_fetch_from_user(struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE]); ++int insn_fetch_from_user_inatomic(struct pt_regs *regs, ++ unsigned char buf[MAX_INSN_SIZE]); + bool insn_decode(struct insn *insn, struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE], int buf_size); + +diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c +index c3fd8fa79838..04a780abb512 100644 +--- a/arch/x86/kernel/sev-es.c ++++ b/arch/x86/kernel/sev-es.c +@@ -258,7 +258,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) + int res; + + if (user_mode(ctxt->regs)) { +- res = insn_fetch_from_user(ctxt->regs, buffer); ++ res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (!res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; +diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c +index 4229950a5d78..bb0b3fe1e0a0 100644 +--- a/arch/x86/lib/insn-eval.c ++++ b/arch/x86/lib/insn-eval.c +@@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) + } + } + ++static unsigned long insn_get_effective_ip(struct pt_regs *regs) ++{ ++ unsigned long seg_base = 0; ++ ++ /* ++ * If not in user-space long mode, a custom code segment could be in ++ * use. This is true in protected mode (if the process defined a local ++ * descriptor table), or virtual-8086 mode. In most of the cases ++ * seg_base will be zero as in USER_CS. ++ */ ++ if (!user_64bit_mode(regs)) { ++ seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); ++ if (seg_base == -1L) ++ return 0; ++ } ++ ++ return seg_base + regs->ip; ++} ++ + /** + * insn_fetch_from_user() - Copy instruction bytes from user-space memory + * @regs: Structure with register values as seen when entering kernel mode +@@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) + */ + int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) + { +- unsigned long seg_base = 0; ++ unsigned long ip; + int not_copied; + +- /* +- * If not in user-space long mode, a custom code segment could be in +- * use. This is true in protected mode (if the process defined a local +- * descriptor table), or virtual-8086 mode. In most of the cases +- * seg_base will be zero as in USER_CS. +- */ +- if (!user_64bit_mode(regs)) { +- seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); +- if (seg_base == -1L) +- return 0; +- } ++ ip = insn_get_effective_ip(regs); ++ if (!ip) ++ return 0; ++ ++ not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); + ++ return MAX_INSN_SIZE - not_copied; ++} ++ ++/** ++ * insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory ++ * while in atomic code ++ * @regs: Structure with register values as seen when entering kernel mode ++ * @buf: Array to store the fetched instruction ++ * ++ * Gets the linear address of the instruction and copies the instruction bytes ++ * to the buf. This function must be used in atomic context. ++ * ++ * Returns: ++ * ++ * Number of instruction bytes copied. ++ * ++ * 0 if nothing was copied. ++ */ ++int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) ++{ ++ unsigned long ip; ++ int not_copied; ++ ++ ip = insn_get_effective_ip(regs); ++ if (!ip) ++ return 0; + +- not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), +- MAX_INSN_SIZE); ++ not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); + + return MAX_INSN_SIZE - not_copied; + } +-- +2.30.2 + diff --git a/queue-5.10/x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch b/queue-5.10/x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch new file mode 100644 index 00000000000..b830b893914 --- /dev/null +++ b/queue-5.10/x86-unwind-orc-disable-kasan-checking-in-the-orc-unwinder-part-2.patch @@ -0,0 +1,87 @@ +From e504e74cc3a2c092b05577ce3e8e013fae7d94e6 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Fri, 5 Feb 2021 08:24:02 -0600 +Subject: x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2 + +From: Josh Poimboeuf + +commit e504e74cc3a2c092b05577ce3e8e013fae7d94e6 upstream. + +KASAN reserves "redzone" areas between stack frames in order to detect +stack overruns. A read or write to such an area triggers a KASAN +"stack-out-of-bounds" BUG. + +Normally, the ORC unwinder stays in-bounds and doesn't access the +redzone. But sometimes it can't find ORC metadata for a given +instruction. This can happen for code which is missing ORC metadata, or +for generated code. In such cases, the unwinder attempts to fall back +to frame pointers, as a best-effort type thing. + +This fallback often works, but when it doesn't, the unwinder can get +confused and go off into the weeds into the KASAN redzone, triggering +the aforementioned KASAN BUG. + +But in this case, the unwinder's confusion is actually harmless and +working as designed. It already has checks in place to prevent +off-stack accesses, but those checks get short-circuited by the KASAN +BUG. And a BUG is a lot more disruptive than a harmless unwinder +warning. + +Disable the KASAN checks by using READ_ONCE_NOCHECK() for all stack +accesses. This finishes the job started by commit 881125bfe65b +("x86/unwind: Disable KASAN checking in the ORC unwinder"), which only +partially fixed the issue. + +Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") +Reported-by: Ivan Babrou +Signed-off-by: Josh Poimboeuf +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Borislav Petkov +Reviewed-by: Steven Rostedt (VMware) +Tested-by: Ivan Babrou +Cc: stable@kernel.org +Link: https://lkml.kernel.org/r/9583327904ebbbeda399eca9c56d6c7085ac20fe.1612534649.git.jpoimboe@redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/unwind_orc.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/arch/x86/kernel/unwind_orc.c ++++ b/arch/x86/kernel/unwind_orc.c +@@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwi + if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) + return false; + +- *ip = regs->ip; +- *sp = regs->sp; ++ *ip = READ_ONCE_NOCHECK(regs->ip); ++ *sp = READ_ONCE_NOCHECK(regs->sp); + return true; + } + +@@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct + if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) + return false; + +- *ip = regs->ip; +- *sp = regs->sp; ++ *ip = READ_ONCE_NOCHECK(regs->ip); ++ *sp = READ_ONCE_NOCHECK(regs->sp); + return true; + } + +@@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state + return false; + + if (state->full_regs) { +- *val = ((unsigned long *)state->regs)[reg]; ++ *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]); + return true; + } + + if (state->prev_regs) { +- *val = ((unsigned long *)state->prev_regs)[reg]; ++ *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]); + return true; + } + diff --git a/queue-5.10/zram-fix-return-value-on-writeback_store.patch b/queue-5.10/zram-fix-return-value-on-writeback_store.patch new file mode 100644 index 00000000000..e0a476fe428 --- /dev/null +++ b/queue-5.10/zram-fix-return-value-on-writeback_store.patch @@ -0,0 +1,60 @@ +From 57e0076e6575a7b7cef620a0bd2ee2549ef77818 Mon Sep 17 00:00:00 2001 +From: Minchan Kim +Date: Fri, 12 Mar 2021 21:08:38 -0800 +Subject: zram: fix return value on writeback_store + +From: Minchan Kim + +commit 57e0076e6575a7b7cef620a0bd2ee2549ef77818 upstream. + +writeback_store's return value is overwritten by submit_bio_wait's return +value. Thus, writeback_store will return zero since there was no IO +error. In the end, write syscall from userspace will see the zero as +return value, which could make the process stall to keep trying the write +until it will succeed. + +Link: https://lkml.kernel.org/r/20210312173949.2197662-1-minchan@kernel.org +Fixes: 3b82a051c101("drivers/block/zram/zram_drv.c: fix error return codes not being returned in writeback_store") +Signed-off-by: Minchan Kim +Cc: Sergey Senozhatsky +Cc: Colin Ian King +Cc: John Dias +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/zram/zram_drv.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/drivers/block/zram/zram_drv.c ++++ b/drivers/block/zram/zram_drv.c +@@ -633,7 +633,7 @@ static ssize_t writeback_store(struct de + struct bio_vec bio_vec; + struct page *page; + ssize_t ret = len; +- int mode; ++ int mode, err; + unsigned long blk_idx = 0; + + if (sysfs_streq(buf, "idle")) +@@ -725,12 +725,17 @@ static ssize_t writeback_store(struct de + * XXX: A single page IO would be inefficient for write + * but it would be not bad as starter. + */ +- ret = submit_bio_wait(&bio); +- if (ret) { ++ err = submit_bio_wait(&bio); ++ if (err) { + zram_slot_lock(zram, index); + zram_clear_flag(zram, index, ZRAM_UNDER_WB); + zram_clear_flag(zram, index, ZRAM_IDLE); + zram_slot_unlock(zram, index); ++ /* ++ * Return last IO error unless every IO were ++ * not suceeded. ++ */ ++ ret = err; + continue; + } +