]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 5.10
authorSasha Levin <sashal@kernel.org>
Mon, 6 Dec 2021 04:03:03 +0000 (23:03 -0500)
committerSasha Levin <sashal@kernel.org>
Mon, 6 Dec 2021 04:03:03 +0000 (23:03 -0500)
Signed-off-by: Sasha Levin <sashal@kernel.org>
queue-5.10/kvm-vmx-set-failure-code-in-prepare_vmcs02.patch [new file with mode: 0644]
queue-5.10/kvm-x86-pmu-fix-reserved-bits-for-amd-perfevtseln-re.patch [new file with mode: 0644]
queue-5.10/sched-uclamp-fix-rq-uclamp_max-not-set-on-first-enqu.patch [new file with mode: 0644]
queue-5.10/series
queue-5.10/x86-entry-add-a-fence-for-kernel-entry-swapgs-in-par.patch [new file with mode: 0644]
queue-5.10/x86-entry-use-the-correct-fence-macro-after-swapgs-i.patch [new file with mode: 0644]
queue-5.10/x86-pv-switch-swapgs-to-alternative.patch [new file with mode: 0644]
queue-5.10/x86-sev-fix-sev-es-ins-outs-instructions-for-word-dw.patch [new file with mode: 0644]
queue-5.10/x86-xen-add-xenpv_restore_regs_and_return_to_usermod.patch [new file with mode: 0644]

diff --git a/queue-5.10/kvm-vmx-set-failure-code-in-prepare_vmcs02.patch b/queue-5.10/kvm-vmx-set-failure-code-in-prepare_vmcs02.patch
new file mode 100644 (file)
index 0000000..8ea8f00
--- /dev/null
@@ -0,0 +1,41 @@
+From 7a256e68fa685bbbc887f3c77d10bd0c2f49004d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 30 Nov 2021 15:53:37 +0300
+Subject: KVM: VMX: Set failure code in prepare_vmcs02()
+
+From: Dan Carpenter <dan.carpenter@oracle.com>
+
+[ Upstream commit bfbb307c628676929c2d329da0daf9d22afa8ad2 ]
+
+The error paths in the prepare_vmcs02() function are supposed to set
+*entry_failure_code but this path does not.  It leads to using an
+uninitialized variable in the caller.
+
+Fixes: 71f7347025bf ("KVM: nVMX: Load GUEST_IA32_PERF_GLOBAL_CTRL MSR on VM-Entry")
+Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
+Message-Id: <20211130125337.GB24578@kili>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 257ec2cbf69a4..36661b15c3d04 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2619,8 +2619,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+           WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+-                                   vmcs12->guest_ia32_perf_global_ctrl)))
++                                   vmcs12->guest_ia32_perf_global_ctrl))) {
++              *entry_failure_code = ENTRY_FAIL_DEFAULT;
+               return -EINVAL;
++      }
+       kvm_rsp_write(vcpu, vmcs12->guest_rsp);
+       kvm_rip_write(vcpu, vmcs12->guest_rip);
+-- 
+2.33.0
+
diff --git a/queue-5.10/kvm-x86-pmu-fix-reserved-bits-for-amd-perfevtseln-re.patch b/queue-5.10/kvm-x86-pmu-fix-reserved-bits-for-amd-perfevtseln-re.patch
new file mode 100644 (file)
index 0000000..242f620
--- /dev/null
@@ -0,0 +1,55 @@
+From 980b7c2fb5a6612ec1c2edea008f19b54b17e92f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Nov 2021 21:03:20 +0800
+Subject: KVM: x86/pmu: Fix reserved bits for AMD PerfEvtSeln register
+
+From: Like Xu <likexu@tencent.com>
+
+[ Upstream commit cb1d220da0faa5ca0deb93449aff953f0c2cce6d ]
+
+If we run the following perf command in an AMD Milan guest:
+
+  perf stat \
+  -e cpu/event=0x1d0/ \
+  -e cpu/event=0x1c7/ \
+  -e cpu/umask=0x1f,event=0x18e/ \
+  -e cpu/umask=0x7,event=0x18e/ \
+  -e cpu/umask=0x18,event=0x18e/ \
+  ./workload
+
+dmesg will report a #GP warning from an unchecked MSR access
+error on MSR_F15H_PERF_CTLx.
+
+This is because according to APM (Revision: 4.03) Figure 13-7,
+the bits [35:32] of AMD PerfEvtSeln register is a part of the
+event select encoding, which extends the EVENT_SELECT field
+from 8 bits to 12 bits.
+
+Opportunistically update pmu->reserved_bits for reserved bit 19.
+
+Reported-by: Jim Mattson <jmattson@google.com>
+Fixes: ca724305a2b0 ("KVM: x86/vPMU: Implement AMD vPMU code for KVM")
+Signed-off-by: Like Xu <likexu@tencent.com>
+Message-Id: <20211118130320.95997-1-likexu@tencent.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/pmu.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
+index 035da07500e8b..5a5c165a30ed1 100644
+--- a/arch/x86/kvm/svm/pmu.c
++++ b/arch/x86/kvm/svm/pmu.c
+@@ -274,7 +274,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+               pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+       pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
+-      pmu->reserved_bits = 0xffffffff00200000ull;
++      pmu->reserved_bits = 0xfffffff000280000ull;
+       pmu->version = 1;
+       /* not applicable to AMD; but clean them to prevent any fall out */
+       pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+-- 
+2.33.0
+
diff --git a/queue-5.10/sched-uclamp-fix-rq-uclamp_max-not-set-on-first-enqu.patch b/queue-5.10/sched-uclamp-fix-rq-uclamp_max-not-set-on-first-enqu.patch
new file mode 100644 (file)
index 0000000..cb1ceca
--- /dev/null
@@ -0,0 +1,66 @@
+From 6ede1aa40599f3abdcb4af51a07e4dcf7d058ba9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 2 Dec 2021 11:20:33 +0000
+Subject: sched/uclamp: Fix rq->uclamp_max not set on first enqueue
+
+From: Qais Yousef <qais.yousef@arm.com>
+
+[ Upstream commit 315c4f884800c45cb6bd8c90422fad554a8b9588 ]
+
+Commit d81ae8aac85c ("sched/uclamp: Fix initialization of struct
+uclamp_rq") introduced a bug where uclamp_max of the rq is not reset to
+match the woken up task's uclamp_max when the rq is idle.
+
+The code was relying on rq->uclamp_max initialized to zero, so on first
+enqueue
+
+       static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
+                                           enum uclamp_id clamp_id)
+       {
+               ...
+
+               if (uc_se->value > READ_ONCE(uc_rq->value))
+                       WRITE_ONCE(uc_rq->value, uc_se->value);
+       }
+
+was actually resetting it. But since commit d81ae8aac85c changed the
+default to 1024, this no longer works. And since rq->uclamp_flags is
+also initialized to 0, neither above code path nor uclamp_idle_reset()
+update the rq->uclamp_max on first wake up from idle.
+
+This is only visible from first wake up(s) until the first dequeue to
+idle after enabling the static key. And it only matters if the
+uclamp_max of this task is < 1024 since only then its uclamp_max will be
+effectively ignored.
+
+Fix it by properly initializing rq->uclamp_flags = UCLAMP_FLAG_IDLE to
+ensure uclamp_idle_reset() is called which then will update the rq
+uclamp_max value as expected.
+
+Fixes: d81ae8aac85c ("sched/uclamp: Fix initialization of struct uclamp_rq")
+Signed-off-by: Qais Yousef <qais.yousef@arm.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Valentin Schneider <Valentin.Schneider@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Link: https://lkml.kernel.org/r/20211202112033.1705279-1-qais.yousef@arm.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/core.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 304aad997da11..0a5f9fad45e4b 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1526,7 +1526,7 @@ static void __init init_uclamp_rq(struct rq *rq)
+               };
+       }
+-      rq->uclamp_flags = 0;
++      rq->uclamp_flags = UCLAMP_FLAG_IDLE;
+ }
+ static void __init init_uclamp(void)
+-- 
+2.33.0
+
index 7aa8c8af0f380eb98374dc1a1b6ad250d843a936..95fbe65086f9cf094b3be058e542215e170df2fd 100644 (file)
@@ -100,3 +100,11 @@ atlantic-add-missing-dids-and-fix-115c.patch
 remove-half-duplex-mode-speed-capabilities.patch
 atlantic-fix-statistics-logic-for-production-hardware.patch
 atlantic-remove-warn-trace-message.patch
+kvm-x86-pmu-fix-reserved-bits-for-amd-perfevtseln-re.patch
+kvm-vmx-set-failure-code-in-prepare_vmcs02.patch
+x86-sev-fix-sev-es-ins-outs-instructions-for-word-dw.patch
+x86-entry-use-the-correct-fence-macro-after-swapgs-i.patch
+x86-xen-add-xenpv_restore_regs_and_return_to_usermod.patch
+sched-uclamp-fix-rq-uclamp_max-not-set-on-first-enqu.patch
+x86-pv-switch-swapgs-to-alternative.patch
+x86-entry-add-a-fence-for-kernel-entry-swapgs-in-par.patch
diff --git a/queue-5.10/x86-entry-add-a-fence-for-kernel-entry-swapgs-in-par.patch b/queue-5.10/x86-entry-add-a-fence-for-kernel-entry-swapgs-in-par.patch
new file mode 100644 (file)
index 0000000..786c64d
--- /dev/null
@@ -0,0 +1,80 @@
+From d5a0fcd335821d7ec7e27acf6ae625a3ae41ba7b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 26 Nov 2021 18:11:21 +0800
+Subject: x86/entry: Add a fence for kernel entry SWAPGS in paranoid_entry()
+
+From: Lai Jiangshan <laijs@linux.alibaba.com>
+
+[ Upstream commit c07e45553da1808aa802e9f0ffa8108cfeaf7a17 ]
+
+Commit
+
+  18ec54fdd6d18 ("x86/speculation: Prepare entry code for Spectre v1 swapgs mitigations")
+
+added FENCE_SWAPGS_{KERNEL|USER}_ENTRY for conditional SWAPGS. In
+paranoid_entry(), it uses only FENCE_SWAPGS_KERNEL_ENTRY for both
+branches. This is because the fence is required for both cases since the
+CR3 write is conditional even when PTI is enabled.
+
+But
+
+  96b2371413e8f ("x86/entry/64: Switch CR3 before SWAPGS in paranoid entry")
+
+changed the order of SWAPGS and the CR3 write. And it missed the needed
+FENCE_SWAPGS_KERNEL_ENTRY for the user gsbase case.
+
+Add it back by changing the branches so that FENCE_SWAPGS_KERNEL_ENTRY
+can cover both branches.
+
+  [ bp: Massage, fix typos, remove obsolete comment while at it. ]
+
+Fixes: 96b2371413e8f ("x86/entry/64: Switch CR3 before SWAPGS in paranoid entry")
+Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Link: https://lkml.kernel.org/r/20211126101209.8613-2-jiangshanlai@gmail.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/entry/entry_64.S | 16 +++++-----------
+ 1 file changed, 5 insertions(+), 11 deletions(-)
+
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index 166554a109aeb..a24ce5905ab82 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -936,6 +936,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
+ .Lparanoid_entry_checkgs:
+       /* EBX = 1 -> kernel GSBASE active, no restore required */
+       movl    $1, %ebx
++
+       /*
+        * The kernel-enforced convention is a negative GSBASE indicates
+        * a kernel value. No SWAPGS needed on entry and exit.
+@@ -943,21 +944,14 @@ SYM_CODE_START_LOCAL(paranoid_entry)
+       movl    $MSR_GS_BASE, %ecx
+       rdmsr
+       testl   %edx, %edx
+-      jns     .Lparanoid_entry_swapgs
+-      ret
++      js      .Lparanoid_kernel_gsbase
+-.Lparanoid_entry_swapgs:
++      /* EBX = 0 -> SWAPGS required on exit */
++      xorl    %ebx, %ebx
+       swapgs
++.Lparanoid_kernel_gsbase:
+-      /*
+-       * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
+-       * unconditional CR3 write, even in the PTI case.  So do an lfence
+-       * to prevent GS speculation, regardless of whether PTI is enabled.
+-       */
+       FENCE_SWAPGS_KERNEL_ENTRY
+-
+-      /* EBX = 0 -> SWAPGS required on exit */
+-      xorl    %ebx, %ebx
+       ret
+ SYM_CODE_END(paranoid_entry)
+-- 
+2.33.0
+
diff --git a/queue-5.10/x86-entry-use-the-correct-fence-macro-after-swapgs-i.patch b/queue-5.10/x86-entry-use-the-correct-fence-macro-after-swapgs-i.patch
new file mode 100644 (file)
index 0000000..9e2e2d1
--- /dev/null
@@ -0,0 +1,70 @@
+From e3b6d4da55bc8a47d47384b83d435c32eb21e1fc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 26 Nov 2021 18:11:22 +0800
+Subject: x86/entry: Use the correct fence macro after swapgs in kernel CR3
+
+From: Lai Jiangshan <laijs@linux.alibaba.com>
+
+[ Upstream commit 1367afaa2ee90d1c956dfc224e199fcb3ff3f8cc ]
+
+The commit
+
+  c75890700455 ("x86/entry/64: Remove unneeded kernel CR3 switching")
+
+removed a CR3 write in the faulting path of load_gs_index().
+
+But the path's FENCE_SWAPGS_USER_ENTRY has no fence operation if PTI is
+enabled, see spectre_v1_select_mitigation().
+
+Rather, it depended on the serializing CR3 write of SWITCH_TO_KERNEL_CR3
+and since it got removed, add a FENCE_SWAPGS_KERNEL_ENTRY call to make
+sure speculation is blocked.
+
+ [ bp: Massage commit message and comment. ]
+
+Fixes: c75890700455 ("x86/entry/64: Remove unneeded kernel CR3 switching")
+Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20211126101209.8613-3-jiangshanlai@gmail.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/entry/entry_64.S | 15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index f18f3932e971a..a806d68b96990 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -1035,11 +1035,6 @@ SYM_CODE_START_LOCAL(error_entry)
+       pushq   %r12
+       ret
+-.Lerror_entry_done_lfence:
+-      FENCE_SWAPGS_KERNEL_ENTRY
+-.Lerror_entry_done:
+-      ret
+-
+       /*
+        * There are two places in the kernel that can potentially fault with
+        * usergs. Handle them here.  B stepping K8s sometimes report a
+@@ -1062,8 +1057,14 @@ SYM_CODE_START_LOCAL(error_entry)
+        * .Lgs_change's error handler with kernel gsbase.
+        */
+       SWAPGS
+-      FENCE_SWAPGS_USER_ENTRY
+-      jmp .Lerror_entry_done
++
++      /*
++       * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
++       * kernel or user gsbase.
++       */
++.Lerror_entry_done_lfence:
++      FENCE_SWAPGS_KERNEL_ENTRY
++      ret
+ .Lbstep_iret:
+       /* Fix truncated RIP */
+-- 
+2.33.0
+
diff --git a/queue-5.10/x86-pv-switch-swapgs-to-alternative.patch b/queue-5.10/x86-pv-switch-swapgs-to-alternative.patch
new file mode 100644 (file)
index 0000000..ada5aa0
--- /dev/null
@@ -0,0 +1,237 @@
+From 2ef3d718e2c4ab4483e0b62e56fcc3bb83fa400f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Jan 2021 14:55:44 +0100
+Subject: x86/pv: Switch SWAPGS to ALTERNATIVE
+
+From: Juergen Gross <jgross@suse.com>
+
+[ Upstream commit 53c9d9240944088274aadbbbafc6138ca462db4f ]
+
+SWAPGS is used only for interrupts coming from user mode or for
+returning to user mode. So there is no reason to use the PARAVIRT
+framework, as it can easily be replaced by an ALTERNATIVE depending
+on X86_FEATURE_XENPV.
+
+There are several instances using the PV-aware SWAPGS macro in paths
+which are never executed in a Xen PV guest. Replace those with the
+plain swapgs instruction. For SWAPGS_UNSAFE_STACK the same applies.
+
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Andy Lutomirski <luto@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20210120135555.32594-5-jgross@suse.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/entry/entry_64.S             | 10 +++++-----
+ arch/x86/include/asm/irqflags.h       | 20 ++++++++------------
+ arch/x86/include/asm/paravirt.h       | 20 --------------------
+ arch/x86/include/asm/paravirt_types.h |  2 --
+ arch/x86/kernel/asm-offsets_64.c      |  1 -
+ arch/x86/kernel/paravirt.c            |  1 -
+ arch/x86/kernel/paravirt_patch.c      |  3 ---
+ arch/x86/xen/enlighten_pv.c           |  3 ---
+ 8 files changed, 13 insertions(+), 47 deletions(-)
+
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index de541ea2788eb..166554a109aeb 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -673,7 +673,7 @@ native_irq_return_ldt:
+        */
+       pushq   %rdi                            /* Stash user RDI */
+-      SWAPGS                                  /* to kernel GS */
++      swapgs                                  /* to kernel GS */
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi   /* to kernel CR3 */
+       movq    PER_CPU_VAR(espfix_waddr), %rdi
+@@ -703,7 +703,7 @@ native_irq_return_ldt:
+       orq     PER_CPU_VAR(espfix_stack), %rax
+       SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+-      SWAPGS                                  /* to user GS */
++      swapgs                                  /* to user GS */
+       popq    %rdi                            /* Restore user RDI */
+       movq    %rax, %rsp
+@@ -947,7 +947,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
+       ret
+ .Lparanoid_entry_swapgs:
+-      SWAPGS
++      swapgs
+       /*
+        * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
+@@ -1005,7 +1005,7 @@ SYM_CODE_START_LOCAL(paranoid_exit)
+       jnz             restore_regs_and_return_to_kernel
+       /* We are returning to a context with user GSBASE */
+-      SWAPGS_UNSAFE_STACK
++      swapgs
+       jmp             restore_regs_and_return_to_kernel
+ SYM_CODE_END(paranoid_exit)
+@@ -1431,7 +1431,7 @@ nmi_no_fsgsbase:
+       jnz     nmi_restore
+ nmi_swapgs:
+-      SWAPGS_UNSAFE_STACK
++      swapgs
+ nmi_restore:
+       POP_REGS
+diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
+index 2dfc8d380dab1..8c86edefa1150 100644
+--- a/arch/x86/include/asm/irqflags.h
++++ b/arch/x86/include/asm/irqflags.h
+@@ -131,18 +131,6 @@ static __always_inline unsigned long arch_local_irq_save(void)
+ #define SAVE_FLAGS(x)         pushfq; popq %rax
+ #endif
+-#define SWAPGS        swapgs
+-/*
+- * Currently paravirt can't handle swapgs nicely when we
+- * don't have a stack we can rely on (such as a user space
+- * stack).  So we either find a way around these or just fault
+- * and emulate if a guest tries to call swapgs directly.
+- *
+- * Either way, this is a good way to document that we don't
+- * have a reliable stack. x86_64 only.
+- */
+-#define SWAPGS_UNSAFE_STACK   swapgs
+-
+ #define INTERRUPT_RETURN      jmp native_iret
+ #define USERGS_SYSRET64                               \
+       swapgs;                                 \
+@@ -170,6 +158,14 @@ static __always_inline int arch_irqs_disabled(void)
+       return arch_irqs_disabled_flags(flags);
+ }
++#else
++#ifdef CONFIG_X86_64
++#ifdef CONFIG_XEN_PV
++#define SWAPGS        ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
++#else
++#define SWAPGS        swapgs
++#endif
++#endif
+ #endif /* !__ASSEMBLY__ */
+ #endif
+diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
+index d25cc6830e895..5647bcdba776e 100644
+--- a/arch/x86/include/asm/paravirt.h
++++ b/arch/x86/include/asm/paravirt.h
+@@ -776,26 +776,6 @@ extern void default_banner(void);
+ #ifdef CONFIG_X86_64
+ #ifdef CONFIG_PARAVIRT_XXL
+-/*
+- * If swapgs is used while the userspace stack is still current,
+- * there's no way to call a pvop.  The PV replacement *must* be
+- * inlined, or the swapgs instruction must be trapped and emulated.
+- */
+-#define SWAPGS_UNSAFE_STACK                                           \
+-      PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs)
+-
+-/*
+- * Note: swapgs is very special, and in practise is either going to be
+- * implemented with a single "swapgs" instruction or something very
+- * special.  Either way, we don't need to save any registers for
+- * it.
+- */
+-#define SWAPGS                                                                \
+-      PARA_SITE(PARA_PATCH(PV_CPU_swapgs),                            \
+-                ANNOTATE_RETPOLINE_SAFE;                              \
+-                call PARA_INDIRECT(pv_ops+PV_CPU_swapgs);             \
+-               )
+-
+ #define USERGS_SYSRET64                                                       \
+       PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64),                   \
+                 ANNOTATE_RETPOLINE_SAFE;                              \
+diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
+index 0fad9f61c76ab..903d71884fa25 100644
+--- a/arch/x86/include/asm/paravirt_types.h
++++ b/arch/x86/include/asm/paravirt_types.h
+@@ -169,8 +169,6 @@ struct pv_cpu_ops {
+          frame set up. */
+       void (*iret)(void);
+-      void (*swapgs)(void);
+-
+       void (*start_context_switch)(struct task_struct *prev);
+       void (*end_context_switch)(struct task_struct *next);
+ #endif
+diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
+index 828be792231e9..1354bc30614d7 100644
+--- a/arch/x86/kernel/asm-offsets_64.c
++++ b/arch/x86/kernel/asm-offsets_64.c
+@@ -15,7 +15,6 @@ int main(void)
+ #ifdef CONFIG_PARAVIRT_XXL
+       OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template,
+              cpu.usergs_sysret64);
+-      OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs);
+ #ifdef CONFIG_DEBUG_ENTRY
+       OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
+ #endif
+diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
+index 6c3407ba6ee98..5e5fcf5c376de 100644
+--- a/arch/x86/kernel/paravirt.c
++++ b/arch/x86/kernel/paravirt.c
+@@ -312,7 +312,6 @@ struct paravirt_patch_template pv_ops = {
+       .cpu.usergs_sysret64    = native_usergs_sysret64,
+       .cpu.iret               = native_iret,
+-      .cpu.swapgs             = native_swapgs,
+ #ifdef CONFIG_X86_IOPL_IOPERM
+       .cpu.invalidate_io_bitmap       = native_tss_invalidate_io_bitmap,
+diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
+index ace6e334cb393..7c518b08aa3c5 100644
+--- a/arch/x86/kernel/paravirt_patch.c
++++ b/arch/x86/kernel/paravirt_patch.c
+@@ -28,7 +28,6 @@ struct patch_xxl {
+       const unsigned char     irq_restore_fl[2];
+       const unsigned char     cpu_wbinvd[2];
+       const unsigned char     cpu_usergs_sysret64[6];
+-      const unsigned char     cpu_swapgs[3];
+       const unsigned char     mov64[3];
+ };
+@@ -43,7 +42,6 @@ static const struct patch_xxl patch_data_xxl = {
+       .cpu_wbinvd             = { 0x0f, 0x09 },       // wbinvd
+       .cpu_usergs_sysret64    = { 0x0f, 0x01, 0xf8,
+                                   0x48, 0x0f, 0x07 }, // swapgs; sysretq
+-      .cpu_swapgs             = { 0x0f, 0x01, 0xf8 }, // swapgs
+       .mov64                  = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
+ };
+@@ -86,7 +84,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
+       PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
+       PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len);
+-      PATCH_CASE(cpu, swapgs, xxl, insn_buff, len);
+       PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
+ #endif
+diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
+index 5af0421ef74ba..16ff25d6935e7 100644
+--- a/arch/x86/xen/enlighten_pv.c
++++ b/arch/x86/xen/enlighten_pv.c
+@@ -1083,9 +1083,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
+ #endif
+       .io_delay = xen_io_delay,
+-      /* Xen takes care of %gs when switching to usermode for us */
+-      .swapgs = paravirt_nop,
+-
+       .start_context_switch = paravirt_start_context_switch,
+       .end_context_switch = xen_end_context_switch,
+ };
+-- 
+2.33.0
+
diff --git a/queue-5.10/x86-sev-fix-sev-es-ins-outs-instructions-for-word-dw.patch b/queue-5.10/x86-sev-fix-sev-es-ins-outs-instructions-for-word-dw.patch
new file mode 100644 (file)
index 0000000..e0c02ee
--- /dev/null
@@ -0,0 +1,157 @@
+From f384c2b7d5113952e57e614a17b06cabcb21f8fd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Nov 2021 15:27:57 -0800
+Subject: x86/sev: Fix SEV-ES INS/OUTS instructions for word, dword, and qword
+
+From: Michael Sterritt <sterritt@google.com>
+
+[ Upstream commit 1d5379d0475419085d3575bd9155f2e558e96390 ]
+
+Properly type the operands being passed to __put_user()/__get_user().
+Otherwise, these routines truncate data for dependent instructions
+(e.g., INSW) and only read/write one byte.
+
+This has been tested by sending a string with REP OUTSW to a port and
+then reading it back in with REP INSW on the same port.
+
+Previous behavior was to only send and receive the first char of the
+size. For example, word operations for "abcd" would only read/write
+"ac". With change, the full string is now written and read back.
+
+Fixes: f980f9c31a923 (x86/sev-es: Compile early handler code into kernel image)
+Signed-off-by: Michael Sterritt <sterritt@google.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Marc Orr <marcorr@google.com>
+Reviewed-by: Peter Gonda <pgonda@google.com>
+Reviewed-by: Joerg Roedel <jroedel@suse.de>
+Link: https://lkml.kernel.org/r/20211119232757.176201-1-sterritt@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/sev-es.c | 57 +++++++++++++++++++++++++++-------------
+ 1 file changed, 39 insertions(+), 18 deletions(-)
+
+diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
+index 865e234ea24bd..c222fab112cbd 100644
+--- a/arch/x86/kernel/sev-es.c
++++ b/arch/x86/kernel/sev-es.c
+@@ -260,11 +260,6 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+                                  char *dst, char *buf, size_t size)
+ {
+       unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
+-      char __user *target = (char __user *)dst;
+-      u64 d8;
+-      u32 d4;
+-      u16 d2;
+-      u8  d1;
+       /*
+        * This function uses __put_user() independent of whether kernel or user
+@@ -286,26 +281,42 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+        * instructions here would cause infinite nesting.
+        */
+       switch (size) {
+-      case 1:
++      case 1: {
++              u8 d1;
++              u8 __user *target = (u8 __user *)dst;
++
+               memcpy(&d1, buf, 1);
+               if (__put_user(d1, target))
+                       goto fault;
+               break;
+-      case 2:
++      }
++      case 2: {
++              u16 d2;
++              u16 __user *target = (u16 __user *)dst;
++
+               memcpy(&d2, buf, 2);
+               if (__put_user(d2, target))
+                       goto fault;
+               break;
+-      case 4:
++      }
++      case 4: {
++              u32 d4;
++              u32 __user *target = (u32 __user *)dst;
++
+               memcpy(&d4, buf, 4);
+               if (__put_user(d4, target))
+                       goto fault;
+               break;
+-      case 8:
++      }
++      case 8: {
++              u64 d8;
++              u64 __user *target = (u64 __user *)dst;
++
+               memcpy(&d8, buf, 8);
+               if (__put_user(d8, target))
+                       goto fault;
+               break;
++      }
+       default:
+               WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
+               return ES_UNSUPPORTED;
+@@ -328,11 +339,6 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+                                 char *src, char *buf, size_t size)
+ {
+       unsigned long error_code = X86_PF_PROT;
+-      char __user *s = (char __user *)src;
+-      u64 d8;
+-      u32 d4;
+-      u16 d2;
+-      u8  d1;
+       /*
+        * This function uses __get_user() independent of whether kernel or user
+@@ -354,26 +360,41 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+        * instructions here would cause infinite nesting.
+        */
+       switch (size) {
+-      case 1:
++      case 1: {
++              u8 d1;
++              u8 __user *s = (u8 __user *)src;
++
+               if (__get_user(d1, s))
+                       goto fault;
+               memcpy(buf, &d1, 1);
+               break;
+-      case 2:
++      }
++      case 2: {
++              u16 d2;
++              u16 __user *s = (u16 __user *)src;
++
+               if (__get_user(d2, s))
+                       goto fault;
+               memcpy(buf, &d2, 2);
+               break;
+-      case 4:
++      }
++      case 4: {
++              u32 d4;
++              u32 __user *s = (u32 __user *)src;
++
+               if (__get_user(d4, s))
+                       goto fault;
+               memcpy(buf, &d4, 4);
+               break;
+-      case 8:
++      }
++      case 8: {
++              u64 d8;
++              u64 __user *s = (u64 __user *)src;
+               if (__get_user(d8, s))
+                       goto fault;
+               memcpy(buf, &d8, 8);
+               break;
++      }
+       default:
+               WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
+               return ES_UNSUPPORTED;
+-- 
+2.33.0
+
diff --git a/queue-5.10/x86-xen-add-xenpv_restore_regs_and_return_to_usermod.patch b/queue-5.10/x86-xen-add-xenpv_restore_regs_and_return_to_usermod.patch
new file mode 100644 (file)
index 0000000..91d5dc3
--- /dev/null
@@ -0,0 +1,95 @@
+From 93f5997a2d59d8e4f299ad3143cc57fa9ec7bafb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 26 Nov 2021 18:11:23 +0800
+Subject: x86/xen: Add xenpv_restore_regs_and_return_to_usermode()
+
+From: Lai Jiangshan <laijs@linux.alibaba.com>
+
+[ Upstream commit 5c8f6a2e316efebb3ba93d8c1af258155dcf5632 ]
+
+In the native case, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is the
+trampoline stack. But XEN pv doesn't use trampoline stack, so
+PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is also the kernel stack.
+
+In that case, source and destination stacks are identical, which means
+that reusing swapgs_restore_regs_and_return_to_usermode() in XEN pv
+would cause %rsp to move up to the top of the kernel stack and leave the
+IRET frame below %rsp.
+
+This is dangerous as it can be corrupted if #NMI / #MC hit as either of
+these events occurring in the middle of the stack pushing would clobber
+data on the (original) stack.
+
+And, with  XEN pv, swapgs_restore_regs_and_return_to_usermode() pushing
+the IRET frame on to the original address is useless and error-prone
+when there is any future attempt to modify the code.
+
+ [ bp: Massage commit message. ]
+
+Fixes: 7f2590a110b8 ("x86/entry/64: Use a per-CPU trampoline stack for IDT entries")
+Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Link: https://lkml.kernel.org/r/20211126101209.8613-4-jiangshanlai@gmail.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/entry/entry_64.S |  4 ++++
+ arch/x86/xen/xen-asm.S    | 20 ++++++++++++++++++++
+ 2 files changed, 24 insertions(+)
+
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index a806d68b96990..de541ea2788eb 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -575,6 +575,10 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
+       ud2
+ 1:
+ #endif
++#ifdef CONFIG_XEN_PV
++      ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
++#endif
++
+       POP_REGS pop_rdi=0
+       /*
+diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
+index 53cf8aa35032d..011ec649f3886 100644
+--- a/arch/x86/xen/xen-asm.S
++++ b/arch/x86/xen/xen-asm.S
+@@ -19,6 +19,7 @@
+ #include <linux/init.h>
+ #include <linux/linkage.h>
++#include <../entry/calling.h>
+ /*
+  * Enable events.  This clears the event mask and tests the pending
+@@ -235,6 +236,25 @@ SYM_CODE_START(xen_sysret64)
+       jmp hypercall_iret
+ SYM_CODE_END(xen_sysret64)
++/*
++ * XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is
++ * also the kernel stack.  Reusing swapgs_restore_regs_and_return_to_usermode()
++ * in XEN pv would cause %rsp to move up to the top of the kernel stack and
++ * leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI
++ * interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET
++ * frame at the same address is useless.
++ */
++SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode)
++      UNWIND_HINT_REGS
++      POP_REGS
++
++      /* stackleak_erase() can work safely on the kernel stack. */
++      STACKLEAK_ERASE_NOCLOBBER
++
++      addq    $8, %rsp        /* skip regs->orig_ax */
++      jmp xen_iret
++SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode)
++
+ /*
+  * Xen handles syscall callbacks much like ordinary exceptions, which
+  * means we have:
+-- 
+2.33.0
+