--- /dev/null
+From a908d487709c2741c09a92a9d766774e936aafd8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 17 Jun 2023 08:32:42 +0100
+Subject: irqchip/gic-v4.1: Properly lock VPEs when doing a directLPI
+ invalidation
+
+From: Marc Zyngier <maz@kernel.org>
+
+[ Upstream commit 926846a703cbf5d0635cc06e67d34b228746554b ]
+
+We normally rely on the irq_to_cpuid_[un]lock() primitives to make
+sure nothing will change col->idx while performing a LPI invalidation.
+
+However, these primitives do not cover VPE doorbells, and we have
+some open-coded locking for that. Unfortunately, this locking is
+pretty bogus.
+
+Instead, extend the above primitives to cover VPE doorbells and
+convert the whole thing to it.
+
+Fixes: f3a059219bc7 ("irqchip/gic-v4.1: Ensure mutual exclusion between vPE affinity change and RD access")
+Reported-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: Zenghui Yu <yuzenghui@huawei.com>
+Cc: wanghaibin.wang@huawei.com
+Tested-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
+Link: https://lore.kernel.org/r/20230617073242.3199746-1-maz@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/irqchip/irq-gic-v3-its.c | 75 ++++++++++++++++++++------------
+ 1 file changed, 46 insertions(+), 29 deletions(-)
+
+diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
+index 59a5d06b2d3e4..490e6cfe510e6 100644
+--- a/drivers/irqchip/irq-gic-v3-its.c
++++ b/drivers/irqchip/irq-gic-v3-its.c
+@@ -267,13 +267,23 @@ static void vpe_to_cpuid_unlock(struct its_vpe *vpe, unsigned long flags)
+ raw_spin_unlock_irqrestore(&vpe->vpe_lock, flags);
+ }
+
++static struct irq_chip its_vpe_irq_chip;
++
+ static int irq_to_cpuid_lock(struct irq_data *d, unsigned long *flags)
+ {
+- struct its_vlpi_map *map = get_vlpi_map(d);
++ struct its_vpe *vpe = NULL;
+ int cpu;
+
+- if (map) {
+- cpu = vpe_to_cpuid_lock(map->vpe, flags);
++ if (d->chip == &its_vpe_irq_chip) {
++ vpe = irq_data_get_irq_chip_data(d);
++ } else {
++ struct its_vlpi_map *map = get_vlpi_map(d);
++ if (map)
++ vpe = map->vpe;
++ }
++
++ if (vpe) {
++ cpu = vpe_to_cpuid_lock(vpe, flags);
+ } else {
+ /* Physical LPIs are already locked via the irq_desc lock */
+ struct its_device *its_dev = irq_data_get_irq_chip_data(d);
+@@ -287,10 +297,18 @@ static int irq_to_cpuid_lock(struct irq_data *d, unsigned long *flags)
+
+ static void irq_to_cpuid_unlock(struct irq_data *d, unsigned long flags)
+ {
+- struct its_vlpi_map *map = get_vlpi_map(d);
++ struct its_vpe *vpe = NULL;
++
++ if (d->chip == &its_vpe_irq_chip) {
++ vpe = irq_data_get_irq_chip_data(d);
++ } else {
++ struct its_vlpi_map *map = get_vlpi_map(d);
++ if (map)
++ vpe = map->vpe;
++ }
+
+- if (map)
+- vpe_to_cpuid_unlock(map->vpe, flags);
++ if (vpe)
++ vpe_to_cpuid_unlock(vpe, flags);
+ }
+
+ static struct its_collection *valid_col(struct its_collection *col)
+@@ -1427,14 +1445,29 @@ static void wait_for_syncr(void __iomem *rdbase)
+ cpu_relax();
+ }
+
+-static void direct_lpi_inv(struct irq_data *d)
++static void __direct_lpi_inv(struct irq_data *d, u64 val)
+ {
+- struct its_vlpi_map *map = get_vlpi_map(d);
+ void __iomem *rdbase;
+ unsigned long flags;
+- u64 val;
+ int cpu;
+
++ /* Target the redistributor this LPI is currently routed to */
++ cpu = irq_to_cpuid_lock(d, &flags);
++ raw_spin_lock(&gic_data_rdist_cpu(cpu)->rd_lock);
++
++ rdbase = per_cpu_ptr(gic_rdists->rdist, cpu)->rd_base;
++ gic_write_lpir(val, rdbase + GICR_INVLPIR);
++ wait_for_syncr(rdbase);
++
++ raw_spin_unlock(&gic_data_rdist_cpu(cpu)->rd_lock);
++ irq_to_cpuid_unlock(d, flags);
++}
++
++static void direct_lpi_inv(struct irq_data *d)
++{
++ struct its_vlpi_map *map = get_vlpi_map(d);
++ u64 val;
++
+ if (map) {
+ struct its_device *its_dev = irq_data_get_irq_chip_data(d);
+
+@@ -1447,15 +1480,7 @@ static void direct_lpi_inv(struct irq_data *d)
+ val = d->hwirq;
+ }
+
+- /* Target the redistributor this LPI is currently routed to */
+- cpu = irq_to_cpuid_lock(d, &flags);
+- raw_spin_lock(&gic_data_rdist_cpu(cpu)->rd_lock);
+- rdbase = per_cpu_ptr(gic_rdists->rdist, cpu)->rd_base;
+- gic_write_lpir(val, rdbase + GICR_INVLPIR);
+-
+- wait_for_syncr(rdbase);
+- raw_spin_unlock(&gic_data_rdist_cpu(cpu)->rd_lock);
+- irq_to_cpuid_unlock(d, flags);
++ __direct_lpi_inv(d, val);
+ }
+
+ static void lpi_update_config(struct irq_data *d, u8 clr, u8 set)
+@@ -3936,18 +3961,10 @@ static void its_vpe_send_inv(struct irq_data *d)
+ {
+ struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
+
+- if (gic_rdists->has_direct_lpi) {
+- void __iomem *rdbase;
+-
+- /* Target the redistributor this VPE is currently known on */
+- raw_spin_lock(&gic_data_rdist_cpu(vpe->col_idx)->rd_lock);
+- rdbase = per_cpu_ptr(gic_rdists->rdist, vpe->col_idx)->rd_base;
+- gic_write_lpir(d->parent_data->hwirq, rdbase + GICR_INVLPIR);
+- wait_for_syncr(rdbase);
+- raw_spin_unlock(&gic_data_rdist_cpu(vpe->col_idx)->rd_lock);
+- } else {
++ if (gic_rdists->has_direct_lpi)
++ __direct_lpi_inv(d, d->parent_data->hwirq);
++ else
+ its_vpe_send_cmd(vpe, its_send_inv);
+- }
+ }
+
+ static void its_vpe_mask_irq(struct irq_data *d)
+--
+2.40.1
+
--- /dev/null
+From 8ed6ebc265f28fa856de71c7f16be6e0795b744e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 13 Jun 2023 13:30:35 -0700
+Subject: KVM: x86: Disallow KVM_SET_SREGS{2} if incoming CR0 is invalid
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 26a0652cb453c72f6aab0974bc4939e9b14f886b ]
+
+Reject KVM_SET_SREGS{2} with -EINVAL if the incoming CR0 is invalid,
+e.g. due to setting bits 63:32, illegal combinations, or to a value that
+isn't allowed in VMX (non-)root mode. The VMX checks in particular are
+"fun" as failure to disallow Real Mode for an L2 that is configured with
+unrestricted guest disabled, when KVM itself has unrestricted guest
+enabled, will result in KVM forcing VM86 mode to virtual Real Mode for
+L2, but then fail to unwind the related metadata when synthesizing a
+nested VM-Exit back to L1 (which has unrestricted guest enabled).
+
+Opportunistically fix a benign typo in the prototype for is_valid_cr4().
+
+Cc: stable@vger.kernel.org
+Reported-by: syzbot+5feef0b9ee9c8e9e5689@syzkaller.appspotmail.com
+Closes: https://lore.kernel.org/all/000000000000f316b705fdf6e2b4@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20230613203037.1968489-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm-x86-ops.h | 1 +
+ arch/x86/include/asm/kvm_host.h | 3 ++-
+ arch/x86/kvm/svm/svm.c | 6 ++++++
+ arch/x86/kvm/vmx/vmx.c | 28 ++++++++++++++++++------
+ arch/x86/kvm/x86.c | 34 +++++++++++++++++++-----------
+ 5 files changed, 52 insertions(+), 20 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index 23ea8a25cbbeb..4bdcb91478a51 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -34,6 +34,7 @@ KVM_X86_OP(get_segment)
+ KVM_X86_OP(get_cpl)
+ KVM_X86_OP(set_segment)
+ KVM_X86_OP_NULL(get_cs_db_l_bits)
++KVM_X86_OP(is_valid_cr0)
+ KVM_X86_OP(set_cr0)
+ KVM_X86_OP(is_valid_cr4)
+ KVM_X86_OP(set_cr4)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 9e800d4d323c6..08cfc26ee7c67 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1333,8 +1333,9 @@ struct kvm_x86_ops {
+ void (*set_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
++ bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+- bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
++ bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
+ void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 0611dac70c25c..302a4669c5a15 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1734,6 +1734,11 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+ }
+
++static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
++{
++ return true;
++}
++
+ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
+@@ -4596,6 +4601,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .set_segment = svm_set_segment,
+ .get_cpl = svm_get_cpl,
+ .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
++ .is_valid_cr0 = svm_is_valid_cr0,
+ .set_cr0 = svm_set_cr0,
+ .is_valid_cr4 = svm_is_valid_cr4,
+ .set_cr4 = svm_set_cr4,
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 0841f9a34d1c2..89744ee06101a 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2894,6 +2894,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
++ /*
++ * KVM should never use VM86 to virtualize Real Mode when L2 is active,
++ * as using VM86 is unnecessary if unrestricted guest is enabled, and
++ * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
++ * should VM-Fail and KVM should reject userspace attempts to stuff
++ * CR0.PG=0 when L2 is active.
++ */
++ WARN_ON_ONCE(is_guest_mode(vcpu));
++
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+@@ -3084,6 +3093,17 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
+ #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING)
+
++static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
++{
++ if (is_guest_mode(vcpu))
++ return nested_guest_cr0_valid(vcpu, cr0);
++
++ if (to_vmx(vcpu)->nested.vmxon)
++ return nested_host_cr0_valid(vcpu, cr0);
++
++ return true;
++}
++
+ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+@@ -5027,18 +5047,11 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+ val = (val & ~vmcs12->cr0_guest_host_mask) |
+ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
+
+- if (!nested_guest_cr0_valid(vcpu, val))
+- return 1;
+-
+ if (kvm_set_cr0(vcpu, val))
+ return 1;
+ vmcs_writel(CR0_READ_SHADOW, orig_val);
+ return 0;
+ } else {
+- if (to_vmx(vcpu)->nested.vmxon &&
+- !nested_host_cr0_valid(vcpu, val))
+- return 1;
+-
+ return kvm_set_cr0(vcpu, val);
+ }
+ }
+@@ -7744,6 +7757,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
++ .is_valid_cr0 = vmx_is_valid_cr0,
+ .set_cr0 = vmx_set_cr0,
+ .is_valid_cr4 = vmx_is_valid_cr4,
+ .set_cr4 = vmx_set_cr4,
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 7e1e3bc745622..285ba12be8ce3 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -876,6 +876,22 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
+ }
+ EXPORT_SYMBOL_GPL(load_pdptrs);
+
++static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
++{
++#ifdef CONFIG_X86_64
++ if (cr0 & 0xffffffff00000000UL)
++ return false;
++#endif
++
++ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
++ return false;
++
++ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
++ return false;
++
++ return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0);
++}
++
+ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
+ {
+ if ((cr0 ^ old_cr0) & X86_CR0_PG) {
+@@ -898,20 +914,13 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
+
+- cr0 |= X86_CR0_ET;
+-
+-#ifdef CONFIG_X86_64
+- if (cr0 & 0xffffffff00000000UL)
++ if (!kvm_is_valid_cr0(vcpu, cr0))
+ return 1;
+-#endif
+-
+- cr0 &= ~CR0_RESERVED_BITS;
+
+- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+- return 1;
++ cr0 |= X86_CR0_ET;
+
+- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+- return 1;
++ /* Write to CR0 reserved bits are ignored, even on Intel. */
++ cr0 &= ~CR0_RESERVED_BITS;
+
+ #ifdef CONFIG_X86_64
+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
+@@ -10643,7 +10652,8 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+ return false;
+ }
+
+- return kvm_is_valid_cr4(vcpu, sregs->cr4);
++ return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
++ kvm_is_valid_cr0(vcpu, sregs->cr0);
+ }
+
+ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+--
+2.40.1
+
--- /dev/null
+From bd702e237b948c07f454a125338b2813b785f3d3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 7 Jul 2023 16:19:09 +0200
+Subject: locking/rtmutex: Fix task->pi_waiters integrity
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit f7853c34241807bb97673a5e97719123be39a09e ]
+
+Henry reported that rt_mutex_adjust_prio_check() has an ordering
+problem and puts the lie to the comment in [7]. Sharing the sort key
+between lock->waiters and owner->pi_waiters *does* create problems,
+since unlike what the comment claims, holding [L] is insufficient.
+
+Notably, consider:
+
+ A
+ / \
+ M1 M2
+ | |
+ B C
+
+That is, task A owns both M1 and M2, B and C block on them. In this
+case a concurrent chain walk (B & C) will modify their resp. sort keys
+in [7] while holding M1->wait_lock and M2->wait_lock. So holding [L]
+is meaningless, they're different Ls.
+
+This then gives rise to a race condition between [7] and [11], where
+the requeue of pi_waiters will observe an inconsistent tree order.
+
+ B C
+
+ (holds M1->wait_lock, (holds M2->wait_lock,
+ holds B->pi_lock) holds A->pi_lock)
+
+ [7]
+ waiter_update_prio();
+ ...
+ [8]
+ raw_spin_unlock(B->pi_lock);
+ ...
+ [10]
+ raw_spin_lock(A->pi_lock);
+
+ [11]
+ rt_mutex_enqueue_pi();
+ // observes inconsistent A->pi_waiters
+ // tree order
+
+Fixing this means either extending the range of the owner lock from
+[10-13] to [6-13], with the immediate problem that this means [6-8]
+hold both blocked and owner locks, or duplicating the sort key.
+
+Since the locking in chain walk is horrible enough without having to
+consider pi_lock nesting rules, duplicate the sort key instead.
+
+By giving each tree their own sort key, the above race becomes
+harmless, if C sees B at the old location, then B will correct things
+(if they need correcting) when it walks up the chain and reaches A.
+
+Fixes: fb00aca47440 ("rtmutex: Turn the plist into an rb-tree")
+Reported-by: Henry Wu <triangletrap12@gmail.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Acked-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Henry Wu <triangletrap12@gmail.com>
+Link: https://lkml.kernel.org/r/20230707161052.GF2883469%40hirez.programming.kicks-ass.net
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/locking/rtmutex.c | 170 +++++++++++++++++++++-----------
+ kernel/locking/rtmutex_api.c | 2 +-
+ kernel/locking/rtmutex_common.h | 47 ++++++---
+ kernel/locking/ww_mutex.h | 12 +--
+ 4 files changed, 155 insertions(+), 76 deletions(-)
+
+diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
+index b7fa3ee3aa1de..ee5be1dda0c40 100644
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -331,21 +331,43 @@ static __always_inline int __waiter_prio(struct task_struct *task)
+ return prio;
+ }
+
++/*
++ * Update the waiter->tree copy of the sort keys.
++ */
+ static __always_inline void
+ waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+ {
+- waiter->prio = __waiter_prio(task);
+- waiter->deadline = task->dl.deadline;
++ lockdep_assert_held(&waiter->lock->wait_lock);
++ lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
++
++ waiter->tree.prio = __waiter_prio(task);
++ waiter->tree.deadline = task->dl.deadline;
++}
++
++/*
++ * Update the waiter->pi_tree copy of the sort keys (from the tree copy).
++ */
++static __always_inline void
++waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
++{
++ lockdep_assert_held(&waiter->lock->wait_lock);
++ lockdep_assert_held(&task->pi_lock);
++ lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
++
++ waiter->pi_tree.prio = waiter->tree.prio;
++ waiter->pi_tree.deadline = waiter->tree.deadline;
+ }
+
+ /*
+- * Only use with rt_mutex_waiter_{less,equal}()
++ * Only use with rt_waiter_node_{less,equal}()
+ */
++#define task_to_waiter_node(p) \
++ &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
+ #define task_to_waiter(p) \
+- &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
++ &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
+
+-static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
+- struct rt_mutex_waiter *right)
++static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
++ struct rt_waiter_node *right)
+ {
+ if (left->prio < right->prio)
+ return 1;
+@@ -362,8 +384,8 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
+ return 0;
+ }
+
+-static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
+- struct rt_mutex_waiter *right)
++static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
++ struct rt_waiter_node *right)
+ {
+ if (left->prio != right->prio)
+ return 0;
+@@ -383,7 +405,7 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
+ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ struct rt_mutex_waiter *top_waiter)
+ {
+- if (rt_mutex_waiter_less(waiter, top_waiter))
++ if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
+ return true;
+
+ #ifdef RT_MUTEX_BUILD_SPINLOCKS
+@@ -391,30 +413,30 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ * Note that RT tasks are excluded from same priority (lateral)
+ * steals to prevent the introduction of an unbounded latency.
+ */
+- if (rt_prio(waiter->prio) || dl_prio(waiter->prio))
++ if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
+ return false;
+
+- return rt_mutex_waiter_equal(waiter, top_waiter);
++ return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
+ #else
+ return false;
+ #endif
+ }
+
+ #define __node_2_waiter(node) \
+- rb_entry((node), struct rt_mutex_waiter, tree_entry)
++ rb_entry((node), struct rt_mutex_waiter, tree.entry)
+
+ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
+ {
+ struct rt_mutex_waiter *aw = __node_2_waiter(a);
+ struct rt_mutex_waiter *bw = __node_2_waiter(b);
+
+- if (rt_mutex_waiter_less(aw, bw))
++ if (rt_waiter_node_less(&aw->tree, &bw->tree))
+ return 1;
+
+ if (!build_ww_mutex())
+ return 0;
+
+- if (rt_mutex_waiter_less(bw, aw))
++ if (rt_waiter_node_less(&bw->tree, &aw->tree))
+ return 0;
+
+ /* NOTE: relies on waiter->ww_ctx being set before insertion */
+@@ -432,48 +454,58 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod
+ static __always_inline void
+ rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
+ {
+- rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
++ lockdep_assert_held(&lock->wait_lock);
++
++ rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
+ }
+
+ static __always_inline void
+ rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
+ {
+- if (RB_EMPTY_NODE(&waiter->tree_entry))
++ lockdep_assert_held(&lock->wait_lock);
++
++ if (RB_EMPTY_NODE(&waiter->tree.entry))
+ return;
+
+- rb_erase_cached(&waiter->tree_entry, &lock->waiters);
+- RB_CLEAR_NODE(&waiter->tree_entry);
++ rb_erase_cached(&waiter->tree.entry, &lock->waiters);
++ RB_CLEAR_NODE(&waiter->tree.entry);
+ }
+
+-#define __node_2_pi_waiter(node) \
+- rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
++#define __node_2_rt_node(node) \
++ rb_entry((node), struct rt_waiter_node, entry)
+
+-static __always_inline bool
+-__pi_waiter_less(struct rb_node *a, const struct rb_node *b)
++static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+ {
+- return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
++ return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
+ }
+
+ static __always_inline void
+ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+ {
+- rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
++ lockdep_assert_held(&task->pi_lock);
++
++ rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
+ }
+
+ static __always_inline void
+ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+ {
+- if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
++ lockdep_assert_held(&task->pi_lock);
++
++ if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
+ return;
+
+- rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
+- RB_CLEAR_NODE(&waiter->pi_tree_entry);
++ rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
++ RB_CLEAR_NODE(&waiter->pi_tree.entry);
+ }
+
+-static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
++static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
++ struct task_struct *p)
+ {
+ struct task_struct *pi_task = NULL;
+
++ lockdep_assert_held(&lock->wait_lock);
++ lockdep_assert(rt_mutex_owner(lock) == p);
+ lockdep_assert_held(&p->pi_lock);
+
+ if (task_has_pi_waiters(p))
+@@ -562,9 +594,14 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st
+ * Chain walk basics and protection scope
+ *
+ * [R] refcount on task
+- * [P] task->pi_lock held
++ * [Pn] task->pi_lock held
+ * [L] rtmutex->wait_lock held
+ *
++ * Normal locking order:
++ *
++ * rtmutex->wait_lock
++ * task->pi_lock
++ *
+ * Step Description Protected by
+ * function arguments:
+ * @task [R]
+@@ -579,27 +616,32 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st
+ * again:
+ * loop_sanity_check();
+ * retry:
+- * [1] lock(task->pi_lock); [R] acquire [P]
+- * [2] waiter = task->pi_blocked_on; [P]
+- * [3] check_exit_conditions_1(); [P]
+- * [4] lock = waiter->lock; [P]
+- * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
+- * unlock(task->pi_lock); release [P]
++ * [1] lock(task->pi_lock); [R] acquire [P1]
++ * [2] waiter = task->pi_blocked_on; [P1]
++ * [3] check_exit_conditions_1(); [P1]
++ * [4] lock = waiter->lock; [P1]
++ * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L]
++ * unlock(task->pi_lock); release [P1]
+ * goto retry;
+ * }
+- * [6] check_exit_conditions_2(); [P] + [L]
+- * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
+- * [8] unlock(task->pi_lock); release [P]
++ * [6] check_exit_conditions_2(); [P1] + [L]
++ * [7] requeue_lock_waiter(lock, waiter); [P1] + [L]
++ * [8] unlock(task->pi_lock); release [P1]
+ * put_task_struct(task); release [R]
+ * [9] check_exit_conditions_3(); [L]
+ * [10] task = owner(lock); [L]
+ * get_task_struct(task); [L] acquire [R]
+- * lock(task->pi_lock); [L] acquire [P]
+- * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
+- * [12] check_exit_conditions_4(); [P] + [L]
+- * [13] unlock(task->pi_lock); release [P]
++ * lock(task->pi_lock); [L] acquire [P2]
++ * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
++ * [12] check_exit_conditions_4(); [P2] + [L]
++ * [13] unlock(task->pi_lock); release [P2]
+ * unlock(lock->wait_lock); release [L]
+ * goto again;
++ *
++ * Where P1 is the blocking task and P2 is the lock owner; going up one step
++ * the owner becomes the next blocked task etc..
++ *
++*
+ */
+ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ enum rtmutex_chainwalk chwalk,
+@@ -747,7 +789,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ * enabled we continue, but stop the requeueing in the chain
+ * walk.
+ */
+- if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
++ if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+@@ -755,13 +797,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ }
+
+ /*
+- * [4] Get the next lock
++ * [4] Get the next lock; per holding task->pi_lock we can't unblock
++ * and guarantee @lock's existence.
+ */
+ lock = waiter->lock;
+ /*
+ * [5] We need to trylock here as we are holding task->pi_lock,
+ * which is the reverse lock order versus the other rtmutex
+ * operations.
++ *
++ * Per the above, holding task->pi_lock guarantees lock exists, so
++ * inverting this lock order is infeasible from a life-time
++ * perspective.
+ */
+ if (!raw_spin_trylock(&lock->wait_lock)) {
+ raw_spin_unlock_irq(&task->pi_lock);
+@@ -865,17 +912,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ * or
+ *
+ * DL CBS enforcement advancing the effective deadline.
+- *
+- * Even though pi_waiters also uses these fields, and that tree is only
+- * updated in [11], we can do this here, since we hold [L], which
+- * serializes all pi_waiters access and rb_erase() does not care about
+- * the values of the node being removed.
+ */
+ waiter_update_prio(waiter, task);
+
+ rt_mutex_enqueue(lock, waiter);
+
+- /* [8] Release the task */
++ /*
++ * [8] Release the (blocking) task in preparation for
++ * taking the owner task in [10].
++ *
++ * Since we hold lock->waiter_lock, task cannot unblock, even if we
++ * release task->pi_lock.
++ */
+ raw_spin_unlock(&task->pi_lock);
+ put_task_struct(task);
+
+@@ -899,7 +947,12 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ return 0;
+ }
+
+- /* [10] Grab the next task, i.e. the owner of @lock */
++ /*
++ * [10] Grab the next task, i.e. the owner of @lock
++ *
++ * Per holding lock->wait_lock and checking for !owner above, there
++ * must be an owner and it cannot go away.
++ */
+ task = get_task_struct(rt_mutex_owner(lock));
+ raw_spin_lock(&task->pi_lock);
+
+@@ -912,8 +965,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ * and adjust the priority of the owner.
+ */
+ rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
++ waiter_clone_prio(waiter, task);
+ rt_mutex_enqueue_pi(task, waiter);
+- rt_mutex_adjust_prio(task);
++ rt_mutex_adjust_prio(lock, task);
+
+ } else if (prerequeue_top_waiter == waiter) {
+ /*
+@@ -928,8 +982,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ */
+ rt_mutex_dequeue_pi(task, waiter);
+ waiter = rt_mutex_top_waiter(lock);
++ waiter_clone_prio(waiter, task);
+ rt_mutex_enqueue_pi(task, waiter);
+- rt_mutex_adjust_prio(task);
++ rt_mutex_adjust_prio(lock, task);
+ } else {
+ /*
+ * Nothing changed. No need to do any priority
+@@ -1142,6 +1197,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
+ waiter->task = task;
+ waiter->lock = lock;
+ waiter_update_prio(waiter, task);
++ waiter_clone_prio(waiter, task);
+
+ /* Get the top priority waiter on the lock */
+ if (rt_mutex_has_waiters(lock))
+@@ -1175,7 +1231,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
+ rt_mutex_dequeue_pi(owner, top_waiter);
+ rt_mutex_enqueue_pi(owner, waiter);
+
+- rt_mutex_adjust_prio(owner);
++ rt_mutex_adjust_prio(lock, owner);
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
+ } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
+@@ -1222,6 +1278,8 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
+ {
+ struct rt_mutex_waiter *waiter;
+
++ lockdep_assert_held(&lock->wait_lock);
++
+ raw_spin_lock(¤t->pi_lock);
+
+ waiter = rt_mutex_top_waiter(lock);
+@@ -1234,7 +1292,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
+ * task unblocks.
+ */
+ rt_mutex_dequeue_pi(current, waiter);
+- rt_mutex_adjust_prio(current);
++ rt_mutex_adjust_prio(lock, current);
+
+ /*
+ * As we are waking up the top waiter, and the waiter stays
+@@ -1471,7 +1529,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock,
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
+
+- rt_mutex_adjust_prio(owner);
++ rt_mutex_adjust_prio(lock, owner);
+
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
+diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
+index a461be2f873db..56d1938cb52a1 100644
+--- a/kernel/locking/rtmutex_api.c
++++ b/kernel/locking/rtmutex_api.c
+@@ -437,7 +437,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task)
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
++ if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
+index c47e8361bfb5c..1162e07cdaea1 100644
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -17,27 +17,44 @@
+ #include <linux/rtmutex.h>
+ #include <linux/sched/wake_q.h>
+
++
++/*
++ * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
++ * separate trees and they need their own copy of the sort keys because of
++ * different locking requirements.
++ *
++ * @entry: rbtree node to enqueue into the waiters tree
++ * @prio: Priority of the waiter
++ * @deadline: Deadline of the waiter if applicable
++ *
++ * See rt_waiter_node_less() and waiter_*_prio().
++ */
++struct rt_waiter_node {
++ struct rb_node entry;
++ int prio;
++ u64 deadline;
++};
++
+ /*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+- * @tree_entry: pi node to enqueue into the mutex waiters tree
+- * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
++ * @tree: node to enqueue into the mutex waiters tree
++ * @pi_tree: node to enqueue into the mutex owner waiters tree
+ * @task: task reference to the blocked task
+ * @lock: Pointer to the rt_mutex on which the waiter blocks
+ * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
+- * @prio: Priority of the waiter
+- * @deadline: Deadline of the waiter if applicable
+ * @ww_ctx: WW context pointer
++ *
++ * @tree is ordered by @lock->wait_lock
++ * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
+ */
+ struct rt_mutex_waiter {
+- struct rb_node tree_entry;
+- struct rb_node pi_tree_entry;
++ struct rt_waiter_node tree;
++ struct rt_waiter_node pi_tree;
+ struct task_struct *task;
+ struct rt_mutex_base *lock;
+ unsigned int wake_state;
+- int prio;
+- u64 deadline;
+ struct ww_acquire_ctx *ww_ctx;
+ };
+
+@@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
+ {
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+
+- return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter;
++ return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
+ }
+
+ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
+@@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+ struct rt_mutex_waiter *w = NULL;
+
++ lockdep_assert_held(&lock->wait_lock);
++
+ if (leftmost) {
+- w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
++ w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
+ BUG_ON(w->lock != lock);
+ }
+ return w;
+@@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(struct task_struct *p)
+
+ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
+ {
++ lockdep_assert_held(&p->pi_lock);
++
+ return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
+- pi_tree_entry);
++ pi_tree.entry);
+ }
+
+ #define RT_MUTEX_HAS_WAITERS 1UL
+@@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+ static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+ {
+ debug_rt_mutex_init_waiter(waiter);
+- RB_CLEAR_NODE(&waiter->pi_tree_entry);
+- RB_CLEAR_NODE(&waiter->tree_entry);
++ RB_CLEAR_NODE(&waiter->pi_tree.entry);
++ RB_CLEAR_NODE(&waiter->tree.entry);
+ waiter->wake_state = TASK_NORMAL;
+ waiter->task = NULL;
+ }
+diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
+index 56f139201f246..3ad2cc4823e59 100644
+--- a/kernel/locking/ww_mutex.h
++++ b/kernel/locking/ww_mutex.h
+@@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock)
+ struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
++ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+ }
+
+ static inline struct rt_mutex_waiter *
+ __ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+ {
+- struct rb_node *n = rb_next(&w->tree_entry);
++ struct rb_node *n = rb_next(&w->tree.entry);
+ if (!n)
+ return NULL;
+- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
++ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+ }
+
+ static inline struct rt_mutex_waiter *
+ __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+ {
+- struct rb_node *n = rb_prev(&w->tree_entry);
++ struct rb_node *n = rb_prev(&w->tree.entry);
+ if (!n)
+ return NULL;
+- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
++ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+ }
+
+ static inline struct rt_mutex_waiter *
+@@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock)
+ struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
++ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+ }
+
+ static inline void
+--
+2.40.1
+