6.5-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 4 Oct 2023 14:12:52 +0000 (16:12 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 4 Oct 2023 14:12:52 +0000 (16:12 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 4 Oct 2023 14:12:52 +0000 (16:12 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 4 Oct 2023 14:12:52 +0000 (16:12 +0200)
diff --git a/queue-6.5/kvm-svm-fix-tsc_aux-virtualization-setup.patch b/queue-6.5/kvm-svm-fix-tsc_aux-virtualization-setup.patch

new file mode 100644 (file)

index 0000000..5994b8f
--- /dev/null
+++ b/queue-6.5/kvm-svm-fix-tsc_aux-virtualization-setup.patch
@@ -0,0 +1,124 @@
+From e0096d01c4fcb8c96c05643cfc2c20ab78eae4da Mon Sep 17 00:00:00 2001
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Fri, 15 Sep 2023 15:54:30 -0500
+Subject: KVM: SVM: Fix TSC_AUX virtualization setup
+
+From: Tom Lendacky <thomas.lendacky@amd.com>
+
+commit e0096d01c4fcb8c96c05643cfc2c20ab78eae4da upstream.
+
+The checks for virtualizing TSC_AUX occur during the vCPU reset processing
+path. However, at the time of initial vCPU reset processing, when the vCPU
+is first created, not all of the guest CPUID information has been set. In
+this case the RDTSCP and RDPID feature support for the guest is not in
+place and so TSC_AUX virtualization is not established.
+
+This continues for each vCPU created for the guest. On the first boot of
+an AP, vCPU reset processing is executed as a result of an APIC INIT
+event, this time with all of the guest CPUID information set, resulting
+in TSC_AUX virtualization being enabled, but only for the APs. The BSP
+always sees a TSC_AUX value of 0 which probably went unnoticed because,
+at least for Linux, the BSP TSC_AUX value is 0.
+
+Move the TSC_AUX virtualization enablement out of the init_vmcb() path and
+into the vcpu_after_set_cpuid() path to allow for proper initialization of
+the support after the guest CPUID information has been set.
+
+With the TSC_AUX virtualization support now in the vcpu_set_after_cpuid()
+path, the intercepts must be either cleared or set based on the guest
+CPUID input.
+
+Fixes: 296d5a17e793 ("KVM: SEV-ES: Use V_TSC_AUX if available instead of RDTSC/MSR_TSC_AUX intercepts")
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Message-Id: <4137fbcb9008951ab5f0befa74a0399d2cce809a.1694811272.git.thomas.lendacky@amd.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/sev.c |   31 ++++++++++++++++++++++++++-----
+ arch/x86/kvm/svm/svm.c |    9 ++-------
+ arch/x86/kvm/svm/svm.h |    1 +
+ 3 files changed, 29 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/kvm/svm/sev.c
++++ b/arch/x86/kvm/svm/sev.c
+@@ -2945,6 +2945,32 @@ int sev_es_string_io(struct vcpu_svm *sv
+                                   count, in);
+ }
+ 
++static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
++{
++      struct kvm_vcpu *vcpu = &svm->vcpu;
++
++      if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
++              bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
++                               guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
++
++              set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
++      }
++}
++
++void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
++{
++      struct kvm_vcpu *vcpu = &svm->vcpu;
++      struct kvm_cpuid_entry2 *best;
++
++      /* For sev guests, the memory encryption bit is not reserved in CR3.  */
++      best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
++      if (best)
++              vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
++
++      if (sev_es_guest(svm->vcpu.kvm))
++              sev_es_vcpu_after_set_cpuid(svm);
++}
++
+ static void sev_es_init_vmcb(struct vcpu_svm *svm)
+ {
+       struct kvm_vcpu *vcpu = &svm->vcpu;
+@@ -2991,11 +3017,6 @@ static void sev_es_init_vmcb(struct vcpu
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+-
+-      if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) &&
+-          (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) ||
+-           guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID)))
+-              set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1);
+ }
+ 
+ void sev_init_vmcb(struct vcpu_svm *svm)
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4217,7 +4217,6 @@ static bool svm_has_emulated_msr(struct
+ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+-      struct kvm_cpuid_entry2 *best;
+ 
+       vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+                                   boot_cpu_has(X86_FEATURE_XSAVE) &&
+@@ -4252,12 +4251,8 @@ static void svm_vcpu_after_set_cpuid(str
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
+                                    !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+ 
+-      /* For sev guests, the memory encryption bit is not reserved in CR3.  */
+-      if (sev_guest(vcpu->kvm)) {
+-              best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
+-              if (best)
+-                      vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
+-      }
++      if (sev_guest(vcpu->kvm))
++              sev_vcpu_after_set_cpuid(svm);
+ 
+       init_vmcb_after_set_cpuid(vcpu);
+ }
+--- a/arch/x86/kvm/svm/svm.h
++++ b/arch/x86/kvm/svm/svm.h
+@@ -733,6 +733,7 @@ void __init sev_hardware_setup(void);
+ void sev_hardware_unsetup(void);
+ int sev_cpu_init(struct svm_cpu_data *sd);
+ void sev_init_vmcb(struct vcpu_svm *svm);
++void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
+ void sev_free_vcpu(struct kvm_vcpu *vcpu);
+ int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
+ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
diff --git a/queue-6.5/kvm-svm-intercept_rdtscp-is-never-intercepted-anyway.patch b/queue-6.5/kvm-svm-intercept_rdtscp-is-never-intercepted-anyway.patch

new file mode 100644 (file)

index 0000000..38c99a9
--- /dev/null
+++ b/queue-6.5/kvm-svm-intercept_rdtscp-is-never-intercepted-anyway.patch
@@ -0,0 +1,38 @@
+From e8d93d5d93f85949e7299be289c6e7e1154b2f78 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Fri, 22 Sep 2023 17:06:34 -0400
+Subject: KVM: SVM: INTERCEPT_RDTSCP is never intercepted anyway
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit e8d93d5d93f85949e7299be289c6e7e1154b2f78 upstream.
+
+svm_recalc_instruction_intercepts() is always called at least once
+before the vCPU is started, so the setting or clearing of the RDTSCP
+intercept can be dropped from the TSC_AUX virtualization support.
+
+Extracted from a patch by Tom Lendacky.
+
+Cc: stable@vger.kernel.org
+Fixes: 296d5a17e793 ("KVM: SEV-ES: Use V_TSC_AUX if available instead of RDTSC/MSR_TSC_AUX intercepts")
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/sev.c |    5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/svm/sev.c
++++ b/arch/x86/kvm/svm/sev.c
+@@ -2994,11 +2994,8 @@ static void sev_es_init_vmcb(struct vcpu
+ 
+       if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) &&
+           (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) ||
+-           guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) {
++           guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID)))
+               set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1);
+-              if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP))
+-                      svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+-      }
+ }
+ 
+ void sev_init_vmcb(struct vcpu_svm *svm)
diff --git a/queue-6.5/kvm-x86-mmu-do-not-filter-address-spaces-in-for_each_tdp_mmu_root_yield_safe.patch b/queue-6.5/kvm-x86-mmu-do-not-filter-address-spaces-in-for_each_tdp_mmu_root_yield_safe.patch

new file mode 100644 (file)

index 0000000..10aecfe
--- /dev/null
+++ b/queue-6.5/kvm-x86-mmu-do-not-filter-address-spaces-in-for_each_tdp_mmu_root_yield_safe.patch
@@ -0,0 +1,122 @@
+From 441a5dfcd96854cbcb625709e2694a9c60adfaab Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Thu, 21 Sep 2023 05:44:56 -0400
+Subject: KVM: x86/mmu: Do not filter address spaces in for_each_tdp_mmu_root_yield_safe()
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 441a5dfcd96854cbcb625709e2694a9c60adfaab upstream.
+
+All callers except the MMU notifier want to process all address spaces.
+Remove the address space ID argument of for_each_tdp_mmu_root_yield_safe()
+and switch the MMU notifier to use __for_each_tdp_mmu_root_yield_safe().
+
+Extracted out of a patch by Sean Christopherson <seanjc@google.com>
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/mmu.c     |    8 ++------
+ arch/x86/kvm/mmu/tdp_mmu.c |   22 +++++++++++-----------
+ arch/x86/kvm/mmu/tdp_mmu.h |    3 +--
+ 3 files changed, 14 insertions(+), 19 deletions(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -6294,7 +6294,6 @@ static bool kvm_rmap_zap_gfn_range(struc
+ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+ {
+       bool flush;
+-      int i;
+ 
+       if (WARN_ON_ONCE(gfn_end <= gfn_start))
+               return;
+@@ -6305,11 +6304,8 @@ void kvm_zap_gfn_range(struct kvm *kvm,
+ 
+       flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
+ 
+-      if (tdp_mmu_enabled) {
+-              for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+-                      flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
+-                                                    gfn_end, flush);
+-      }
++      if (tdp_mmu_enabled)
++              flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush);
+ 
+       if (flush)
+               kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -211,8 +211,12 @@ static struct kvm_mmu_page *tdp_mmu_next
+ #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)  \
+       __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
+ 
+-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)                 \
+-      __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
++#define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                 \
++      for (_root = tdp_mmu_next_root(_kvm, NULL, false, false);               \
++           _root;                                                             \
++           _root = tdp_mmu_next_root(_kvm, _root, false, false))              \
++              if (!kvm_lockdep_assert_mmu_lock_held(_kvm, false)) {           \
++              } else
+ 
+ /*
+  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
+@@ -877,12 +881,11 @@ static bool tdp_mmu_zap_leafs(struct kvm
+  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
+  * more SPTEs were zapped since the MMU lock was last acquired.
+  */
+-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
+-                         bool flush)
++bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
+ {
+       struct kvm_mmu_page *root;
+ 
+-      for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
++      for_each_tdp_mmu_root_yield_safe(kvm, root)
+               flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
+ 
+       return flush;
+@@ -891,7 +894,6 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *k
+ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
+ {
+       struct kvm_mmu_page *root;
+-      int i;
+ 
+       /*
+        * Zap all roots, including invalid roots, as all SPTEs must be dropped
+@@ -905,10 +907,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm
+        * is being destroyed or the userspace VMM has exited.  In both cases,
+        * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
+        */
+-      for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+-              for_each_tdp_mmu_root_yield_safe(kvm, root, i)
+-                      tdp_mmu_zap_root(kvm, root, false);
+-      }
++      for_each_tdp_mmu_root_yield_safe(kvm, root)
++              tdp_mmu_zap_root(kvm, root, false);
+ }
+ 
+ /*
+@@ -1148,7 +1148,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct
+ {
+       struct kvm_mmu_page *root;
+ 
+-      for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id)
++      __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false)
+               flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
+                                         range->may_block, flush);
+ 
+--- a/arch/x86/kvm/mmu/tdp_mmu.h
++++ b/arch/x86/kvm/mmu/tdp_mmu.h
+@@ -20,8 +20,7 @@ __must_check static inline bool kvm_tdp_
+ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared);
+ 
+-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
+-                         bool flush);
++bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
+ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
+ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
diff --git a/queue-6.5/kvm-x86-mmu-open-code-leaf-invalidation-from-mmu_notifier.patch b/queue-6.5/kvm-x86-mmu-open-code-leaf-invalidation-from-mmu_notifier.patch

new file mode 100644 (file)

index 0000000..0b487b4
--- /dev/null
+++ b/queue-6.5/kvm-x86-mmu-open-code-leaf-invalidation-from-mmu_notifier.patch
@@ -0,0 +1,87 @@
+From 50107e8b2a8a59d8cec7e8454e27c1f8e365acdb Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Fri, 15 Sep 2023 17:39:14 -0700
+Subject: KVM: x86/mmu: Open code leaf invalidation from mmu_notifier
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 50107e8b2a8a59d8cec7e8454e27c1f8e365acdb upstream.
+
+The mmu_notifier path is a bit of a special snowflake, e.g. it zaps only a
+single address space (because it's per-slot), and can't always yield.
+Because of this, it calls kvm_tdp_mmu_zap_leafs() in ways that no one
+else does.
+
+Iterate manually over the leafs in response to an mmu_notifier
+invalidation, instead of invoking kvm_tdp_mmu_zap_leafs().  Drop the
+@can_yield param from kvm_tdp_mmu_zap_leafs() as its sole remaining
+caller unconditionally passes "true".
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20230916003916.2545000-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/mmu.c     |    2 +-
+ arch/x86/kvm/mmu/tdp_mmu.c |   13 +++++++++----
+ arch/x86/kvm/mmu/tdp_mmu.h |    4 ++--
+ 3 files changed, 12 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -6308,7 +6308,7 @@ void kvm_zap_gfn_range(struct kvm *kvm,
+       if (tdp_mmu_enabled) {
+               for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+                       flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
+-                                                    gfn_end, true, flush);
++                                                    gfn_end, flush);
+       }
+ 
+       if (flush)
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -878,12 +878,12 @@ static bool tdp_mmu_zap_leafs(struct kvm
+  * more SPTEs were zapped since the MMU lock was last acquired.
+  */
+ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
+-                         bool can_yield, bool flush)
++                         bool flush)
+ {
+       struct kvm_mmu_page *root;
+ 
+       for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
+-              flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
++              flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
+ 
+       return flush;
+ }
+@@ -1146,8 +1146,13 @@ retry:
+ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+                                bool flush)
+ {
+-      return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
+-                                   range->end, range->may_block, flush);
++      struct kvm_mmu_page *root;
++
++      for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id)
++              flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
++                                        range->may_block, flush);
++
++      return flush;
+ }
+ 
+ typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
+--- a/arch/x86/kvm/mmu/tdp_mmu.h
++++ b/arch/x86/kvm/mmu/tdp_mmu.h
+@@ -20,8 +20,8 @@ __must_check static inline bool kvm_tdp_
+ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared);
+ 
+-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
+-                               gfn_t end, bool can_yield, bool flush);
++bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
++                         bool flush);
+ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
+ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
diff --git a/queue-6.5/kvm-x86-mmu-stop-zapping-invalidated-tdp-mmu-roots-asynchronously.patch b/queue-6.5/kvm-x86-mmu-stop-zapping-invalidated-tdp-mmu-roots-asynchronously.patch

new file mode 100644 (file)

index 0000000..27b92e5
--- /dev/null
+++ b/queue-6.5/kvm-x86-mmu-stop-zapping-invalidated-tdp-mmu-roots-asynchronously.patch
@@ -0,0 +1,411 @@
+From 0df9dab891ff0d9b646d82e4fe038229e4c02451 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Fri, 15 Sep 2023 17:39:15 -0700
+Subject: KVM: x86/mmu: Stop zapping invalidated TDP MMU roots asynchronously
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 0df9dab891ff0d9b646d82e4fe038229e4c02451 upstream.
+
+Stop zapping invalidate TDP MMU roots via work queue now that KVM
+preserves TDP MMU roots until they are explicitly invalidated.  Zapping
+roots asynchronously was effectively a workaround to avoid stalling a vCPU
+for an extended during if a vCPU unloaded a root, which at the time
+happened whenever the guest toggled CR0.WP (a frequent operation for some
+guest kernels).
+
+While a clever hack, zapping roots via an unbound worker had subtle,
+unintended consequences on host scheduling, especially when zapping
+multiple roots, e.g. as part of a memslot.  Because the work of zapping a
+root is no longer bound to the task that initiated the zap, things like
+the CPU affinity and priority of the original task get lost.  Losing the
+affinity and priority can be especially problematic if unbound workqueues
+aren't affined to a small number of CPUs, as zapping multiple roots can
+cause KVM to heavily utilize the majority of CPUs in the system, *beyond*
+the CPUs KVM is already using to run vCPUs.
+
+When deleting a memslot via KVM_SET_USER_MEMORY_REGION, the async root
+zap can result in KVM occupying all logical CPUs for ~8ms, and result in
+high priority tasks not being scheduled in in a timely manner.  In v5.15,
+which doesn't preserve unloaded roots, the issues were even more noticeable
+as KVM would zap roots more frequently and could occupy all CPUs for 50ms+.
+
+Consuming all CPUs for an extended duration can lead to significant jitter
+throughout the system, e.g. on ChromeOS with virtio-gpu, deleting memslots
+is a semi-frequent operation as memslots are deleted and recreated with
+different host virtual addresses to react to host GPU drivers allocating
+and freeing GPU blobs.  On ChromeOS, the jitter manifests as audio blips
+during games due to the audio server's tasks not getting scheduled in
+promptly, despite the tasks having a high realtime priority.
+
+Deleting memslots isn't exactly a fast path and should be avoided when
+possible, and ChromeOS is working towards utilizing MAP_FIXED to avoid the
+memslot shenanigans, but KVM is squarely in the wrong.  Not to mention
+that removing the async zapping eliminates a non-trivial amount of
+complexity.
+
+Note, one of the subtle behaviors hidden behind the async zapping is that
+KVM would zap invalidated roots only once (ignoring partial zaps from
+things like mmu_notifier events).  Preserve this behavior by adding a flag
+to identify roots that are scheduled to be zapped versus roots that have
+already been zapped but not yet freed.
+
+Add a comment calling out why kvm_tdp_mmu_invalidate_all_roots() can
+encounter invalid roots, as it's not at all obvious why zapping
+invalidated roots shouldn't simply zap all invalid roots.
+
+Reported-by: Pattara Teerapong <pteerapong@google.com>
+Cc: David Stevens <stevensd@google.com>
+Cc: Yiwei Zhang<zzyiwei@google.com>
+Cc: Paul Hsia <paulhsia@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20230916003916.2545000-4-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/kvm_host.h |    3 
+ arch/x86/kvm/mmu/mmu.c          |   12 ---
+ arch/x86/kvm/mmu/mmu_internal.h |   15 ++--
+ arch/x86/kvm/mmu/tdp_mmu.c      |  133 ++++++++++++++++------------------------
+ arch/x86/kvm/mmu/tdp_mmu.h      |    2 
+ arch/x86/kvm/x86.c              |    5 -
+ 6 files changed, 68 insertions(+), 102 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1400,7 +1400,6 @@ struct kvm_arch {
+        * the thread holds the MMU lock in write mode.
+        */
+       spinlock_t tdp_mmu_pages_lock;
+-      struct workqueue_struct *tdp_mmu_zap_wq;
+ #endif /* CONFIG_X86_64 */
+ 
+       /*
+@@ -1814,7 +1813,7 @@ void kvm_mmu_vendor_module_exit(void);
+ 
+ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+ int kvm_mmu_create(struct kvm_vcpu *vcpu);
+-int kvm_mmu_init_vm(struct kvm *kvm);
++void kvm_mmu_init_vm(struct kvm *kvm);
+ void kvm_mmu_uninit_vm(struct kvm *kvm);
+ 
+ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -6206,21 +6206,17 @@ static void kvm_mmu_invalidate_zap_pages
+       kvm_mmu_zap_all_fast(kvm);
+ }
+ 
+-int kvm_mmu_init_vm(struct kvm *kvm)
++void kvm_mmu_init_vm(struct kvm *kvm)
+ {
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+-      int r;
+ 
+       INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+       INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+       INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
+       spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
+ 
+-      if (tdp_mmu_enabled) {
+-              r = kvm_mmu_init_tdp_mmu(kvm);
+-              if (r < 0)
+-                      return r;
+-      }
++      if (tdp_mmu_enabled)
++              kvm_mmu_init_tdp_mmu(kvm);
+ 
+       node->track_write = kvm_mmu_pte_write;
+       node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
+@@ -6233,8 +6229,6 @@ int kvm_mmu_init_vm(struct kvm *kvm)
+ 
+       kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
+       kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+-
+-      return 0;
+ }
+ 
+ static void mmu_free_vm_memory_caches(struct kvm *kvm)
+--- a/arch/x86/kvm/mmu/mmu_internal.h
++++ b/arch/x86/kvm/mmu/mmu_internal.h
+@@ -56,7 +56,12 @@ struct kvm_mmu_page {
+ 
+       bool tdp_mmu_page;
+       bool unsync;
+-      u8 mmu_valid_gen;
++      union {
++              u8 mmu_valid_gen;
++
++              /* Only accessed under slots_lock.  */
++              bool tdp_mmu_scheduled_root_to_zap;
++      };
+ 
+        /*
+         * The shadow page can't be replaced by an equivalent huge page
+@@ -98,13 +103,7 @@ struct kvm_mmu_page {
+               struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+               tdp_ptep_t ptep;
+       };
+-      union {
+-              DECLARE_BITMAP(unsync_child_bitmap, 512);
+-              struct {
+-                      struct work_struct tdp_mmu_async_work;
+-                      void *tdp_mmu_async_data;
+-              };
+-      };
++      DECLARE_BITMAP(unsync_child_bitmap, 512);
+ 
+       /*
+        * Tracks shadow pages that, if zapped, would allow KVM to create an NX
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -12,18 +12,10 @@
+ #include <trace/events/kvm.h>
+ 
+ /* Initializes the TDP MMU for the VM, if enabled. */
+-int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
++void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+ {
+-      struct workqueue_struct *wq;
+-
+-      wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
+-      if (!wq)
+-              return -ENOMEM;
+-
+       INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
+       spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
+-      kvm->arch.tdp_mmu_zap_wq = wq;
+-      return 1;
+ }
+ 
+ /* Arbitrarily returns true so that this may be used in if statements. */
+@@ -46,20 +38,15 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *
+        * ultimately frees all roots.
+        */
+       kvm_tdp_mmu_invalidate_all_roots(kvm);
+-
+-      /*
+-       * Destroying a workqueue also first flushes the workqueue, i.e. no
+-       * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
+-       */
+-      destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
++      kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ 
+       WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
+       WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+ 
+       /*
+        * Ensure that all the outstanding RCU callbacks to free shadow pages
+-       * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
+-       * can call kvm_tdp_mmu_put_root and create new callbacks.
++       * can run before the VM is torn down.  Putting the last reference to
++       * zapped roots will create new callbacks.
+        */
+       rcu_barrier();
+ }
+@@ -86,46 +73,6 @@ static void tdp_mmu_free_sp_rcu_callback
+       tdp_mmu_free_sp(sp);
+ }
+ 
+-static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+-                           bool shared);
+-
+-static void tdp_mmu_zap_root_work(struct work_struct *work)
+-{
+-      struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
+-                                               tdp_mmu_async_work);
+-      struct kvm *kvm = root->tdp_mmu_async_data;
+-
+-      read_lock(&kvm->mmu_lock);
+-
+-      /*
+-       * A TLB flush is not necessary as KVM performs a local TLB flush when
+-       * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
+-       * to a different pCPU.  Note, the local TLB flush on reuse also
+-       * invalidates any paging-structure-cache entries, i.e. TLB entries for
+-       * intermediate paging structures, that may be zapped, as such entries
+-       * are associated with the ASID on both VMX and SVM.
+-       */
+-      tdp_mmu_zap_root(kvm, root, true);
+-
+-      /*
+-       * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
+-       * avoiding an infinite loop.  By design, the root is reachable while
+-       * it's being asynchronously zapped, thus a different task can put its
+-       * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
+-       * asynchronously zapped root is unavoidable.
+-       */
+-      kvm_tdp_mmu_put_root(kvm, root, true);
+-
+-      read_unlock(&kvm->mmu_lock);
+-}
+-
+-static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
+-{
+-      root->tdp_mmu_async_data = kvm;
+-      INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
+-      queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
+-}
+-
+ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared)
+ {
+@@ -211,11 +158,11 @@ static struct kvm_mmu_page *tdp_mmu_next
+ #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)  \
+       __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
+ 
+-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                 \
+-      for (_root = tdp_mmu_next_root(_kvm, NULL, false, false);               \
++#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared)                        \
++      for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false);             \
+            _root;                                                             \
+-           _root = tdp_mmu_next_root(_kvm, _root, false, false))              \
+-              if (!kvm_lockdep_assert_mmu_lock_held(_kvm, false)) {           \
++           _root = tdp_mmu_next_root(_kvm, _root, _shared, false))            \
++              if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) {         \
+               } else
+ 
+ /*
+@@ -296,7 +243,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(stru
+        * by a memslot update or by the destruction of the VM.  Initialize the
+        * refcount to two; one reference for the vCPU, and one reference for
+        * the TDP MMU itself, which is held until the root is invalidated and
+-       * is ultimately put by tdp_mmu_zap_root_work().
++       * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
+        */
+       refcount_set(&root->tdp_mmu_root_count, 2);
+ 
+@@ -885,7 +832,7 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *k
+ {
+       struct kvm_mmu_page *root;
+ 
+-      for_each_tdp_mmu_root_yield_safe(kvm, root)
++      for_each_tdp_mmu_root_yield_safe(kvm, root, false)
+               flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
+ 
+       return flush;
+@@ -907,7 +854,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm
+        * is being destroyed or the userspace VMM has exited.  In both cases,
+        * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
+        */
+-      for_each_tdp_mmu_root_yield_safe(kvm, root)
++      for_each_tdp_mmu_root_yield_safe(kvm, root, false)
+               tdp_mmu_zap_root(kvm, root, false);
+ }
+ 
+@@ -917,18 +864,47 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm
+  */
+ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+ {
+-      flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
++      struct kvm_mmu_page *root;
++
++      read_lock(&kvm->mmu_lock);
++
++      for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
++              if (!root->tdp_mmu_scheduled_root_to_zap)
++                      continue;
++
++              root->tdp_mmu_scheduled_root_to_zap = false;
++              KVM_BUG_ON(!root->role.invalid, kvm);
++
++              /*
++               * A TLB flush is not necessary as KVM performs a local TLB
++               * flush when allocating a new root (see kvm_mmu_load()), and
++               * when migrating a vCPU to a different pCPU.  Note, the local
++               * TLB flush on reuse also invalidates paging-structure-cache
++               * entries, i.e. TLB entries for intermediate paging structures,
++               * that may be zapped, as such entries are associated with the
++               * ASID on both VMX and SVM.
++               */
++              tdp_mmu_zap_root(kvm, root, true);
++
++              /*
++               * The referenced needs to be put *after* zapping the root, as
++               * the root must be reachable by mmu_notifiers while it's being
++               * zapped
++               */
++              kvm_tdp_mmu_put_root(kvm, root, true);
++      }
++
++      read_unlock(&kvm->mmu_lock);
+ }
+ 
+ /*
+  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
+  * is about to be zapped, e.g. in response to a memslots update.  The actual
+- * zapping is performed asynchronously.  Using a separate workqueue makes it
+- * easy to ensure that the destruction is performed before the "fast zap"
+- * completes, without keeping a separate list of invalidated roots; the list is
+- * effectively the list of work items in the workqueue.
++ * zapping is done separately so that it happens with mmu_lock with read,
++ * whereas invalidating roots must be done with mmu_lock held for write (unless
++ * the VM is being destroyed).
+  *
+- * Note, the asynchronous worker is gifted the TDP MMU's reference.
++ * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
+  * See kvm_tdp_mmu_get_vcpu_root_hpa().
+  */
+ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+@@ -953,19 +929,20 @@ void kvm_tdp_mmu_invalidate_all_roots(st
+       /*
+        * As above, mmu_lock isn't held when destroying the VM!  There can't
+        * be other references to @kvm, i.e. nothing else can invalidate roots
+-       * or be consuming roots, but walking the list of roots does need to be
+-       * guarded against roots being deleted by the asynchronous zap worker.
++       * or get/put references to roots.
+        */
+-      rcu_read_lock();
+-
+-      list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
++      list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
++              /*
++               * Note, invalid roots can outlive a memslot update!  Invalid
++               * roots must be *zapped* before the memslot update completes,
++               * but a different task can acquire a reference and keep the
++               * root alive after its been zapped.
++               */
+               if (!root->role.invalid) {
++                      root->tdp_mmu_scheduled_root_to_zap = true;
+                       root->role.invalid = true;
+-                      tdp_mmu_schedule_zap_root(kvm, root);
+               }
+       }
+-
+-      rcu_read_unlock();
+ }
+ 
+ /*
+--- a/arch/x86/kvm/mmu/tdp_mmu.h
++++ b/arch/x86/kvm/mmu/tdp_mmu.h
+@@ -7,7 +7,7 @@
+ 
+ #include "spte.h"
+ 
+-int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
++void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+ 
+ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -12302,9 +12302,7 @@ int kvm_arch_init_vm(struct kvm *kvm, un
+       if (ret)
+               goto out;
+ 
+-      ret = kvm_mmu_init_vm(kvm);
+-      if (ret)
+-              goto out_page_track;
++      kvm_mmu_init_vm(kvm);
+ 
+       ret = static_call(kvm_x86_vm_init)(kvm);
+       if (ret)
+@@ -12349,7 +12347,6 @@ int kvm_arch_init_vm(struct kvm *kvm, un
+ 
+ out_uninit_mmu:
+       kvm_mmu_uninit_vm(kvm);
+-out_page_track:
+       kvm_page_track_cleanup(kvm);
+ out:
+       return ret;
diff --git a/queue-6.5/misc-rtsx-fix-some-platforms-can-not-boot-and-move-the-l1ss-judgment-to-probe.patch b/queue-6.5/misc-rtsx-fix-some-platforms-can-not-boot-and-move-the-l1ss-judgment-to-probe.patch

new file mode 100644 (file)

index 0000000..caf3e4b
--- /dev/null
+++ b/queue-6.5/misc-rtsx-fix-some-platforms-can-not-boot-and-move-the-l1ss-judgment-to-probe.patch
@@ -0,0 +1,517 @@
+From 0e4cac557531a4c93de108d9ff11329fcad482ff Mon Sep 17 00:00:00 2001
+From: Ricky WU <ricky_wu@realtek.com>
+Date: Wed, 20 Sep 2023 09:11:19 +0000
+Subject: misc: rtsx: Fix some platforms can not boot and move the l1ss judgment to probe
+
+From: Ricky WU <ricky_wu@realtek.com>
+
+commit 0e4cac557531a4c93de108d9ff11329fcad482ff upstream.
+
+commit 101bd907b424 ("misc: rtsx: judge ASPM Mode to set PETXCFG Reg")
+some readers no longer force #CLKREQ to low
+when the system need to enter ASPM.
+But some platform maybe not implement complete ASPM?
+it causes some platforms can not boot
+
+Like in the past only the platform support L1ss we release the #CLKREQ.
+Move the judgment (L1ss) to probe,
+we think read config space one time when the driver start is enough
+
+Fixes: 101bd907b424 ("misc: rtsx: judge ASPM Mode to set PETXCFG Reg")
+Cc: stable <stable@kernel.org>
+Reported-by: Paul Grandperrin <paul.grandperrin@gmail.com>
+Signed-off-by: Ricky Wu <ricky_wu@realtek.com>
+Tested-By: Jade Lovelace <lists@jade.fyi>
+Link: https://lore.kernel.org/r/37b1afb997f14946a8784c73d1f9a4f5@realtek.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/misc/cardreader/rts5227.c  |   55 +++--------------------------------
+ drivers/misc/cardreader/rts5228.c  |   57 +++++++++++--------------------------
+ drivers/misc/cardreader/rts5249.c  |   56 ++++--------------------------------
+ drivers/misc/cardreader/rts5260.c  |   43 ++++++++-------------------
+ drivers/misc/cardreader/rts5261.c  |   52 ++++++++-------------------------
+ drivers/misc/cardreader/rtsx_pcr.c |   51 +++++++++++++++++++++++++++++----
+ 6 files changed, 102 insertions(+), 212 deletions(-)
+
+--- a/drivers/misc/cardreader/rts5227.c
++++ b/drivers/misc/cardreader/rts5227.c
+@@ -83,63 +83,20 @@ static void rts5227_fetch_vendor_setting
+ 
+ static void rts5227_init_from_cfg(struct rtsx_pcr *pcr)
+ {
+-      struct pci_dev *pdev = pcr->pci;
+-      int l1ss;
+-      u32 lval;
+       struct rtsx_cr_option *option = &pcr->option;
+ 
+-      l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS);
+-      if (!l1ss)
+-              return;
+-
+-      pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, &lval);
+-
+       if (CHK_PCI_PID(pcr, 0x522A)) {
+-              if (0 == (lval & 0x0F))
+-                      rtsx_pci_enable_oobs_polling(pcr);
+-              else
++              if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN
++                              | PM_L1_1_EN | PM_L1_2_EN))
+                       rtsx_pci_disable_oobs_polling(pcr);
++              else
++                      rtsx_pci_enable_oobs_polling(pcr);
+       }
+ 
+-      if (lval & PCI_L1SS_CTL1_ASPM_L1_1)
+-              rtsx_set_dev_flag(pcr, ASPM_L1_1_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, ASPM_L1_1_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_ASPM_L1_2)
+-              rtsx_set_dev_flag(pcr, ASPM_L1_2_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, ASPM_L1_2_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_PCIPM_L1_1)
+-              rtsx_set_dev_flag(pcr, PM_L1_1_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, PM_L1_1_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_PCIPM_L1_2)
+-              rtsx_set_dev_flag(pcr, PM_L1_2_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, PM_L1_2_EN);
+-
+       if (option->ltr_en) {
+-              u16 val;
+-
+-              pcie_capability_read_word(pcr->pci, PCI_EXP_DEVCTL2, &val);
+-              if (val & PCI_EXP_DEVCTL2_LTR_EN) {
+-                      option->ltr_enabled = true;
+-                      option->ltr_active = true;
++              if (option->ltr_enabled)
+                       rtsx_set_ltr_latency(pcr, option->ltr_active_latency);
+-              } else {
+-                      option->ltr_enabled = false;
+-              }
+       }
+-
+-      if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN
+-                              | PM_L1_1_EN | PM_L1_2_EN))
+-              option->force_clkreq_0 = false;
+-      else
+-              option->force_clkreq_0 = true;
+-
+ }
+ 
+ static int rts5227_extra_init_hw(struct rtsx_pcr *pcr)
+@@ -195,7 +152,7 @@ static int rts5227_extra_init_hw(struct
+               }
+       }
+ 
+-      if (option->force_clkreq_0 && pcr->aspm_mode == ASPM_MODE_CFG)
++      if (option->force_clkreq_0)
+               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG,
+                               FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW);
+       else
+--- a/drivers/misc/cardreader/rts5228.c
++++ b/drivers/misc/cardreader/rts5228.c
+@@ -386,59 +386,25 @@ static void rts5228_process_ocp(struct r
+ 
+ static void rts5228_init_from_cfg(struct rtsx_pcr *pcr)
+ {
+-      struct pci_dev *pdev = pcr->pci;
+-      int l1ss;
+-      u32 lval;
+       struct rtsx_cr_option *option = &pcr->option;
+ 
+-      l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS);
+-      if (!l1ss)
+-              return;
+-
+-      pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, &lval);
+-
+-      if (0 == (lval & 0x0F))
+-              rtsx_pci_enable_oobs_polling(pcr);
+-      else
++      if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN
++                              | PM_L1_1_EN | PM_L1_2_EN))
+               rtsx_pci_disable_oobs_polling(pcr);
+-
+-      if (lval & PCI_L1SS_CTL1_ASPM_L1_1)
+-              rtsx_set_dev_flag(pcr, ASPM_L1_1_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, ASPM_L1_1_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_ASPM_L1_2)
+-              rtsx_set_dev_flag(pcr, ASPM_L1_2_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, ASPM_L1_2_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_PCIPM_L1_1)
+-              rtsx_set_dev_flag(pcr, PM_L1_1_EN);
+       else
+-              rtsx_clear_dev_flag(pcr, PM_L1_1_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_PCIPM_L1_2)
+-              rtsx_set_dev_flag(pcr, PM_L1_2_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, PM_L1_2_EN);
++              rtsx_pci_enable_oobs_polling(pcr);
+ 
+       rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, 0xFF, 0);
+-      if (option->ltr_en) {
+-              u16 val;
+ 
+-              pcie_capability_read_word(pcr->pci, PCI_EXP_DEVCTL2, &val);
+-              if (val & PCI_EXP_DEVCTL2_LTR_EN) {
+-                      option->ltr_enabled = true;
+-                      option->ltr_active = true;
++      if (option->ltr_en) {
++              if (option->ltr_enabled)
+                       rtsx_set_ltr_latency(pcr, option->ltr_active_latency);
+-              } else {
+-                      option->ltr_enabled = false;
+-              }
+       }
+ }
+ 
+ static int rts5228_extra_init_hw(struct rtsx_pcr *pcr)
+ {
++      struct rtsx_cr_option *option = &pcr->option;
+ 
+       rtsx_pci_write_register(pcr, RTS5228_AUTOLOAD_CFG1,
+                       CD_RESUME_EN_MASK, CD_RESUME_EN_MASK);
+@@ -469,6 +435,17 @@ static int rts5228_extra_init_hw(struct
+       else
+               rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x00);
+ 
++      /*
++       * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced
++       * to drive low, and we forcibly request clock.
++       */
++      if (option->force_clkreq_0)
++              rtsx_pci_write_register(pcr, PETXCFG,
++                               FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW);
++      else
++              rtsx_pci_write_register(pcr, PETXCFG,
++                               FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_HIGH);
++
+       rtsx_pci_write_register(pcr, PWD_SUSPEND_EN, 0xFF, 0xFB);
+ 
+       if (pcr->rtd3_en) {
+--- a/drivers/misc/cardreader/rts5249.c
++++ b/drivers/misc/cardreader/rts5249.c
+@@ -86,64 +86,22 @@ static void rtsx_base_fetch_vendor_setti
+ 
+ static void rts5249_init_from_cfg(struct rtsx_pcr *pcr)
+ {
+-      struct pci_dev *pdev = pcr->pci;
+-      int l1ss;
+       struct rtsx_cr_option *option = &(pcr->option);
+-      u32 lval;
+-
+-      l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS);
+-      if (!l1ss)
+-              return;
+-
+-      pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, &lval);
+ 
+       if (CHK_PCI_PID(pcr, PID_524A) || CHK_PCI_PID(pcr, PID_525A)) {
+-              if (0 == (lval & 0x0F))
+-                      rtsx_pci_enable_oobs_polling(pcr);
+-              else
++              if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN
++                              | PM_L1_1_EN | PM_L1_2_EN))
+                       rtsx_pci_disable_oobs_polling(pcr);
++              else
++                      rtsx_pci_enable_oobs_polling(pcr);
+       }
+ 
+-
+-      if (lval & PCI_L1SS_CTL1_ASPM_L1_1)
+-              rtsx_set_dev_flag(pcr, ASPM_L1_1_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_ASPM_L1_2)
+-              rtsx_set_dev_flag(pcr, ASPM_L1_2_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_PCIPM_L1_1)
+-              rtsx_set_dev_flag(pcr, PM_L1_1_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_PCIPM_L1_2)
+-              rtsx_set_dev_flag(pcr, PM_L1_2_EN);
+-
+       if (option->ltr_en) {
+-              u16 val;
+-
+-              pcie_capability_read_word(pdev, PCI_EXP_DEVCTL2, &val);
+-              if (val & PCI_EXP_DEVCTL2_LTR_EN) {
+-                      option->ltr_enabled = true;
+-                      option->ltr_active = true;
++              if (option->ltr_enabled)
+                       rtsx_set_ltr_latency(pcr, option->ltr_active_latency);
+-              } else {
+-                      option->ltr_enabled = false;
+-              }
+       }
+ }
+ 
+-static int rts5249_init_from_hw(struct rtsx_pcr *pcr)
+-{
+-      struct rtsx_cr_option *option = &(pcr->option);
+-
+-      if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN
+-                              | PM_L1_1_EN | PM_L1_2_EN))
+-              option->force_clkreq_0 = false;
+-      else
+-              option->force_clkreq_0 = true;
+-
+-      return 0;
+-}
+-
+ static void rts52xa_force_power_down(struct rtsx_pcr *pcr, u8 pm_state, bool runtime)
+ {
+       /* Set relink_time to 0 */
+@@ -276,7 +234,6 @@ static int rts5249_extra_init_hw(struct
+       struct rtsx_cr_option *option = &(pcr->option);
+ 
+       rts5249_init_from_cfg(pcr);
+-      rts5249_init_from_hw(pcr);
+ 
+       rtsx_pci_init_cmd(pcr);
+ 
+@@ -327,11 +284,12 @@ static int rts5249_extra_init_hw(struct
+               }
+       }
+ 
++
+       /*
+        * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced
+        * to drive low, and we forcibly request clock.
+        */
+-      if (option->force_clkreq_0 && pcr->aspm_mode == ASPM_MODE_CFG)
++      if (option->force_clkreq_0)
+               rtsx_pci_write_register(pcr, PETXCFG,
+                       FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW);
+       else
+--- a/drivers/misc/cardreader/rts5260.c
++++ b/drivers/misc/cardreader/rts5260.c
+@@ -480,47 +480,19 @@ static void rts5260_pwr_saving_setting(s
+ 
+ static void rts5260_init_from_cfg(struct rtsx_pcr *pcr)
+ {
+-      struct pci_dev *pdev = pcr->pci;
+-      int l1ss;
+       struct rtsx_cr_option *option = &pcr->option;
+-      u32 lval;
+-
+-      l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS);
+-      if (!l1ss)
+-              return;
+-
+-      pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, &lval);
+-
+-      if (lval & PCI_L1SS_CTL1_ASPM_L1_1)
+-              rtsx_set_dev_flag(pcr, ASPM_L1_1_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_ASPM_L1_2)
+-              rtsx_set_dev_flag(pcr, ASPM_L1_2_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_PCIPM_L1_1)
+-              rtsx_set_dev_flag(pcr, PM_L1_1_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_PCIPM_L1_2)
+-              rtsx_set_dev_flag(pcr, PM_L1_2_EN);
+ 
+       rts5260_pwr_saving_setting(pcr);
+ 
+       if (option->ltr_en) {
+-              u16 val;
+-
+-              pcie_capability_read_word(pdev, PCI_EXP_DEVCTL2, &val);
+-              if (val & PCI_EXP_DEVCTL2_LTR_EN) {
+-                      option->ltr_enabled = true;
+-                      option->ltr_active = true;
++              if (option->ltr_enabled)
+                       rtsx_set_ltr_latency(pcr, option->ltr_active_latency);
+-              } else {
+-                      option->ltr_enabled = false;
+-              }
+       }
+ }
+ 
+ static int rts5260_extra_init_hw(struct rtsx_pcr *pcr)
+ {
++      struct rtsx_cr_option *option = &pcr->option;
+ 
+       /* Set mcu_cnt to 7 to ensure data can be sampled properly */
+       rtsx_pci_write_register(pcr, 0xFC03, 0x7F, 0x07);
+@@ -539,6 +511,17 @@ static int rts5260_extra_init_hw(struct
+ 
+       rts5260_init_hw(pcr);
+ 
++      /*
++       * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced
++       * to drive low, and we forcibly request clock.
++       */
++      if (option->force_clkreq_0)
++              rtsx_pci_write_register(pcr, PETXCFG,
++                               FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW);
++      else
++              rtsx_pci_write_register(pcr, PETXCFG,
++                               FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_HIGH);
++
+       rtsx_pci_write_register(pcr, pcr->reg_pm_ctrl3, 0x10, 0x00);
+ 
+       return 0;
+--- a/drivers/misc/cardreader/rts5261.c
++++ b/drivers/misc/cardreader/rts5261.c
+@@ -454,54 +454,17 @@ static void rts5261_init_from_hw(struct
+ 
+ static void rts5261_init_from_cfg(struct rtsx_pcr *pcr)
+ {
+-      struct pci_dev *pdev = pcr->pci;
+-      int l1ss;
+-      u32 lval;
+       struct rtsx_cr_option *option = &pcr->option;
+ 
+-      l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS);
+-      if (!l1ss)
+-              return;
+-
+-      pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, &lval);
+-
+-      if (lval & PCI_L1SS_CTL1_ASPM_L1_1)
+-              rtsx_set_dev_flag(pcr, ASPM_L1_1_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, ASPM_L1_1_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_ASPM_L1_2)
+-              rtsx_set_dev_flag(pcr, ASPM_L1_2_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, ASPM_L1_2_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_PCIPM_L1_1)
+-              rtsx_set_dev_flag(pcr, PM_L1_1_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, PM_L1_1_EN);
+-
+-      if (lval & PCI_L1SS_CTL1_PCIPM_L1_2)
+-              rtsx_set_dev_flag(pcr, PM_L1_2_EN);
+-      else
+-              rtsx_clear_dev_flag(pcr, PM_L1_2_EN);
+-
+-      rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, 0xFF, 0);
+       if (option->ltr_en) {
+-              u16 val;
+-
+-              pcie_capability_read_word(pdev, PCI_EXP_DEVCTL2, &val);
+-              if (val & PCI_EXP_DEVCTL2_LTR_EN) {
+-                      option->ltr_enabled = true;
+-                      option->ltr_active = true;
++              if (option->ltr_enabled)
+                       rtsx_set_ltr_latency(pcr, option->ltr_active_latency);
+-              } else {
+-                      option->ltr_enabled = false;
+-              }
+       }
+ }
+ 
+ static int rts5261_extra_init_hw(struct rtsx_pcr *pcr)
+ {
++      struct rtsx_cr_option *option = &pcr->option;
+       u32 val;
+ 
+       rtsx_pci_write_register(pcr, RTS5261_AUTOLOAD_CFG1,
+@@ -547,6 +510,17 @@ static int rts5261_extra_init_hw(struct
+       else
+               rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x00);
+ 
++      /*
++       * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced
++       * to drive low, and we forcibly request clock.
++       */
++      if (option->force_clkreq_0)
++              rtsx_pci_write_register(pcr, PETXCFG,
++                               FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW);
++      else
++              rtsx_pci_write_register(pcr, PETXCFG,
++                               FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_HIGH);
++
+       rtsx_pci_write_register(pcr, PWD_SUSPEND_EN, 0xFF, 0xFB);
+ 
+       if (pcr->rtd3_en) {
+--- a/drivers/misc/cardreader/rtsx_pcr.c
++++ b/drivers/misc/cardreader/rtsx_pcr.c
+@@ -1326,11 +1326,8 @@ static int rtsx_pci_init_hw(struct rtsx_
+                       return err;
+       }
+ 
+-      if (pcr->aspm_mode == ASPM_MODE_REG) {
++      if (pcr->aspm_mode == ASPM_MODE_REG)
+               rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, 0x30, 0x30);
+-              rtsx_pci_write_register(pcr, PETXCFG,
+-                              FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_HIGH);
+-      }
+ 
+       /* No CD interrupt if probing driver with card inserted.
+        * So we need to initialize pcr->card_exist here.
+@@ -1345,7 +1342,9 @@ static int rtsx_pci_init_hw(struct rtsx_
+ 
+ static int rtsx_pci_init_chip(struct rtsx_pcr *pcr)
+ {
+-      int err;
++      struct rtsx_cr_option *option = &(pcr->option);
++      int err, l1ss;
++      u32 lval;
+       u16 cfg_val;
+       u8 val;
+ 
+@@ -1430,6 +1429,48 @@ static int rtsx_pci_init_chip(struct rts
+                       pcr->aspm_enabled = true;
+       }
+ 
++      l1ss = pci_find_ext_capability(pcr->pci, PCI_EXT_CAP_ID_L1SS);
++      if (l1ss) {
++              pci_read_config_dword(pcr->pci, l1ss + PCI_L1SS_CTL1, &lval);
++
++              if (lval & PCI_L1SS_CTL1_ASPM_L1_1)
++                      rtsx_set_dev_flag(pcr, ASPM_L1_1_EN);
++              else
++                      rtsx_clear_dev_flag(pcr, ASPM_L1_1_EN);
++
++              if (lval & PCI_L1SS_CTL1_ASPM_L1_2)
++                      rtsx_set_dev_flag(pcr, ASPM_L1_2_EN);
++              else
++                      rtsx_clear_dev_flag(pcr, ASPM_L1_2_EN);
++
++              if (lval & PCI_L1SS_CTL1_PCIPM_L1_1)
++                      rtsx_set_dev_flag(pcr, PM_L1_1_EN);
++              else
++                      rtsx_clear_dev_flag(pcr, PM_L1_1_EN);
++
++              if (lval & PCI_L1SS_CTL1_PCIPM_L1_2)
++                      rtsx_set_dev_flag(pcr, PM_L1_2_EN);
++              else
++                      rtsx_clear_dev_flag(pcr, PM_L1_2_EN);
++
++              pcie_capability_read_word(pcr->pci, PCI_EXP_DEVCTL2, &cfg_val);
++              if (cfg_val & PCI_EXP_DEVCTL2_LTR_EN) {
++                      option->ltr_enabled = true;
++                      option->ltr_active = true;
++              } else {
++                      option->ltr_enabled = false;
++              }
++
++              if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN
++                              | PM_L1_1_EN | PM_L1_2_EN))
++                      option->force_clkreq_0 = false;
++              else
++                      option->force_clkreq_0 = true;
++      } else {
++              option->ltr_enabled = false;
++              option->force_clkreq_0 = true;
++      }
++
+       if (pcr->ops->fetch_vendor_settings)
+               pcr->ops->fetch_vendor_settings(pcr);
+ 
diff --git a/queue-6.5/mptcp-fix-bogus-receive-window-shrinkage-with-multiple-subflows.patch b/queue-6.5/mptcp-fix-bogus-receive-window-shrinkage-with-multiple-subflows.patch

new file mode 100644 (file)

index 0000000..55dced9
--- /dev/null
+++ b/queue-6.5/mptcp-fix-bogus-receive-window-shrinkage-with-multiple-subflows.patch
@@ -0,0 +1,50 @@
+From 6bec041147a2a64a490d1f813e8a004443061b38 Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Sat, 16 Sep 2023 12:52:45 +0200
+Subject: mptcp: fix bogus receive window shrinkage with multiple subflows
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit 6bec041147a2a64a490d1f813e8a004443061b38 upstream.
+
+In case multiple subflows race to update the mptcp-level receive
+window, the subflow losing the race should use the window value
+provided by the "winning" subflow to update it's own tcp-level
+rcv_wnd.
+
+To such goal, the current code bogusly uses the mptcp-level rcv_wnd
+value as observed before the update attempt. On unlucky circumstances
+that may lead to TCP-level window shrinkage, and stall the other end.
+
+Address the issue feeding to the rcv wnd update the correct value.
+
+Fixes: f3589be0c420 ("mptcp: never shrink offered window")
+Cc: stable@vger.kernel.org
+Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/427
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/options.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/net/mptcp/options.c
++++ b/net/mptcp/options.c
+@@ -1269,12 +1269,13 @@ static void mptcp_set_rwin(struct tcp_so
+ 
+                       if (rcv_wnd == rcv_wnd_old)
+                               break;
+-                      if (before64(rcv_wnd_new, rcv_wnd)) {
++
++                      rcv_wnd_old = rcv_wnd;
++                      if (before64(rcv_wnd_new, rcv_wnd_old)) {
+                               MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE);
+                               goto raise_win;
+                       }
+                       MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
+-                      rcv_wnd_old = rcv_wnd;
+               }
+               return;
+       }
diff --git a/queue-6.5/mptcp-move-__mptcp_error_report-in-protocol.c.patch b/queue-6.5/mptcp-move-__mptcp_error_report-in-protocol.c.patch

new file mode 100644 (file)

index 0000000..5c01d7e
--- /dev/null
+++ b/queue-6.5/mptcp-move-__mptcp_error_report-in-protocol.c.patch
@@ -0,0 +1,115 @@
+From d5fbeff1ab812b6c473b6924bee8748469462e2c Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Sat, 16 Sep 2023 12:52:46 +0200
+Subject: mptcp: move __mptcp_error_report in protocol.c
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit d5fbeff1ab812b6c473b6924bee8748469462e2c upstream.
+
+This will simplify the next patch ("mptcp: process pending subflow error
+on close").
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org # v5.12+
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c |   36 ++++++++++++++++++++++++++++++++++++
+ net/mptcp/subflow.c  |   36 ------------------------------------
+ 2 files changed, 36 insertions(+), 36 deletions(-)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -772,6 +772,42 @@ static bool __mptcp_ofo_queue(struct mpt
+       return moved;
+ }
+ 
++void __mptcp_error_report(struct sock *sk)
++{
++      struct mptcp_subflow_context *subflow;
++      struct mptcp_sock *msk = mptcp_sk(sk);
++
++      mptcp_for_each_subflow(msk, subflow) {
++              struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
++              int err = sock_error(ssk);
++              int ssk_state;
++
++              if (!err)
++                      continue;
++
++              /* only propagate errors on fallen-back sockets or
++               * on MPC connect
++               */
++              if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
++                      continue;
++
++              /* We need to propagate only transition to CLOSE state.
++               * Orphaned socket will see such state change via
++               * subflow_sched_work_if_closed() and that path will properly
++               * destroy the msk as needed.
++               */
++              ssk_state = inet_sk_state_load(ssk);
++              if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
++                      inet_sk_state_store(sk, ssk_state);
++              WRITE_ONCE(sk->sk_err, -err);
++
++              /* This barrier is coupled with smp_rmb() in mptcp_poll() */
++              smp_wmb();
++              sk_error_report(sk);
++              break;
++      }
++}
++
+ /* In most cases we will be able to lock the mptcp socket.  If its already
+  * owned, we need to defer to the work queue to avoid ABBA deadlock.
+  */
+--- a/net/mptcp/subflow.c
++++ b/net/mptcp/subflow.c
+@@ -1362,42 +1362,6 @@ void mptcp_space(const struct sock *ssk,
+       *full_space = tcp_full_space(sk);
+ }
+ 
+-void __mptcp_error_report(struct sock *sk)
+-{
+-      struct mptcp_subflow_context *subflow;
+-      struct mptcp_sock *msk = mptcp_sk(sk);
+-
+-      mptcp_for_each_subflow(msk, subflow) {
+-              struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+-              int err = sock_error(ssk);
+-              int ssk_state;
+-
+-              if (!err)
+-                      continue;
+-
+-              /* only propagate errors on fallen-back sockets or
+-               * on MPC connect
+-               */
+-              if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
+-                      continue;
+-
+-              /* We need to propagate only transition to CLOSE state.
+-               * Orphaned socket will see such state change via
+-               * subflow_sched_work_if_closed() and that path will properly
+-               * destroy the msk as needed.
+-               */
+-              ssk_state = inet_sk_state_load(ssk);
+-              if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
+-                      inet_sk_state_store(sk, ssk_state);
+-              WRITE_ONCE(sk->sk_err, -err);
+-
+-              /* This barrier is coupled with smp_rmb() in mptcp_poll() */
+-              smp_wmb();
+-              sk_error_report(sk);
+-              break;
+-      }
+-}
+-
+ static void subflow_error_report(struct sock *ssk)
+ {
+       struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
diff --git a/queue-6.5/mptcp-process-pending-subflow-error-on-close.patch b/queue-6.5/mptcp-process-pending-subflow-error-on-close.patch

new file mode 100644 (file)

index 0000000..ca96115
--- /dev/null
+++ b/queue-6.5/mptcp-process-pending-subflow-error-on-close.patch
@@ -0,0 +1,113 @@
+From 9f1a98813b4b686482e5ef3c9d998581cace0ba6 Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Sat, 16 Sep 2023 12:52:47 +0200
+Subject: mptcp: process pending subflow error on close
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit 9f1a98813b4b686482e5ef3c9d998581cace0ba6 upstream.
+
+On incoming TCP reset, subflow closing could happen before error
+propagation. That in turn could cause the socket error being ignored,
+and a missing socket state transition, as reported by Daire-Byrne.
+
+Address the issues explicitly checking for subflow socket error at
+close time. To avoid code duplication, factor-out of __mptcp_error_report()
+a new helper implementing the relevant bits.
+
+Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/429
+Fixes: 15cc10453398 ("mptcp: deliver ssk errors to msk")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c |   63 +++++++++++++++++++++++++++------------------------
+ 1 file changed, 34 insertions(+), 29 deletions(-)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -772,40 +772,44 @@ static bool __mptcp_ofo_queue(struct mpt
+       return moved;
+ }
+ 
+-void __mptcp_error_report(struct sock *sk)
++static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
+ {
+-      struct mptcp_subflow_context *subflow;
+-      struct mptcp_sock *msk = mptcp_sk(sk);
++      int err = sock_error(ssk);
++      int ssk_state;
+ 
+-      mptcp_for_each_subflow(msk, subflow) {
+-              struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+-              int err = sock_error(ssk);
+-              int ssk_state;
++      if (!err)
++              return false;
+ 
+-              if (!err)
+-                      continue;
++      /* only propagate errors on fallen-back sockets or
++       * on MPC connect
++       */
++      if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
++              return false;
+ 
+-              /* only propagate errors on fallen-back sockets or
+-               * on MPC connect
+-               */
+-              if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
+-                      continue;
++      /* We need to propagate only transition to CLOSE state.
++       * Orphaned socket will see such state change via
++       * subflow_sched_work_if_closed() and that path will properly
++       * destroy the msk as needed.
++       */
++      ssk_state = inet_sk_state_load(ssk);
++      if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
++              inet_sk_state_store(sk, ssk_state);
++      WRITE_ONCE(sk->sk_err, -err);
++
++      /* This barrier is coupled with smp_rmb() in mptcp_poll() */
++      smp_wmb();
++      sk_error_report(sk);
++      return true;
++}
+ 
+-              /* We need to propagate only transition to CLOSE state.
+-               * Orphaned socket will see such state change via
+-               * subflow_sched_work_if_closed() and that path will properly
+-               * destroy the msk as needed.
+-               */
+-              ssk_state = inet_sk_state_load(ssk);
+-              if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
+-                      inet_sk_state_store(sk, ssk_state);
+-              WRITE_ONCE(sk->sk_err, -err);
+-
+-              /* This barrier is coupled with smp_rmb() in mptcp_poll() */
+-              smp_wmb();
+-              sk_error_report(sk);
+-              break;
+-      }
++void __mptcp_error_report(struct sock *sk)
++{
++      struct mptcp_subflow_context *subflow;
++      struct mptcp_sock *msk = mptcp_sk(sk);
++
++      mptcp_for_each_subflow(msk, subflow)
++              if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow)))
++                      break;
+ }
+ 
+ /* In most cases we will be able to lock the mptcp socket.  If its already
+@@ -2417,6 +2421,7 @@ static void __mptcp_close_ssk(struct soc
+       }
+ 
+ out_release:
++      __mptcp_subflow_error_report(sk, ssk);
+       release_sock(ssk);
+ 
+       sock_put(ssk);
diff --git a/queue-6.5/nilfs2-fix-potential-use-after-free-in-nilfs_gccache_submit_read_data.patch b/queue-6.5/nilfs2-fix-potential-use-after-free-in-nilfs_gccache_submit_read_data.patch

new file mode 100644 (file)

index 0000000..095d33a
--- /dev/null
+++ b/queue-6.5/nilfs2-fix-potential-use-after-free-in-nilfs_gccache_submit_read_data.patch
@@ -0,0 +1,61 @@
+From 7ee29facd8a9c5a26079148e36bcf07141b3a6bc Mon Sep 17 00:00:00 2001
+From: Pan Bian <bianpan2016@163.com>
+Date: Thu, 21 Sep 2023 23:17:31 +0900
+Subject: nilfs2: fix potential use after free in nilfs_gccache_submit_read_data()
+
+From: Pan Bian <bianpan2016@163.com>
+
+commit 7ee29facd8a9c5a26079148e36bcf07141b3a6bc upstream.
+
+In nilfs_gccache_submit_read_data(), brelse(bh) is called to drop the
+reference count of bh when the call to nilfs_dat_translate() fails.  If
+the reference count hits 0 and its owner page gets unlocked, bh may be
+freed.  However, bh->b_page is dereferenced to put the page after that,
+which may result in a use-after-free bug.  This patch moves the release
+operation after unlocking and putting the page.
+
+NOTE: The function in question is only called in GC, and in combination
+with current userland tools, address translation using DAT does not occur
+in that function, so the code path that causes this issue will not be
+executed.  However, it is possible to run that code path by intentionally
+modifying the userland GC library or by calling the GC ioctl directly.
+
+[konishi.ryusuke@gmail.com: NOTE added to the commit log]
+Link: https://lkml.kernel.org/r/1543201709-53191-1-git-send-email-bianpan2016@163.com
+Link: https://lkml.kernel.org/r/20230921141731.10073-1-konishi.ryusuke@gmail.com
+Fixes: a3d93f709e89 ("nilfs2: block cache for garbage collection")
+Signed-off-by: Pan Bian <bianpan2016@163.com>
+Reported-by: Ferry Meng <mengferry@linux.alibaba.com>
+Closes: https://lkml.kernel.org/r/20230818092022.111054-1-mengferry@linux.alibaba.com
+Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/gcinode.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/nilfs2/gcinode.c
++++ b/fs/nilfs2/gcinode.c
+@@ -73,10 +73,8 @@ int nilfs_gccache_submit_read_data(struc
+               struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+ 
+               err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
+-              if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
+-                      brelse(bh);
++              if (unlikely(err)) /* -EIO, -ENOMEM, -ENOENT */
+                       goto failed;
+-              }
+       }
+ 
+       lock_buffer(bh);
+@@ -102,6 +100,8 @@ int nilfs_gccache_submit_read_data(struc
+  failed:
+       unlock_page(bh->b_page);
+       put_page(bh->b_page);
++      if (unlikely(err))
++              brelse(bh);
+       return err;
+ }
+ 
diff --git a/queue-6.5/revert-tty-n_gsm-fix-uaf-in-gsm_cleanup_mux.patch b/queue-6.5/revert-tty-n_gsm-fix-uaf-in-gsm_cleanup_mux.patch

new file mode 100644 (file)

index 0000000..1dde906
--- /dev/null
+++ b/queue-6.5/revert-tty-n_gsm-fix-uaf-in-gsm_cleanup_mux.patch
@@ -0,0 +1,68 @@
+From 29346e217b8ab8a52889b88f00b268278d6b7668 Mon Sep 17 00:00:00 2001
+From: Daniel Starke <daniel.starke@siemens.com>
+Date: Thu, 14 Sep 2023 07:15:07 +0200
+Subject: Revert "tty: n_gsm: fix UAF in gsm_cleanup_mux"
+
+From: Daniel Starke <daniel.starke@siemens.com>
+
+commit 29346e217b8ab8a52889b88f00b268278d6b7668 upstream.
+
+This reverts commit 9b9c8195f3f0d74a826077fc1c01b9ee74907239.
+
+The commit above is reverted as it did not solve the original issue.
+
+gsm_cleanup_mux() tries to free up the virtual ttys by calling
+gsm_dlci_release() for each available DLCI. There, dlci_put() is called to
+decrease the reference counter for the DLCI via tty_port_put() which
+finally calls gsm_dlci_free(). This already clears the pointer which is
+being checked in gsm_cleanup_mux() before calling gsm_dlci_release().
+Therefore, it is not necessary to clear this pointer in gsm_cleanup_mux()
+as done in the reverted commit. The commit introduces a null pointer
+dereference:
+ <TASK>
+ ? __die+0x1f/0x70
+ ? page_fault_oops+0x156/0x420
+ ? search_exception_tables+0x37/0x50
+ ? fixup_exception+0x21/0x310
+ ? exc_page_fault+0x69/0x150
+ ? asm_exc_page_fault+0x26/0x30
+ ? tty_port_put+0x19/0xa0
+ gsmtty_cleanup+0x29/0x80 [n_gsm]
+ release_one_tty+0x37/0xe0
+ process_one_work+0x1e6/0x3e0
+ worker_thread+0x4c/0x3d0
+ ? __pfx_worker_thread+0x10/0x10
+ kthread+0xe1/0x110
+ ? __pfx_kthread+0x10/0x10
+ ret_from_fork+0x2f/0x50
+ ? __pfx_kthread+0x10/0x10
+ ret_from_fork_asm+0x1b/0x30
+ </TASK>
+
+The actual issue is that nothing guards dlci_put() from being called
+multiple times while the tty driver was triggered but did not yet finished
+calling gsm_dlci_free().
+
+Fixes: 9b9c8195f3f0 ("tty: n_gsm: fix UAF in gsm_cleanup_mux")
+Cc: stable <stable@kernel.org>
+Signed-off-by: Daniel Starke <daniel.starke@siemens.com>
+Link: https://lore.kernel.org/r/20230914051507.3240-1-daniel.starke@siemens.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/tty/n_gsm.c |    4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/drivers/tty/n_gsm.c
++++ b/drivers/tty/n_gsm.c
+@@ -3071,10 +3071,8 @@ static void gsm_cleanup_mux(struct gsm_m
+               gsm->has_devices = false;
+       }
+       for (i = NUM_DLCI - 1; i >= 0; i--)
+-              if (gsm->dlci[i]) {
++              if (gsm->dlci[i])
+                       gsm_dlci_release(gsm->dlci[i]);
+-                      gsm->dlci[i] = NULL;
+-              }
+       mutex_unlock(&gsm->mutex);
+       /* Now wipe the queues */
+       tty_ldisc_flush(gsm->tty);
diff --git a/queue-6.5/scsi-core-ata-do-no-try-to-probe-for-cdl-on-old-drives.patch b/queue-6.5/scsi-core-ata-do-no-try-to-probe-for-cdl-on-old-drives.patch

new file mode 100644 (file)

index 0000000..ef3bc5f
--- /dev/null
+++ b/queue-6.5/scsi-core-ata-do-no-try-to-probe-for-cdl-on-old-drives.patch
@@ -0,0 +1,101 @@
+From 2132df16f53b4f01ab25f5d404f36a22244ae342 Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <dlemoal@kernel.org>
+Date: Fri, 15 Sep 2023 11:20:34 +0900
+Subject: scsi: core: ata: Do no try to probe for CDL on old drives
+
+From: Damien Le Moal <dlemoal@kernel.org>
+
+commit 2132df16f53b4f01ab25f5d404f36a22244ae342 upstream.
+
+Some old drives (e.g. an Ultra320 SCSI disk as reported by John) do not
+seem to execute MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES
+commands correctly and hang when a non-zero service action is specified
+(one command format with service action case in scsi_report_opcode()).
+
+Currently, CDL probing with scsi_cdl_check_cmd() is the only caller using a
+non zero service action for scsi_report_opcode(). To avoid issues with
+these old drives, do not attempt CDL probe if the device reports support
+for an SPC version lower than 5 (CDL was introduced in SPC-5). To keep
+things working with ATA devices which probe for the CDL T2A and T2B pages
+introduced with SPC-6, modify ata_scsiop_inq_std() to claim SPC-6 version
+compatibility for ATA drives supporting CDL.
+
+SPC-6 standard version number is defined as Dh (= 13) in SPC-6 r09. Fix
+scsi_probe_lun() to correctly capture this value by changing the bit mask
+for the second byte of the INQUIRY response from 0x7 to 0xf.
+include/scsi/scsi.h is modified to add the definition SCSI_SPC_6 with the
+value 14 (Dh + 1). The missing definitions for the SCSI_SPC_4 and
+SCSI_SPC_5 versions are also added.
+
+Reported-by: John David Anglin <dave.anglin@bell.net>
+Fixes: 624885209f31 ("scsi: core: Detect support for command duration limits")
+Cc: stable@vger.kernel.org
+Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
+Link: https://lore.kernel.org/r/20230915022034.678121-1-dlemoal@kernel.org
+Tested-by: David Gow <david@davidgow.net>
+Reviewed-by: Bart Van Assche <bvanassche@acm.org>
+Reviewed-by: Niklas Cassel <niklas.cassel@wdc.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/ata/libata-scsi.c |    3 +++
+ drivers/scsi/scsi.c       |   11 +++++++++++
+ drivers/scsi/scsi_scan.c  |    2 +-
+ include/scsi/scsi.h       |    3 +++
+ 4 files changed, 18 insertions(+), 1 deletion(-)
+
+--- a/drivers/ata/libata-scsi.c
++++ b/drivers/ata/libata-scsi.c
+@@ -1892,6 +1892,9 @@ static unsigned int ata_scsiop_inq_std(s
+               hdr[2] = 0x7; /* claim SPC-5 version compatibility */
+       }
+ 
++      if (args->dev->flags & ATA_DFLAG_CDL)
++              hdr[2] = 0xd; /* claim SPC-6 version compatibility */
++
+       memcpy(rbuf, hdr, sizeof(hdr));
+       memcpy(&rbuf[8], "ATA     ", 8);
+       ata_id_string(args->id, &rbuf[16], ATA_ID_PROD, 16);
+--- a/drivers/scsi/scsi.c
++++ b/drivers/scsi/scsi.c
+@@ -613,6 +613,17 @@ void scsi_cdl_check(struct scsi_device *
+       bool cdl_supported;
+       unsigned char *buf;
+ 
++      /*
++       * Support for CDL was defined in SPC-5. Ignore devices reporting an
++       * lower SPC version. This also avoids problems with old drives choking
++       * on MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES with a
++       * service action specified, as done in scsi_cdl_check_cmd().
++       */
++      if (sdev->scsi_level < SCSI_SPC_5) {
++              sdev->cdl_supported = 0;
++              return;
++      }
++
+       buf = kmalloc(SCSI_CDL_CHECK_BUF_LEN, GFP_KERNEL);
+       if (!buf) {
+               sdev->cdl_supported = 0;
+--- a/drivers/scsi/scsi_scan.c
++++ b/drivers/scsi/scsi_scan.c
+@@ -822,7 +822,7 @@ static int scsi_probe_lun(struct scsi_de
+        * device is attached at LUN 0 (SCSI_SCAN_TARGET_PRESENT) so
+        * non-zero LUNs can be scanned.
+        */
+-      sdev->scsi_level = inq_result[2] & 0x07;
++      sdev->scsi_level = inq_result[2] & 0x0f;
+       if (sdev->scsi_level >= 2 ||
+           (sdev->scsi_level == 1 && (inq_result[3] & 0x0f) == 1))
+               sdev->scsi_level++;
+--- a/include/scsi/scsi.h
++++ b/include/scsi/scsi.h
+@@ -157,6 +157,9 @@ enum scsi_disposition {
+ #define SCSI_3          4        /* SPC */
+ #define SCSI_SPC_2      5
+ #define SCSI_SPC_3      6
++#define SCSI_SPC_4    7
++#define SCSI_SPC_5    8
++#define SCSI_SPC_6    14
+ 
+ /*
+  * INQ PERIPHERAL QUALIFIERS
diff --git a/queue-6.5/serial-8250_port-check-irq-data-before-use.patch b/queue-6.5/serial-8250_port-check-irq-data-before-use.patch

new file mode 100644 (file)

index 0000000..7f79fbf
--- /dev/null
+++ b/queue-6.5/serial-8250_port-check-irq-data-before-use.patch
@@ -0,0 +1,49 @@
+From cce7fc8b29961b64fadb1ce398dc5ff32a79643b Mon Sep 17 00:00:00 2001
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Date: Fri, 1 Sep 2023 01:25:55 +0300
+Subject: serial: 8250_port: Check IRQ data before use
+
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+
+commit cce7fc8b29961b64fadb1ce398dc5ff32a79643b upstream.
+
+In case the leaf driver wants to use IRQ polling (irq = 0) and
+IIR register shows that an interrupt happened in the 8250 hardware
+the IRQ data can be NULL. In such a case we need to skip the wake
+event as we came to this path from the timer interrupt and quite
+likely system is already awake.
+
+Without this fix we have got an Oops:
+
+    serial8250: ttyS0 at I/O 0x3f8 (irq = 0, base_baud = 115200) is a 16550A
+    ...
+    BUG: kernel NULL pointer dereference, address: 0000000000000010
+    RIP: 0010:serial8250_handle_irq+0x7c/0x240
+    Call Trace:
+     ? serial8250_handle_irq+0x7c/0x240
+     ? __pfx_serial8250_timeout+0x10/0x10
+
+Fixes: 0ba9e3a13c6a ("serial: 8250: Add missing wakeup event reporting")
+Cc: stable <stable@kernel.org>
+Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
+Link: https://lore.kernel.org/r/20230831222555.614426-1-andriy.shevchenko@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/tty/serial/8250/8250_port.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/tty/serial/8250/8250_port.c
++++ b/drivers/tty/serial/8250/8250_port.c
+@@ -1929,7 +1929,10 @@ int serial8250_handle_irq(struct uart_po
+               skip_rx = true;
+ 
+       if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) {
+-              if (irqd_is_wakeup_set(irq_get_irq_data(port->irq)))
++              struct irq_data *d;
++
++              d = irq_get_irq_data(port->irq);
++              if (d && irqd_is_wakeup_set(d))
+                       pm_wakeup_event(tport->tty->dev, 0);
+               if (!up->dma || handle_rx_dma(up, iir))
+                       status = serial8250_rx_chars(up, status);
diff --git a/queue-6.5/series b/queue-6.5/series

index 35d11bb8091272f9e05ad21a2095dfbd0b406e5d..4dd58acffa9a04ed6b3f4e4b56fc344c9c201ec9 100644 (file)
--- a/queue-6.5/series
+++ b/queue-6.5/series
@@ -237,3 +237,19 @@ fbdev-sh7760fb-depend-on-fb-y.patch
  perf-build-define-yynomem-as-yynoabort-for-bison-3.8.patch
  asoc-cs35l56-call-pm_runtime_dont_use_autosuspend.patch
  iommu-arm-smmu-v3-fix-soft-lockup-triggered-by-arm_s.patch
+spi-zynqmp-gqspi-fix-clock-imbalance-on-probe-failure.patch
+x86-sgx-resolves-secs-reclaim-vs.-page-fault-for-eaug-race.patch
+x86-srso-add-srso-mitigation-for-hygon-processors.patch
+kvm-svm-intercept_rdtscp-is-never-intercepted-anyway.patch
+kvm-svm-fix-tsc_aux-virtualization-setup.patch
+kvm-x86-mmu-open-code-leaf-invalidation-from-mmu_notifier.patch
+kvm-x86-mmu-do-not-filter-address-spaces-in-for_each_tdp_mmu_root_yield_safe.patch
+kvm-x86-mmu-stop-zapping-invalidated-tdp-mmu-roots-asynchronously.patch
+mptcp-fix-bogus-receive-window-shrinkage-with-multiple-subflows.patch
+mptcp-move-__mptcp_error_report-in-protocol.c.patch
+mptcp-process-pending-subflow-error-on-close.patch
+misc-rtsx-fix-some-platforms-can-not-boot-and-move-the-l1ss-judgment-to-probe.patch
+revert-tty-n_gsm-fix-uaf-in-gsm_cleanup_mux.patch
+scsi-core-ata-do-no-try-to-probe-for-cdl-on-old-drives.patch
+serial-8250_port-check-irq-data-before-use.patch
+nilfs2-fix-potential-use-after-free-in-nilfs_gccache_submit_read_data.patch
diff --git a/queue-6.5/spi-zynqmp-gqspi-fix-clock-imbalance-on-probe-failure.patch b/queue-6.5/spi-zynqmp-gqspi-fix-clock-imbalance-on-probe-failure.patch

new file mode 100644 (file)

index 0000000..7a1ba81
--- /dev/null
+++ b/queue-6.5/spi-zynqmp-gqspi-fix-clock-imbalance-on-probe-failure.patch
@@ -0,0 +1,57 @@
+From 1527b076ae2cb6a9c590a02725ed39399fcad1cf Mon Sep 17 00:00:00 2001
+From: Johan Hovold <johan+linaro@kernel.org>
+Date: Thu, 22 Jun 2023 10:24:35 +0200
+Subject: spi: zynqmp-gqspi: fix clock imbalance on probe failure
+
+From: Johan Hovold <johan+linaro@kernel.org>
+
+commit 1527b076ae2cb6a9c590a02725ed39399fcad1cf upstream.
+
+Make sure that the device is not runtime suspended before explicitly
+disabling the clocks on probe failure and on driver unbind to avoid a
+clock enable-count imbalance.
+
+Fixes: 9e3a000362ae ("spi: zynqmp: Add pm runtime support")
+Cc: stable@vger.kernel.org     # 4.19
+Cc: Naga Sureshkumar Relli <naga.sureshkumar.relli@xilinx.com>
+Cc: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
+Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
+Link: https://lore.kernel.org/r/Message-Id: <20230622082435.7873-1-johan+linaro@kernel.org>
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/spi/spi-zynqmp-gqspi.c |   12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/drivers/spi/spi-zynqmp-gqspi.c
++++ b/drivers/spi/spi-zynqmp-gqspi.c
+@@ -1342,9 +1342,9 @@ static int zynqmp_qspi_probe(struct plat
+       return 0;
+ 
+ clk_dis_all:
+-      pm_runtime_put_sync(&pdev->dev);
+-      pm_runtime_set_suspended(&pdev->dev);
+       pm_runtime_disable(&pdev->dev);
++      pm_runtime_put_noidle(&pdev->dev);
++      pm_runtime_set_suspended(&pdev->dev);
+       clk_disable_unprepare(xqspi->refclk);
+ clk_dis_pclk:
+       clk_disable_unprepare(xqspi->pclk);
+@@ -1368,11 +1368,15 @@ static void zynqmp_qspi_remove(struct pl
+ {
+       struct zynqmp_qspi *xqspi = platform_get_drvdata(pdev);
+ 
++      pm_runtime_get_sync(&pdev->dev);
++
+       zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, 0x0);
++
++      pm_runtime_disable(&pdev->dev);
++      pm_runtime_put_noidle(&pdev->dev);
++      pm_runtime_set_suspended(&pdev->dev);
+       clk_disable_unprepare(xqspi->refclk);
+       clk_disable_unprepare(xqspi->pclk);
+-      pm_runtime_set_suspended(&pdev->dev);
+-      pm_runtime_disable(&pdev->dev);
+ }
+ 
+ MODULE_DEVICE_TABLE(of, zynqmp_qspi_of_match);
diff --git a/queue-6.5/x86-sgx-resolves-secs-reclaim-vs.-page-fault-for-eaug-race.patch b/queue-6.5/x86-sgx-resolves-secs-reclaim-vs.-page-fault-for-eaug-race.patch

new file mode 100644 (file)

index 0000000..bda57a0
--- /dev/null
+++ b/queue-6.5/x86-sgx-resolves-secs-reclaim-vs.-page-fault-for-eaug-race.patch
@@ -0,0 +1,112 @@
+From c6c2adcba50c2622ed25ba5d5e7f05f584711358 Mon Sep 17 00:00:00 2001
+From: Haitao Huang <haitao.huang@linux.intel.com>
+Date: Thu, 27 Jul 2023 22:10:24 -0700
+Subject: x86/sgx: Resolves SECS reclaim vs. page fault for EAUG race
+
+From: Haitao Huang <haitao.huang@linux.intel.com>
+
+commit c6c2adcba50c2622ed25ba5d5e7f05f584711358 upstream.
+
+The SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an
+enclave and set secs.epc_page to NULL. The SECS page is used for EAUG
+and ELDU in the SGX page fault handler. However, the NULL check for
+secs.epc_page is only done for ELDU, not EAUG before being used.
+
+Fix this by doing the same NULL check and reloading of the SECS page as
+needed for both EAUG and ELDU.
+
+The SECS page holds global enclave metadata. It can only be reclaimed
+when there are no other enclave pages remaining. At that point,
+virtually nothing can be done with the enclave until the SECS page is
+paged back in.
+
+An enclave can not run nor generate page faults without a resident SECS
+page. But it is still possible for a #PF for a non-SECS page to race
+with paging out the SECS page: when the last resident non-SECS page A
+triggers a #PF in a non-resident page B, and then page A and the SECS
+both are paged out before the #PF on B is handled.
+
+Hitting this bug requires that race triggered with a #PF for EAUG.
+Following is a trace when it happens.
+
+BUG: kernel NULL pointer dereference, address: 0000000000000000
+RIP: 0010:sgx_encl_eaug_page+0xc7/0x210
+Call Trace:
+ ? __kmem_cache_alloc_node+0x16a/0x440
+ ? xa_load+0x6e/0xa0
+ sgx_vma_fault+0x119/0x230
+ __do_fault+0x36/0x140
+ do_fault+0x12f/0x400
+ __handle_mm_fault+0x728/0x1110
+ handle_mm_fault+0x105/0x310
+ do_user_addr_fault+0x1ee/0x750
+ ? __this_cpu_preempt_check+0x13/0x20
+ exc_page_fault+0x76/0x180
+ asm_exc_page_fault+0x27/0x30
+
+Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave")
+Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
+Reviewed-by: Kai Huang <kai.huang@intel.com>
+Acked-by: Reinette Chatre <reinette.chatre@intel.com>
+Cc:stable@vger.kernel.org
+Link: https://lore.kernel.org/all/20230728051024.33063-1-haitao.huang%40linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/sgx/encl.c |   30 +++++++++++++++++++++++++-----
+ 1 file changed, 25 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/kernel/cpu/sgx/encl.c
++++ b/arch/x86/kernel/cpu/sgx/encl.c
+@@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eld
+       return epc_page;
+ }
+ 
++/*
++ * Ensure the SECS page is not swapped out.  Must be called with encl->lock
++ * to protect the enclave states including SECS and ensure the SECS page is
++ * not swapped out again while being used.
++ */
++static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
++{
++      struct sgx_epc_page *epc_page = encl->secs.epc_page;
++
++      if (!epc_page)
++              epc_page = sgx_encl_eldu(&encl->secs, NULL);
++
++      return epc_page;
++}
++
+ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
+                                                 struct sgx_encl_page *entry)
+ {
+@@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_
+               return entry;
+       }
+ 
+-      if (!(encl->secs.epc_page)) {
+-              epc_page = sgx_encl_eldu(&encl->secs, NULL);
+-              if (IS_ERR(epc_page))
+-                      return ERR_CAST(epc_page);
+-      }
++      epc_page = sgx_encl_load_secs(encl);
++      if (IS_ERR(epc_page))
++              return ERR_CAST(epc_page);
+ 
+       epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
+       if (IS_ERR(epc_page))
+@@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(str
+ 
+       mutex_lock(&encl->lock);
+ 
++      epc_page = sgx_encl_load_secs(encl);
++      if (IS_ERR(epc_page)) {
++              if (PTR_ERR(epc_page) == -EBUSY)
++                      vmret = VM_FAULT_NOPAGE;
++              goto err_out_unlock;
++      }
++
+       epc_page = sgx_alloc_epc_page(encl_page, false);
+       if (IS_ERR(epc_page)) {
+               if (PTR_ERR(epc_page) == -EBUSY)
diff --git a/queue-6.5/x86-srso-add-srso-mitigation-for-hygon-processors.patch b/queue-6.5/x86-srso-add-srso-mitigation-for-hygon-processors.patch

new file mode 100644 (file)

index 0000000..d8b8ac0
--- /dev/null
+++ b/queue-6.5/x86-srso-add-srso-mitigation-for-hygon-processors.patch
@@ -0,0 +1,33 @@
+From a5ef7d68cea1344cf524f04981c2b3f80bedbb0d Mon Sep 17 00:00:00 2001
+From: Pu Wen <puwen@hygon.cn>
+Date: Thu, 28 Sep 2023 14:59:16 +0800
+Subject: x86/srso: Add SRSO mitigation for Hygon processors
+
+From: Pu Wen <puwen@hygon.cn>
+
+commit a5ef7d68cea1344cf524f04981c2b3f80bedbb0d upstream.
+
+Add mitigation for the speculative return stack overflow vulnerability
+which exists on Hygon processors too.
+
+Signed-off-by: Pu Wen <puwen@hygon.cn>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
+Cc: <stable@vger.kernel.org>
+Link: https://lore.kernel.org/r/tencent_4A14812842F104E93AA722EC939483CEFF05@qq.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/common.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1288,7 +1288,7 @@ static const struct x86_cpu_id cpu_vuln_
+       VULNBL_AMD(0x15, RETBLEED),
+       VULNBL_AMD(0x16, RETBLEED),
+       VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
+-      VULNBL_HYGON(0x18, RETBLEED | SMT_RSB),
++      VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
+       VULNBL_AMD(0x19, SRSO),
+       {}
+ };
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 4 Oct 2023 14:12:52 +0000 (16:12 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 4 Oct 2023 14:12:52 +0000 (16:12 +0200)
queue-6.5/kvm-svm-fix-tsc_aux-virtualization-setup.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/kvm-svm-intercept_rdtscp-is-never-intercepted-anyway.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/kvm-x86-mmu-do-not-filter-address-spaces-in-for_each_tdp_mmu_root_yield_safe.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/kvm-x86-mmu-open-code-leaf-invalidation-from-mmu_notifier.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/kvm-x86-mmu-stop-zapping-invalidated-tdp-mmu-roots-asynchronously.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/misc-rtsx-fix-some-platforms-can-not-boot-and-move-the-l1ss-judgment-to-probe.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/mptcp-fix-bogus-receive-window-shrinkage-with-multiple-subflows.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/mptcp-move-__mptcp_error_report-in-protocol.c.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/mptcp-process-pending-subflow-error-on-close.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/nilfs2-fix-potential-use-after-free-in-nilfs_gccache_submit_read_data.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/revert-tty-n_gsm-fix-uaf-in-gsm_cleanup_mux.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/scsi-core-ata-do-no-try-to-probe-for-cdl-on-old-drives.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/serial-8250_port-check-irq-data-before-use.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/series		patch \| blob \| blame \| history
queue-6.5/spi-zynqmp-gqspi-fix-clock-imbalance-on-probe-failure.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/x86-sgx-resolves-secs-reclaim-vs.-page-fault-for-eaug-race.patch	[new file with mode: 0644]	patch \| blob
queue-6.5/x86-srso-add-srso-mitigation-for-hygon-processors.patch	[new file with mode: 0644]	patch \| blob