From: Greg Kroah-Hartman Date: Wed, 4 Oct 2023 14:12:52 +0000 (+0200) Subject: 6.5-stable patches X-Git-Tag: v6.5.6~24 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1fe16cec387852b1eecf1d07ea5c928f496c335e;p=thirdparty%2Fkernel%2Fstable-queue.git 6.5-stable patches added patches: kvm-svm-fix-tsc_aux-virtualization-setup.patch kvm-svm-intercept_rdtscp-is-never-intercepted-anyway.patch kvm-x86-mmu-do-not-filter-address-spaces-in-for_each_tdp_mmu_root_yield_safe.patch kvm-x86-mmu-open-code-leaf-invalidation-from-mmu_notifier.patch kvm-x86-mmu-stop-zapping-invalidated-tdp-mmu-roots-asynchronously.patch misc-rtsx-fix-some-platforms-can-not-boot-and-move-the-l1ss-judgment-to-probe.patch mptcp-fix-bogus-receive-window-shrinkage-with-multiple-subflows.patch mptcp-move-__mptcp_error_report-in-protocol.c.patch mptcp-process-pending-subflow-error-on-close.patch nilfs2-fix-potential-use-after-free-in-nilfs_gccache_submit_read_data.patch revert-tty-n_gsm-fix-uaf-in-gsm_cleanup_mux.patch scsi-core-ata-do-no-try-to-probe-for-cdl-on-old-drives.patch serial-8250_port-check-irq-data-before-use.patch spi-zynqmp-gqspi-fix-clock-imbalance-on-probe-failure.patch x86-sgx-resolves-secs-reclaim-vs.-page-fault-for-eaug-race.patch x86-srso-add-srso-mitigation-for-hygon-processors.patch --- diff --git a/queue-6.5/kvm-svm-fix-tsc_aux-virtualization-setup.patch b/queue-6.5/kvm-svm-fix-tsc_aux-virtualization-setup.patch new file mode 100644 index 00000000000..5994b8f787f --- /dev/null +++ b/queue-6.5/kvm-svm-fix-tsc_aux-virtualization-setup.patch @@ -0,0 +1,124 @@ +From e0096d01c4fcb8c96c05643cfc2c20ab78eae4da Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Fri, 15 Sep 2023 15:54:30 -0500 +Subject: KVM: SVM: Fix TSC_AUX virtualization setup + +From: Tom Lendacky + +commit e0096d01c4fcb8c96c05643cfc2c20ab78eae4da upstream. + +The checks for virtualizing TSC_AUX occur during the vCPU reset processing +path. However, at the time of initial vCPU reset processing, when the vCPU +is first created, not all of the guest CPUID information has been set. In +this case the RDTSCP and RDPID feature support for the guest is not in +place and so TSC_AUX virtualization is not established. + +This continues for each vCPU created for the guest. On the first boot of +an AP, vCPU reset processing is executed as a result of an APIC INIT +event, this time with all of the guest CPUID information set, resulting +in TSC_AUX virtualization being enabled, but only for the APs. The BSP +always sees a TSC_AUX value of 0 which probably went unnoticed because, +at least for Linux, the BSP TSC_AUX value is 0. + +Move the TSC_AUX virtualization enablement out of the init_vmcb() path and +into the vcpu_after_set_cpuid() path to allow for proper initialization of +the support after the guest CPUID information has been set. + +With the TSC_AUX virtualization support now in the vcpu_set_after_cpuid() +path, the intercepts must be either cleared or set based on the guest +CPUID input. + +Fixes: 296d5a17e793 ("KVM: SEV-ES: Use V_TSC_AUX if available instead of RDTSC/MSR_TSC_AUX intercepts") +Signed-off-by: Tom Lendacky +Message-Id: <4137fbcb9008951ab5f0befa74a0399d2cce809a.1694811272.git.thomas.lendacky@amd.com> +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/sev.c | 31 ++++++++++++++++++++++++++----- + arch/x86/kvm/svm/svm.c | 9 ++------- + arch/x86/kvm/svm/svm.h | 1 + + 3 files changed, 29 insertions(+), 12 deletions(-) + +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -2945,6 +2945,32 @@ int sev_es_string_io(struct vcpu_svm *sv + count, in); + } + ++static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) ++{ ++ struct kvm_vcpu *vcpu = &svm->vcpu; ++ ++ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { ++ bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || ++ guest_cpuid_has(vcpu, X86_FEATURE_RDPID); ++ ++ set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); ++ } ++} ++ ++void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) ++{ ++ struct kvm_vcpu *vcpu = &svm->vcpu; ++ struct kvm_cpuid_entry2 *best; ++ ++ /* For sev guests, the memory encryption bit is not reserved in CR3. */ ++ best = kvm_find_cpuid_entry(vcpu, 0x8000001F); ++ if (best) ++ vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); ++ ++ if (sev_es_guest(svm->vcpu.kvm)) ++ sev_es_vcpu_after_set_cpuid(svm); ++} ++ + static void sev_es_init_vmcb(struct vcpu_svm *svm) + { + struct kvm_vcpu *vcpu = &svm->vcpu; +@@ -2991,11 +3017,6 @@ static void sev_es_init_vmcb(struct vcpu + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); +- +- if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && +- (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) || +- guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) +- set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1); + } + + void sev_init_vmcb(struct vcpu_svm *svm) +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4217,7 +4217,6 @@ static bool svm_has_emulated_msr(struct + static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); +- struct kvm_cpuid_entry2 *best; + + vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + boot_cpu_has(X86_FEATURE_XSAVE) && +@@ -4252,12 +4251,8 @@ static void svm_vcpu_after_set_cpuid(str + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, + !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + +- /* For sev guests, the memory encryption bit is not reserved in CR3. */ +- if (sev_guest(vcpu->kvm)) { +- best = kvm_find_cpuid_entry(vcpu, 0x8000001F); +- if (best) +- vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); +- } ++ if (sev_guest(vcpu->kvm)) ++ sev_vcpu_after_set_cpuid(svm); + + init_vmcb_after_set_cpuid(vcpu); + } +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -733,6 +733,7 @@ void __init sev_hardware_setup(void); + void sev_hardware_unsetup(void); + int sev_cpu_init(struct svm_cpu_data *sd); + void sev_init_vmcb(struct vcpu_svm *svm); ++void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); + void sev_free_vcpu(struct kvm_vcpu *vcpu); + int sev_handle_vmgexit(struct kvm_vcpu *vcpu); + int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); diff --git a/queue-6.5/kvm-svm-intercept_rdtscp-is-never-intercepted-anyway.patch b/queue-6.5/kvm-svm-intercept_rdtscp-is-never-intercepted-anyway.patch new file mode 100644 index 00000000000..38c99a9fca1 --- /dev/null +++ b/queue-6.5/kvm-svm-intercept_rdtscp-is-never-intercepted-anyway.patch @@ -0,0 +1,38 @@ +From e8d93d5d93f85949e7299be289c6e7e1154b2f78 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 22 Sep 2023 17:06:34 -0400 +Subject: KVM: SVM: INTERCEPT_RDTSCP is never intercepted anyway + +From: Paolo Bonzini + +commit e8d93d5d93f85949e7299be289c6e7e1154b2f78 upstream. + +svm_recalc_instruction_intercepts() is always called at least once +before the vCPU is started, so the setting or clearing of the RDTSCP +intercept can be dropped from the TSC_AUX virtualization support. + +Extracted from a patch by Tom Lendacky. + +Cc: stable@vger.kernel.org +Fixes: 296d5a17e793 ("KVM: SEV-ES: Use V_TSC_AUX if available instead of RDTSC/MSR_TSC_AUX intercepts") +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/sev.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -2994,11 +2994,8 @@ static void sev_es_init_vmcb(struct vcpu + + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && + (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) || +- guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) { ++ guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) + set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1); +- if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP)) +- svm_clr_intercept(svm, INTERCEPT_RDTSCP); +- } + } + + void sev_init_vmcb(struct vcpu_svm *svm) diff --git a/queue-6.5/kvm-x86-mmu-do-not-filter-address-spaces-in-for_each_tdp_mmu_root_yield_safe.patch b/queue-6.5/kvm-x86-mmu-do-not-filter-address-spaces-in-for_each_tdp_mmu_root_yield_safe.patch new file mode 100644 index 00000000000..10aecfe7443 --- /dev/null +++ b/queue-6.5/kvm-x86-mmu-do-not-filter-address-spaces-in-for_each_tdp_mmu_root_yield_safe.patch @@ -0,0 +1,122 @@ +From 441a5dfcd96854cbcb625709e2694a9c60adfaab Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 21 Sep 2023 05:44:56 -0400 +Subject: KVM: x86/mmu: Do not filter address spaces in for_each_tdp_mmu_root_yield_safe() + +From: Paolo Bonzini + +commit 441a5dfcd96854cbcb625709e2694a9c60adfaab upstream. + +All callers except the MMU notifier want to process all address spaces. +Remove the address space ID argument of for_each_tdp_mmu_root_yield_safe() +and switch the MMU notifier to use __for_each_tdp_mmu_root_yield_safe(). + +Extracted out of a patch by Sean Christopherson + +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/mmu.c | 8 ++------ + arch/x86/kvm/mmu/tdp_mmu.c | 22 +++++++++++----------- + arch/x86/kvm/mmu/tdp_mmu.h | 3 +-- + 3 files changed, 14 insertions(+), 19 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -6294,7 +6294,6 @@ static bool kvm_rmap_zap_gfn_range(struc + void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) + { + bool flush; +- int i; + + if (WARN_ON_ONCE(gfn_end <= gfn_start)) + return; +@@ -6305,11 +6304,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, + + flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); + +- if (tdp_mmu_enabled) { +- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) +- flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, +- gfn_end, flush); +- } ++ if (tdp_mmu_enabled) ++ flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush); + + if (flush) + kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -211,8 +211,12 @@ static struct kvm_mmu_page *tdp_mmu_next + #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) + +-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ +- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) ++#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ ++ for (_root = tdp_mmu_next_root(_kvm, NULL, false, false); \ ++ _root; \ ++ _root = tdp_mmu_next_root(_kvm, _root, false, false)) \ ++ if (!kvm_lockdep_assert_mmu_lock_held(_kvm, false)) { \ ++ } else + + /* + * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, +@@ -877,12 +881,11 @@ static bool tdp_mmu_zap_leafs(struct kvm + * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or + * more SPTEs were zapped since the MMU lock was last acquired. + */ +-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, +- bool flush) ++bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) + { + struct kvm_mmu_page *root; + +- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) ++ for_each_tdp_mmu_root_yield_safe(kvm, root) + flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); + + return flush; +@@ -891,7 +894,6 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *k + void kvm_tdp_mmu_zap_all(struct kvm *kvm) + { + struct kvm_mmu_page *root; +- int i; + + /* + * Zap all roots, including invalid roots, as all SPTEs must be dropped +@@ -905,10 +907,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm + * is being destroyed or the userspace VMM has exited. In both cases, + * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. + */ +- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { +- for_each_tdp_mmu_root_yield_safe(kvm, root, i) +- tdp_mmu_zap_root(kvm, root, false); +- } ++ for_each_tdp_mmu_root_yield_safe(kvm, root) ++ tdp_mmu_zap_root(kvm, root, false); + } + + /* +@@ -1148,7 +1148,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct + { + struct kvm_mmu_page *root; + +- for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id) ++ __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false) + flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, + range->may_block, flush); + +--- a/arch/x86/kvm/mmu/tdp_mmu.h ++++ b/arch/x86/kvm/mmu/tdp_mmu.h +@@ -20,8 +20,7 @@ __must_check static inline bool kvm_tdp_ + void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared); + +-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, +- bool flush); ++bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); + bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); + void kvm_tdp_mmu_zap_all(struct kvm *kvm); + void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); diff --git a/queue-6.5/kvm-x86-mmu-open-code-leaf-invalidation-from-mmu_notifier.patch b/queue-6.5/kvm-x86-mmu-open-code-leaf-invalidation-from-mmu_notifier.patch new file mode 100644 index 00000000000..0b487b4ca6f --- /dev/null +++ b/queue-6.5/kvm-x86-mmu-open-code-leaf-invalidation-from-mmu_notifier.patch @@ -0,0 +1,87 @@ +From 50107e8b2a8a59d8cec7e8454e27c1f8e365acdb Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Fri, 15 Sep 2023 17:39:14 -0700 +Subject: KVM: x86/mmu: Open code leaf invalidation from mmu_notifier + +From: Sean Christopherson + +commit 50107e8b2a8a59d8cec7e8454e27c1f8e365acdb upstream. + +The mmu_notifier path is a bit of a special snowflake, e.g. it zaps only a +single address space (because it's per-slot), and can't always yield. +Because of this, it calls kvm_tdp_mmu_zap_leafs() in ways that no one +else does. + +Iterate manually over the leafs in response to an mmu_notifier +invalidation, instead of invoking kvm_tdp_mmu_zap_leafs(). Drop the +@can_yield param from kvm_tdp_mmu_zap_leafs() as its sole remaining +caller unconditionally passes "true". + +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20230916003916.2545000-2-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/mmu.c | 2 +- + arch/x86/kvm/mmu/tdp_mmu.c | 13 +++++++++---- + arch/x86/kvm/mmu/tdp_mmu.h | 4 ++-- + 3 files changed, 12 insertions(+), 7 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -6308,7 +6308,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, + if (tdp_mmu_enabled) { + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, +- gfn_end, true, flush); ++ gfn_end, flush); + } + + if (flush) +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -878,12 +878,12 @@ static bool tdp_mmu_zap_leafs(struct kvm + * more SPTEs were zapped since the MMU lock was last acquired. + */ + bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, +- bool can_yield, bool flush) ++ bool flush) + { + struct kvm_mmu_page *root; + + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) +- flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); ++ flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); + + return flush; + } +@@ -1146,8 +1146,13 @@ retry: + bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, + bool flush) + { +- return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, +- range->end, range->may_block, flush); ++ struct kvm_mmu_page *root; ++ ++ for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id) ++ flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, ++ range->may_block, flush); ++ ++ return flush; + } + + typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, +--- a/arch/x86/kvm/mmu/tdp_mmu.h ++++ b/arch/x86/kvm/mmu/tdp_mmu.h +@@ -20,8 +20,8 @@ __must_check static inline bool kvm_tdp_ + void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared); + +-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, +- gfn_t end, bool can_yield, bool flush); ++bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, ++ bool flush); + bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); + void kvm_tdp_mmu_zap_all(struct kvm *kvm); + void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); diff --git a/queue-6.5/kvm-x86-mmu-stop-zapping-invalidated-tdp-mmu-roots-asynchronously.patch b/queue-6.5/kvm-x86-mmu-stop-zapping-invalidated-tdp-mmu-roots-asynchronously.patch new file mode 100644 index 00000000000..27b92e514cc --- /dev/null +++ b/queue-6.5/kvm-x86-mmu-stop-zapping-invalidated-tdp-mmu-roots-asynchronously.patch @@ -0,0 +1,411 @@ +From 0df9dab891ff0d9b646d82e4fe038229e4c02451 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Fri, 15 Sep 2023 17:39:15 -0700 +Subject: KVM: x86/mmu: Stop zapping invalidated TDP MMU roots asynchronously + +From: Sean Christopherson + +commit 0df9dab891ff0d9b646d82e4fe038229e4c02451 upstream. + +Stop zapping invalidate TDP MMU roots via work queue now that KVM +preserves TDP MMU roots until they are explicitly invalidated. Zapping +roots asynchronously was effectively a workaround to avoid stalling a vCPU +for an extended during if a vCPU unloaded a root, which at the time +happened whenever the guest toggled CR0.WP (a frequent operation for some +guest kernels). + +While a clever hack, zapping roots via an unbound worker had subtle, +unintended consequences on host scheduling, especially when zapping +multiple roots, e.g. as part of a memslot. Because the work of zapping a +root is no longer bound to the task that initiated the zap, things like +the CPU affinity and priority of the original task get lost. Losing the +affinity and priority can be especially problematic if unbound workqueues +aren't affined to a small number of CPUs, as zapping multiple roots can +cause KVM to heavily utilize the majority of CPUs in the system, *beyond* +the CPUs KVM is already using to run vCPUs. + +When deleting a memslot via KVM_SET_USER_MEMORY_REGION, the async root +zap can result in KVM occupying all logical CPUs for ~8ms, and result in +high priority tasks not being scheduled in in a timely manner. In v5.15, +which doesn't preserve unloaded roots, the issues were even more noticeable +as KVM would zap roots more frequently and could occupy all CPUs for 50ms+. + +Consuming all CPUs for an extended duration can lead to significant jitter +throughout the system, e.g. on ChromeOS with virtio-gpu, deleting memslots +is a semi-frequent operation as memslots are deleted and recreated with +different host virtual addresses to react to host GPU drivers allocating +and freeing GPU blobs. On ChromeOS, the jitter manifests as audio blips +during games due to the audio server's tasks not getting scheduled in +promptly, despite the tasks having a high realtime priority. + +Deleting memslots isn't exactly a fast path and should be avoided when +possible, and ChromeOS is working towards utilizing MAP_FIXED to avoid the +memslot shenanigans, but KVM is squarely in the wrong. Not to mention +that removing the async zapping eliminates a non-trivial amount of +complexity. + +Note, one of the subtle behaviors hidden behind the async zapping is that +KVM would zap invalidated roots only once (ignoring partial zaps from +things like mmu_notifier events). Preserve this behavior by adding a flag +to identify roots that are scheduled to be zapped versus roots that have +already been zapped but not yet freed. + +Add a comment calling out why kvm_tdp_mmu_invalidate_all_roots() can +encounter invalid roots, as it's not at all obvious why zapping +invalidated roots shouldn't simply zap all invalid roots. + +Reported-by: Pattara Teerapong +Cc: David Stevens +Cc: Yiwei Zhang +Cc: Paul Hsia +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20230916003916.2545000-4-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/mmu/mmu.c | 12 --- + arch/x86/kvm/mmu/mmu_internal.h | 15 ++-- + arch/x86/kvm/mmu/tdp_mmu.c | 133 ++++++++++++++++------------------------ + arch/x86/kvm/mmu/tdp_mmu.h | 2 + arch/x86/kvm/x86.c | 5 - + 6 files changed, 68 insertions(+), 102 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1400,7 +1400,6 @@ struct kvm_arch { + * the thread holds the MMU lock in write mode. + */ + spinlock_t tdp_mmu_pages_lock; +- struct workqueue_struct *tdp_mmu_zap_wq; + #endif /* CONFIG_X86_64 */ + + /* +@@ -1814,7 +1813,7 @@ void kvm_mmu_vendor_module_exit(void); + + void kvm_mmu_destroy(struct kvm_vcpu *vcpu); + int kvm_mmu_create(struct kvm_vcpu *vcpu); +-int kvm_mmu_init_vm(struct kvm *kvm); ++void kvm_mmu_init_vm(struct kvm *kvm); + void kvm_mmu_uninit_vm(struct kvm *kvm); + + void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -6206,21 +6206,17 @@ static void kvm_mmu_invalidate_zap_pages + kvm_mmu_zap_all_fast(kvm); + } + +-int kvm_mmu_init_vm(struct kvm *kvm) ++void kvm_mmu_init_vm(struct kvm *kvm) + { + struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; +- int r; + + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); + INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); + spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); + +- if (tdp_mmu_enabled) { +- r = kvm_mmu_init_tdp_mmu(kvm); +- if (r < 0) +- return r; +- } ++ if (tdp_mmu_enabled) ++ kvm_mmu_init_tdp_mmu(kvm); + + node->track_write = kvm_mmu_pte_write; + node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; +@@ -6233,8 +6229,6 @@ int kvm_mmu_init_vm(struct kvm *kvm) + + kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; + kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; +- +- return 0; + } + + static void mmu_free_vm_memory_caches(struct kvm *kvm) +--- a/arch/x86/kvm/mmu/mmu_internal.h ++++ b/arch/x86/kvm/mmu/mmu_internal.h +@@ -56,7 +56,12 @@ struct kvm_mmu_page { + + bool tdp_mmu_page; + bool unsync; +- u8 mmu_valid_gen; ++ union { ++ u8 mmu_valid_gen; ++ ++ /* Only accessed under slots_lock. */ ++ bool tdp_mmu_scheduled_root_to_zap; ++ }; + + /* + * The shadow page can't be replaced by an equivalent huge page +@@ -98,13 +103,7 @@ struct kvm_mmu_page { + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ + tdp_ptep_t ptep; + }; +- union { +- DECLARE_BITMAP(unsync_child_bitmap, 512); +- struct { +- struct work_struct tdp_mmu_async_work; +- void *tdp_mmu_async_data; +- }; +- }; ++ DECLARE_BITMAP(unsync_child_bitmap, 512); + + /* + * Tracks shadow pages that, if zapped, would allow KVM to create an NX +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -12,18 +12,10 @@ + #include + + /* Initializes the TDP MMU for the VM, if enabled. */ +-int kvm_mmu_init_tdp_mmu(struct kvm *kvm) ++void kvm_mmu_init_tdp_mmu(struct kvm *kvm) + { +- struct workqueue_struct *wq; +- +- wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); +- if (!wq) +- return -ENOMEM; +- + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); + spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); +- kvm->arch.tdp_mmu_zap_wq = wq; +- return 1; + } + + /* Arbitrarily returns true so that this may be used in if statements. */ +@@ -46,20 +38,15 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm * + * ultimately frees all roots. + */ + kvm_tdp_mmu_invalidate_all_roots(kvm); +- +- /* +- * Destroying a workqueue also first flushes the workqueue, i.e. no +- * need to invoke kvm_tdp_mmu_zap_invalidated_roots(). +- */ +- destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); ++ kvm_tdp_mmu_zap_invalidated_roots(kvm); + + WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); + + /* + * Ensure that all the outstanding RCU callbacks to free shadow pages +- * can run before the VM is torn down. Work items on tdp_mmu_zap_wq +- * can call kvm_tdp_mmu_put_root and create new callbacks. ++ * can run before the VM is torn down. Putting the last reference to ++ * zapped roots will create new callbacks. + */ + rcu_barrier(); + } +@@ -86,46 +73,6 @@ static void tdp_mmu_free_sp_rcu_callback + tdp_mmu_free_sp(sp); + } + +-static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, +- bool shared); +- +-static void tdp_mmu_zap_root_work(struct work_struct *work) +-{ +- struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, +- tdp_mmu_async_work); +- struct kvm *kvm = root->tdp_mmu_async_data; +- +- read_lock(&kvm->mmu_lock); +- +- /* +- * A TLB flush is not necessary as KVM performs a local TLB flush when +- * allocating a new root (see kvm_mmu_load()), and when migrating vCPU +- * to a different pCPU. Note, the local TLB flush on reuse also +- * invalidates any paging-structure-cache entries, i.e. TLB entries for +- * intermediate paging structures, that may be zapped, as such entries +- * are associated with the ASID on both VMX and SVM. +- */ +- tdp_mmu_zap_root(kvm, root, true); +- +- /* +- * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for +- * avoiding an infinite loop. By design, the root is reachable while +- * it's being asynchronously zapped, thus a different task can put its +- * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an +- * asynchronously zapped root is unavoidable. +- */ +- kvm_tdp_mmu_put_root(kvm, root, true); +- +- read_unlock(&kvm->mmu_lock); +-} +- +-static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) +-{ +- root->tdp_mmu_async_data = kvm; +- INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); +- queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); +-} +- + void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared) + { +@@ -211,11 +158,11 @@ static struct kvm_mmu_page *tdp_mmu_next + #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) + +-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ +- for (_root = tdp_mmu_next_root(_kvm, NULL, false, false); \ ++#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \ ++ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \ + _root; \ +- _root = tdp_mmu_next_root(_kvm, _root, false, false)) \ +- if (!kvm_lockdep_assert_mmu_lock_held(_kvm, false)) { \ ++ _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \ ++ if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \ + } else + + /* +@@ -296,7 +243,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(stru + * by a memslot update or by the destruction of the VM. Initialize the + * refcount to two; one reference for the vCPU, and one reference for + * the TDP MMU itself, which is held until the root is invalidated and +- * is ultimately put by tdp_mmu_zap_root_work(). ++ * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). + */ + refcount_set(&root->tdp_mmu_root_count, 2); + +@@ -885,7 +832,7 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *k + { + struct kvm_mmu_page *root; + +- for_each_tdp_mmu_root_yield_safe(kvm, root) ++ for_each_tdp_mmu_root_yield_safe(kvm, root, false) + flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); + + return flush; +@@ -907,7 +854,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm + * is being destroyed or the userspace VMM has exited. In both cases, + * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. + */ +- for_each_tdp_mmu_root_yield_safe(kvm, root) ++ for_each_tdp_mmu_root_yield_safe(kvm, root, false) + tdp_mmu_zap_root(kvm, root, false); + } + +@@ -917,18 +864,47 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm + */ + void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) + { +- flush_workqueue(kvm->arch.tdp_mmu_zap_wq); ++ struct kvm_mmu_page *root; ++ ++ read_lock(&kvm->mmu_lock); ++ ++ for_each_tdp_mmu_root_yield_safe(kvm, root, true) { ++ if (!root->tdp_mmu_scheduled_root_to_zap) ++ continue; ++ ++ root->tdp_mmu_scheduled_root_to_zap = false; ++ KVM_BUG_ON(!root->role.invalid, kvm); ++ ++ /* ++ * A TLB flush is not necessary as KVM performs a local TLB ++ * flush when allocating a new root (see kvm_mmu_load()), and ++ * when migrating a vCPU to a different pCPU. Note, the local ++ * TLB flush on reuse also invalidates paging-structure-cache ++ * entries, i.e. TLB entries for intermediate paging structures, ++ * that may be zapped, as such entries are associated with the ++ * ASID on both VMX and SVM. ++ */ ++ tdp_mmu_zap_root(kvm, root, true); ++ ++ /* ++ * The referenced needs to be put *after* zapping the root, as ++ * the root must be reachable by mmu_notifiers while it's being ++ * zapped ++ */ ++ kvm_tdp_mmu_put_root(kvm, root, true); ++ } ++ ++ read_unlock(&kvm->mmu_lock); + } + + /* + * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that + * is about to be zapped, e.g. in response to a memslots update. The actual +- * zapping is performed asynchronously. Using a separate workqueue makes it +- * easy to ensure that the destruction is performed before the "fast zap" +- * completes, without keeping a separate list of invalidated roots; the list is +- * effectively the list of work items in the workqueue. ++ * zapping is done separately so that it happens with mmu_lock with read, ++ * whereas invalidating roots must be done with mmu_lock held for write (unless ++ * the VM is being destroyed). + * +- * Note, the asynchronous worker is gifted the TDP MMU's reference. ++ * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. + * See kvm_tdp_mmu_get_vcpu_root_hpa(). + */ + void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) +@@ -953,19 +929,20 @@ void kvm_tdp_mmu_invalidate_all_roots(st + /* + * As above, mmu_lock isn't held when destroying the VM! There can't + * be other references to @kvm, i.e. nothing else can invalidate roots +- * or be consuming roots, but walking the list of roots does need to be +- * guarded against roots being deleted by the asynchronous zap worker. ++ * or get/put references to roots. + */ +- rcu_read_lock(); +- +- list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { ++ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { ++ /* ++ * Note, invalid roots can outlive a memslot update! Invalid ++ * roots must be *zapped* before the memslot update completes, ++ * but a different task can acquire a reference and keep the ++ * root alive after its been zapped. ++ */ + if (!root->role.invalid) { ++ root->tdp_mmu_scheduled_root_to_zap = true; + root->role.invalid = true; +- tdp_mmu_schedule_zap_root(kvm, root); + } + } +- +- rcu_read_unlock(); + } + + /* +--- a/arch/x86/kvm/mmu/tdp_mmu.h ++++ b/arch/x86/kvm/mmu/tdp_mmu.h +@@ -7,7 +7,7 @@ + + #include "spte.h" + +-int kvm_mmu_init_tdp_mmu(struct kvm *kvm); ++void kvm_mmu_init_tdp_mmu(struct kvm *kvm); + void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + + hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -12302,9 +12302,7 @@ int kvm_arch_init_vm(struct kvm *kvm, un + if (ret) + goto out; + +- ret = kvm_mmu_init_vm(kvm); +- if (ret) +- goto out_page_track; ++ kvm_mmu_init_vm(kvm); + + ret = static_call(kvm_x86_vm_init)(kvm); + if (ret) +@@ -12349,7 +12347,6 @@ int kvm_arch_init_vm(struct kvm *kvm, un + + out_uninit_mmu: + kvm_mmu_uninit_vm(kvm); +-out_page_track: + kvm_page_track_cleanup(kvm); + out: + return ret; diff --git a/queue-6.5/misc-rtsx-fix-some-platforms-can-not-boot-and-move-the-l1ss-judgment-to-probe.patch b/queue-6.5/misc-rtsx-fix-some-platforms-can-not-boot-and-move-the-l1ss-judgment-to-probe.patch new file mode 100644 index 00000000000..caf3e4be5c9 --- /dev/null +++ b/queue-6.5/misc-rtsx-fix-some-platforms-can-not-boot-and-move-the-l1ss-judgment-to-probe.patch @@ -0,0 +1,517 @@ +From 0e4cac557531a4c93de108d9ff11329fcad482ff Mon Sep 17 00:00:00 2001 +From: Ricky WU +Date: Wed, 20 Sep 2023 09:11:19 +0000 +Subject: misc: rtsx: Fix some platforms can not boot and move the l1ss judgment to probe + +From: Ricky WU + +commit 0e4cac557531a4c93de108d9ff11329fcad482ff upstream. + +commit 101bd907b424 ("misc: rtsx: judge ASPM Mode to set PETXCFG Reg") +some readers no longer force #CLKREQ to low +when the system need to enter ASPM. +But some platform maybe not implement complete ASPM? +it causes some platforms can not boot + +Like in the past only the platform support L1ss we release the #CLKREQ. +Move the judgment (L1ss) to probe, +we think read config space one time when the driver start is enough + +Fixes: 101bd907b424 ("misc: rtsx: judge ASPM Mode to set PETXCFG Reg") +Cc: stable +Reported-by: Paul Grandperrin +Signed-off-by: Ricky Wu +Tested-By: Jade Lovelace +Link: https://lore.kernel.org/r/37b1afb997f14946a8784c73d1f9a4f5@realtek.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/misc/cardreader/rts5227.c | 55 +++-------------------------------- + drivers/misc/cardreader/rts5228.c | 57 +++++++++++-------------------------- + drivers/misc/cardreader/rts5249.c | 56 ++++-------------------------------- + drivers/misc/cardreader/rts5260.c | 43 ++++++++------------------- + drivers/misc/cardreader/rts5261.c | 52 ++++++++------------------------- + drivers/misc/cardreader/rtsx_pcr.c | 51 +++++++++++++++++++++++++++++---- + 6 files changed, 102 insertions(+), 212 deletions(-) + +--- a/drivers/misc/cardreader/rts5227.c ++++ b/drivers/misc/cardreader/rts5227.c +@@ -83,63 +83,20 @@ static void rts5227_fetch_vendor_setting + + static void rts5227_init_from_cfg(struct rtsx_pcr *pcr) + { +- struct pci_dev *pdev = pcr->pci; +- int l1ss; +- u32 lval; + struct rtsx_cr_option *option = &pcr->option; + +- l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS); +- if (!l1ss) +- return; +- +- pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, &lval); +- + if (CHK_PCI_PID(pcr, 0x522A)) { +- if (0 == (lval & 0x0F)) +- rtsx_pci_enable_oobs_polling(pcr); +- else ++ if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN ++ | PM_L1_1_EN | PM_L1_2_EN)) + rtsx_pci_disable_oobs_polling(pcr); ++ else ++ rtsx_pci_enable_oobs_polling(pcr); + } + +- if (lval & PCI_L1SS_CTL1_ASPM_L1_1) +- rtsx_set_dev_flag(pcr, ASPM_L1_1_EN); +- else +- rtsx_clear_dev_flag(pcr, ASPM_L1_1_EN); +- +- if (lval & PCI_L1SS_CTL1_ASPM_L1_2) +- rtsx_set_dev_flag(pcr, ASPM_L1_2_EN); +- else +- rtsx_clear_dev_flag(pcr, ASPM_L1_2_EN); +- +- if (lval & PCI_L1SS_CTL1_PCIPM_L1_1) +- rtsx_set_dev_flag(pcr, PM_L1_1_EN); +- else +- rtsx_clear_dev_flag(pcr, PM_L1_1_EN); +- +- if (lval & PCI_L1SS_CTL1_PCIPM_L1_2) +- rtsx_set_dev_flag(pcr, PM_L1_2_EN); +- else +- rtsx_clear_dev_flag(pcr, PM_L1_2_EN); +- + if (option->ltr_en) { +- u16 val; +- +- pcie_capability_read_word(pcr->pci, PCI_EXP_DEVCTL2, &val); +- if (val & PCI_EXP_DEVCTL2_LTR_EN) { +- option->ltr_enabled = true; +- option->ltr_active = true; ++ if (option->ltr_enabled) + rtsx_set_ltr_latency(pcr, option->ltr_active_latency); +- } else { +- option->ltr_enabled = false; +- } + } +- +- if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN +- | PM_L1_1_EN | PM_L1_2_EN)) +- option->force_clkreq_0 = false; +- else +- option->force_clkreq_0 = true; +- + } + + static int rts5227_extra_init_hw(struct rtsx_pcr *pcr) +@@ -195,7 +152,7 @@ static int rts5227_extra_init_hw(struct + } + } + +- if (option->force_clkreq_0 && pcr->aspm_mode == ASPM_MODE_CFG) ++ if (option->force_clkreq_0) + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, + FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW); + else +--- a/drivers/misc/cardreader/rts5228.c ++++ b/drivers/misc/cardreader/rts5228.c +@@ -386,59 +386,25 @@ static void rts5228_process_ocp(struct r + + static void rts5228_init_from_cfg(struct rtsx_pcr *pcr) + { +- struct pci_dev *pdev = pcr->pci; +- int l1ss; +- u32 lval; + struct rtsx_cr_option *option = &pcr->option; + +- l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS); +- if (!l1ss) +- return; +- +- pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, &lval); +- +- if (0 == (lval & 0x0F)) +- rtsx_pci_enable_oobs_polling(pcr); +- else ++ if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN ++ | PM_L1_1_EN | PM_L1_2_EN)) + rtsx_pci_disable_oobs_polling(pcr); +- +- if (lval & PCI_L1SS_CTL1_ASPM_L1_1) +- rtsx_set_dev_flag(pcr, ASPM_L1_1_EN); +- else +- rtsx_clear_dev_flag(pcr, ASPM_L1_1_EN); +- +- if (lval & PCI_L1SS_CTL1_ASPM_L1_2) +- rtsx_set_dev_flag(pcr, ASPM_L1_2_EN); +- else +- rtsx_clear_dev_flag(pcr, ASPM_L1_2_EN); +- +- if (lval & PCI_L1SS_CTL1_PCIPM_L1_1) +- rtsx_set_dev_flag(pcr, PM_L1_1_EN); + else +- rtsx_clear_dev_flag(pcr, PM_L1_1_EN); +- +- if (lval & PCI_L1SS_CTL1_PCIPM_L1_2) +- rtsx_set_dev_flag(pcr, PM_L1_2_EN); +- else +- rtsx_clear_dev_flag(pcr, PM_L1_2_EN); ++ rtsx_pci_enable_oobs_polling(pcr); + + rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, 0xFF, 0); +- if (option->ltr_en) { +- u16 val; + +- pcie_capability_read_word(pcr->pci, PCI_EXP_DEVCTL2, &val); +- if (val & PCI_EXP_DEVCTL2_LTR_EN) { +- option->ltr_enabled = true; +- option->ltr_active = true; ++ if (option->ltr_en) { ++ if (option->ltr_enabled) + rtsx_set_ltr_latency(pcr, option->ltr_active_latency); +- } else { +- option->ltr_enabled = false; +- } + } + } + + static int rts5228_extra_init_hw(struct rtsx_pcr *pcr) + { ++ struct rtsx_cr_option *option = &pcr->option; + + rtsx_pci_write_register(pcr, RTS5228_AUTOLOAD_CFG1, + CD_RESUME_EN_MASK, CD_RESUME_EN_MASK); +@@ -469,6 +435,17 @@ static int rts5228_extra_init_hw(struct + else + rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x00); + ++ /* ++ * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced ++ * to drive low, and we forcibly request clock. ++ */ ++ if (option->force_clkreq_0) ++ rtsx_pci_write_register(pcr, PETXCFG, ++ FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW); ++ else ++ rtsx_pci_write_register(pcr, PETXCFG, ++ FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_HIGH); ++ + rtsx_pci_write_register(pcr, PWD_SUSPEND_EN, 0xFF, 0xFB); + + if (pcr->rtd3_en) { +--- a/drivers/misc/cardreader/rts5249.c ++++ b/drivers/misc/cardreader/rts5249.c +@@ -86,64 +86,22 @@ static void rtsx_base_fetch_vendor_setti + + static void rts5249_init_from_cfg(struct rtsx_pcr *pcr) + { +- struct pci_dev *pdev = pcr->pci; +- int l1ss; + struct rtsx_cr_option *option = &(pcr->option); +- u32 lval; +- +- l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS); +- if (!l1ss) +- return; +- +- pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, &lval); + + if (CHK_PCI_PID(pcr, PID_524A) || CHK_PCI_PID(pcr, PID_525A)) { +- if (0 == (lval & 0x0F)) +- rtsx_pci_enable_oobs_polling(pcr); +- else ++ if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN ++ | PM_L1_1_EN | PM_L1_2_EN)) + rtsx_pci_disable_oobs_polling(pcr); ++ else ++ rtsx_pci_enable_oobs_polling(pcr); + } + +- +- if (lval & PCI_L1SS_CTL1_ASPM_L1_1) +- rtsx_set_dev_flag(pcr, ASPM_L1_1_EN); +- +- if (lval & PCI_L1SS_CTL1_ASPM_L1_2) +- rtsx_set_dev_flag(pcr, ASPM_L1_2_EN); +- +- if (lval & PCI_L1SS_CTL1_PCIPM_L1_1) +- rtsx_set_dev_flag(pcr, PM_L1_1_EN); +- +- if (lval & PCI_L1SS_CTL1_PCIPM_L1_2) +- rtsx_set_dev_flag(pcr, PM_L1_2_EN); +- + if (option->ltr_en) { +- u16 val; +- +- pcie_capability_read_word(pdev, PCI_EXP_DEVCTL2, &val); +- if (val & PCI_EXP_DEVCTL2_LTR_EN) { +- option->ltr_enabled = true; +- option->ltr_active = true; ++ if (option->ltr_enabled) + rtsx_set_ltr_latency(pcr, option->ltr_active_latency); +- } else { +- option->ltr_enabled = false; +- } + } + } + +-static int rts5249_init_from_hw(struct rtsx_pcr *pcr) +-{ +- struct rtsx_cr_option *option = &(pcr->option); +- +- if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN +- | PM_L1_1_EN | PM_L1_2_EN)) +- option->force_clkreq_0 = false; +- else +- option->force_clkreq_0 = true; +- +- return 0; +-} +- + static void rts52xa_force_power_down(struct rtsx_pcr *pcr, u8 pm_state, bool runtime) + { + /* Set relink_time to 0 */ +@@ -276,7 +234,6 @@ static int rts5249_extra_init_hw(struct + struct rtsx_cr_option *option = &(pcr->option); + + rts5249_init_from_cfg(pcr); +- rts5249_init_from_hw(pcr); + + rtsx_pci_init_cmd(pcr); + +@@ -327,11 +284,12 @@ static int rts5249_extra_init_hw(struct + } + } + ++ + /* + * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced + * to drive low, and we forcibly request clock. + */ +- if (option->force_clkreq_0 && pcr->aspm_mode == ASPM_MODE_CFG) ++ if (option->force_clkreq_0) + rtsx_pci_write_register(pcr, PETXCFG, + FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW); + else +--- a/drivers/misc/cardreader/rts5260.c ++++ b/drivers/misc/cardreader/rts5260.c +@@ -480,47 +480,19 @@ static void rts5260_pwr_saving_setting(s + + static void rts5260_init_from_cfg(struct rtsx_pcr *pcr) + { +- struct pci_dev *pdev = pcr->pci; +- int l1ss; + struct rtsx_cr_option *option = &pcr->option; +- u32 lval; +- +- l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS); +- if (!l1ss) +- return; +- +- pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, &lval); +- +- if (lval & PCI_L1SS_CTL1_ASPM_L1_1) +- rtsx_set_dev_flag(pcr, ASPM_L1_1_EN); +- +- if (lval & PCI_L1SS_CTL1_ASPM_L1_2) +- rtsx_set_dev_flag(pcr, ASPM_L1_2_EN); +- +- if (lval & PCI_L1SS_CTL1_PCIPM_L1_1) +- rtsx_set_dev_flag(pcr, PM_L1_1_EN); +- +- if (lval & PCI_L1SS_CTL1_PCIPM_L1_2) +- rtsx_set_dev_flag(pcr, PM_L1_2_EN); + + rts5260_pwr_saving_setting(pcr); + + if (option->ltr_en) { +- u16 val; +- +- pcie_capability_read_word(pdev, PCI_EXP_DEVCTL2, &val); +- if (val & PCI_EXP_DEVCTL2_LTR_EN) { +- option->ltr_enabled = true; +- option->ltr_active = true; ++ if (option->ltr_enabled) + rtsx_set_ltr_latency(pcr, option->ltr_active_latency); +- } else { +- option->ltr_enabled = false; +- } + } + } + + static int rts5260_extra_init_hw(struct rtsx_pcr *pcr) + { ++ struct rtsx_cr_option *option = &pcr->option; + + /* Set mcu_cnt to 7 to ensure data can be sampled properly */ + rtsx_pci_write_register(pcr, 0xFC03, 0x7F, 0x07); +@@ -539,6 +511,17 @@ static int rts5260_extra_init_hw(struct + + rts5260_init_hw(pcr); + ++ /* ++ * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced ++ * to drive low, and we forcibly request clock. ++ */ ++ if (option->force_clkreq_0) ++ rtsx_pci_write_register(pcr, PETXCFG, ++ FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW); ++ else ++ rtsx_pci_write_register(pcr, PETXCFG, ++ FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_HIGH); ++ + rtsx_pci_write_register(pcr, pcr->reg_pm_ctrl3, 0x10, 0x00); + + return 0; +--- a/drivers/misc/cardreader/rts5261.c ++++ b/drivers/misc/cardreader/rts5261.c +@@ -454,54 +454,17 @@ static void rts5261_init_from_hw(struct + + static void rts5261_init_from_cfg(struct rtsx_pcr *pcr) + { +- struct pci_dev *pdev = pcr->pci; +- int l1ss; +- u32 lval; + struct rtsx_cr_option *option = &pcr->option; + +- l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS); +- if (!l1ss) +- return; +- +- pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, &lval); +- +- if (lval & PCI_L1SS_CTL1_ASPM_L1_1) +- rtsx_set_dev_flag(pcr, ASPM_L1_1_EN); +- else +- rtsx_clear_dev_flag(pcr, ASPM_L1_1_EN); +- +- if (lval & PCI_L1SS_CTL1_ASPM_L1_2) +- rtsx_set_dev_flag(pcr, ASPM_L1_2_EN); +- else +- rtsx_clear_dev_flag(pcr, ASPM_L1_2_EN); +- +- if (lval & PCI_L1SS_CTL1_PCIPM_L1_1) +- rtsx_set_dev_flag(pcr, PM_L1_1_EN); +- else +- rtsx_clear_dev_flag(pcr, PM_L1_1_EN); +- +- if (lval & PCI_L1SS_CTL1_PCIPM_L1_2) +- rtsx_set_dev_flag(pcr, PM_L1_2_EN); +- else +- rtsx_clear_dev_flag(pcr, PM_L1_2_EN); +- +- rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, 0xFF, 0); + if (option->ltr_en) { +- u16 val; +- +- pcie_capability_read_word(pdev, PCI_EXP_DEVCTL2, &val); +- if (val & PCI_EXP_DEVCTL2_LTR_EN) { +- option->ltr_enabled = true; +- option->ltr_active = true; ++ if (option->ltr_enabled) + rtsx_set_ltr_latency(pcr, option->ltr_active_latency); +- } else { +- option->ltr_enabled = false; +- } + } + } + + static int rts5261_extra_init_hw(struct rtsx_pcr *pcr) + { ++ struct rtsx_cr_option *option = &pcr->option; + u32 val; + + rtsx_pci_write_register(pcr, RTS5261_AUTOLOAD_CFG1, +@@ -547,6 +510,17 @@ static int rts5261_extra_init_hw(struct + else + rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x00); + ++ /* ++ * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced ++ * to drive low, and we forcibly request clock. ++ */ ++ if (option->force_clkreq_0) ++ rtsx_pci_write_register(pcr, PETXCFG, ++ FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW); ++ else ++ rtsx_pci_write_register(pcr, PETXCFG, ++ FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_HIGH); ++ + rtsx_pci_write_register(pcr, PWD_SUSPEND_EN, 0xFF, 0xFB); + + if (pcr->rtd3_en) { +--- a/drivers/misc/cardreader/rtsx_pcr.c ++++ b/drivers/misc/cardreader/rtsx_pcr.c +@@ -1326,11 +1326,8 @@ static int rtsx_pci_init_hw(struct rtsx_ + return err; + } + +- if (pcr->aspm_mode == ASPM_MODE_REG) { ++ if (pcr->aspm_mode == ASPM_MODE_REG) + rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, 0x30, 0x30); +- rtsx_pci_write_register(pcr, PETXCFG, +- FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_HIGH); +- } + + /* No CD interrupt if probing driver with card inserted. + * So we need to initialize pcr->card_exist here. +@@ -1345,7 +1342,9 @@ static int rtsx_pci_init_hw(struct rtsx_ + + static int rtsx_pci_init_chip(struct rtsx_pcr *pcr) + { +- int err; ++ struct rtsx_cr_option *option = &(pcr->option); ++ int err, l1ss; ++ u32 lval; + u16 cfg_val; + u8 val; + +@@ -1430,6 +1429,48 @@ static int rtsx_pci_init_chip(struct rts + pcr->aspm_enabled = true; + } + ++ l1ss = pci_find_ext_capability(pcr->pci, PCI_EXT_CAP_ID_L1SS); ++ if (l1ss) { ++ pci_read_config_dword(pcr->pci, l1ss + PCI_L1SS_CTL1, &lval); ++ ++ if (lval & PCI_L1SS_CTL1_ASPM_L1_1) ++ rtsx_set_dev_flag(pcr, ASPM_L1_1_EN); ++ else ++ rtsx_clear_dev_flag(pcr, ASPM_L1_1_EN); ++ ++ if (lval & PCI_L1SS_CTL1_ASPM_L1_2) ++ rtsx_set_dev_flag(pcr, ASPM_L1_2_EN); ++ else ++ rtsx_clear_dev_flag(pcr, ASPM_L1_2_EN); ++ ++ if (lval & PCI_L1SS_CTL1_PCIPM_L1_1) ++ rtsx_set_dev_flag(pcr, PM_L1_1_EN); ++ else ++ rtsx_clear_dev_flag(pcr, PM_L1_1_EN); ++ ++ if (lval & PCI_L1SS_CTL1_PCIPM_L1_2) ++ rtsx_set_dev_flag(pcr, PM_L1_2_EN); ++ else ++ rtsx_clear_dev_flag(pcr, PM_L1_2_EN); ++ ++ pcie_capability_read_word(pcr->pci, PCI_EXP_DEVCTL2, &cfg_val); ++ if (cfg_val & PCI_EXP_DEVCTL2_LTR_EN) { ++ option->ltr_enabled = true; ++ option->ltr_active = true; ++ } else { ++ option->ltr_enabled = false; ++ } ++ ++ if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN ++ | PM_L1_1_EN | PM_L1_2_EN)) ++ option->force_clkreq_0 = false; ++ else ++ option->force_clkreq_0 = true; ++ } else { ++ option->ltr_enabled = false; ++ option->force_clkreq_0 = true; ++ } ++ + if (pcr->ops->fetch_vendor_settings) + pcr->ops->fetch_vendor_settings(pcr); + diff --git a/queue-6.5/mptcp-fix-bogus-receive-window-shrinkage-with-multiple-subflows.patch b/queue-6.5/mptcp-fix-bogus-receive-window-shrinkage-with-multiple-subflows.patch new file mode 100644 index 00000000000..55dced99975 --- /dev/null +++ b/queue-6.5/mptcp-fix-bogus-receive-window-shrinkage-with-multiple-subflows.patch @@ -0,0 +1,50 @@ +From 6bec041147a2a64a490d1f813e8a004443061b38 Mon Sep 17 00:00:00 2001 +From: Paolo Abeni +Date: Sat, 16 Sep 2023 12:52:45 +0200 +Subject: mptcp: fix bogus receive window shrinkage with multiple subflows + +From: Paolo Abeni + +commit 6bec041147a2a64a490d1f813e8a004443061b38 upstream. + +In case multiple subflows race to update the mptcp-level receive +window, the subflow losing the race should use the window value +provided by the "winning" subflow to update it's own tcp-level +rcv_wnd. + +To such goal, the current code bogusly uses the mptcp-level rcv_wnd +value as observed before the update attempt. On unlucky circumstances +that may lead to TCP-level window shrinkage, and stall the other end. + +Address the issue feeding to the rcv wnd update the correct value. + +Fixes: f3589be0c420 ("mptcp: never shrink offered window") +Cc: stable@vger.kernel.org +Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/427 +Signed-off-by: Paolo Abeni +Reviewed-by: Mat Martineau +Signed-off-by: Matthieu Baerts +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/mptcp/options.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/net/mptcp/options.c ++++ b/net/mptcp/options.c +@@ -1269,12 +1269,13 @@ static void mptcp_set_rwin(struct tcp_so + + if (rcv_wnd == rcv_wnd_old) + break; +- if (before64(rcv_wnd_new, rcv_wnd)) { ++ ++ rcv_wnd_old = rcv_wnd; ++ if (before64(rcv_wnd_new, rcv_wnd_old)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); + goto raise_win; + } + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); +- rcv_wnd_old = rcv_wnd; + } + return; + } diff --git a/queue-6.5/mptcp-move-__mptcp_error_report-in-protocol.c.patch b/queue-6.5/mptcp-move-__mptcp_error_report-in-protocol.c.patch new file mode 100644 index 00000000000..5c01d7e9131 --- /dev/null +++ b/queue-6.5/mptcp-move-__mptcp_error_report-in-protocol.c.patch @@ -0,0 +1,115 @@ +From d5fbeff1ab812b6c473b6924bee8748469462e2c Mon Sep 17 00:00:00 2001 +From: Paolo Abeni +Date: Sat, 16 Sep 2023 12:52:46 +0200 +Subject: mptcp: move __mptcp_error_report in protocol.c + +From: Paolo Abeni + +commit d5fbeff1ab812b6c473b6924bee8748469462e2c upstream. + +This will simplify the next patch ("mptcp: process pending subflow error +on close"). + +No functional change intended. + +Cc: stable@vger.kernel.org # v5.12+ +Signed-off-by: Paolo Abeni +Reviewed-by: Mat Martineau +Signed-off-by: Matthieu Baerts +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/mptcp/protocol.c | 36 ++++++++++++++++++++++++++++++++++++ + net/mptcp/subflow.c | 36 ------------------------------------ + 2 files changed, 36 insertions(+), 36 deletions(-) + +--- a/net/mptcp/protocol.c ++++ b/net/mptcp/protocol.c +@@ -772,6 +772,42 @@ static bool __mptcp_ofo_queue(struct mpt + return moved; + } + ++void __mptcp_error_report(struct sock *sk) ++{ ++ struct mptcp_subflow_context *subflow; ++ struct mptcp_sock *msk = mptcp_sk(sk); ++ ++ mptcp_for_each_subflow(msk, subflow) { ++ struct sock *ssk = mptcp_subflow_tcp_sock(subflow); ++ int err = sock_error(ssk); ++ int ssk_state; ++ ++ if (!err) ++ continue; ++ ++ /* only propagate errors on fallen-back sockets or ++ * on MPC connect ++ */ ++ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) ++ continue; ++ ++ /* We need to propagate only transition to CLOSE state. ++ * Orphaned socket will see such state change via ++ * subflow_sched_work_if_closed() and that path will properly ++ * destroy the msk as needed. ++ */ ++ ssk_state = inet_sk_state_load(ssk); ++ if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) ++ inet_sk_state_store(sk, ssk_state); ++ WRITE_ONCE(sk->sk_err, -err); ++ ++ /* This barrier is coupled with smp_rmb() in mptcp_poll() */ ++ smp_wmb(); ++ sk_error_report(sk); ++ break; ++ } ++} ++ + /* In most cases we will be able to lock the mptcp socket. If its already + * owned, we need to defer to the work queue to avoid ABBA deadlock. + */ +--- a/net/mptcp/subflow.c ++++ b/net/mptcp/subflow.c +@@ -1362,42 +1362,6 @@ void mptcp_space(const struct sock *ssk, + *full_space = tcp_full_space(sk); + } + +-void __mptcp_error_report(struct sock *sk) +-{ +- struct mptcp_subflow_context *subflow; +- struct mptcp_sock *msk = mptcp_sk(sk); +- +- mptcp_for_each_subflow(msk, subflow) { +- struct sock *ssk = mptcp_subflow_tcp_sock(subflow); +- int err = sock_error(ssk); +- int ssk_state; +- +- if (!err) +- continue; +- +- /* only propagate errors on fallen-back sockets or +- * on MPC connect +- */ +- if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) +- continue; +- +- /* We need to propagate only transition to CLOSE state. +- * Orphaned socket will see such state change via +- * subflow_sched_work_if_closed() and that path will properly +- * destroy the msk as needed. +- */ +- ssk_state = inet_sk_state_load(ssk); +- if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) +- inet_sk_state_store(sk, ssk_state); +- WRITE_ONCE(sk->sk_err, -err); +- +- /* This barrier is coupled with smp_rmb() in mptcp_poll() */ +- smp_wmb(); +- sk_error_report(sk); +- break; +- } +-} +- + static void subflow_error_report(struct sock *ssk) + { + struct sock *sk = mptcp_subflow_ctx(ssk)->conn; diff --git a/queue-6.5/mptcp-process-pending-subflow-error-on-close.patch b/queue-6.5/mptcp-process-pending-subflow-error-on-close.patch new file mode 100644 index 00000000000..ca961156d29 --- /dev/null +++ b/queue-6.5/mptcp-process-pending-subflow-error-on-close.patch @@ -0,0 +1,113 @@ +From 9f1a98813b4b686482e5ef3c9d998581cace0ba6 Mon Sep 17 00:00:00 2001 +From: Paolo Abeni +Date: Sat, 16 Sep 2023 12:52:47 +0200 +Subject: mptcp: process pending subflow error on close + +From: Paolo Abeni + +commit 9f1a98813b4b686482e5ef3c9d998581cace0ba6 upstream. + +On incoming TCP reset, subflow closing could happen before error +propagation. That in turn could cause the socket error being ignored, +and a missing socket state transition, as reported by Daire-Byrne. + +Address the issues explicitly checking for subflow socket error at +close time. To avoid code duplication, factor-out of __mptcp_error_report() +a new helper implementing the relevant bits. + +Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/429 +Fixes: 15cc10453398 ("mptcp: deliver ssk errors to msk") +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Abeni +Reviewed-by: Mat Martineau +Signed-off-by: Matthieu Baerts +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/mptcp/protocol.c | 63 +++++++++++++++++++++++++++------------------------ + 1 file changed, 34 insertions(+), 29 deletions(-) + +--- a/net/mptcp/protocol.c ++++ b/net/mptcp/protocol.c +@@ -772,40 +772,44 @@ static bool __mptcp_ofo_queue(struct mpt + return moved; + } + +-void __mptcp_error_report(struct sock *sk) ++static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) + { +- struct mptcp_subflow_context *subflow; +- struct mptcp_sock *msk = mptcp_sk(sk); ++ int err = sock_error(ssk); ++ int ssk_state; + +- mptcp_for_each_subflow(msk, subflow) { +- struct sock *ssk = mptcp_subflow_tcp_sock(subflow); +- int err = sock_error(ssk); +- int ssk_state; ++ if (!err) ++ return false; + +- if (!err) +- continue; ++ /* only propagate errors on fallen-back sockets or ++ * on MPC connect ++ */ ++ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) ++ return false; + +- /* only propagate errors on fallen-back sockets or +- * on MPC connect +- */ +- if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) +- continue; ++ /* We need to propagate only transition to CLOSE state. ++ * Orphaned socket will see such state change via ++ * subflow_sched_work_if_closed() and that path will properly ++ * destroy the msk as needed. ++ */ ++ ssk_state = inet_sk_state_load(ssk); ++ if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) ++ inet_sk_state_store(sk, ssk_state); ++ WRITE_ONCE(sk->sk_err, -err); ++ ++ /* This barrier is coupled with smp_rmb() in mptcp_poll() */ ++ smp_wmb(); ++ sk_error_report(sk); ++ return true; ++} + +- /* We need to propagate only transition to CLOSE state. +- * Orphaned socket will see such state change via +- * subflow_sched_work_if_closed() and that path will properly +- * destroy the msk as needed. +- */ +- ssk_state = inet_sk_state_load(ssk); +- if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) +- inet_sk_state_store(sk, ssk_state); +- WRITE_ONCE(sk->sk_err, -err); +- +- /* This barrier is coupled with smp_rmb() in mptcp_poll() */ +- smp_wmb(); +- sk_error_report(sk); +- break; +- } ++void __mptcp_error_report(struct sock *sk) ++{ ++ struct mptcp_subflow_context *subflow; ++ struct mptcp_sock *msk = mptcp_sk(sk); ++ ++ mptcp_for_each_subflow(msk, subflow) ++ if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow))) ++ break; + } + + /* In most cases we will be able to lock the mptcp socket. If its already +@@ -2417,6 +2421,7 @@ static void __mptcp_close_ssk(struct soc + } + + out_release: ++ __mptcp_subflow_error_report(sk, ssk); + release_sock(ssk); + + sock_put(ssk); diff --git a/queue-6.5/nilfs2-fix-potential-use-after-free-in-nilfs_gccache_submit_read_data.patch b/queue-6.5/nilfs2-fix-potential-use-after-free-in-nilfs_gccache_submit_read_data.patch new file mode 100644 index 00000000000..095d33a0d7a --- /dev/null +++ b/queue-6.5/nilfs2-fix-potential-use-after-free-in-nilfs_gccache_submit_read_data.patch @@ -0,0 +1,61 @@ +From 7ee29facd8a9c5a26079148e36bcf07141b3a6bc Mon Sep 17 00:00:00 2001 +From: Pan Bian +Date: Thu, 21 Sep 2023 23:17:31 +0900 +Subject: nilfs2: fix potential use after free in nilfs_gccache_submit_read_data() + +From: Pan Bian + +commit 7ee29facd8a9c5a26079148e36bcf07141b3a6bc upstream. + +In nilfs_gccache_submit_read_data(), brelse(bh) is called to drop the +reference count of bh when the call to nilfs_dat_translate() fails. If +the reference count hits 0 and its owner page gets unlocked, bh may be +freed. However, bh->b_page is dereferenced to put the page after that, +which may result in a use-after-free bug. This patch moves the release +operation after unlocking and putting the page. + +NOTE: The function in question is only called in GC, and in combination +with current userland tools, address translation using DAT does not occur +in that function, so the code path that causes this issue will not be +executed. However, it is possible to run that code path by intentionally +modifying the userland GC library or by calling the GC ioctl directly. + +[konishi.ryusuke@gmail.com: NOTE added to the commit log] +Link: https://lkml.kernel.org/r/1543201709-53191-1-git-send-email-bianpan2016@163.com +Link: https://lkml.kernel.org/r/20230921141731.10073-1-konishi.ryusuke@gmail.com +Fixes: a3d93f709e89 ("nilfs2: block cache for garbage collection") +Signed-off-by: Pan Bian +Reported-by: Ferry Meng +Closes: https://lkml.kernel.org/r/20230818092022.111054-1-mengferry@linux.alibaba.com +Signed-off-by: Ryusuke Konishi +Tested-by: Ryusuke Konishi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/nilfs2/gcinode.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/nilfs2/gcinode.c ++++ b/fs/nilfs2/gcinode.c +@@ -73,10 +73,8 @@ int nilfs_gccache_submit_read_data(struc + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn); +- if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ +- brelse(bh); ++ if (unlikely(err)) /* -EIO, -ENOMEM, -ENOENT */ + goto failed; +- } + } + + lock_buffer(bh); +@@ -102,6 +100,8 @@ int nilfs_gccache_submit_read_data(struc + failed: + unlock_page(bh->b_page); + put_page(bh->b_page); ++ if (unlikely(err)) ++ brelse(bh); + return err; + } + diff --git a/queue-6.5/revert-tty-n_gsm-fix-uaf-in-gsm_cleanup_mux.patch b/queue-6.5/revert-tty-n_gsm-fix-uaf-in-gsm_cleanup_mux.patch new file mode 100644 index 00000000000..1dde9061fab --- /dev/null +++ b/queue-6.5/revert-tty-n_gsm-fix-uaf-in-gsm_cleanup_mux.patch @@ -0,0 +1,68 @@ +From 29346e217b8ab8a52889b88f00b268278d6b7668 Mon Sep 17 00:00:00 2001 +From: Daniel Starke +Date: Thu, 14 Sep 2023 07:15:07 +0200 +Subject: Revert "tty: n_gsm: fix UAF in gsm_cleanup_mux" + +From: Daniel Starke + +commit 29346e217b8ab8a52889b88f00b268278d6b7668 upstream. + +This reverts commit 9b9c8195f3f0d74a826077fc1c01b9ee74907239. + +The commit above is reverted as it did not solve the original issue. + +gsm_cleanup_mux() tries to free up the virtual ttys by calling +gsm_dlci_release() for each available DLCI. There, dlci_put() is called to +decrease the reference counter for the DLCI via tty_port_put() which +finally calls gsm_dlci_free(). This already clears the pointer which is +being checked in gsm_cleanup_mux() before calling gsm_dlci_release(). +Therefore, it is not necessary to clear this pointer in gsm_cleanup_mux() +as done in the reverted commit. The commit introduces a null pointer +dereference: + + ? __die+0x1f/0x70 + ? page_fault_oops+0x156/0x420 + ? search_exception_tables+0x37/0x50 + ? fixup_exception+0x21/0x310 + ? exc_page_fault+0x69/0x150 + ? asm_exc_page_fault+0x26/0x30 + ? tty_port_put+0x19/0xa0 + gsmtty_cleanup+0x29/0x80 [n_gsm] + release_one_tty+0x37/0xe0 + process_one_work+0x1e6/0x3e0 + worker_thread+0x4c/0x3d0 + ? __pfx_worker_thread+0x10/0x10 + kthread+0xe1/0x110 + ? __pfx_kthread+0x10/0x10 + ret_from_fork+0x2f/0x50 + ? __pfx_kthread+0x10/0x10 + ret_from_fork_asm+0x1b/0x30 + + +The actual issue is that nothing guards dlci_put() from being called +multiple times while the tty driver was triggered but did not yet finished +calling gsm_dlci_free(). + +Fixes: 9b9c8195f3f0 ("tty: n_gsm: fix UAF in gsm_cleanup_mux") +Cc: stable +Signed-off-by: Daniel Starke +Link: https://lore.kernel.org/r/20230914051507.3240-1-daniel.starke@siemens.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/tty/n_gsm.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +--- a/drivers/tty/n_gsm.c ++++ b/drivers/tty/n_gsm.c +@@ -3071,10 +3071,8 @@ static void gsm_cleanup_mux(struct gsm_m + gsm->has_devices = false; + } + for (i = NUM_DLCI - 1; i >= 0; i--) +- if (gsm->dlci[i]) { ++ if (gsm->dlci[i]) + gsm_dlci_release(gsm->dlci[i]); +- gsm->dlci[i] = NULL; +- } + mutex_unlock(&gsm->mutex); + /* Now wipe the queues */ + tty_ldisc_flush(gsm->tty); diff --git a/queue-6.5/scsi-core-ata-do-no-try-to-probe-for-cdl-on-old-drives.patch b/queue-6.5/scsi-core-ata-do-no-try-to-probe-for-cdl-on-old-drives.patch new file mode 100644 index 00000000000..ef3bc5f5575 --- /dev/null +++ b/queue-6.5/scsi-core-ata-do-no-try-to-probe-for-cdl-on-old-drives.patch @@ -0,0 +1,101 @@ +From 2132df16f53b4f01ab25f5d404f36a22244ae342 Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Fri, 15 Sep 2023 11:20:34 +0900 +Subject: scsi: core: ata: Do no try to probe for CDL on old drives + +From: Damien Le Moal + +commit 2132df16f53b4f01ab25f5d404f36a22244ae342 upstream. + +Some old drives (e.g. an Ultra320 SCSI disk as reported by John) do not +seem to execute MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES +commands correctly and hang when a non-zero service action is specified +(one command format with service action case in scsi_report_opcode()). + +Currently, CDL probing with scsi_cdl_check_cmd() is the only caller using a +non zero service action for scsi_report_opcode(). To avoid issues with +these old drives, do not attempt CDL probe if the device reports support +for an SPC version lower than 5 (CDL was introduced in SPC-5). To keep +things working with ATA devices which probe for the CDL T2A and T2B pages +introduced with SPC-6, modify ata_scsiop_inq_std() to claim SPC-6 version +compatibility for ATA drives supporting CDL. + +SPC-6 standard version number is defined as Dh (= 13) in SPC-6 r09. Fix +scsi_probe_lun() to correctly capture this value by changing the bit mask +for the second byte of the INQUIRY response from 0x7 to 0xf. +include/scsi/scsi.h is modified to add the definition SCSI_SPC_6 with the +value 14 (Dh + 1). The missing definitions for the SCSI_SPC_4 and +SCSI_SPC_5 versions are also added. + +Reported-by: John David Anglin +Fixes: 624885209f31 ("scsi: core: Detect support for command duration limits") +Cc: stable@vger.kernel.org +Signed-off-by: Damien Le Moal +Link: https://lore.kernel.org/r/20230915022034.678121-1-dlemoal@kernel.org +Tested-by: David Gow +Reviewed-by: Bart Van Assche +Reviewed-by: Niklas Cassel +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/libata-scsi.c | 3 +++ + drivers/scsi/scsi.c | 11 +++++++++++ + drivers/scsi/scsi_scan.c | 2 +- + include/scsi/scsi.h | 3 +++ + 4 files changed, 18 insertions(+), 1 deletion(-) + +--- a/drivers/ata/libata-scsi.c ++++ b/drivers/ata/libata-scsi.c +@@ -1892,6 +1892,9 @@ static unsigned int ata_scsiop_inq_std(s + hdr[2] = 0x7; /* claim SPC-5 version compatibility */ + } + ++ if (args->dev->flags & ATA_DFLAG_CDL) ++ hdr[2] = 0xd; /* claim SPC-6 version compatibility */ ++ + memcpy(rbuf, hdr, sizeof(hdr)); + memcpy(&rbuf[8], "ATA ", 8); + ata_id_string(args->id, &rbuf[16], ATA_ID_PROD, 16); +--- a/drivers/scsi/scsi.c ++++ b/drivers/scsi/scsi.c +@@ -613,6 +613,17 @@ void scsi_cdl_check(struct scsi_device * + bool cdl_supported; + unsigned char *buf; + ++ /* ++ * Support for CDL was defined in SPC-5. Ignore devices reporting an ++ * lower SPC version. This also avoids problems with old drives choking ++ * on MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES with a ++ * service action specified, as done in scsi_cdl_check_cmd(). ++ */ ++ if (sdev->scsi_level < SCSI_SPC_5) { ++ sdev->cdl_supported = 0; ++ return; ++ } ++ + buf = kmalloc(SCSI_CDL_CHECK_BUF_LEN, GFP_KERNEL); + if (!buf) { + sdev->cdl_supported = 0; +--- a/drivers/scsi/scsi_scan.c ++++ b/drivers/scsi/scsi_scan.c +@@ -822,7 +822,7 @@ static int scsi_probe_lun(struct scsi_de + * device is attached at LUN 0 (SCSI_SCAN_TARGET_PRESENT) so + * non-zero LUNs can be scanned. + */ +- sdev->scsi_level = inq_result[2] & 0x07; ++ sdev->scsi_level = inq_result[2] & 0x0f; + if (sdev->scsi_level >= 2 || + (sdev->scsi_level == 1 && (inq_result[3] & 0x0f) == 1)) + sdev->scsi_level++; +--- a/include/scsi/scsi.h ++++ b/include/scsi/scsi.h +@@ -157,6 +157,9 @@ enum scsi_disposition { + #define SCSI_3 4 /* SPC */ + #define SCSI_SPC_2 5 + #define SCSI_SPC_3 6 ++#define SCSI_SPC_4 7 ++#define SCSI_SPC_5 8 ++#define SCSI_SPC_6 14 + + /* + * INQ PERIPHERAL QUALIFIERS diff --git a/queue-6.5/serial-8250_port-check-irq-data-before-use.patch b/queue-6.5/serial-8250_port-check-irq-data-before-use.patch new file mode 100644 index 00000000000..7f79fbf2eb4 --- /dev/null +++ b/queue-6.5/serial-8250_port-check-irq-data-before-use.patch @@ -0,0 +1,49 @@ +From cce7fc8b29961b64fadb1ce398dc5ff32a79643b Mon Sep 17 00:00:00 2001 +From: Andy Shevchenko +Date: Fri, 1 Sep 2023 01:25:55 +0300 +Subject: serial: 8250_port: Check IRQ data before use + +From: Andy Shevchenko + +commit cce7fc8b29961b64fadb1ce398dc5ff32a79643b upstream. + +In case the leaf driver wants to use IRQ polling (irq = 0) and +IIR register shows that an interrupt happened in the 8250 hardware +the IRQ data can be NULL. In such a case we need to skip the wake +event as we came to this path from the timer interrupt and quite +likely system is already awake. + +Without this fix we have got an Oops: + + serial8250: ttyS0 at I/O 0x3f8 (irq = 0, base_baud = 115200) is a 16550A + ... + BUG: kernel NULL pointer dereference, address: 0000000000000010 + RIP: 0010:serial8250_handle_irq+0x7c/0x240 + Call Trace: + ? serial8250_handle_irq+0x7c/0x240 + ? __pfx_serial8250_timeout+0x10/0x10 + +Fixes: 0ba9e3a13c6a ("serial: 8250: Add missing wakeup event reporting") +Cc: stable +Signed-off-by: Andy Shevchenko +Reviewed-by: Florian Fainelli +Link: https://lore.kernel.org/r/20230831222555.614426-1-andriy.shevchenko@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/tty/serial/8250/8250_port.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/drivers/tty/serial/8250/8250_port.c ++++ b/drivers/tty/serial/8250/8250_port.c +@@ -1929,7 +1929,10 @@ int serial8250_handle_irq(struct uart_po + skip_rx = true; + + if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) { +- if (irqd_is_wakeup_set(irq_get_irq_data(port->irq))) ++ struct irq_data *d; ++ ++ d = irq_get_irq_data(port->irq); ++ if (d && irqd_is_wakeup_set(d)) + pm_wakeup_event(tport->tty->dev, 0); + if (!up->dma || handle_rx_dma(up, iir)) + status = serial8250_rx_chars(up, status); diff --git a/queue-6.5/series b/queue-6.5/series index 35d11bb8091..4dd58acffa9 100644 --- a/queue-6.5/series +++ b/queue-6.5/series @@ -237,3 +237,19 @@ fbdev-sh7760fb-depend-on-fb-y.patch perf-build-define-yynomem-as-yynoabort-for-bison-3.8.patch asoc-cs35l56-call-pm_runtime_dont_use_autosuspend.patch iommu-arm-smmu-v3-fix-soft-lockup-triggered-by-arm_s.patch +spi-zynqmp-gqspi-fix-clock-imbalance-on-probe-failure.patch +x86-sgx-resolves-secs-reclaim-vs.-page-fault-for-eaug-race.patch +x86-srso-add-srso-mitigation-for-hygon-processors.patch +kvm-svm-intercept_rdtscp-is-never-intercepted-anyway.patch +kvm-svm-fix-tsc_aux-virtualization-setup.patch +kvm-x86-mmu-open-code-leaf-invalidation-from-mmu_notifier.patch +kvm-x86-mmu-do-not-filter-address-spaces-in-for_each_tdp_mmu_root_yield_safe.patch +kvm-x86-mmu-stop-zapping-invalidated-tdp-mmu-roots-asynchronously.patch +mptcp-fix-bogus-receive-window-shrinkage-with-multiple-subflows.patch +mptcp-move-__mptcp_error_report-in-protocol.c.patch +mptcp-process-pending-subflow-error-on-close.patch +misc-rtsx-fix-some-platforms-can-not-boot-and-move-the-l1ss-judgment-to-probe.patch +revert-tty-n_gsm-fix-uaf-in-gsm_cleanup_mux.patch +scsi-core-ata-do-no-try-to-probe-for-cdl-on-old-drives.patch +serial-8250_port-check-irq-data-before-use.patch +nilfs2-fix-potential-use-after-free-in-nilfs_gccache_submit_read_data.patch diff --git a/queue-6.5/spi-zynqmp-gqspi-fix-clock-imbalance-on-probe-failure.patch b/queue-6.5/spi-zynqmp-gqspi-fix-clock-imbalance-on-probe-failure.patch new file mode 100644 index 00000000000..7a1ba819f2c --- /dev/null +++ b/queue-6.5/spi-zynqmp-gqspi-fix-clock-imbalance-on-probe-failure.patch @@ -0,0 +1,57 @@ +From 1527b076ae2cb6a9c590a02725ed39399fcad1cf Mon Sep 17 00:00:00 2001 +From: Johan Hovold +Date: Thu, 22 Jun 2023 10:24:35 +0200 +Subject: spi: zynqmp-gqspi: fix clock imbalance on probe failure + +From: Johan Hovold + +commit 1527b076ae2cb6a9c590a02725ed39399fcad1cf upstream. + +Make sure that the device is not runtime suspended before explicitly +disabling the clocks on probe failure and on driver unbind to avoid a +clock enable-count imbalance. + +Fixes: 9e3a000362ae ("spi: zynqmp: Add pm runtime support") +Cc: stable@vger.kernel.org # 4.19 +Cc: Naga Sureshkumar Relli +Cc: Shubhrajyoti Datta +Signed-off-by: Johan Hovold +Link: https://lore.kernel.org/r/Message-Id: <20230622082435.7873-1-johan+linaro@kernel.org> +Signed-off-by: Mark Brown +Signed-off-by: Greg Kroah-Hartman +--- + drivers/spi/spi-zynqmp-gqspi.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/drivers/spi/spi-zynqmp-gqspi.c ++++ b/drivers/spi/spi-zynqmp-gqspi.c +@@ -1342,9 +1342,9 @@ static int zynqmp_qspi_probe(struct plat + return 0; + + clk_dis_all: +- pm_runtime_put_sync(&pdev->dev); +- pm_runtime_set_suspended(&pdev->dev); + pm_runtime_disable(&pdev->dev); ++ pm_runtime_put_noidle(&pdev->dev); ++ pm_runtime_set_suspended(&pdev->dev); + clk_disable_unprepare(xqspi->refclk); + clk_dis_pclk: + clk_disable_unprepare(xqspi->pclk); +@@ -1368,11 +1368,15 @@ static void zynqmp_qspi_remove(struct pl + { + struct zynqmp_qspi *xqspi = platform_get_drvdata(pdev); + ++ pm_runtime_get_sync(&pdev->dev); ++ + zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, 0x0); ++ ++ pm_runtime_disable(&pdev->dev); ++ pm_runtime_put_noidle(&pdev->dev); ++ pm_runtime_set_suspended(&pdev->dev); + clk_disable_unprepare(xqspi->refclk); + clk_disable_unprepare(xqspi->pclk); +- pm_runtime_set_suspended(&pdev->dev); +- pm_runtime_disable(&pdev->dev); + } + + MODULE_DEVICE_TABLE(of, zynqmp_qspi_of_match); diff --git a/queue-6.5/x86-sgx-resolves-secs-reclaim-vs.-page-fault-for-eaug-race.patch b/queue-6.5/x86-sgx-resolves-secs-reclaim-vs.-page-fault-for-eaug-race.patch new file mode 100644 index 00000000000..bda57a0ea95 --- /dev/null +++ b/queue-6.5/x86-sgx-resolves-secs-reclaim-vs.-page-fault-for-eaug-race.patch @@ -0,0 +1,112 @@ +From c6c2adcba50c2622ed25ba5d5e7f05f584711358 Mon Sep 17 00:00:00 2001 +From: Haitao Huang +Date: Thu, 27 Jul 2023 22:10:24 -0700 +Subject: x86/sgx: Resolves SECS reclaim vs. page fault for EAUG race + +From: Haitao Huang + +commit c6c2adcba50c2622ed25ba5d5e7f05f584711358 upstream. + +The SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an +enclave and set secs.epc_page to NULL. The SECS page is used for EAUG +and ELDU in the SGX page fault handler. However, the NULL check for +secs.epc_page is only done for ELDU, not EAUG before being used. + +Fix this by doing the same NULL check and reloading of the SECS page as +needed for both EAUG and ELDU. + +The SECS page holds global enclave metadata. It can only be reclaimed +when there are no other enclave pages remaining. At that point, +virtually nothing can be done with the enclave until the SECS page is +paged back in. + +An enclave can not run nor generate page faults without a resident SECS +page. But it is still possible for a #PF for a non-SECS page to race +with paging out the SECS page: when the last resident non-SECS page A +triggers a #PF in a non-resident page B, and then page A and the SECS +both are paged out before the #PF on B is handled. + +Hitting this bug requires that race triggered with a #PF for EAUG. +Following is a trace when it happens. + +BUG: kernel NULL pointer dereference, address: 0000000000000000 +RIP: 0010:sgx_encl_eaug_page+0xc7/0x210 +Call Trace: + ? __kmem_cache_alloc_node+0x16a/0x440 + ? xa_load+0x6e/0xa0 + sgx_vma_fault+0x119/0x230 + __do_fault+0x36/0x140 + do_fault+0x12f/0x400 + __handle_mm_fault+0x728/0x1110 + handle_mm_fault+0x105/0x310 + do_user_addr_fault+0x1ee/0x750 + ? __this_cpu_preempt_check+0x13/0x20 + exc_page_fault+0x76/0x180 + asm_exc_page_fault+0x27/0x30 + +Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave") +Signed-off-by: Haitao Huang +Signed-off-by: Dave Hansen +Reviewed-by: Jarkko Sakkinen +Reviewed-by: Kai Huang +Acked-by: Reinette Chatre +Cc:stable@vger.kernel.org +Link: https://lore.kernel.org/all/20230728051024.33063-1-haitao.huang%40linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/sgx/encl.c | 30 +++++++++++++++++++++++++----- + 1 file changed, 25 insertions(+), 5 deletions(-) + +--- a/arch/x86/kernel/cpu/sgx/encl.c ++++ b/arch/x86/kernel/cpu/sgx/encl.c +@@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eld + return epc_page; + } + ++/* ++ * Ensure the SECS page is not swapped out. Must be called with encl->lock ++ * to protect the enclave states including SECS and ensure the SECS page is ++ * not swapped out again while being used. ++ */ ++static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) ++{ ++ struct sgx_epc_page *epc_page = encl->secs.epc_page; ++ ++ if (!epc_page) ++ epc_page = sgx_encl_eldu(&encl->secs, NULL); ++ ++ return epc_page; ++} ++ + static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, + struct sgx_encl_page *entry) + { +@@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_ + return entry; + } + +- if (!(encl->secs.epc_page)) { +- epc_page = sgx_encl_eldu(&encl->secs, NULL); +- if (IS_ERR(epc_page)) +- return ERR_CAST(epc_page); +- } ++ epc_page = sgx_encl_load_secs(encl); ++ if (IS_ERR(epc_page)) ++ return ERR_CAST(epc_page); + + epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); + if (IS_ERR(epc_page)) +@@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(str + + mutex_lock(&encl->lock); + ++ epc_page = sgx_encl_load_secs(encl); ++ if (IS_ERR(epc_page)) { ++ if (PTR_ERR(epc_page) == -EBUSY) ++ vmret = VM_FAULT_NOPAGE; ++ goto err_out_unlock; ++ } ++ + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) diff --git a/queue-6.5/x86-srso-add-srso-mitigation-for-hygon-processors.patch b/queue-6.5/x86-srso-add-srso-mitigation-for-hygon-processors.patch new file mode 100644 index 00000000000..d8b8ac0f08f --- /dev/null +++ b/queue-6.5/x86-srso-add-srso-mitigation-for-hygon-processors.patch @@ -0,0 +1,33 @@ +From a5ef7d68cea1344cf524f04981c2b3f80bedbb0d Mon Sep 17 00:00:00 2001 +From: Pu Wen +Date: Thu, 28 Sep 2023 14:59:16 +0800 +Subject: x86/srso: Add SRSO mitigation for Hygon processors + +From: Pu Wen + +commit a5ef7d68cea1344cf524f04981c2b3f80bedbb0d upstream. + +Add mitigation for the speculative return stack overflow vulnerability +which exists on Hygon processors too. + +Signed-off-by: Pu Wen +Signed-off-by: Ingo Molnar +Acked-by: Borislav Petkov (AMD) +Cc: +Link: https://lore.kernel.org/r/tencent_4A14812842F104E93AA722EC939483CEFF05@qq.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/common.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1288,7 +1288,7 @@ static const struct x86_cpu_id cpu_vuln_ + VULNBL_AMD(0x15, RETBLEED), + VULNBL_AMD(0x16, RETBLEED), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), +- VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), ++ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), + VULNBL_AMD(0x19, SRSO), + {} + };