From a3dc149bf8602d63d307560db8dfc8375e85ecab Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 11 Aug 2024 18:00:21 +0200 Subject: [PATCH] 5.15-stable patches added patches: profiling-remove-profile-sleep-support.patch sched-cputime-fix-mul_u64_u64_div_u64-precision-for-cputime.patch scsi-mpt3sas-avoid-iommu-page-faults-on-report-zones.patch --- ...ofiling-remove-profile-sleep-support.patch | 129 ++++++++++++++++++ ...64_u64_div_u64-precision-for-cputime.patch | 58 ++++++++ ...id-iommu-page-faults-on-report-zones.patch | 96 +++++++++++++ queue-5.15/series | 3 + 4 files changed, 286 insertions(+) create mode 100644 queue-5.15/profiling-remove-profile-sleep-support.patch create mode 100644 queue-5.15/sched-cputime-fix-mul_u64_u64_div_u64-precision-for-cputime.patch create mode 100644 queue-5.15/scsi-mpt3sas-avoid-iommu-page-faults-on-report-zones.patch diff --git a/queue-5.15/profiling-remove-profile-sleep-support.patch b/queue-5.15/profiling-remove-profile-sleep-support.patch new file mode 100644 index 00000000000..bc84fd72f75 --- /dev/null +++ b/queue-5.15/profiling-remove-profile-sleep-support.patch @@ -0,0 +1,129 @@ +From b88f55389ad27f05ed84af9e1026aa64dbfabc9a Mon Sep 17 00:00:00 2001 +From: Tetsuo Handa +Date: Sun, 4 Aug 2024 18:48:10 +0900 +Subject: profiling: remove profile=sleep support + +From: Tetsuo Handa + +commit b88f55389ad27f05ed84af9e1026aa64dbfabc9a upstream. + +The kernel sleep profile is no longer working due to a recursive locking +bug introduced by commit 42a20f86dc19 ("sched: Add wrapper for get_wchan() +to keep task blocked") + +Booting with the 'profile=sleep' kernel command line option added or +executing + + # echo -n sleep > /sys/kernel/profiling + +after boot causes the system to lock up. + +Lockdep reports + + kthreadd/3 is trying to acquire lock: + ffff93ac82e08d58 (&p->pi_lock){....}-{2:2}, at: get_wchan+0x32/0x70 + + but task is already holding lock: + ffff93ac82e08d58 (&p->pi_lock){....}-{2:2}, at: try_to_wake_up+0x53/0x370 + +with the call trace being + + lock_acquire+0xc8/0x2f0 + get_wchan+0x32/0x70 + __update_stats_enqueue_sleeper+0x151/0x430 + enqueue_entity+0x4b0/0x520 + enqueue_task_fair+0x92/0x6b0 + ttwu_do_activate+0x73/0x140 + try_to_wake_up+0x213/0x370 + swake_up_locked+0x20/0x50 + complete+0x2f/0x40 + kthread+0xfb/0x180 + +However, since nobody noticed this regression for more than two years, +let's remove 'profile=sleep' support based on the assumption that nobody +needs this functionality. + +Fixes: 42a20f86dc19 ("sched: Add wrapper for get_wchan() to keep task blocked") +Cc: stable@vger.kernel.org # v5.16+ +Signed-off-by: Tetsuo Handa +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/admin-guide/kernel-parameters.txt | 4 +--- + include/linux/profile.h | 1 - + kernel/profile.c | 16 +--------------- + kernel/sched/fair.c | 10 ---------- + 4 files changed, 2 insertions(+), 29 deletions(-) + +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4355,11 +4355,9 @@ + + profile= [KNL] Enable kernel profiling via /proc/profile + Format: [,] +- Param: : "schedule", "sleep", or "kvm" ++ Param: : "schedule" or "kvm" + [defaults to kernel profiling] + Param: "schedule" - profile schedule points. +- Param: "sleep" - profile D-state sleeping (millisecs). +- Requires CONFIG_SCHEDSTATS + Param: "kvm" - profile VM exits. + Param: - step/bucket size as a power of 2 for + statistical time based profiling. +--- a/include/linux/profile.h ++++ b/include/linux/profile.h +@@ -11,7 +11,6 @@ + + #define CPU_PROFILING 1 + #define SCHED_PROFILING 2 +-#define SLEEP_PROFILING 3 + #define KVM_PROFILING 4 + + struct proc_dir_entry; +--- a/kernel/profile.c ++++ b/kernel/profile.c +@@ -57,24 +57,10 @@ static DEFINE_MUTEX(profile_flip_mutex); + int profile_setup(char *str) + { + static const char schedstr[] = "schedule"; +- static const char sleepstr[] = "sleep"; + static const char kvmstr[] = "kvm"; + int par; + +- if (!strncmp(str, sleepstr, strlen(sleepstr))) { +-#ifdef CONFIG_SCHEDSTATS +- force_schedstat_enabled(); +- prof_on = SLEEP_PROFILING; +- if (str[strlen(sleepstr)] == ',') +- str += strlen(sleepstr) + 1; +- if (get_option(&str, &par)) +- prof_shift = clamp(par, 0, BITS_PER_LONG - 1); +- pr_info("kernel sleep profiling enabled (shift: %u)\n", +- prof_shift); +-#else +- pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); +-#endif /* CONFIG_SCHEDSTATS */ +- } else if (!strncmp(str, schedstr, strlen(schedstr))) { ++ if (!strncmp(str, schedstr, strlen(schedstr))) { + prof_on = SCHED_PROFILING; + if (str[strlen(schedstr)] == ',') + str += strlen(schedstr) + 1; +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -988,16 +988,6 @@ update_stats_enqueue_sleeper(struct cfs_ + + trace_sched_stat_blocked(tsk, delta); + +- /* +- * Blocking time is in units of nanosecs, so shift by +- * 20 to get a milliseconds-range estimation of the +- * amount of time that the task spent sleeping: +- */ +- if (unlikely(prof_on == SLEEP_PROFILING)) { +- profile_hits(SLEEP_PROFILING, +- (void *)get_wchan(tsk), +- delta >> 20); +- } + account_scheduler_latency(tsk, delta >> 10, 0); + } + } diff --git a/queue-5.15/sched-cputime-fix-mul_u64_u64_div_u64-precision-for-cputime.patch b/queue-5.15/sched-cputime-fix-mul_u64_u64_div_u64-precision-for-cputime.patch new file mode 100644 index 00000000000..ec5917b02c8 --- /dev/null +++ b/queue-5.15/sched-cputime-fix-mul_u64_u64_div_u64-precision-for-cputime.patch @@ -0,0 +1,58 @@ +From 77baa5bafcbe1b2a15ef9c37232c21279c95481c Mon Sep 17 00:00:00 2001 +From: Zheng Zucheng +Date: Fri, 26 Jul 2024 02:32:35 +0000 +Subject: sched/cputime: Fix mul_u64_u64_div_u64() precision for cputime + +From: Zheng Zucheng + +commit 77baa5bafcbe1b2a15ef9c37232c21279c95481c upstream. + +In extreme test scenarios: +the 14th field utime in /proc/xx/stat is greater than sum_exec_runtime, +utime = 18446744073709518790 ns, rtime = 135989749728000 ns + +In cputime_adjust() process, stime is greater than rtime due to +mul_u64_u64_div_u64() precision problem. +before call mul_u64_u64_div_u64(), +stime = 175136586720000, rtime = 135989749728000, utime = 1416780000. +after call mul_u64_u64_div_u64(), +stime = 135989949653530 + +unsigned reversion occurs because rtime is less than stime. +utime = rtime - stime = 135989749728000 - 135989949653530 + = -199925530 + = (u64)18446744073709518790 + +Trigger condition: + 1). User task run in kernel mode most of time + 2). ARM64 architecture + 3). TICK_CPU_ACCOUNTING=y + CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set + +Fix mul_u64_u64_div_u64() conversion precision by reset stime to rtime + +Fixes: 3dc167ba5729 ("sched/cputime: Improve cputime_adjust()") +Signed-off-by: Zheng Zucheng +Signed-off-by: Peter Zijlstra (Intel) +Cc: +Link: https://lkml.kernel.org/r/20240726023235.217771-1-zhengzucheng@huawei.com +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/cputime.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -577,6 +577,12 @@ void cputime_adjust(struct task_cputime + } + + stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); ++ /* ++ * Because mul_u64_u64_div_u64() can approximate on some ++ * achitectures; enforce the constraint that: a*b/(b+c) <= a. ++ */ ++ if (unlikely(stime > rtime)) ++ stime = rtime; + + update: + /* diff --git a/queue-5.15/scsi-mpt3sas-avoid-iommu-page-faults-on-report-zones.patch b/queue-5.15/scsi-mpt3sas-avoid-iommu-page-faults-on-report-zones.patch new file mode 100644 index 00000000000..5c27f4171a7 --- /dev/null +++ b/queue-5.15/scsi-mpt3sas-avoid-iommu-page-faults-on-report-zones.patch @@ -0,0 +1,96 @@ +From 82dbb57ac8d06dfe8227ba9ab11a49de2b475ae5 Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Fri, 19 Jul 2024 16:39:12 +0900 +Subject: scsi: mpt3sas: Avoid IOMMU page faults on REPORT ZONES + +From: Damien Le Moal + +commit 82dbb57ac8d06dfe8227ba9ab11a49de2b475ae5 upstream. + +Some firmware versions of the 9600 series SAS HBA byte-swap the REPORT +ZONES command reply buffer from ATA-ZAC devices by directly accessing the +buffer in the host memory. This does not respect the default command DMA +direction and causes IOMMU page faults on architectures with an IOMMU +enforcing write-only mappings for DMA_FROM_DEVICE DMA driection (e.g. AMD +hosts). + +scsi 18:0:0:0: Direct-Access-ZBC ATA WDC WSH722020AL W870 PQ: 0 ANSI: 6 +scsi 18:0:0:0: SATA: handle(0x0027), sas_addr(0x300062b2083e7c40), phy(0), device_name(0x5000cca29dc35e11) +scsi 18:0:0:0: enclosure logical id (0x300062b208097c40), slot(0) +scsi 18:0:0:0: enclosure level(0x0000), connector name( C0.0) +scsi 18:0:0:0: atapi(n), ncq(y), asyn_notify(n), smart(y), fua(y), sw_preserve(y) +scsi 18:0:0:0: qdepth(32), tagged(1), scsi_level(7), cmd_que(1) +sd 18:0:0:0: Attached scsi generic sg2 type 20 +sd 18:0:0:0: [sdc] Host-managed zoned block device +mpt3sas 0000:41:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0021 address=0xfff9b200 flags=0x0050] +mpt3sas 0000:41:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0021 address=0xfff9b300 flags=0x0050] +mpt3sas_cm0: mpt3sas_ctl_pre_reset_handler: Releasing the trace buffer due to adapter reset. +mpt3sas_cm0 fault info from func: mpt3sas_base_make_ioc_ready +mpt3sas_cm0: fault_state(0x2666)! +mpt3sas_cm0: sending diag reset !! +mpt3sas_cm0: diag reset: SUCCESS +sd 18:0:0:0: [sdc] REPORT ZONES start lba 0 failed +sd 18:0:0:0: [sdc] REPORT ZONES: Result: hostbyte=DID_RESET driverbyte=DRIVER_OK +sd 18:0:0:0: [sdc] 0 4096-byte logical blocks: (0 B/0 B) + +Avoid such issue by always mapping the buffer of REPORT ZONES commands +using DMA_BIDIRECTIONAL (read+write IOMMU mapping). This is done by +introducing the helper function _base_scsi_dma_map() and using this helper +in _base_build_sg_scmd() and _base_build_sg_scmd_ieee() instead of calling +directly scsi_dma_map(). + +Fixes: 471ef9d4e498 ("mpt3sas: Build MPI SGL LIST on GEN2 HBAs and IEEE SGL LIST on GEN3 HBAs") +Cc: stable@vger.kernel.org +Signed-off-by: Damien Le Moal +Link: https://lore.kernel.org/r/20240719073913.179559-3-dlemoal@kernel.org +Reviewed-by: Christoph Hellwig +Reviewed-by: Johannes Thumshirn +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/scsi/mpt3sas/mpt3sas_base.c | 20 ++++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +--- a/drivers/scsi/mpt3sas/mpt3sas_base.c ++++ b/drivers/scsi/mpt3sas/mpt3sas_base.c +@@ -2672,6 +2672,22 @@ _base_build_zero_len_sge_ieee(struct MPT + _base_add_sg_single_ieee(paddr, sgl_flags, 0, 0, -1); + } + ++static inline int _base_scsi_dma_map(struct scsi_cmnd *cmd) ++{ ++ /* ++ * Some firmware versions byte-swap the REPORT ZONES command reply from ++ * ATA-ZAC devices by directly accessing in the host buffer. This does ++ * not respect the default command DMA direction and causes IOMMU page ++ * faults on some architectures with an IOMMU enforcing write mappings ++ * (e.g. AMD hosts). Avoid such issue by making the report zones buffer ++ * mapping bi-directional. ++ */ ++ if (cmd->cmnd[0] == ZBC_IN && cmd->cmnd[1] == ZI_REPORT_ZONES) ++ cmd->sc_data_direction = DMA_BIDIRECTIONAL; ++ ++ return scsi_dma_map(cmd); ++} ++ + /** + * _base_build_sg_scmd - main sg creation routine + * pcie_device is unused here! +@@ -2718,7 +2734,7 @@ _base_build_sg_scmd(struct MPT3SAS_ADAPT + sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT; + + sg_scmd = scsi_sglist(scmd); +- sges_left = scsi_dma_map(scmd); ++ sges_left = _base_scsi_dma_map(scmd); + if (sges_left < 0) + return -ENOMEM; + +@@ -2862,7 +2878,7 @@ _base_build_sg_scmd_ieee(struct MPT3SAS_ + } + + sg_scmd = scsi_sglist(scmd); +- sges_left = scsi_dma_map(scmd); ++ sges_left = _base_scsi_dma_map(scmd); + if (sges_left < 0) + return -ENOMEM; + diff --git a/queue-5.15/series b/queue-5.15/series index 699d125a4ba..9736a6beb68 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -406,3 +406,6 @@ ext4-fix-uninitialized-variable-in-ext4_inlinedir_to.patch jbd2-avoid-memleak-in-jbd2_journal_write_metadata_bu.patch s390-sclp-prevent-release-of-buffer-in-i-o.patch sunrpc-fix-a-race-to-wake-a-sync-task.patch +profiling-remove-profile-sleep-support.patch +scsi-mpt3sas-avoid-iommu-page-faults-on-report-zones.patch +sched-cputime-fix-mul_u64_u64_div_u64-precision-for-cputime.patch -- 2.47.3