From: Yang Wang Date: Thu, 26 Mar 2026 01:41:46 +0000 (-0400) Subject: drm/amd/pm: correct mem_busy_percent display due to calculation errors X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=592713a8960ed661bd9fcb7c256921c53eadeb49;p=thirdparty%2Fkernel%2Flinux.git drm/amd/pm: correct mem_busy_percent display due to calculation errors PMFW may return invalid values due to internal calculation errors. so, the kmd driver must validate and sanitize the returned values to prevent issues caused by firmware calculation errors. For example, values 0xfffe (-2) and 0xffff (-1) are treated as invalid and clamped to 0. this applies to devices with CAB (Cache As Buffer) functionality. Closes: https://gitlab.freedesktop.org/drm/amd/-/work_items/4905 Signed-off-by: Yang Wang Reviewed-by: Kenneth Feng Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index 609f5ab07d8a..126fc54cb511 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -2164,4 +2164,21 @@ static inline void smu_feature_init(struct smu_context *smu, int feature_num) smu_feature_list_clear_all(smu, SMU_FEATURE_LIST_ALLOWED); } +/* + * smu_safe_u16_nn - Make u16 safe by filtering negative overflow errors + * @val: Input u16 value, may contain invalid negative overflows + * + * Convert u16 to non-negative value. Cast to s16 to detect negative values + * caused by calculation errors. Return 0 for negative errors, return + * original value if valid. + * + * Return: Valid u16 value or 0 + */ +static inline u16 smu_safe_u16_nn(u16 val) +{ + s16 tmp = (s16)val; + + return tmp < 0 ? 0 : val; +} + #endif diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index b414a74d29fd..0a7f5fa3c1d3 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -773,13 +773,13 @@ static int smu_v13_0_0_get_smu_metrics_data(struct smu_context *smu, *value = metrics->AverageGfxclkFrequencyPreDs; break; case METRICS_AVERAGE_FCLK: - if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD) + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD) *value = metrics->AverageFclkFrequencyPostDs; else *value = metrics->AverageFclkFrequencyPreDs; break; case METRICS_AVERAGE_UCLK: - if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD) + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD) *value = metrics->AverageMemclkFrequencyPostDs; else *value = metrics->AverageMemclkFrequencyPreDs; @@ -800,7 +800,7 @@ static int smu_v13_0_0_get_smu_metrics_data(struct smu_context *smu, *value = metrics->AverageGfxActivity; break; case METRICS_AVERAGE_MEMACTIVITY: - *value = metrics->AverageUclkActivity; + *value = smu_safe_u16_nn(metrics->AverageUclkActivity); break; case METRICS_AVERAGE_VCNACTIVITY: *value = max(metrics->Vcn0ActivityPercentage, @@ -2085,7 +2085,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct smu_context *smu, metrics->AvgTemperature[TEMP_VR_MEM1]); gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity; - gpu_metrics->average_umc_activity = metrics->AverageUclkActivity; + gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity); gpu_metrics->average_mm_activity = max(metrics->Vcn0ActivityPercentage, metrics->Vcn1ActivityPercentage); @@ -2102,7 +2102,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct smu_context *smu, else gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs; - if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD) + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD) gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs; else gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index fd0b6215364f..5abf2b0703c6 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -783,13 +783,13 @@ static int smu_v13_0_7_get_smu_metrics_data(struct smu_context *smu, *value = metrics->AverageGfxclkFrequencyPreDs; break; case METRICS_AVERAGE_FCLK: - if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD) + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD) *value = metrics->AverageFclkFrequencyPostDs; else *value = metrics->AverageFclkFrequencyPreDs; break; case METRICS_AVERAGE_UCLK: - if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD) + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD) *value = metrics->AverageMemclkFrequencyPostDs; else *value = metrics->AverageMemclkFrequencyPreDs; @@ -814,7 +814,7 @@ static int smu_v13_0_7_get_smu_metrics_data(struct smu_context *smu, *value = metrics->AverageGfxActivity; break; case METRICS_AVERAGE_MEMACTIVITY: - *value = metrics->AverageUclkActivity; + *value = smu_safe_u16_nn(metrics->AverageUclkActivity); break; case METRICS_AVERAGE_SOCKETPOWER: *value = metrics->AverageSocketPower << 8; @@ -2091,7 +2091,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct smu_context *smu, metrics->AvgTemperature[TEMP_VR_MEM1]); gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity; - gpu_metrics->average_umc_activity = metrics->AverageUclkActivity; + gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity); gpu_metrics->average_mm_activity = max(metrics->Vcn0ActivityPercentage, metrics->Vcn1ActivityPercentage); @@ -2104,7 +2104,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct smu_context *smu, else gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs; - if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD) + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD) gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs; else gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 31f9566f7979..62514e3ac600 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -661,13 +661,13 @@ static int smu_v14_0_2_get_smu_metrics_data(struct smu_context *smu, *value = metrics->AverageGfxclkFrequencyPreDs; break; case METRICS_AVERAGE_FCLK: - if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD) + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD) *value = metrics->AverageFclkFrequencyPostDs; else *value = metrics->AverageFclkFrequencyPreDs; break; case METRICS_AVERAGE_UCLK: - if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD) + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD) *value = metrics->AverageMemclkFrequencyPostDs; else *value = metrics->AverageMemclkFrequencyPreDs; @@ -688,7 +688,7 @@ static int smu_v14_0_2_get_smu_metrics_data(struct smu_context *smu, *value = metrics->AverageGfxActivity; break; case METRICS_AVERAGE_MEMACTIVITY: - *value = metrics->AverageUclkActivity; + *value = smu_safe_u16_nn(metrics->AverageUclkActivity); break; case METRICS_AVERAGE_VCNACTIVITY: *value = max(metrics->AverageVcn0ActivityPercentage, @@ -2147,7 +2147,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu, metrics->AvgTemperature[TEMP_VR_MEM1]); gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity; - gpu_metrics->average_umc_activity = metrics->AverageUclkActivity; + gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity); gpu_metrics->average_mm_activity = max(metrics->AverageVcn0ActivityPercentage, metrics->Vcn1ActivityPercentage); @@ -2159,7 +2159,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu, else gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs; - if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD) + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD) gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs; else gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs;