]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amd/pm: correct mem_busy_percent display due to calculation errors
authorYang Wang <kevinyang.wang@amd.com>
Thu, 26 Mar 2026 01:41:46 +0000 (21:41 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 3 Apr 2026 17:52:44 +0000 (13:52 -0400)
PMFW may return invalid values due to internal calculation errors.
so, the kmd driver must validate and sanitize the returned values to
prevent issues caused by firmware calculation errors.

For example, values 0xfffe (-2) and 0xffff (-1) are treated
as invalid and clamped to 0.

this applies to devices with CAB (Cache As Buffer) functionality.

Closes: https://gitlab.freedesktop.org/drm/amd/-/work_items/4905
Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Kenneth Feng <kenneth.feng@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c

index 609f5ab07d8a85664afcbe5baf26a4e1356e0cbd..126fc54cb5113221653e1e65d71e9490a1480045 100644 (file)
@@ -2164,4 +2164,21 @@ static inline void smu_feature_init(struct smu_context *smu, int feature_num)
        smu_feature_list_clear_all(smu, SMU_FEATURE_LIST_ALLOWED);
 }
 
+/*
+ * smu_safe_u16_nn - Make u16 safe by filtering negative overflow errors
+ * @val: Input u16 value, may contain invalid negative overflows
+ *
+ * Convert u16 to non-negative value. Cast to s16 to detect negative values
+ * caused by calculation errors. Return 0 for negative errors, return
+ * original value if valid.
+ *
+ * Return: Valid u16 value or 0
+ */
+static inline u16 smu_safe_u16_nn(u16 val)
+{
+       s16 tmp = (s16)val;
+
+       return tmp < 0 ? 0 : val;
+}
+
 #endif
index b414a74d29fd4a2d1cba35e95c971eecb81b9f3d..0a7f5fa3c1d319f9f106e941d424265576afff28 100644 (file)
@@ -773,13 +773,13 @@ static int smu_v13_0_0_get_smu_metrics_data(struct smu_context *smu,
                        *value = metrics->AverageGfxclkFrequencyPreDs;
                break;
        case METRICS_AVERAGE_FCLK:
-               if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
+               if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD)
                        *value = metrics->AverageFclkFrequencyPostDs;
                else
                        *value = metrics->AverageFclkFrequencyPreDs;
                break;
        case METRICS_AVERAGE_UCLK:
-               if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
+               if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD)
                        *value = metrics->AverageMemclkFrequencyPostDs;
                else
                        *value = metrics->AverageMemclkFrequencyPreDs;
@@ -800,7 +800,7 @@ static int smu_v13_0_0_get_smu_metrics_data(struct smu_context *smu,
                *value = metrics->AverageGfxActivity;
                break;
        case METRICS_AVERAGE_MEMACTIVITY:
-               *value = metrics->AverageUclkActivity;
+               *value = smu_safe_u16_nn(metrics->AverageUclkActivity);
                break;
        case METRICS_AVERAGE_VCNACTIVITY:
                *value = max(metrics->Vcn0ActivityPercentage,
@@ -2085,7 +2085,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct smu_context *smu,
                                             metrics->AvgTemperature[TEMP_VR_MEM1]);
 
        gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
-       gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
+       gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity);
        gpu_metrics->average_mm_activity = max(metrics->Vcn0ActivityPercentage,
                                               metrics->Vcn1ActivityPercentage);
 
@@ -2102,7 +2102,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct smu_context *smu,
        else
                gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs;
 
-       if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
+       if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD)
                gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs;
        else
                gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs;
index fd0b6215364fab0c6d4182f22575998ceb596b45..5abf2b0703c6210520d4ef4a16b55f85b9260cbc 100644 (file)
@@ -783,13 +783,13 @@ static int smu_v13_0_7_get_smu_metrics_data(struct smu_context *smu,
                *value = metrics->AverageGfxclkFrequencyPreDs;
                break;
        case METRICS_AVERAGE_FCLK:
-               if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
+               if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD)
                        *value = metrics->AverageFclkFrequencyPostDs;
                else
                        *value = metrics->AverageFclkFrequencyPreDs;
                break;
        case METRICS_AVERAGE_UCLK:
-               if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
+               if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD)
                        *value = metrics->AverageMemclkFrequencyPostDs;
                else
                        *value = metrics->AverageMemclkFrequencyPreDs;
@@ -814,7 +814,7 @@ static int smu_v13_0_7_get_smu_metrics_data(struct smu_context *smu,
                *value = metrics->AverageGfxActivity;
                break;
        case METRICS_AVERAGE_MEMACTIVITY:
-               *value = metrics->AverageUclkActivity;
+               *value = smu_safe_u16_nn(metrics->AverageUclkActivity);
                break;
        case METRICS_AVERAGE_SOCKETPOWER:
                *value = metrics->AverageSocketPower << 8;
@@ -2091,7 +2091,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct smu_context *smu,
                                             metrics->AvgTemperature[TEMP_VR_MEM1]);
 
        gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
-       gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
+       gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity);
        gpu_metrics->average_mm_activity = max(metrics->Vcn0ActivityPercentage,
                                               metrics->Vcn1ActivityPercentage);
 
@@ -2104,7 +2104,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct smu_context *smu,
        else
                gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs;
 
-       if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
+       if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD)
                gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs;
        else
                gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs;
index 31f9566f7979d99a29ee03d096cf4b88ba935c74..62514e3ac600c00baa28f337fe4ba98a7827aa55 100644 (file)
@@ -661,13 +661,13 @@ static int smu_v14_0_2_get_smu_metrics_data(struct smu_context *smu,
                        *value = metrics->AverageGfxclkFrequencyPreDs;
                break;
        case METRICS_AVERAGE_FCLK:
-               if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
+               if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD)
                        *value = metrics->AverageFclkFrequencyPostDs;
                else
                        *value = metrics->AverageFclkFrequencyPreDs;
                break;
        case METRICS_AVERAGE_UCLK:
-               if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
+               if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD)
                        *value = metrics->AverageMemclkFrequencyPostDs;
                else
                        *value = metrics->AverageMemclkFrequencyPreDs;
@@ -688,7 +688,7 @@ static int smu_v14_0_2_get_smu_metrics_data(struct smu_context *smu,
                *value = metrics->AverageGfxActivity;
                break;
        case METRICS_AVERAGE_MEMACTIVITY:
-               *value = metrics->AverageUclkActivity;
+               *value = smu_safe_u16_nn(metrics->AverageUclkActivity);
                break;
        case METRICS_AVERAGE_VCNACTIVITY:
                *value = max(metrics->AverageVcn0ActivityPercentage,
@@ -2147,7 +2147,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu,
                                             metrics->AvgTemperature[TEMP_VR_MEM1]);
 
        gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
-       gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
+       gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity);
        gpu_metrics->average_mm_activity = max(metrics->AverageVcn0ActivityPercentage,
                                               metrics->Vcn1ActivityPercentage);
 
@@ -2159,7 +2159,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu,
        else
                gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs;
 
-       if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
+       if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD)
                gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs;
        else
                gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs;