]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdgpu: Add debug mask to disable CE logs
authorXiang Liu <xiang.liu@amd.com>
Fri, 6 Jun 2025 03:10:40 +0000 (11:10 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 18 Jun 2025 16:19:18 +0000 (12:19 -0400)
Add debug mask to disable kernel logs of RAS correctable errors,
including both ACA and CE error counter kernel messages.

Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index a5ccd0ada16ab03241f9e61d55fb2a312fd21f31..09f45ff40057147894c3161f160b0c425127f4ea 100644 (file)
@@ -1282,6 +1282,7 @@ struct amdgpu_device {
        bool                            debug_exp_resets;
        bool                            debug_disable_gpu_ring_reset;
        bool                            debug_vm_userptr;
+       bool                            debug_disable_ce_logs;
 
        /* Protection for the following isolation structure */
        struct mutex                    enforce_isolation_mutex;
index 3835f25929142a03a38bbda15bba910be3782e0b..cbc40cad581b4455c34586fbbeff6f7c6162f699 100644 (file)
@@ -115,6 +115,11 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st
        u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID;
        int i;
 
+       if (adev->debug_disable_ce_logs &&
+           bank->smu_err_type == ACA_SMU_TYPE_CE &&
+           !ACA_BANK_ERR_IS_DEFFERED(bank))
+               return;
+
        RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
        /* plus 1 for output format, e.g: ACA[08/08]: xxxx */
        for (i = 0; i < ARRAY_SIZE(aca_regs); i++)
index 4db92e0a60da7b37ad0be30fbc4c6f0d30836153..efd6bff95e4e1756a69e1d2b1c85e4db6a3ea209 100644 (file)
@@ -144,6 +144,7 @@ enum AMDGPU_DEBUG_MASK {
        AMDGPU_DEBUG_DISABLE_GPU_RING_RESET = BIT(6),
        AMDGPU_DEBUG_SMU_POOL = BIT(7),
        AMDGPU_DEBUG_VM_USERPTR = BIT(8),
+       AMDGPU_DEBUG_DISABLE_RAS_CE_LOG = BIT(9)
 };
 
 unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -2278,6 +2279,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
                pr_info("debug: VM mode debug for userptr is enabled\n");
                adev->debug_vm_userptr = true;
        }
+
+       if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_RAS_CE_LOG) {
+               pr_info("debug: disable kernel logs of correctalbe errors\n");
+               adev->debug_disable_ce_logs = true;
+       }
 }
 
 static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
index e91d4d37b5b363a54e40257382a155bf61a807c1..69b8733a2b2d15cbf22908af54e10cf579003c03 100644 (file)
@@ -1107,6 +1107,9 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
                                              err_info->de_count, blk_name);
                        }
                } else {
+                       if (adev->debug_disable_ce_logs)
+                               return;
+
                        for_each_ras_error(err_node, err_data) {
                                err_info = &err_node->err_info;
                                mcm_info = &err_info->mcm_info;