]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amdgpu: Support multiple error query modes
authorHawking Zhang <Hawking.Zhang@amd.com>
Wed, 8 Nov 2023 08:07:45 +0000 (16:07 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 9 Nov 2023 22:01:58 +0000 (17:01 -0500)
Direct error query mode and firmware error query mode
are supported for now.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index b7fe5951b166ebffd72b5a25c4057fb2e195e59e..a5c14dbb00b736911f374f9e7a2520905882606c 100644 (file)
@@ -1165,13 +1165,53 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
        }
 }
 
-/* query/inject/cure begin */
-int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
-                                 struct ras_query_if *info)
+static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
+                                               struct ras_query_if *info,
+                                               struct ras_err_data *err_data,
+                                               unsigned int error_query_mode)
 {
+       enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
        struct amdgpu_ras_block_object *block_obj = NULL;
+
+       if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
+               return -EINVAL;
+
+       if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
+               if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
+                       amdgpu_ras_get_ecc_info(adev, err_data);
+               } else {
+                       block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
+                       if (!block_obj || !block_obj->hw_ops) {
+                               dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
+                                            get_ras_block_str(&info->head));
+                               return -EINVAL;
+                       }
+
+                       if (block_obj->hw_ops->query_ras_error_count)
+                               block_obj->hw_ops->query_ras_error_count(adev, &err_data);
+
+                       if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
+                           (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
+                           (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
+                               if (block_obj->hw_ops->query_ras_error_status)
+                                       block_obj->hw_ops->query_ras_error_status(adev);
+                       }
+               }
+       } else {
+               /* FIXME: add code to check return value later */
+               amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
+               amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+       }
+
+       return 0;
+}
+
+/* query/inject/cure begin */
+int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
+{
        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
        struct ras_err_data err_data;
+       unsigned int error_query_mode;
        int ret;
 
        if (!obj)
@@ -1181,27 +1221,14 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
        if (ret)
                return ret;
 
-       if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
-               amdgpu_ras_get_ecc_info(adev, &err_data);
-       } else {
-               block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
-               if (!block_obj || !block_obj->hw_ops)   {
-                       dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
-                                    get_ras_block_str(&info->head));
-                       ret = -EINVAL;
-                       goto out_fini_err_data;
-               }
-
-               if (block_obj->hw_ops->query_ras_error_count)
-                       block_obj->hw_ops->query_ras_error_count(adev, &err_data);
+       if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
+               return -EINVAL;
 
-               if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
-                   (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
-                   (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
-                               if (block_obj->hw_ops->query_ras_error_status)
-                                       block_obj->hw_ops->query_ras_error_status(adev);
-                       }
-       }
+       ret = amdgpu_ras_query_error_status_helper(adev, info,
+                                                  &err_data,
+                                                  error_query_mode);
+       if (ret)
+               goto out_fini_err_data;
 
        amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
 
@@ -3397,6 +3424,26 @@ bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
                return true;
 }
 
+bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
+                                    unsigned int *error_query_mode)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+
+       if (!con) {
+               *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
+               return false;
+       }
+
+       if (mca_funcs && mca_funcs->mca_set_debug_mode)
+               *error_query_mode =
+                       (con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
+       else
+               *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
+
+       return true;
+}
+
 /* Register each ip ras block into amdgpu ras */
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
                struct amdgpu_ras_block_object *ras_block_obj)
index 665414c22ca95f48d260e8f25a3ae30c442cab44..19161916ac46b904ee604e5ad23cd2dc6ff77701 100644 (file)
@@ -320,6 +320,12 @@ enum amdgpu_ras_ret {
        AMDGPU_RAS_PT,
 };
 
+enum amdgpu_ras_error_query_mode {
+       AMDGPU_RAS_INVALID_ERROR_QUERY          = 0,
+       AMDGPU_RAS_DIRECT_ERROR_QUERY           = 1,
+       AMDGPU_RAS_FIRMWARE_ERROR_QUERY         = 2,
+};
+
 /* ras error status reisger fields */
 #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT    0x0
 #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK      0x00000001L
@@ -769,6 +775,8 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
 
 void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
+bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
+                                    unsigned int *mode);
 
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
                                struct amdgpu_ras_block_object *ras_block_obj);