]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdgpu: Updated RAS infrastructure
authorJohn Clements <john.clements@amd.com>
Wed, 22 Sep 2021 06:04:52 +0000 (14:04 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 23 Sep 2021 20:34:43 +0000 (16:34 -0400)
Update RAS infrastructure to support RAS query for MCA subblocks

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
drivers/gpu/drm/amd/amdgpu/ta_ras_if.h

index a2d3dbbf7d2574ce31dc21a835607ea0302370f3..ce538f4819f925c1b9c5afffc591c21c6c14e0bd 100644 (file)
@@ -31,7 +31,7 @@ void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
                                              uint64_t mc_status_addr,
                                              unsigned long *error_count)
 {
-       uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
+       uint64_t mc_status = RREG64_PCIE(mc_status_addr);
 
        if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
            REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
@@ -42,7 +42,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
                                                uint64_t mc_status_addr,
                                                unsigned long *error_count)
 {
-       uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
+       uint64_t mc_status = RREG64_PCIE(mc_status_addr);
 
        if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
            (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
@@ -56,7 +56,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
 void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
                                  uint64_t mc_status_addr)
 {
-       WREG64_PCIE(mc_status_addr * 4, 0x0ULL);
+       WREG64_PCIE(mc_status_addr, 0x0ULL);
 }
 
 void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
@@ -87,8 +87,8 @@ int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
                if (!mca_dev->ras_if)
                        return -ENOMEM;
                mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block;
+               mca_dev->ras_if->sub_block_index = mca_dev->ras_funcs->ras_sub_block;
                mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-               mca_dev->ras_if->sub_block_index = 0;
        }
        ih_info.head = fs_info.head = *mca_dev->ras_if;
        r = amdgpu_ras_late_init(adev, mca_dev->ras_if,
index f860f2f0e296e5466c83645fc7010c2edc3d515f..c74bc7177066ec534015703dd13d3c38b3303551 100644 (file)
@@ -29,6 +29,7 @@ struct amdgpu_mca_ras_funcs {
        void (*query_ras_error_address)(struct amdgpu_device *adev,
                                        void *ras_error_status);
        uint32_t ras_block;
+       uint32_t ras_sub_block;
        const char* sysfs_name;
 };
 
index b5332db4d287307c4b00a56a2e561219611207cb..912ea1f9fd04c6a60c9253b81679d4875c267604 100644 (file)
@@ -61,9 +61,30 @@ const char *ras_block_string[] = {
        "mp0",
        "mp1",
        "fuse",
-       "mpio",
+       "mca",
 };
 
+const char *ras_mca_block_string[] = {
+       "mca_mp0",
+       "mca_mp1",
+       "mca_mpio",
+       "mca_iohc",
+};
+
+const char *get_ras_block_str(struct ras_common_if *ras_block)
+{
+       if (!ras_block)
+               return "NULL";
+
+       if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
+               return "OUT OF RANGE";
+
+       if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
+               return ras_mca_block_string[ras_block->sub_block_index];
+
+       return ras_block_string[ras_block->block];
+}
+
 #define ras_err_str(i) (ras_error_string[ffs(i)])
 
 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
@@ -188,7 +209,7 @@ static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
 
        for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
                *block_id = i;
-               if (strcmp(name, ras_block_str(i)) == 0)
+               if (strcmp(name, ras_block_string[i]) == 0)
                        return 0;
        }
        return -EINVAL;
@@ -510,7 +531,6 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
        if (amdgpu_ras_query_error_status(obj->adev, &info))
                return -EINVAL;
 
-
        if (obj->adev->asic_type == CHIP_ALDEBARAN) {
                if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
                        DRM_WARN("Failed to reset error counter and error status");
@@ -530,7 +550,7 @@ static inline void put_obj(struct ras_manager *obj)
        if (obj && (--obj->use == 0))
                list_del(&obj->node);
        if (obj && (obj->use < 0))
-               DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", ras_block_str(obj->head.block));
+               DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
 }
 
 /* make one obj and return it. */
@@ -546,7 +566,14 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
        if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
                return NULL;
 
-       obj = &con->objs[head->block];
+       if (head->block == AMDGPU_RAS_BLOCK__MCA) {
+               if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
+                       return NULL;
+
+               obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
+       } else
+               obj = &con->objs[head->block];
+
        /* already exist. return obj? */
        if (alive_obj(obj))
                return NULL;
@@ -574,19 +601,21 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
                if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
                        return NULL;
 
-               obj = &con->objs[head->block];
+               if (head->block == AMDGPU_RAS_BLOCK__MCA) {
+                       if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
+                               return NULL;
+
+                       obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
+               } else
+                       obj = &con->objs[head->block];
 
-               if (alive_obj(obj)) {
-                       WARN_ON(head->block != obj->head.block);
+               if (alive_obj(obj))
                        return obj;
-               }
        } else {
-               for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
+               for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
                        obj = &con->objs[i];
-                       if (alive_obj(obj)) {
-                               WARN_ON(i != obj->head.block);
+                       if (alive_obj(obj))
                                return obj;
-                       }
                }
        }
 
@@ -627,8 +656,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
         */
        if (!amdgpu_ras_is_feature_allowed(adev, head))
                return 0;
-       if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
-               return 0;
 
        if (enable) {
                if (!obj) {
@@ -679,18 +706,13 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 
        /* Do not enable if it is not allowed. */
        WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
-       /* Are we alerady in that state we are going to set? */
-       if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) {
-               ret = 0;
-               goto out;
-       }
 
        if (!amdgpu_ras_intr_triggered()) {
                ret = psp_ras_enable_features(&adev->psp, info, enable);
                if (ret) {
                        dev_err(adev->dev, "ras %s %s failed %d\n",
                                enable ? "enable":"disable",
-                               ras_block_str(head->block),
+                               get_ras_block_str(head),
                                ret);
                        goto out;
                }
@@ -732,7 +754,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
                                if (!ret)
                                        dev_info(adev->dev,
                                                "RAS INFO: %s setup object\n",
-                                               ras_block_str(head->block));
+                                               get_ras_block_str(head));
                        }
                } else {
                        /* setup the object then issue a ras TA disable cmd.*/
@@ -782,17 +804,39 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
                bool bypass)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
        int i;
-       const enum amdgpu_ras_error_type default_ras_type =
-               AMDGPU_RAS_ERROR__NONE;
+       const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
 
-       for (i = 0; i < ras_block_count; i++) {
+       for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
                struct ras_common_if head = {
                        .block = i,
                        .type = default_ras_type,
                        .sub_block_index = 0,
                };
+
+               if (i == AMDGPU_RAS_BLOCK__MCA)
+                       continue;
+
+               if (bypass) {
+                       /*
+                        * bypass psp. vbios enable ras for us.
+                        * so just create the obj
+                        */
+                       if (__amdgpu_ras_feature_enable(adev, &head, 1))
+                               break;
+               } else {
+                       if (amdgpu_ras_feature_enable(adev, &head, 1))
+                               break;
+               }
+       }
+
+       for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
+               struct ras_common_if head = {
+                       .block = AMDGPU_RAS_BLOCK__MCA,
+                       .type = default_ras_type,
+                       .sub_block_index = i,
+               };
+
                if (bypass) {
                        /*
                         * bypass psp. vbios enable ras for us.
@@ -810,6 +854,32 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
 }
 /* feature ctl end */
 
+
+void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
+                                      struct ras_common_if *ras_block,
+                                      struct ras_err_data  *err_data)
+{
+       switch (ras_block->sub_block_index) {
+       case AMDGPU_RAS_MCA_BLOCK__MP0:
+               if (adev->mca.mp0.ras_funcs &&
+                   adev->mca.mp0.ras_funcs->query_ras_error_count)
+                       adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data);
+               break;
+       case AMDGPU_RAS_MCA_BLOCK__MP1:
+               if (adev->mca.mp1.ras_funcs &&
+                   adev->mca.mp1.ras_funcs->query_ras_error_count)
+                       adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data);
+               break;
+       case AMDGPU_RAS_MCA_BLOCK__MPIO:
+               if (adev->mca.mpio.ras_funcs &&
+                   adev->mca.mpio.ras_funcs->query_ras_error_count)
+                       adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data);
+               break;
+       default:
+               break;
+       }
+}
+
 /* query/inject/cure begin */
 int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
                                  struct ras_query_if *info)
@@ -873,6 +943,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
                    adev->hdp.ras_funcs->query_ras_error_count)
                        adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data);
                break;
+       case AMDGPU_RAS_BLOCK__MCA:
+               amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data);
+               break;
        default:
                break;
        }
@@ -894,13 +967,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
                                        adev->smuio.funcs->get_socket_id(adev),
                                        adev->smuio.funcs->get_die_id(adev),
                                        obj->err_data.ce_count,
-                                       ras_block_str(info->head.block));
+                                       get_ras_block_str(&info->head));
                } else {
                        dev_info(adev->dev, "%ld correctable hardware errors "
                                        "detected in %s block, no user "
                                        "action is needed.\n",
                                        obj->err_data.ce_count,
-                                       ras_block_str(info->head.block));
+                                       get_ras_block_str(&info->head));
                }
        }
        if (err_data.ue_count) {
@@ -913,12 +986,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
                                        adev->smuio.funcs->get_socket_id(adev),
                                        adev->smuio.funcs->get_die_id(adev),
                                        obj->err_data.ue_count,
-                                       ras_block_str(info->head.block));
+                                       get_ras_block_str(&info->head));
                } else {
                        dev_info(adev->dev, "%ld uncorrectable hardware errors "
                                        "detected in %s block\n",
                                        obj->err_data.ue_count,
-                                       ras_block_str(info->head.block));
+                                       get_ras_block_str(&info->head));
                }
        }
 
@@ -1028,9 +1101,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
        case AMDGPU_RAS_BLOCK__SDMA:
        case AMDGPU_RAS_BLOCK__MMHUB:
        case AMDGPU_RAS_BLOCK__PCIE_BIF:
-       case AMDGPU_RAS_BLOCK__MP0:
-       case AMDGPU_RAS_BLOCK__MP1:
-       case AMDGPU_RAS_BLOCK__MPIO:
+       case AMDGPU_RAS_BLOCK__MCA:
                ret = psp_ras_trigger_error(&adev->psp, &block_info);
                break;
        case AMDGPU_RAS_BLOCK__XGMI_WAFL:
@@ -1038,13 +1109,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
                break;
        default:
                dev_info(adev->dev, "%s error injection is not supported yet\n",
-                        ras_block_str(info->head.block));
+                        get_ras_block_str(&info->head));
                ret = -EINVAL;
        }
 
        if (ret)
                dev_err(adev->dev, "ras inject %s failed %d\n",
-                       ras_block_str(info->head.block), ret);
+                       get_ras_block_str(&info->head), ret);
 
        return ret;
 }
@@ -1387,7 +1458,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
                if (amdgpu_ras_is_supported(adev, obj->head.block) &&
                        (obj->attr_inuse == 1)) {
                        sprintf(fs_info.debugfs_name, "%s_err_inject",
-                                       ras_block_str(obj->head.block));
+                                       get_ras_block_str(&obj->head));
                        fs_info.head = obj->head;
                        amdgpu_ras_debugfs_create(adev, &fs_info, dir);
                }
@@ -2185,7 +2256,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                return 0;
 
        con = kmalloc(sizeof(struct amdgpu_ras) +
-                       sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
+                       sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
+                       sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
                        GFP_KERNEL|__GFP_ZERO);
        if (!con)
                return -ENOMEM;
index 1670467c20546380c088274186dab35387636c29..ec42e9873aaa9eee4c247d876e3a8b15b9946a0d 100644 (file)
@@ -49,15 +49,22 @@ enum amdgpu_ras_block {
        AMDGPU_RAS_BLOCK__MP0,
        AMDGPU_RAS_BLOCK__MP1,
        AMDGPU_RAS_BLOCK__FUSE,
-       AMDGPU_RAS_BLOCK__MPIO,
+       AMDGPU_RAS_BLOCK__MCA,
 
        AMDGPU_RAS_BLOCK__LAST
 };
 
-extern const char *ras_block_string[];
+enum amdgpu_ras_mca_block {
+       AMDGPU_RAS_MCA_BLOCK__MP0 = 0,
+       AMDGPU_RAS_MCA_BLOCK__MP1,
+       AMDGPU_RAS_MCA_BLOCK__MPIO,
+       AMDGPU_RAS_MCA_BLOCK__IOHC,
+
+       AMDGPU_RAS_MCA_BLOCK__LAST
+};
 
-#define ras_block_str(i) (ras_block_string[i])
 #define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST
+#define AMDGPU_RAS_MCA_BLOCK_COUNT     AMDGPU_RAS_MCA_BLOCK__LAST
 #define AMDGPU_RAS_BLOCK_MASK  ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
 
 enum amdgpu_ras_gfx_subblock {
@@ -544,8 +551,8 @@ amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
                return TA_RAS_BLOCK__MP1;
        case AMDGPU_RAS_BLOCK__FUSE:
                return TA_RAS_BLOCK__FUSE;
-       case AMDGPU_RAS_BLOCK__MPIO:
-               return TA_RAS_BLOCK__MPIO;
+       case AMDGPU_RAS_BLOCK__MCA:
+               return TA_RAS_BLOCK__MCA;
        default:
                WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block);
                return TA_RAS_BLOCK__UMC;
@@ -640,4 +647,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev);
 
 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);
 
+const char *get_ras_block_str(struct ras_common_if *ras_block);
+
 #endif
index 058b65730a8461f08db7981462eaa17396016a64..8f7107d392afba3fa6e164ca796e1c068bcf8ed3 100644 (file)
@@ -52,7 +52,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = {
        .ras_fini = mca_v3_0_mp0_ras_fini,
        .query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
        .query_ras_error_address = NULL,
-       .ras_block = AMDGPU_RAS_BLOCK__MP0,
+       .ras_block = AMDGPU_RAS_BLOCK__MCA,
+       .ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP0,
        .sysfs_name = "mp0_err_count",
 };
 
@@ -79,7 +80,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp1_ras_funcs = {
        .ras_fini = mca_v3_0_mp1_ras_fini,
        .query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
        .query_ras_error_address = NULL,
-       .ras_block = AMDGPU_RAS_BLOCK__MP1,
+       .ras_block = AMDGPU_RAS_BLOCK__MCA,
+       .ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP1,
        .sysfs_name = "mp1_err_count",
 };
 
@@ -106,7 +108,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mpio_ras_funcs = {
        .ras_fini = mca_v3_0_mpio_ras_fini,
        .query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
        .query_ras_error_address = NULL,
-       .ras_block = AMDGPU_RAS_BLOCK__MPIO,
+       .ras_block = AMDGPU_RAS_BLOCK__MCA,
+       .ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MPIO,
        .sysfs_name = "mpio_err_count",
 };
 
index 0a9fc19b1be02175433ec4c75bd479bb5cc74085..91b3afa946f5947e5900f004308d61fd55b2757b 100644 (file)
@@ -387,13 +387,13 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
                                                "errors detected in %s block, "
                                                "no user action is needed.\n",
                                                obj->err_data.ce_count,
-                                               ras_block_str(adev->nbio.ras_if->block));
+                                               get_ras_block_str(adev->nbio.ras_if));
 
                        if (err_data.ue_count)
                                dev_info(adev->dev, "%ld uncorrectable hardware "
                                                "errors detected in %s block\n",
                                                obj->err_data.ue_count,
-                                               ras_block_str(adev->nbio.ras_if->block));
+                                               get_ras_block_str(adev->nbio.ras_if));
                }
 
                dev_info(adev->dev, "RAS controller interrupt triggered "
index 532260fd64db1407d3f8fc6099b92d198427adf1..82d956d15b54f48e623c94cf421ad5ee534869dc 100644 (file)
@@ -73,10 +73,19 @@ enum ta_ras_block {
        TA_RAS_BLOCK__MP0,
        TA_RAS_BLOCK__MP1,
        TA_RAS_BLOCK__FUSE,
-       TA_RAS_BLOCK__MPIO,
+       TA_RAS_BLOCK__MCA,
        TA_NUM_BLOCK_MAX
 };
 
+enum ta_ras_mca_block
+{
+       TA_RAS_MCA_BLOCK__MP0   = 0,
+       TA_RAS_MCA_BLOCK__MP1   = 1,
+       TA_RAS_MCA_BLOCK__MPIO  = 2,
+       TA_RAS_MCA_BLOCK__IOHC  = 3,
+       TA_MCA_NUM_BLOCK_MAX
+};
+
 enum ta_ras_error_type {
        TA_RAS_ERROR__NONE                      = 0,
        TA_RAS_ERROR__PARITY                    = 1,