]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amd/ras: Support high-frequency querying sriov ras block error count
authorYiPeng Chai <YiPeng.Chai@amd.com>
Thu, 30 Oct 2025 08:06:25 +0000 (16:06 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 8 Dec 2025 18:56:33 +0000 (13:56 -0500)
Support high-frequency querying sriov ras block error count:
1. Create shared memory and fills it with RAS_CMD__GET_LAL_LOC_STATUS
   ras command.
2. The RAS_CMD_GET_ALL_BLOCK_ECC_STATUS command and shared
   memory are registered to sriov host ras auto-update list
   via RAS_CMD_SET_CMD_AUTO_UPDATE command.
3. Once sriov host detects ras error, it will automatically execute
   RAS_CMD__GET_ALL_BLOCK_ECC_STATUS command and write the result to
   shared memory.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h
drivers/gpu/drm/amd/ras/rascore/ras_cmd.h

index 78b9a28f92dfa9f17ec24148d03e2fe0595ff3dc..b8d5482e704f629f7a8d3f498e36170e01a4b44f 100644 (file)
@@ -235,9 +235,90 @@ static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
        return RAS_CMD__SUCCESS;
 }
 
+static int __fill_get_blocks_ecc_cmd(struct amdgpu_device *adev,
+                       struct vram_blocks_ecc *blks_ecc)
+{
+       struct ras_cmd_ctx *rcmd;
+
+       if (!blks_ecc || !blks_ecc->bo || !blks_ecc->cpu_addr)
+               return -EINVAL;
+
+       rcmd = (struct ras_cmd_ctx *)blks_ecc->cpu_addr;
+
+       rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS;
+       rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req);
+       rcmd->output_buf_size = blks_ecc->size - sizeof(*rcmd);
+
+       return 0;
+}
+
+static int __set_cmd_auto_update(struct amdgpu_device *adev,
+                       enum ras_cmd_id cmd_id, uint64_t gpa_addr, uint32_t len, bool reg)
+{
+       struct ras_cmd_auto_update_req req = {0};
+       struct ras_cmd_auto_update_rsp rsp = {0};
+       int ret;
+
+       req.mode = reg ? 1 : 0;
+       req.cmd_id = cmd_id;
+       req.addr = gpa_addr;
+       req.len = len;
+       ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__SET_CMD_AUTO_UPDATE,
+               &req, sizeof(req), &rsp, sizeof(rsp));
+
+       return ret;
+}
+
+static int amdgpu_virt_ras_get_block_ecc(struct ras_core_context *ras_core,
+                               struct ras_cmd_ctx *cmd, void *data)
+{
+       struct amdgpu_device *adev = ras_core->dev;
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+       struct amdgpu_virt_ras_cmd *virt_ras =
+                       (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
+       struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
+       struct ras_cmd_ctx *blks_ecc_cmd_ctx;
+       struct ras_cmd_blocks_ecc_rsp *blks_ecc_rsp;
+       struct ras_cmd_block_ecc_info_req *input_data =
+                       (struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
+       struct ras_cmd_block_ecc_info_rsp *output_data =
+                       (struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw;
+       int ret = 0;
+
+       if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req))
+               return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+       if (input_data->block_id >= MAX_RAS_BLOCK_NUM)
+               return RAS_CMD__ERROR_INVALID_INPUT_DATA;
+
+       if (__fill_get_blocks_ecc_cmd(adev, blks_ecc))
+               return RAS_CMD__ERROR_GENERIC;
+
+       if (!virt_ras->blocks_ecc.auto_update_actived) {
+               ret = __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
+                               blks_ecc->mc_addr - adev->gmc.vram_start,
+                               blks_ecc->size, true);
+               if (ret)
+                       return ret;
+
+               blks_ecc->auto_update_actived = true;
+       }
+
+       blks_ecc_cmd_ctx = blks_ecc->cpu_addr;
+       blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx->output_buff_raw;
+
+       output_data->ce_count = blks_ecc_rsp->blocks[input_data->block_id].ce_count;
+       output_data->ue_count = blks_ecc_rsp->blocks[input_data->block_id].ue_count;
+       output_data->de_count = blks_ecc_rsp->blocks[input_data->block_id].de_count;
+
+       cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp);
+       return RAS_CMD__SUCCESS;
+}
+
 static struct ras_cmd_func_map amdgpu_virt_ras_cmd_maps[] = {
        {RAS_CMD__GET_CPER_SNAPSHOT, amdgpu_virt_ras_get_cper_snapshot},
        {RAS_CMD__GET_CPER_RECORD, amdgpu_virt_ras_get_cper_records},
+       {RAS_CMD__GET_BLOCK_ECC_STATUS, amdgpu_virt_ras_get_block_ecc},
 };
 
 int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core,
@@ -294,10 +375,41 @@ int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev)
 
 int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev)
 {
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+       struct amdgpu_virt_ras_cmd *virt_ras =
+                       (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
+       struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
+
+       memset(blks_ecc, 0, sizeof(*blks_ecc));
+       blks_ecc->size = PAGE_SIZE;
+       if (amdgpu_bo_create_kernel(adev, blks_ecc->size,
+                       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
+                       &blks_ecc->bo, &blks_ecc->mc_addr,
+                       (void **)&blks_ecc->cpu_addr))
+               return -ENOMEM;
+
        return 0;
 }
 
 int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev)
 {
+       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+       struct amdgpu_virt_ras_cmd *virt_ras =
+                       (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
+       struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
+
+       if (blks_ecc->bo) {
+               __set_cmd_auto_update(adev,
+                       RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
+                       blks_ecc->mc_addr - adev->gmc.vram_start,
+                       blks_ecc->size, false);
+
+               memset(blks_ecc->cpu_addr, 0, blks_ecc->size);
+               amdgpu_bo_free_kernel(&blks_ecc->bo,
+                       &blks_ecc->mc_addr, &blks_ecc->cpu_addr);
+
+               memset(blks_ecc, 0, sizeof(*blks_ecc));
+       }
+
        return 0;
 }
index addc693c29260c734e1299d821144cb4dd7270db..ae7bf67b3a3b20f077ea36eb2e1da81c88a3190e 100644 (file)
@@ -30,8 +30,17 @@ struct remote_batch_trace_mgr {
        struct ras_cmd_batch_trace_record_rsp  batch_trace;
 };
 
+struct vram_blocks_ecc {
+       struct amdgpu_bo *bo;
+       uint64_t mc_addr;
+       void *cpu_addr;
+       uint32_t size;
+       bool auto_update_actived;
+};
+
 struct amdgpu_virt_ras_cmd {
        struct remote_batch_trace_mgr batch_mgr;
+       struct vram_blocks_ecc blocks_ecc;
 };
 
 int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev);
index 48a0715eb8218d6d0db9965f80b3b6c25025c762..b9833812c31fa1d3c13bee38a7cbdfabdddd2d77 100644 (file)
@@ -75,6 +75,8 @@ enum ras_cmd_id {
        RAS_CMD__GET_CPER_RECORD,
        RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
        RAS_CMD__GET_BATCH_TRACE_RECORD,
+       RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
+       RAS_CMD__SET_CMD_AUTO_UPDATE,
        RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END,
 };
 
@@ -411,6 +413,37 @@ struct ras_cmd_batch_trace_record_rsp {
        struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM];
 };
 
+struct ras_cmd_auto_update_req {
+       struct ras_cmd_dev_handle dev;
+       uint32_t mode;
+       uint32_t cmd_id;
+       uint64_t addr;
+       uint32_t len;
+       uint32_t reserved[5];
+};
+
+struct ras_cmd_auto_update_rsp {
+       uint32_t version;
+       uint32_t reserved[4];
+};
+
+struct ras_cmd_blocks_ecc_req {
+       struct ras_cmd_dev_handle dev;
+};
+
+struct ras_cmd_block_ecc {
+       uint32_t ce_count;
+       uint32_t ue_count;
+       uint32_t de_count;
+};
+
+#define MAX_RAS_BLOCK_NUM  20
+struct ras_cmd_blocks_ecc_rsp {
+       uint32_t version;
+       uint32_t reserved[5];
+       struct ras_cmd_block_ecc blocks[MAX_RAS_BLOCK_NUM];
+};
+
 #pragma pack(pop)
 
 int ras_cmd_init(struct ras_core_context *ras_core);