]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amd/ras: copy ras log data instead of referencing pointers
authorYiPeng Chai <YiPeng.Chai@amd.com>
Tue, 19 May 2026 05:47:34 +0000 (13:47 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 19 May 2026 15:54:17 +0000 (11:54 -0400)
When generating ras cper file, the original data nodes in the ras
log ring buffer may be deleted, leading to invalid pointer
access. Copy the data from the ras log ring instead of directly
referencing the pointers to avoid this issue.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
drivers/gpu/drm/amd/ras/rascore/ras_cper.c
drivers/gpu/drm/amd/ras/rascore/ras_cper.h
drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h

index 6ef9ec7ed597770ebd3f7d9b66e1cb0afd17818b..c2761f3d06a0abcdb6f1cf3e728b8a893f36f33a 100644 (file)
@@ -202,7 +202,7 @@ static bool amdgpu_virt_ras_check_batch_cached(struct ras_cmd_batch_trace_record
 }
 
 static int amdgpu_virt_ras_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_id,
-                       struct ras_log_info **trace_arr, uint32_t arr_num,
+                       struct ras_log_info *trace_arr, uint32_t arr_num,
                        struct ras_cmd_batch_trace_record_rsp *rsp_cache)
 {
        struct ras_cmd_batch_trace_record_req req = {
@@ -238,7 +238,8 @@ static int amdgpu_virt_ras_get_batch_records(struct ras_core_context *ras_core,
        }
 
        for (i = 0; i < batch->trace_num && i < arr_num; i++)
-               trace_arr[i] = &rsp->records[batch->offset + i];
+               memcpy(&trace_arr[i],
+                       &rsp->records[batch->offset + i], sizeof(*trace_arr));
 
        return i;
 }
@@ -255,7 +256,8 @@ static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
                (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
        struct ras_log_batch_overview *overview = &virt_ras->batch_mgr.batch_overview;
        struct ras_cmd_batch_trace_record_rsp *rsp_cache = &virt_ras->batch_mgr.batch_trace;
-       struct ras_log_info **trace;
+       struct ras_log_info *trace;
+       uint32_t trace_count = MAX_RECORD_PER_BATCH;
        uint32_t offset = 0, real_data_len = 0;
        uint64_t batch_id;
        uint8_t *out_buf;
@@ -269,7 +271,7 @@ static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
            req->buf_size > RAS_CMD_MAX_CPER_BUF_SZ)
                return RAS_CMD__ERROR_INVALID_INPUT_DATA;
 
-       trace = kzalloc_objs(*trace, MAX_RECORD_PER_BATCH);
+       trace = kzalloc_objs(*trace, trace_count);
        if (!trace)
                return RAS_CMD__ERROR_GENERIC;
 
@@ -286,7 +288,7 @@ static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
                if (batch_id >= overview->last_batch_id)
                        break;
                count = amdgpu_virt_ras_get_batch_records(ras_core, batch_id,
-                                                         trace, MAX_RECORD_PER_BATCH,
+                                                         trace, trace_count,
                                                          rsp_cache);
                if (count > 0) {
                        ret = ras_cper_generate_cper(ras_core, trace, count,
index e3b0c8b06c1c3f41f72f4af41da0c2af145c8e29..abd004eee95e73ccdaf4fc2952fb1cec275647d2 100644 (file)
@@ -202,11 +202,12 @@ static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
                        (struct ras_cmd_cper_record_req *)cmd->input_buff_raw;
        struct ras_cmd_cper_record_rsp *rsp =
                        (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
-       struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0};
+       struct ras_log_info *trace = NULL;
+       uint32_t trace_count = MAX_RECORD_PER_BATCH;
        struct ras_log_batch_overview overview;
        uint32_t offset = 0, real_data_len = 0;
        uint64_t batch_id;
-       uint8_t *buffer;
+       uint8_t *buffer = NULL;
        int ret = 0, i, count;
 
        if ((cmd->input_size != sizeof(struct ras_cmd_cper_record_req)) ||
@@ -221,6 +222,12 @@ static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
        if (!buffer)
                return RAS_CMD__ERROR_GENERIC;
 
+       trace = kcalloc(trace_count, sizeof(*trace), GFP_KERNEL);
+       if (!trace) {
+               ret = RAS_CMD__ERROR_GENERIC;
+               goto out;
+       }
+
        ras_log_ring_get_batch_overview(ras_core, &overview);
        for (i = 0; i < req->cper_num; i++) {
                batch_id = req->cper_start_id + i;
@@ -228,7 +235,7 @@ static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
                        break;
 
                count = ras_log_ring_get_batch_records(ras_core, batch_id, trace,
-                                       ARRAY_SIZE(trace));
+                                       trace_count);
                if (count > 0) {
                        ret = ras_cper_generate_cper(ras_core, trace, count,
                                        &buffer[offset], req->buf_size - offset, &real_data_len);
@@ -241,8 +248,8 @@ static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
 
        if ((ret && (ret != -ENOMEM)) ||
                copy_to_user(u64_to_user_ptr(req->buf_ptr), buffer, offset)) {
-               kfree(buffer);
-               return RAS_CMD__ERROR_GENERIC;
+               ret = RAS_CMD__ERROR_GENERIC;
+               goto out;
        }
 
        rsp->real_data_size = offset;
@@ -251,10 +258,12 @@ static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
        rsp->version = 0;
 
        cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
+       ret = RAS_CMD__SUCCESS;
 
+out:
+       kfree(trace);
        kfree(buffer);
-
-       return RAS_CMD__SUCCESS;
+       return ret;
 }
 
 static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context *ras_core,
@@ -288,7 +297,8 @@ static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core,
        struct ras_cmd_batch_trace_record_rsp *output_data =
                        (struct ras_cmd_batch_trace_record_rsp *)cmd->output_buff_raw;
        struct ras_log_batch_overview overview;
-       struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0};
+       struct ras_log_info *trace_arry = NULL;
+       uint32_t trace_count = MAX_RECORD_PER_BATCH;
        struct ras_log_info *record;
        int i, j, count = 0, offset = 0;
        uint64_t id;
@@ -306,6 +316,10 @@ static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core,
            (input_data->start_batch_id >= overview.last_batch_id))
                return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
 
+       trace_arry = kcalloc(trace_count, sizeof(*trace_arry), GFP_KERNEL);
+       if (!trace_arry)
+               return RAS_CMD__ERROR_GENERIC;
+
        for (i = 0; i < input_data->batch_num; i++) {
                id = input_data->start_batch_id + i;
                if (id >= overview.last_batch_id) {
@@ -314,17 +328,17 @@ static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core,
                }
 
                count = ras_log_ring_get_batch_records(ras_core,
-                                       id, trace_arry, ARRAY_SIZE(trace_arry));
+                                       id, trace_arry, trace_count);
                if (count > 0) {
                        if ((offset + count) > RAS_CMD_MAX_TRACE_NUM)
                                break;
                        for (j = 0; j < count; j++) {
                                record = &output_data->records[offset + j];
-                               record->seqno = trace_arry[j]->seqno;
-                               record->timestamp = trace_arry[j]->timestamp;
-                               record->event = trace_arry[j]->event;
+                               record->seqno = trace_arry[j].seqno;
+                               record->timestamp = trace_arry[j].timestamp;
+                               record->event = trace_arry[j].event;
                                memcpy(&record->aca_reg,
-                                       &trace_arry[j]->aca_reg, sizeof(trace_arry[j]->aca_reg));
+                                       &trace_arry[j].aca_reg, sizeof(trace_arry[j].aca_reg));
                        }
                } else {
                        count = 0;
@@ -343,6 +357,8 @@ static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core,
 
        cmd->output_size = sizeof(struct ras_cmd_batch_trace_record_rsp);
 
+       kfree(trace_arry);
+
        return RAS_CMD__SUCCESS;
 }
 
index 0fc7522b7ab6b64867f887002a56301e21aa946b..6e93a13bbc4cefdcbbab99e8449a6e1d990bdfb9 100644 (file)
@@ -175,14 +175,14 @@ static int fill_section_runtime(struct ras_core_context *ras_core,
 }
 
 static int cper_generate_runtime_record(struct ras_core_context *ras_core,
-       struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, uint32_t arr_num,
+       struct cper_section_hdr *hdr, struct ras_log_info *trace_arr, uint32_t arr_num,
                enum ras_cper_severity sev)
 {
        struct cper_section_descriptor *descriptor;
        struct cper_section_runtime *runtime;
        int i;
 
-       fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, trace_arr[0]);
+       fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, &trace_arr[0]);
        hdr->record_length =  RAS_HDR_LEN + ((RAS_SEC_DESC_LEN + RAS_NONSTD_SEC_LEN) * arr_num);
        hdr->sec_cnt = arr_num;
        for (i = 0; i < arr_num; i++) {
@@ -194,21 +194,21 @@ static int cper_generate_runtime_record(struct ras_core_context *ras_core,
                fill_section_descriptor(ras_core, descriptor, sev, RUNTIME,
                        RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i),
                        sizeof(struct cper_section_runtime));
-               fill_section_runtime(ras_core, runtime, trace_arr[i], sev);
+               fill_section_runtime(ras_core, runtime, &trace_arr[i], sev);
        }
 
        return 0;
 }
 
 static int cper_generate_fatal_record(struct ras_core_context *ras_core,
-       uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num)
+       uint8_t *buffer, struct ras_log_info *trace_arr, uint32_t arr_num)
 {
        struct ras_cper_fatal_record record = {0};
        int i = 0;
 
        for (i = 0; i < arr_num; i++) {
                fill_section_hdr(ras_core, &record.hdr, RAS_CPER_TYPE_FATAL,
-                                RAS_CPER_SEV_FATAL_UE, trace_arr[i]);
+                                RAS_CPER_SEV_FATAL_UE, &trace_arr[i]);
                record.hdr.record_length =  RAS_HDR_LEN + RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN;
                record.hdr.sec_cnt = 1;
 
@@ -216,7 +216,7 @@ static int cper_generate_fatal_record(struct ras_core_context *ras_core,
                                        CRASHDUMP, offsetof(struct ras_cper_fatal_record, fatal),
                                        sizeof(struct cper_section_fatal));
 
-               fill_section_fatal(ras_core, &record.fatal, trace_arr[i]);
+               fill_section_fatal(ras_core, &record.fatal, &trace_arr[i]);
 
                memcpy(buffer + (i * record.hdr.record_length),
                                &record, record.hdr.record_length);
@@ -271,7 +271,7 @@ static enum ras_cper_type cper_ras_log_event_to_cper_type(enum ras_log_event eve
 }
 
 int ras_cper_generate_cper(struct ras_core_context *ras_core,
-               struct ras_log_info **trace_list, uint32_t count,
+               struct ras_log_info *trace_list, uint32_t count,
                uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len)
 {
        uint8_t *buffer = buf;
@@ -281,14 +281,14 @@ int ras_cper_generate_cper(struct ras_core_context *ras_core,
 
        /* All the batch traces share the same event */
        record_size = cper_get_record_size(
-                       cper_ras_log_event_to_cper_type(trace_list[0]->event), count);
+                       cper_ras_log_event_to_cper_type(trace_list[0].event), count);
 
        if ((record_size + saved_size) > buf_size)
                return -ENOMEM;
 
        hdr = (struct cper_section_hdr *)(buffer + saved_size);
 
-       switch (trace_list[0]->event) {
+       switch (trace_list[0].event) {
        case RAS_LOG_EVENT_RMA:
                cper_generate_runtime_record(ras_core, hdr, trace_list, count, RAS_CPER_SEV_RMA);
                break;
@@ -304,7 +304,7 @@ int ras_cper_generate_cper(struct ras_core_context *ras_core,
                cper_generate_fatal_record(ras_core, buffer + saved_size, trace_list, count);
                break;
        default:
-               RAS_DEV_WARN(ras_core->dev, "Unprocessed trace event: %d\n", trace_list[0]->event);
+               RAS_DEV_WARN(ras_core->dev, "Unprocessed trace event: %d\n", trace_list[0].event);
                break;
        }
 
index 076c1883c1ce4443d7712a982e2ac595bdb64ca1..e4e3615ecc2e02425f069c67c4c25f26ce2daa1e 100644 (file)
@@ -299,6 +299,6 @@ struct ras_cper_fatal_record {
 struct ras_core_context;
 struct ras_log_info;
 int ras_cper_generate_cper(struct ras_core_context *ras_core,
-               struct ras_log_info **trace_list, uint32_t count,
+               struct ras_log_info *trace_list, uint32_t count,
                uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len);
 #endif
index f10e856f3c8cf32c8bc907ea51942879d73b1bd5..c37343db39dfd98aef597ad591d44be5c4e3631f 100644 (file)
@@ -265,8 +265,8 @@ void ras_log_ring_add_log_event(struct ras_core_context *ras_core,
        ras_log_ring_add_data(ras_core, log, batch_tag);
 }
 
-static struct ras_log_info *ras_log_ring_lookup_data(struct ras_core_context *ras_core,
-                                       uint64_t idx)
+static int ras_log_ring_lookup_data(struct ras_core_context *ras_core,
+                                       uint64_t idx, struct ras_log_info *log)
 {
        struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
        unsigned long flags = 0;
@@ -274,30 +274,27 @@ static struct ras_log_info *ras_log_ring_lookup_data(struct ras_core_context *ra
 
        spin_lock_irqsave(&log_ring->spin_lock, flags);
        data = radix_tree_lookup(&log_ring->ras_log_root, idx);
+       if (data)
+               memcpy(log, data, sizeof(*log));
        spin_unlock_irqrestore(&log_ring->spin_lock, flags);
 
-       return (struct ras_log_info *)data;
+       return data ? 0 : -ENODATA;
 }
 
 int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_id,
-               struct ras_log_info **log_arr, uint32_t arr_num)
+               struct ras_log_info *log_arr, uint32_t arr_num)
 {
        struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
        uint32_t i, idx, count = 0;
-       void *data;
 
-       if ((batch_id >= log_ring->mono_upward_batch_id) ||
+       if (!log_arr || !arr_num || (batch_id >= log_ring->mono_upward_batch_id) ||
                (batch_id < log_ring->last_del_batch_id))
                return -EINVAL;
 
-       for (i = 0; i < MAX_RECORD_PER_BATCH; i++) {
+       for (i = 0; i < MAX_RECORD_PER_BATCH && i < arr_num; i++) {
                idx = BATCH_IDX_TO_TREE_IDX(batch_id, i);
-               data = ras_log_ring_lookup_data(ras_core, idx);
-               if (data) {
-                       log_arr[count++] = data;
-                       if (count >= arr_num)
-                               break;
-               }
+               if (!ras_log_ring_lookup_data(ras_core, idx, &log_arr[count]))
+                       count++;
        }
 
        return count;
index 0ff6cc35678dde4c83d7ed0132a14e5794321c77..cb66beaa9f432e9d9bf323189707fa9f66b6fa38 100644 (file)
@@ -86,7 +86,7 @@ void ras_log_ring_add_log_event(struct ras_core_context *ras_core,
                enum ras_log_event event, void *data, struct ras_log_batch_tag *tag);
 
 int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_idx,
-               struct ras_log_info **log_arr, uint32_t arr_num);
+               struct ras_log_info *log_arr, uint32_t arr_num);
 
 int ras_log_ring_get_batch_overview(struct ras_core_context *ras_core,
                struct ras_log_batch_overview *overview);