]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
accel/amdxdna: Add IOCTL parameter for telemetry data
authorLizhi Hou <lizhi.hou@amd.com>
Tue, 4 Nov 2025 06:25:45 +0000 (22:25 -0800)
committerLizhi Hou <lizhi.hou@amd.com>
Tue, 4 Nov 2025 17:04:21 +0000 (09:04 -0800)
Extend DRM_IOCTL_AMDXDNA_GET_INFO to include additional parameters
that allow collection of telemetry data.

Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Link: https://patch.msgid.link/20251104062546.833771-3-lizhi.hou@amd.com
drivers/accel/amdxdna/aie2_message.c
drivers/accel/amdxdna/aie2_msg_priv.h
drivers/accel/amdxdna/aie2_pci.c
drivers/accel/amdxdna/aie2_pci.h
drivers/accel/amdxdna/amdxdna_mailbox_helper.h
drivers/accel/amdxdna/amdxdna_pci_drv.c
include/uapi/drm/amdxdna_accel.h

index 39214253d804317cf43bd7a01d155334c1020e5c..69cdce9ff2083ceba6b9e5f370284b32273fa17f 100644 (file)
@@ -47,7 +47,7 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
                ndev->mgmt_chann = NULL;
        }
 
-       if (!ret && *hdl->data != AIE2_STATUS_SUCCESS) {
+       if (!ret && *hdl->status != AIE2_STATUS_SUCCESS) {
                XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
                         msg->opcode, *hdl->data);
                ret = -EINVAL;
@@ -336,11 +336,6 @@ int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf,
                goto fail;
        }
 
-       if (resp.status != AIE2_STATUS_SUCCESS) {
-               XDNA_ERR(xdna, "Query NPU status failed, status 0x%x", resp.status);
-               ret = -EINVAL;
-               goto fail;
-       }
        XDNA_DBG(xdna, "Query NPU status completed");
 
        if (size < resp.size) {
@@ -362,6 +357,55 @@ fail:
        return ret;
 }
 
+int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
+                        char __user *buf, u32 size,
+                        struct amdxdna_drm_query_telemetry_header *header)
+{
+       DECLARE_AIE2_MSG(get_telemetry, MSG_OP_GET_TELEMETRY);
+       struct amdxdna_dev *xdna = ndev->xdna;
+       dma_addr_t dma_addr;
+       u8 *addr;
+       int ret;
+
+       if (header->type >= MAX_TELEMETRY_TYPE)
+               return -EINVAL;
+
+       addr = dma_alloc_noncoherent(xdna->ddev.dev, size, &dma_addr,
+                                    DMA_FROM_DEVICE, GFP_KERNEL);
+       if (!addr)
+               return -ENOMEM;
+
+       req.buf_addr = dma_addr;
+       req.buf_size = size;
+       req.type = header->type;
+
+       drm_clflush_virt_range(addr, size); /* device can access */
+       ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+       if (ret) {
+               XDNA_ERR(xdna, "Query telemetry failed, status %d", ret);
+               goto free_buf;
+       }
+
+       if (size < resp.size) {
+               ret = -EINVAL;
+               XDNA_ERR(xdna, "Bad buffer size. Available: %u. Needs: %u", size, resp.size);
+               goto free_buf;
+       }
+
+       if (copy_to_user(buf, addr, resp.size)) {
+               ret = -EFAULT;
+               XDNA_ERR(xdna, "Failed to copy telemetry to user space");
+               goto free_buf;
+       }
+
+       header->major = resp.major;
+       header->minor = resp.minor;
+
+free_buf:
+       dma_free_noncoherent(xdna->ddev.dev, size, addr, dma_addr, DMA_FROM_DEVICE);
+       return ret;
+}
+
 int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
                                 void *handle, int (*cb)(void*, void __iomem *, size_t))
 {
index 945140011763684eb3e37eb8092a43d82a7269a6..947daa63f0643b48a23606fdda2c1a41929121c8 100644 (file)
@@ -9,7 +9,8 @@
 enum aie2_msg_opcode {
        MSG_OP_CREATE_CONTEXT              = 0x2,
        MSG_OP_DESTROY_CONTEXT             = 0x3,
-       MSG_OP_SYNC_BO                     = 0x7,
+       MSG_OP_GET_TELEMETRY               = 0x4,
+       MSG_OP_SYNC_BO                     = 0x7,
        MSG_OP_EXECUTE_BUFFER_CF           = 0xC,
        MSG_OP_QUERY_COL_STATUS            = 0xD,
        MSG_OP_QUERY_AIE_TILE_INFO         = 0xE,
@@ -137,6 +138,28 @@ struct destroy_ctx_resp {
        enum aie2_msg_status    status;
 } __packed;
 
+enum telemetry_type {
+       TELEMETRY_TYPE_DISABLED,
+       TELEMETRY_TYPE_HEALTH,
+       TELEMETRY_TYPE_ERROR_INFO,
+       TELEMETRY_TYPE_PROFILING,
+       TELEMETRY_TYPE_DEBUG,
+       MAX_TELEMETRY_TYPE
+};
+
+struct get_telemetry_req {
+       enum telemetry_type     type;
+       __u64   buf_addr;
+       __u32   buf_size;
+} __packed;
+
+struct get_telemetry_resp {
+       __u32   major;
+       __u32   minor;
+       __u32   size;
+       enum aie2_msg_status    status;
+} __packed;
+
 struct execute_buffer_req {
        __u32   cu_idx;
        __u32   payload[19];
index 396dc6e060077b92b22e8e2de4c085d1846354e3..d7ccbdaf47f5f30c16c6a97d20eddd4b88063069 100644 (file)
@@ -862,6 +862,76 @@ static int aie2_query_resource_info(struct amdxdna_client *client,
        return 0;
 }
 
+static int aie2_fill_hwctx_map(struct amdxdna_hwctx *hwctx, void *arg)
+{
+       struct amdxdna_dev *xdna = hwctx->client->xdna;
+       u32 *map = arg;
+
+       if (hwctx->fw_ctx_id >= xdna->dev_handle->priv->hwctx_limit) {
+               XDNA_ERR(xdna, "Invalid fw ctx id %d/%d ", hwctx->fw_ctx_id,
+                        xdna->dev_handle->priv->hwctx_limit);
+               return -EINVAL;
+       }
+
+       map[hwctx->fw_ctx_id] = hwctx->id;
+       return 0;
+}
+
+static int aie2_get_telemetry(struct amdxdna_client *client,
+                             struct amdxdna_drm_get_info *args)
+{
+       struct amdxdna_drm_query_telemetry_header *header __free(kfree) = NULL;
+       u32 telemetry_data_sz, header_sz, elem_num;
+       struct amdxdna_dev *xdna = client->xdna;
+       struct amdxdna_client *tmp_client;
+       int ret;
+
+       elem_num = xdna->dev_handle->priv->hwctx_limit;
+       header_sz = struct_size(header, map, elem_num);
+       if (args->buffer_size <= header_sz) {
+               XDNA_ERR(xdna, "Invalid buffer size");
+               return -EINVAL;
+       }
+
+       telemetry_data_sz = args->buffer_size - header_sz;
+       if (telemetry_data_sz > SZ_4M) {
+               XDNA_ERR(xdna, "Buffer size is too big, %d", telemetry_data_sz);
+               return -EINVAL;
+       }
+
+       header = kzalloc(header_sz, GFP_KERNEL);
+       if (!header)
+               return -ENOMEM;
+
+       if (copy_from_user(header, u64_to_user_ptr(args->buffer), sizeof(*header))) {
+               XDNA_ERR(xdna, "Failed to copy telemetry header from user");
+               return -EFAULT;
+       }
+
+       header->map_num_elements = elem_num;
+       list_for_each_entry(tmp_client, &xdna->client_list, node) {
+               ret = amdxdna_hwctx_walk(tmp_client, &header->map,
+                                        aie2_fill_hwctx_map);
+               if (ret)
+                       return ret;
+       }
+
+       ret = aie2_query_telemetry(xdna->dev_handle,
+                                  u64_to_user_ptr(args->buffer + header_sz),
+                                  telemetry_data_sz, header);
+       if (ret) {
+               XDNA_ERR(xdna, "Query telemetry failed ret %d", ret);
+               return ret;
+       }
+
+       if (copy_to_user(u64_to_user_ptr(args->buffer), header, header_sz)) {
+               XDNA_ERR(xdna, "Copy header failed");
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
 static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_info *args)
 {
        struct amdxdna_dev *xdna = client->xdna;
@@ -896,6 +966,9 @@ static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_i
        case DRM_AMDXDNA_GET_POWER_MODE:
                ret = aie2_get_power_mode(client, args);
                break;
+       case DRM_AMDXDNA_QUERY_TELEMETRY:
+               ret = aie2_get_telemetry(client, args);
+               break;
        case DRM_AMDXDNA_QUERY_RESOURCE_INFO:
                ret = aie2_query_resource_info(client, args);
                break;
index a79f4f71ff6b644a5e399feffa2015c9b4f3c965..9793cd1e0c558c73c9239f207d8d56377fc06703 100644 (file)
@@ -305,6 +305,9 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
 int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
 int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
 int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf, u32 size, u32 *cols_filled);
+int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
+                        char __user *buf, u32 size,
+                        struct amdxdna_drm_query_telemetry_header *header);
 int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
                                 void *handle, int (*cb)(void*, void __iomem *, size_t));
 int aie2_config_cu(struct amdxdna_hwctx *hwctx,
index 710ff8873d6172ec6592a6d4cead5396a046b9d1..556c712cad0a434c84afb143d7528fc35693f106 100644 (file)
@@ -16,16 +16,18 @@ struct xdna_notify {
        u32                     *data;
        size_t                  size;
        int                     error;
+       u32                     *status;
 };
 
-#define DECLARE_XDNA_MSG_COMMON(name, op, status)                      \
+#define DECLARE_XDNA_MSG_COMMON(name, op, s)                           \
        struct name##_req       req = { 0 };                            \
-       struct name##_resp      resp = { status };                      \
+       struct name##_resp      resp = { .status = s };                 \
        struct xdna_notify      hdl = {                                 \
                .error = 0,                                             \
                .data = (u32 *)&resp,                                   \
                .size = sizeof(resp),                                   \
                .comp = COMPLETION_INITIALIZER_ONSTACK(hdl.comp),       \
+               .status = (u32 *)&resp.status,                          \
        };                                                              \
        struct xdna_mailbox_msg msg = {                                 \
                .send_data = (u8 *)&req,                                \
index af943a603ad1589a79f236df32c0176cd3e9543a..7590265d4485eebb82eed92a2c876cfbd5453fcd 100644 (file)
@@ -30,9 +30,10 @@ MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin");
  * 0.2: Support getting last error hardware error
  * 0.3: Support firmware debug buffer
  * 0.4: Support getting resource information
+ * 0.5: Support getting telemetry data
  */
 #define AMDXDNA_DRIVER_MAJOR           0
-#define AMDXDNA_DRIVER_MINOR           4
+#define AMDXDNA_DRIVER_MINOR           5
 
 /*
  * Bind the driver base on (vendor_id, device_id) pair and later use the
index 8b679c38d30864eb753e07031fd30be7f29dd0fb..8ad254bc35a5b9fd3f625a65df5eac6dfd91f077 100644 (file)
@@ -442,6 +442,7 @@ enum amdxdna_drm_get_param {
        DRM_AMDXDNA_QUERY_HW_CONTEXTS,
        DRM_AMDXDNA_QUERY_FIRMWARE_VERSION = 8,
        DRM_AMDXDNA_GET_POWER_MODE,
+       DRM_AMDXDNA_QUERY_TELEMETRY,
        DRM_AMDXDNA_QUERY_RESOURCE_INFO = 12,
 };
 
@@ -461,6 +462,22 @@ struct amdxdna_drm_get_resource_info {
        __u64 npu_task_curr;
 };
 
+/**
+ * struct amdxdna_drm_query_telemetry_header - Telemetry data header
+ */
+struct amdxdna_drm_query_telemetry_header {
+       /** @major: Firmware telemetry interface major version number */
+       __u32 major;
+       /** @minor: Firmware telemetry interface minor version number */
+       __u32 minor;
+       /** @type: Telemetry query type */
+       __u32 type;
+       /** @map_num_elements: Total number of elements in the map table */
+       __u32 map_num_elements;
+       /** @map: Element map */
+       __u32 map[];
+};
+
 /**
  * struct amdxdna_drm_get_info - Get some information from the AIE hardware.
  * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed in the buffer.