]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
accel/qaic: Implement basic SSR handling
authorJeffrey Hugo <jhugo@codeaurora.org>
Fri, 31 Oct 2025 17:41:02 +0000 (10:41 -0700)
committerJeff Hugo <jeff.hugo@oss.qualcomm.com>
Fri, 7 Nov 2025 18:01:18 +0000 (11:01 -0700)
Subsystem restart (SSR) for a qaic device means that a NSP has crashed,
and will be restarted.  However the restart process will lose any state
associated with activation, so the user will need to do some recovery.

While SSR has the provision to collect a crash dump, this patch does not
implement support for it.

Co-developed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Co-developed-by: Pranjal Ramajor Asha Kanojiya <quic_pkanojiy@quicinc.com>
Signed-off-by: Pranjal Ramajor Asha Kanojiya <quic_pkanojiy@quicinc.com>
Co-developed-by: Troy Hanson <quic_thanson@quicinc.com>
Signed-off-by: Troy Hanson <quic_thanson@quicinc.com>
Co-developed-by: Aswin Venkatesan <aswivenk@qti.qualcomm.com>
Signed-off-by: Aswin Venkatesan <aswivenk@qti.qualcomm.com>
Signed-off-by: Jeffrey Hugo <jhugo@codeaurora.org>
Signed-off-by: Youssef Samir <youssef.abdulrahman@oss.qualcomm.com>
Signed-off-by: Zack McKevitt <zachary.mckevitt@oss.qualcomm.com>
Reviewed-by: Jeff Hugo <jeff.hugo@oss.qualcomm.com>
[jhugo: Fix minor checkpatch whitespace issues]
Signed-off-by: Jeff Hugo <jeff.hugo@oss.qualcomm.com>
Link: https://patch.msgid.link/20251031174059.2814445-3-zachary.mckevitt@oss.qualcomm.com
Documentation/accel/qaic/aic100.rst
drivers/accel/qaic/Makefile
drivers/accel/qaic/qaic.h
drivers/accel/qaic/qaic_data.c
drivers/accel/qaic/qaic_drv.c
drivers/accel/qaic/qaic_ssr.c [new file with mode: 0644]
drivers/accel/qaic/qaic_ssr.h [new file with mode: 0644]

index 273da6192fb34662045e4300cafb1bbac7692c78..3b287c3987d2693183a5c29122bc51b10305cedb 100644 (file)
@@ -487,8 +487,8 @@ one user crashes, the fallout of that should be limited to that workload and not
 impact other workloads. SSR accomplishes this.
 
 If a particular workload crashes, QSM notifies the host via the QAIC_SSR MHI
-channel. This notification identifies the workload by it's assigned DBC. A
-multi-stage recovery process is then used to cleanup both sides, and get the
+channel. This notification identifies the workload by its assigned DBC. A
+multi-stage recovery process is then used to cleanup both sides, and gets the
 DBC/NSPs into a working state.
 
 When SSR occurs, any state in the workload is lost. Any inputs that were in
@@ -496,6 +496,26 @@ process, or queued by not yet serviced, are lost. The loaded artifacts will
 remain in on-card DDR, but the host will need to re-activate the workload if
 it desires to recover the workload.
 
+When SSR occurs for a specific NSP, the assigned DBC goes through the
+following state transactions in order:
+DBC_STATE_BEFORE_SHUTDOWN
+       Indicates that the affected NSP was found in an unrecoverable error
+       condition.
+DBC_STATE_AFTER_SHUTDOWN
+       Indicates that the NSP is under reset.
+DBC_STATE_BEFORE_POWER_UP
+       Indicates that the NSP's debug information has been collected, and is
+       ready to be collected by the host (if desired). At that stage the NSP
+       is restarted by QSM.
+DBC_STATE_AFTER_POWER_UP
+       Indicates that the NSP has been restarted, fully operational and is
+       in idle state.
+
+SSR also has an optional crashdump collection feature. If enabled, the host can
+collect the memory dump for the crashed NSP and dump it to the user space via
+the dev_coredump subsystem. The host can also decline the crashdump collection
+request from the device.
+
 Reliability, Accessibility, Serviceability (RAS)
 ================================================
 
index 8f6746e5f03aa3396ae47f460f1c8aecc86b6159..71f727b74da3bb4478324689f02a7cea24a05c2d 100644 (file)
@@ -11,6 +11,7 @@ qaic-y := \
        qaic_data.o \
        qaic_drv.o \
        qaic_ras.o \
+       qaic_ssr.o \
        qaic_sysfs.o \
        qaic_timesync.o \
        sahara.o
index cbaec577c457a2a1cdc4997edd917137fdf81210..b9ae1c256c7ac2f93efb9d7bf5ae5d0ae14c8c76 100644 (file)
@@ -21,6 +21,7 @@
 
 #define QAIC_DBC_BASE          SZ_128K
 #define QAIC_DBC_SIZE          SZ_4K
+#define QAIC_SSR_DBC_SENTINEL  U32_MAX /* No ongoing SSR sentinel */
 
 #define QAIC_NO_PARTITION      -1
 
@@ -197,6 +198,12 @@ struct qaic_device {
        unsigned int            ue_count;
        /* Un-correctable non-fatal error count */
        unsigned int            ue_nf_count;
+       /* MHI SSR channel device */
+       struct mhi_device       *ssr_ch;
+       /* Work queue for tasks related to MHI SSR device */
+       struct workqueue_struct *ssr_wq;
+       /* DBC which is under SSR. Sentinel U32_MAX would mean that no SSR in progress */
+       u32                     ssr_dbc;
 };
 
 struct qaic_drm_device {
@@ -340,6 +347,8 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
 int qaic_detach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
 void irq_polling_work(struct work_struct *work);
+void qaic_dbc_enter_ssr(struct qaic_device *qdev, u32 dbc_id);
+void qaic_dbc_exit_ssr(struct qaic_device *qdev);
 
 /* qaic_sysfs.c */
 int qaic_sysfs_init(struct qaic_drm_device *qddev);
index fa723a2bdfa91689835652fc6fb87cd44223df78..50a2c3482fd122f079b92f45ec59ee8808128fd6 100644 (file)
@@ -1047,6 +1047,11 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi
                goto unlock_ch_srcu;
        }
 
+       if (dbc->id == qdev->ssr_dbc) {
+               ret = -EPIPE;
+               goto unlock_ch_srcu;
+       }
+
        ret = qaic_prepare_bo(qdev, bo, &args->hdr);
        if (ret)
                goto unlock_ch_srcu;
@@ -1370,6 +1375,11 @@ static int __qaic_execute_bo_ioctl(struct drm_device *dev, void *data, struct dr
                goto release_ch_rcu;
        }
 
+       if (dbc->id == qdev->ssr_dbc) {
+               ret = -EPIPE;
+               goto release_ch_rcu;
+       }
+
        ret = mutex_lock_interruptible(&dbc->req_lock);
        if (ret)
                goto release_ch_rcu;
@@ -1722,6 +1732,11 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file
                goto unlock_ch_srcu;
        }
 
+       if (dbc->id == qdev->ssr_dbc) {
+               ret = -EPIPE;
+               goto unlock_ch_srcu;
+       }
+
        obj = drm_gem_object_lookup(file_priv, args->handle);
        if (!obj) {
                ret = -ENOENT;
@@ -1742,6 +1757,9 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file
        if (!dbc->usr)
                ret = -EPERM;
 
+       if (dbc->id == qdev->ssr_dbc)
+               ret = -EPIPE;
+
 put_obj:
        drm_gem_object_put(obj);
 unlock_ch_srcu:
@@ -1945,6 +1963,17 @@ static void empty_xfer_list(struct qaic_device *qdev, struct dma_bridge_chan *db
        spin_unlock_irqrestore(&dbc->xfer_lock, flags);
 }
 
+static void sync_empty_xfer_list(struct qaic_device *qdev, struct dma_bridge_chan *dbc)
+{
+       empty_xfer_list(qdev, dbc);
+       synchronize_srcu(&dbc->ch_lock);
+       /*
+        * Threads holding channel lock, may add more elements in the xfer_list.
+        * Flush out these elements from xfer_list.
+        */
+       empty_xfer_list(qdev, dbc);
+}
+
 int disable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr)
 {
        if (!qdev->dbc[dbc_id].usr || qdev->dbc[dbc_id].usr->handle != usr->handle)
@@ -1973,13 +2002,7 @@ void wakeup_dbc(struct qaic_device *qdev, u32 dbc_id)
        struct dma_bridge_chan *dbc = &qdev->dbc[dbc_id];
 
        dbc->usr = NULL;
-       empty_xfer_list(qdev, dbc);
-       synchronize_srcu(&dbc->ch_lock);
-       /*
-        * Threads holding channel lock, may add more elements in the xfer_list.
-        * Flush out these elements from xfer_list.
-        */
-       empty_xfer_list(qdev, dbc);
+       sync_empty_xfer_list(qdev, dbc);
 }
 
 void release_dbc(struct qaic_device *qdev, u32 dbc_id)
@@ -2020,3 +2043,30 @@ void qaic_data_get_fifo_info(struct dma_bridge_chan *dbc, u32 *head, u32 *tail)
        *head = readl(dbc->dbc_base + REQHP_OFF);
        *tail = readl(dbc->dbc_base + REQTP_OFF);
 }
+
+/*
+ * qaic_dbc_enter_ssr - Prepare to enter in sub system reset(SSR) for given DBC ID.
+ * @qdev: qaic device handle
+ * @dbc_id: ID of the DBC which will enter SSR
+ *
+ * The device will automatically deactivate the workload as not
+ * all errors can be silently recovered. The user will be
+ * notified and will need to decide the required recovery
+ * action to take.
+ */
+void qaic_dbc_enter_ssr(struct qaic_device *qdev, u32 dbc_id)
+{
+       qdev->ssr_dbc = dbc_id;
+       release_dbc(qdev, dbc_id);
+}
+
+/*
+ * qaic_dbc_exit_ssr - Prepare to exit from sub system reset(SSR) for given DBC ID.
+ * @qdev: qaic device handle
+ *
+ * The DBC returns to an operational state and begins accepting work after exiting SSR.
+ */
+void qaic_dbc_exit_ssr(struct qaic_device *qdev)
+{
+       qdev->ssr_dbc = QAIC_SSR_DBC_SENTINEL;
+}
index a9fcb9782d27a2cd37e36c44ae0e601dd1409c9b..4d9f4f149b3dd4f0d209e2cfdcaed1ea74f24c97 100644 (file)
@@ -30,6 +30,7 @@
 #include "qaic.h"
 #include "qaic_debugfs.h"
 #include "qaic_ras.h"
+#include "qaic_ssr.h"
 #include "qaic_timesync.h"
 #include "sahara.h"
 
@@ -390,6 +391,7 @@ void qaic_dev_reset_clean_local_state(struct qaic_device *qdev)
        qaic_notify_reset(qdev);
 
        /* start tearing things down */
+       qaic_clean_up_ssr(qdev);
        for (i = 0; i < qdev->num_dbc; ++i)
                release_dbc(qdev, i);
 }
@@ -439,11 +441,18 @@ static struct qaic_device *create_qdev(struct pci_dev *pdev,
        qdev->qts_wq = qaicm_wq_init(drm, "qaic_ts");
        if (IS_ERR(qdev->qts_wq))
                return NULL;
+       qdev->ssr_wq = qaicm_wq_init(drm, "qaic_ssr");
+       if (IS_ERR(qdev->ssr_wq))
+               return NULL;
 
        ret = qaicm_srcu_init(drm, &qdev->dev_lock);
        if (ret)
                return NULL;
 
+       ret = qaic_ssr_init(qdev);
+       if (ret)
+               pci_info(pdev, "QAIC SSR crashdump collection not supported.\n");
+
        qdev->qddev = qddev;
        qdev->pdev = pdev;
        qddev->qdev = qdev;
@@ -799,9 +808,16 @@ static int __init qaic_init(void)
        ret = qaic_ras_register();
        if (ret)
                pr_debug("qaic: qaic_ras_register failed %d\n", ret);
+       ret = qaic_ssr_register();
+       if (ret) {
+               pr_debug("qaic: qaic_ssr_register failed %d\n", ret);
+               goto free_bootlog;
+       }
 
        return 0;
 
+free_bootlog:
+       qaic_bootlog_unregister();
 free_mhi:
        mhi_driver_unregister(&qaic_mhi_driver);
 free_pci:
@@ -827,6 +843,7 @@ static void __exit qaic_exit(void)
         * reinitializing the link_up state after the cleanup is done.
         */
        link_up = true;
+       qaic_ssr_unregister();
        qaic_ras_unregister();
        qaic_bootlog_unregister();
        qaic_timesync_deinit();
diff --git a/drivers/accel/qaic/qaic_ssr.c b/drivers/accel/qaic/qaic_ssr.c
new file mode 100644 (file)
index 0000000..e9c7fe0
--- /dev/null
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. */
+/* Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved. */
+
+#include <asm/byteorder.h>
+#include <drm/drm_file.h>
+#include <drm/drm_managed.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mhi.h>
+#include <linux/workqueue.h>
+
+#include "qaic.h"
+#include "qaic_ssr.h"
+
+#define SSR_RESP_MSG_SZ 32
+
+#define DEBUG_TRANSFER_INFO            BIT(0)
+#define DEBUG_TRANSFER_INFO_RSP                BIT(1)
+#define MEMORY_READ                    BIT(2)
+#define MEMORY_READ_RSP                        BIT(3)
+#define DEBUG_TRANSFER_DONE            BIT(4)
+#define DEBUG_TRANSFER_DONE_RSP                BIT(5)
+#define SSR_EVENT                      BIT(8)
+#define SSR_EVENT_RSP                  BIT(9)
+
+#define SSR_EVENT_NACK         BIT(0)
+#define BEFORE_SHUTDOWN                BIT(1)
+#define AFTER_SHUTDOWN         BIT(2)
+#define BEFORE_POWER_UP                BIT(3)
+#define AFTER_POWER_UP         BIT(4)
+
+struct _ssr_hdr {
+       __le32 cmd;
+       __le32 len;
+       __le32 dbc_id;
+};
+
+struct ssr_hdr {
+       u32 cmd;
+       u32 len;
+       u32 dbc_id;
+};
+
+struct ssr_debug_transfer_info_rsp {
+       struct _ssr_hdr hdr;
+       __le32 ret;
+} __packed;
+
+struct ssr_event {
+       struct ssr_hdr hdr;
+       u32 event;
+} __packed;
+
+struct ssr_event_rsp {
+       struct _ssr_hdr hdr;
+       __le32 event;
+} __packed;
+
+struct ssr_resp {
+       /* Work struct to schedule work coming on QAIC_SSR channel */
+       struct work_struct work;
+       /* Root struct of device, used to access device resources */
+       struct qaic_device *qdev;
+       /* Buffer used by MHI for transfer requests */
+       u8 data[] __aligned(8);
+};
+
+void qaic_clean_up_ssr(struct qaic_device *qdev)
+{
+       qaic_dbc_exit_ssr(qdev);
+}
+
+static void ssr_worker(struct work_struct *work)
+{
+       struct ssr_resp *resp = container_of(work, struct ssr_resp, work);
+       struct ssr_hdr *hdr = (struct ssr_hdr *)resp->data;
+       struct ssr_debug_transfer_info_rsp *debug_rsp;
+       struct qaic_device *qdev = resp->qdev;
+       struct ssr_event_rsp *event_rsp;
+       struct dma_bridge_chan *dbc;
+       struct ssr_event *event;
+       u32 ssr_event_ack;
+       int ret;
+
+       le32_to_cpus(&hdr->cmd);
+       le32_to_cpus(&hdr->len);
+       le32_to_cpus(&hdr->dbc_id);
+
+       if (hdr->len > SSR_RESP_MSG_SZ)
+               goto out;
+
+       if (hdr->dbc_id >= qdev->num_dbc)
+               goto out;
+
+       dbc = &qdev->dbc[hdr->dbc_id];
+
+       switch (hdr->cmd) {
+       case DEBUG_TRANSFER_INFO:
+               /* Decline crash dump request from the device */
+               debug_rsp = kmalloc(sizeof(*debug_rsp), GFP_KERNEL);
+               if (!debug_rsp)
+                       break;
+
+               debug_rsp->hdr.cmd = cpu_to_le32(DEBUG_TRANSFER_INFO_RSP);
+               debug_rsp->hdr.len = cpu_to_le32(sizeof(*debug_rsp));
+               debug_rsp->hdr.dbc_id = cpu_to_le32(event->hdr.dbc_id);
+               debug_rsp->ret = cpu_to_le32(1);
+
+               ret = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE,
+                                   debug_rsp, sizeof(*debug_rsp), MHI_EOT);
+               if (ret) {
+                       pci_warn(qdev->pdev, "Could not send DEBUG_TRANSFER_INFO_RSP %d\n", ret);
+                       kfree(debug_rsp);
+               }
+               return;
+       case SSR_EVENT:
+               event = (struct ssr_event *)hdr;
+               le32_to_cpus(&event->event);
+               ssr_event_ack = event->event;
+
+               switch (event->event) {
+               case BEFORE_SHUTDOWN:
+                       set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_BEFORE_SHUTDOWN);
+                       qaic_dbc_enter_ssr(qdev, hdr->dbc_id);
+                       break;
+               case AFTER_SHUTDOWN:
+                       set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_AFTER_SHUTDOWN);
+                       break;
+               case BEFORE_POWER_UP:
+                       set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_BEFORE_POWER_UP);
+                       break;
+               case AFTER_POWER_UP:
+                       set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_AFTER_POWER_UP);
+                       break;
+               default:
+                       break;
+               }
+
+               event_rsp = kmalloc(sizeof(*event_rsp), GFP_KERNEL);
+               if (!event_rsp)
+                       break;
+
+               event_rsp->hdr.cmd = cpu_to_le32(SSR_EVENT_RSP);
+               event_rsp->hdr.len = cpu_to_le32(sizeof(*event_rsp));
+               event_rsp->hdr.dbc_id = cpu_to_le32(hdr->dbc_id);
+               event_rsp->event = cpu_to_le32(ssr_event_ack);
+
+               ret = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE, event_rsp, sizeof(*event_rsp),
+                                   MHI_EOT);
+               if (ret)
+                       kfree(event_rsp);
+
+               if (event->event == AFTER_POWER_UP) {
+                       qaic_dbc_exit_ssr(qdev);
+                       set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_IDLE);
+               }
+
+               break;
+       default:
+               break;
+       }
+
+out:
+       ret = mhi_queue_buf(qdev->ssr_ch, DMA_FROM_DEVICE, resp->data, SSR_RESP_MSG_SZ, MHI_EOT);
+       if (ret)
+               kfree(resp);
+}
+
+static int qaic_ssr_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
+{
+       struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev));
+       struct ssr_resp *resp;
+       int ret;
+
+       ret = mhi_prepare_for_transfer(mhi_dev);
+       if (ret)
+               return ret;
+
+       resp = kzalloc(sizeof(*resp) + SSR_RESP_MSG_SZ, GFP_KERNEL);
+       if (!resp) {
+               mhi_unprepare_from_transfer(mhi_dev);
+               return -ENOMEM;
+       }
+
+       resp->qdev = qdev;
+       INIT_WORK(&resp->work, ssr_worker);
+
+       ret = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, resp->data, SSR_RESP_MSG_SZ, MHI_EOT);
+       if (ret) {
+               kfree(resp);
+               mhi_unprepare_from_transfer(mhi_dev);
+               return ret;
+       }
+
+       dev_set_drvdata(&mhi_dev->dev, qdev);
+       qdev->ssr_ch = mhi_dev;
+
+       return 0;
+}
+
+static void qaic_ssr_mhi_remove(struct mhi_device *mhi_dev)
+{
+       struct qaic_device *qdev;
+
+       qdev = dev_get_drvdata(&mhi_dev->dev);
+       mhi_unprepare_from_transfer(qdev->ssr_ch);
+       qdev->ssr_ch = NULL;
+}
+
+static void qaic_ssr_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
+{
+       kfree(mhi_result->buf_addr);
+}
+
+static void qaic_ssr_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
+{
+       struct ssr_resp *resp = container_of(mhi_result->buf_addr, struct ssr_resp, data);
+       struct qaic_device *qdev = dev_get_drvdata(&mhi_dev->dev);
+
+       if (mhi_result->transaction_status) {
+               kfree(resp);
+               return;
+       }
+       queue_work(qdev->ssr_wq, &resp->work);
+}
+
+static const struct mhi_device_id qaic_ssr_mhi_match_table[] = {
+       { .chan = "QAIC_SSR", },
+       {},
+};
+
+static struct mhi_driver qaic_ssr_mhi_driver = {
+       .id_table = qaic_ssr_mhi_match_table,
+       .remove = qaic_ssr_mhi_remove,
+       .probe = qaic_ssr_mhi_probe,
+       .ul_xfer_cb = qaic_ssr_mhi_ul_xfer_cb,
+       .dl_xfer_cb = qaic_ssr_mhi_dl_xfer_cb,
+       .driver = {
+               .name = "qaic_ssr",
+       },
+};
+
+int qaic_ssr_init(struct qaic_device *qdev)
+{
+       qdev->ssr_dbc = QAIC_SSR_DBC_SENTINEL;
+       return 0;
+}
+
+int qaic_ssr_register(void)
+{
+       return mhi_driver_register(&qaic_ssr_mhi_driver);
+}
+
+void qaic_ssr_unregister(void)
+{
+       mhi_driver_unregister(&qaic_ssr_mhi_driver);
+}
diff --git a/drivers/accel/qaic/qaic_ssr.h b/drivers/accel/qaic/qaic_ssr.h
new file mode 100644 (file)
index 0000000..02cd9ee
--- /dev/null
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * Copyright (c) 2020, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2021, 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef __QAIC_SSR_H__
+#define __QAIC_SSR_H__
+
+struct qaic_device;
+
+int qaic_ssr_register(void);
+void qaic_ssr_unregister(void);
+void qaic_clean_up_ssr(struct qaic_device *qdev);
+int qaic_ssr_init(struct qaic_device *qdev);
+#endif /* __QAIC_SSR_H__ */