]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
accel/habanalabs/gaudi2: get the correct QM CQ info upon an error
authorTomer Tayar <ttayar@habana.ai>
Mon, 6 Nov 2023 16:41:35 +0000 (18:41 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Tue, 19 Dec 2023 09:09:43 +0000 (11:09 +0200)
Upon a QM error, the address/size from both the CQ and the ARC_CQ are
printed, although the instruction that led to the error was received
from only one of them.

Moreover, in case of a QM undefined opcode, only one of these
address/size sets will be captured based on the value of ARC_CQ_PTR.
However, this value can be non-zero even if currently the CQ is used, in
case the CQ/ARC_CQ are alternately used.

Under the assumption of having a stop-on-error configuration, modify to
use CP_STS.CUR_CQ field to get the relevant CQ for the QM error.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/accel/habanalabs/gaudi2/gaudi2.c
drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h

index 5075f92d15ccf04f5e6d50a7459352c5797ce06f..77c480725a84b97e1011cfcc05836b3266d0a363 100644 (file)
@@ -7860,36 +7860,36 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 
 static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask)
 {
-       u32 lo, hi, cq_ptr_size, arc_cq_ptr_size;
-       u64 cq_ptr, arc_cq_ptr, cp_current_inst;
-
-       lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
-       hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
-       cq_ptr = ((u64) hi) << 32 | lo;
-       cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
-
-       lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
-       hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
-       arc_cq_ptr = ((u64) hi) << 32 | lo;
-       arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
+       u32 lo, hi, cq_ptr_size, cp_sts;
+       u64 cq_ptr, cp_current_inst;
+       bool is_arc_cq;
+
+       cp_sts = RREG32(qman_base + QM_CP_STS_4_OFFSET);
+       is_arc_cq = FIELD_GET(PDMA0_QM_CP_STS_CUR_CQ_MASK, cp_sts); /* 0 - legacy CQ, 1 - ARC_CQ */
+
+       if (is_arc_cq) {
+               lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
+               hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
+               cq_ptr = ((u64) hi) << 32 | lo;
+               cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
+       } else {
+               lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
+               hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
+               cq_ptr = ((u64) hi) << 32 | lo;
+               cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
+       }
 
        lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET);
        hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET);
        cp_current_inst = ((u64) hi) << 32 | lo;
 
        dev_info(hdev->dev,
-               "LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
-               cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst);
+               "LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
+               is_arc_cq ? "ARC_" : "", cq_ptr, cq_ptr_size, cp_current_inst);
 
        if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
-               if (arc_cq_ptr) {
-                       hdev->captured_err_info.undef_opcode.cq_addr = arc_cq_ptr;
-                       hdev->captured_err_info.undef_opcode.cq_size = arc_cq_ptr_size;
-               } else {
-                       hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
-                       hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
-               }
-
+               hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
+               hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
                hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS;
        }
 }
index a08378d0802b559f58f472c90c1b3e49901412a5..8018214a7b59b43bbc7c26e3236c57a58885222a 100644 (file)
 #define QM_ARC_CQ_PTR_HI_OFFSET                (mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE)
 #define QM_ARC_CQ_TSIZE_OFFSET         (mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE)
 
+#define QM_CP_STS_4_OFFSET             (mmPDMA0_QM_CP_STS_4 - mmPDMA0_QM_BASE)
 #define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE)
 #define QM_CP_CURRENT_INST_HI_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE)