It is hard to debug the reason for heartbeat check failures.
As an attempt to ease this task, this patch will provide more
information when this failure happens.
Heartbeat checks the communication with FW, so printing
the CPU queue pi/ci and the counter of how many times that event
was received would help in debugging the issue.
Signed-off-by: Farah Kassabri <fkassabri@habana.ai>
Reviewed-by: Ofir Bitton <obitton@habana.ai>
Signed-off-by: Ofir Bitton <obitton@habana.ai>
static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ u32 cpu_q_id;
if (!prop->cpucp_info.eq_health_check_supported)
return true;
if (!hdev->eq_heartbeat_received) {
+ cpu_q_id = hdev->heartbeat_debug_info.cpu_queue_id;
+
dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
+
+ dev_err(hdev->dev, "Heartbeat events counter: %u, Q_PI: %u, Q_CI: %u, EQ CI: %u, EQ prev: %u\n",
+ hdev->heartbeat_debug_info.heartbeat_event_counter,
+ hdev->kernel_queues[cpu_q_id].pi,
+ atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
+ hdev->event_queue.ci,
+ hdev->event_queue.prev_eqe_index);
return false;
}
hdev->high_pll = hdev->asic_prop.high_pll;
if (hdev->heartbeat) {
+ hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
+
/*
* Before scheduling the heartbeat driver will check if eq event has received.
* for the first schedule we need to set the indication as true then for the next
#define HL_DEVICE_TIMEOUT_USEC 1000000 /* 1 s */
-#define HL_HEARTBEAT_PER_USEC 5000000 /* 5 s */
+#define HL_HEARTBEAT_PER_USEC 10000000 /* 10 s */
#define HL_PLL_LOW_JOB_FREQ_USEC 5000000 /* 5 s */
u8 watchdog_active;
};
+/**
+ * struct eq_heartbeat_debug_info - stores debug info to be used upon heartbeat failure.
+ * @heartbeat_event_counter: number of heartbeat events received.
+ * @cpu_queue_id: used to read the queue pi/ci
+ */
+struct eq_heartbeat_debug_info {
+ u32 heartbeat_event_counter;
+ u32 cpu_queue_id;
+};
+
/**
* struct hl_device - habanalabs device structure.
* @pdev: pointer to PCI device, can be NULL in case of simulator device.
* @clk_throttling: holds information about current/previous clock throttling events
* @captured_err_info: holds information about errors.
* @reset_info: holds current device reset information.
+ * @heartbeat_debug_info: counters used to debug heartbeat failures.
* @irq_affinity_mask: mask of available CPU cores for user and decoder interrupt handling.
* @stream_master_qid_arr: pointer to array with QIDs of master streams.
* @fw_inner_major_ver: the major of current loaded preboot inner version.
struct hl_reset_info reset_info;
+ struct eq_heartbeat_debug_info heartbeat_debug_info;
+
cpumask_t irq_affinity_mask;
u32 *stream_master_qid_arr;
if (rc)
goto special_blocks_free;
+ hdev->heartbeat_debug_info.cpu_queue_id = GAUDI2_QUEUE_ID_CPU_PQ;
+
return 0;
special_blocks_free:
static void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
{
+ hdev->heartbeat_debug_info.heartbeat_event_counter++;
hdev->eq_heartbeat_received = true;
}