]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
accel/habanalabs/gaudi2: add support for logging register accesses from debugfs
authorSharley Calzolari <sharley.calzolari@intel.com>
Thu, 18 Jul 2024 15:01:52 +0000 (18:01 +0300)
committerKoby Elbaz <koby.elbaz@intel.com>
Thu, 25 Sep 2025 06:09:26 +0000 (09:09 +0300)
Add infrastructure for logging the last configuration register accesses
that occur via debugfs read/write operations. At interrupt time, these
log entries can be dumped to dmesg, which helps in diagnosing the cause
of RAZWI and ADDR_DEC interrupts.

The logging is implemented as a ring buffer of access entries, with each
entry recording timestamp and access details. To ensure correctness
under concurrent access, operations are now protected using spinlocks.
Entries are copied under lock and then printed after releasing it, which
minimizes time spent in the critical section.

Signed-off-by: Sharley Calzolari <sharley.calzolari@intel.com>
Reviewed-by: Koby Elbaz <koby.elbaz@intel.com>
Signed-off-by: Koby Elbaz <koby.elbaz@intel.com>
drivers/accel/habanalabs/common/debugfs.c
drivers/accel/habanalabs/common/habanalabs.h
drivers/accel/habanalabs/gaudi2/gaudi2.c

index 4b391807e5f2e2a2570a38b9dfdf6be4299dbfb6..4d975e8059ca3fa3d688cdb68f1eea802adc34a7 100644 (file)
@@ -788,6 +788,113 @@ static void hl_access_host_mem(struct hl_device *hdev, u64 addr, u64 *val,
        }
 }
 
+static void dump_cfg_access_entry(struct hl_device *hdev,
+                                 struct hl_debugfs_cfg_access_entry *entry)
+{
+       char *access_type = "";
+       struct tm tm;
+
+       switch (entry->debugfs_type) {
+       case DEBUGFS_READ32:
+               access_type = "READ32 from";
+               break;
+       case DEBUGFS_WRITE32:
+               access_type = "WRITE32 to";
+               break;
+       case DEBUGFS_READ64:
+               access_type = "READ64 from";
+               break;
+       case DEBUGFS_WRITE64:
+               access_type = "WRITE64 to";
+               break;
+       default:
+               dev_err(hdev->dev, "Invalid DEBUGFS access type (%u)\n", entry->debugfs_type);
+               return;
+       }
+
+       time64_to_tm(entry->seconds_since_epoch, 0, &tm);
+       dev_info(hdev->dev,
+               "%ld-%02d-%02d %02d:%02d:%02d (UTC): %s %#llx\n", tm.tm_year + 1900, tm.tm_mon + 1,
+               tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, access_type, entry->addr);
+}
+
+void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev)
+{
+       struct hl_debugfs_cfg_access *dbgfs = &hdev->debugfs_cfg_accesses;
+       u32 i, head, count = 0;
+       time64_t entry_time, now;
+       unsigned long flags;
+
+       now = ktime_get_real_seconds();
+
+       spin_lock_irqsave(&dbgfs->lock, flags);
+       head = dbgfs->head;
+       if (head == 0)
+               i = HL_DBGFS_CFG_ACCESS_HIST_LEN - 1;
+       else
+               i = head - 1;
+
+       /* Walk back until timeout or invalid entry */
+       while (dbgfs->cfg_access_list[i].valid) {
+               entry_time = dbgfs->cfg_access_list[i].seconds_since_epoch;
+               /* Stop when entry is older than timeout */
+               if (now - entry_time > HL_DBGFS_CFG_ACCESS_HIST_TIMEOUT_SEC)
+                       break;
+
+               /* print single entry under lock */
+               {
+                       struct hl_debugfs_cfg_access_entry entry = dbgfs->cfg_access_list[i];
+                       /*
+                        * We copy the entry out under lock and then print after
+                        * releasing the lock to minimize time under lock.
+                        */
+                       spin_unlock_irqrestore(&dbgfs->lock, flags);
+                       dump_cfg_access_entry(hdev, &entry);
+                       spin_lock_irqsave(&dbgfs->lock, flags);
+               }
+
+               /* mark consumed */
+               dbgfs->cfg_access_list[i].valid = false;
+
+               if (i == 0)
+                       i = HL_DBGFS_CFG_ACCESS_HIST_LEN - 1;
+               else
+                       i--;
+               count++;
+               if (count >= HL_DBGFS_CFG_ACCESS_HIST_LEN)
+                       break;
+       }
+       spin_unlock_irqrestore(&dbgfs->lock, flags);
+}
+
+static void check_if_cfg_access_and_log(struct hl_device *hdev, u64 addr, size_t access_size,
+                                       enum debugfs_access_type access_type)
+{
+       struct hl_debugfs_cfg_access *dbgfs_cfg_accesses = &hdev->debugfs_cfg_accesses;
+       struct pci_mem_region *mem_reg = &hdev->pci_mem_region[PCI_REGION_CFG];
+       struct hl_debugfs_cfg_access_entry *new_entry;
+       unsigned long flags;
+
+       /* Check if address is in config memory */
+       if (addr >= mem_reg->region_base &&
+               mem_reg->region_size >= access_size &&
+               addr <= mem_reg->region_base + mem_reg->region_size - access_size) {
+
+               spin_lock_irqsave(&dbgfs_cfg_accesses->lock, flags);
+
+               new_entry = &dbgfs_cfg_accesses->cfg_access_list[dbgfs_cfg_accesses->head];
+               new_entry->seconds_since_epoch = ktime_get_real_seconds();
+               new_entry->addr = addr;
+               new_entry->debugfs_type = access_type;
+               new_entry->valid = true;
+               dbgfs_cfg_accesses->head = (dbgfs_cfg_accesses->head + 1)
+                                               % HL_DBGFS_CFG_ACCESS_HIST_LEN;
+
+               spin_unlock_irqrestore(&dbgfs_cfg_accesses->lock, flags);
+
+       }
+}
+
 static int hl_access_mem(struct hl_device *hdev, u64 addr, u64 *val,
                                enum debugfs_access_type acc_type)
 {
@@ -805,6 +912,7 @@ static int hl_access_mem(struct hl_device *hdev, u64 addr, u64 *val,
                        return rc;
        }
 
+       check_if_cfg_access_and_log(hdev, addr, acc_size, acc_type);
        rc = hl_access_dev_mem_by_region(hdev, addr, val, acc_type, &found);
        if (rc) {
                dev_err(hdev->dev,
@@ -1762,6 +1870,9 @@ int hl_debugfs_device_init(struct hl_device *hdev)
        spin_lock_init(&dev_entry->userptr_spinlock);
        mutex_init(&dev_entry->ctx_mem_hash_mutex);
 
+       spin_lock_init(&hdev->debugfs_cfg_accesses.lock);
+       hdev->debugfs_cfg_accesses.head = 0; /* already zero by alloc but explicit init is fine */
+
        return 0;
 }
 
index 6f27ce4fa01bfc25342bf6618fe1bc600909d6dd..122ade172bb498f2967adea7b56170fcff59337f 100644 (file)
@@ -90,7 +90,9 @@ struct hl_fpriv;
 #define HL_COMMON_USER_CQ_INTERRUPT_ID 0xFFF
 #define HL_COMMON_DEC_INTERRUPT_ID     0xFFE
 
-#define HL_STATE_DUMP_HIST_LEN         5
+#define HL_STATE_DUMP_HIST_LEN                 5
+#define HL_DBGFS_CFG_ACCESS_HIST_LEN           20
+#define HL_DBGFS_CFG_ACCESS_HIST_TIMEOUT_SEC   2 /* 2s */
 
 /* Default value for device reset trigger , an invalid value */
 #define HL_RESET_TRIGGER_DEFAULT       0xFF
@@ -2436,6 +2438,32 @@ struct hl_dbg_device_entry {
        u8                              i2c_len;
 };
 
+/**
+ * struct hl_debugfs_cfg_access_entry - single debugfs config access object, member of
+ * hl_debugfs_cfg_access.
+ * @seconds_since_epoch: seconds since January 1, 1970, used for time comparisons.
+ * @debugfs_type: the debugfs operation requested, can be READ32, WRITE32, READ64 or WRITE64.
+ * @addr: the requested address to access.
+ * @valid: if set, this entry has valid data for dumping at interrupt time.
+ */
+struct hl_debugfs_cfg_access_entry {
+       ktime_t                         seconds_since_epoch;
+       enum debugfs_access_type        debugfs_type;
+       u64                             addr;
+       bool                            valid;
+};
+
+/**
+ * struct hl_debugfs_cfg_access - saves debugfs config region access requests history.
+ * @cfg_access_list: list of objects describing config region access requests.
+ * @head: next valid index to add new entry to in cfg_access_list.
+ */
+struct hl_debugfs_cfg_access {
+       struct hl_debugfs_cfg_access_entry      cfg_access_list[HL_DBGFS_CFG_ACCESS_HIST_LEN];
+       u32                                     head;
+       spinlock_t                      lock; /* protects head and entries */
+};
+
 /**
  * struct hl_hw_obj_name_entry - single hw object name, member of
  * hl_state_dump_specs
@@ -3281,6 +3309,7 @@ struct eq_heartbeat_debug_info {
  * @hl_chip_info: ASIC's sensors information.
  * @device_status_description: device status description.
  * @hl_debugfs: device's debugfs manager.
+ * @debugfs_cfg_accesses: list of last debugfs config region accesses.
  * @cb_pool: list of pre allocated CBs.
  * @cb_pool_lock: protects the CB pool.
  * @internal_cb_pool_virt_addr: internal command buffer pool virtual address.
@@ -3461,6 +3490,7 @@ struct hl_device {
        struct hwmon_chip_info          *hl_chip_info;
 
        struct hl_dbg_device_entry      hl_debugfs;
+       struct hl_debugfs_cfg_access    debugfs_cfg_accesses;
 
        struct list_head                cb_pool;
        spinlock_t                      cb_pool_lock;
@@ -4110,6 +4140,7 @@ void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx);
 void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx);
 void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data,
                                        unsigned long length);
+void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev);
 
 #else
 
@@ -4185,6 +4216,10 @@ static inline void hl_debugfs_set_state_dump(struct hl_device *hdev,
 {
 }
 
+static inline void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev)
+{
+}
+
 #endif
 
 /* Security */
index 39c4cc12ffdf9a6476084e34a53b26a445d752c6..d8f40f2c967c67e1860087cd639c803125d1cb86 100644 (file)
@@ -10610,6 +10610,7 @@ reset_device:
        if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR)
                hl_handle_critical_hw_err(hdev, event_type, &event_mask);
 
+       hl_debugfs_cfg_access_history_dump(hdev);
        event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
        hl_device_cond_reset(hdev, reset_flags, event_mask);
 }