drm/xe/xe_vm: Add per VM fault info

author Jonathan Cavitt <jonathan.cavitt@intel.com>

Tue, 24 Mar 2026 15:29:39 +0000 (15:29 +0000)

committer Matthew Brost <matthew.brost@intel.com>

Thu, 26 Mar 2026 01:05:57 +0000 (18:05 -0700)
author Jonathan Cavitt <jonathan.cavitt@intel.com>
Tue, 24 Mar 2026 15:29:39 +0000 (15:29 +0000)
committer Matthew Brost <matthew.brost@intel.com>
Thu, 26 Mar 2026 01:05:57 +0000 (18:05 -0700)
diff --git a/drivers/gpu/drm/xe/xe_pagefault.c b/drivers/gpu/drm/xe/xe_pagefault.c

index 918d595d1c1b765cc12b8e7ee433d20b86a4dd1f..2fd55d7c98f9a88c6cebc971b50e28cb776b4561 100644 (file)
--- a/drivers/gpu/drm/xe/xe_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_pagefault.c
@@ -250,6 +250,31 @@ static void xe_pagefault_print(struct xe_pagefault *pf)
                    pf->consumer.engine_instance);
  }
  
+static void xe_pagefault_save_to_vm(struct xe_device *xe, struct xe_pagefault *pf)
+{
+       struct xe_vm *vm;
+
+       /*
+        * Pagefault may be asociated to VM that is not in fault mode.
+        * Perform asid_to_vm behavior, except if VM is not in fault
+        * mode, return VM anyways.
+        */
+       down_read(&xe->usm.lock);
+       vm = xa_load(&xe->usm.asid_to_vm, pf->consumer.asid);
+       if (vm)
+               xe_vm_get(vm);
+       else
+               vm = ERR_PTR(-EINVAL);
+       up_read(&xe->usm.lock);
+
+       if (IS_ERR(vm))
+               return;
+
+       xe_vm_add_fault_entry_pf(vm, pf);
+
+       xe_vm_put(vm);
+}
+
  static void xe_pagefault_queue_work(struct work_struct *w)
  {
         struct xe_pagefault_queue *pf_queue =
@@ -268,6 +293,7 @@ static void xe_pagefault_queue_work(struct work_struct *w)
  
                 err = xe_pagefault_service(&pf);
                 if (err) {
+                       xe_pagefault_save_to_vm(gt_to_xe(pf.gt), &pf);
                         if (!(pf.consumer.access_type & XE_PAGEFAULT_ACCESS_PREFETCH)) {
                                 xe_pagefault_print(&pf);
                                 xe_gt_info(pf.gt, "Fault response: Unsuccessful %pe\n",
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c

index ff162e4fc84badf271e372f900e1a2131db521c6..5ed98ec8674b493a615531d420b70686601d8f25 100644 (file)
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -27,6 +27,7 @@
  #include "xe_device.h"
  #include "xe_drm_client.h"
  #include "xe_exec_queue.h"
+#include "xe_gt.h"
  #include "xe_migrate.h"
  #include "xe_pat.h"
  #include "xe_pm.h"
@@ -577,6 +578,74 @@ out_unlock_outer:
         trace_xe_vm_rebind_worker_exit(vm);
  }
  
+/**
+ * xe_vm_add_fault_entry_pf() - Add pagefault to vm fault list
+ * @vm: The VM.
+ * @pf: The pagefault.
+ *
+ * This function takes the data from the pagefault @pf and saves it to @vm->faults.list.
+ *
+ * The function exits silently if the list is full, and reports a warning if the pagefault
+ * could not be saved to the list.
+ */
+void xe_vm_add_fault_entry_pf(struct xe_vm *vm, struct xe_pagefault *pf)
+{
+       struct xe_vm_fault_entry *e;
+       struct xe_hw_engine *hwe;
+
+       /* Do not report faults on reserved engines */
+       hwe = xe_gt_hw_engine(pf->gt, pf->consumer.engine_class,
+                             pf->consumer.engine_instance, false);
+       if (!hwe || xe_hw_engine_is_reserved(hwe))
+               return;
+
+       e = kzalloc_obj(*e);
+       if (!e) {
+               drm_warn(&vm->xe->drm,
+                        "Could not allocate memory for fault!\n");
+               return;
+       }
+
+       guard(spinlock)(&vm->faults.lock);
+
+       /*
+        * Limit the number of faults in the fault list to prevent
+        * memory overuse.
+        */
+       if (vm->faults.len >= MAX_FAULTS_SAVED_PER_VM) {
+               kfree(e);
+               return;
+       }
+
+       e->address = pf->consumer.page_addr;
+       /*
+        * TODO:
+        * Address precision is currently always SZ_4K, but this may change
+        * in the future.
+        */
+       e->address_precision = SZ_4K;
+       e->access_type = pf->consumer.access_type;
+       e->fault_type = FIELD_GET(XE_PAGEFAULT_TYPE_MASK,
+                                 pf->consumer.fault_type_level),
+       e->fault_level = FIELD_GET(XE_PAGEFAULT_LEVEL_MASK,
+                                  pf->consumer.fault_type_level),
+
+       list_add_tail(&e->list, &vm->faults.list);
+       vm->faults.len++;
+}
+
+static void xe_vm_clear_fault_entries(struct xe_vm *vm)
+{
+       struct xe_vm_fault_entry *e, *tmp;
+
+       guard(spinlock)(&vm->faults.lock);
+       list_for_each_entry_safe(e, tmp, &vm->faults.list, list) {
+               list_del(&e->list);
+               kfree(e);
+       }
+       vm->faults.len = 0;
+}
+
  static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
  {
         int i;
@@ -1538,6 +1607,9 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
         INIT_LIST_HEAD(&vm->userptr.invalidated);
         spin_lock_init(&vm->userptr.invalidated_lock);
  
+       INIT_LIST_HEAD(&vm->faults.list);
+       spin_lock_init(&vm->faults.lock);
+
         ttm_lru_bulk_move_init(&vm->lru_bulk_move);
  
         INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
@@ -1854,6 +1926,8 @@ void xe_vm_close_and_put(struct xe_vm *vm)
         }
         up_write(&xe->usm.lock);
  
+       xe_vm_clear_fault_entries(vm);
+
         for_each_tile(tile, xe, id)
                 xe_range_fence_tree_fini(&vm->rftree[id]);
  
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h

index 0bc7ed23eeae06873192f94d07185323144de47d..42767d2aebac1616e8c38de28075ba8c0545d40d 100644 (file)
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -12,6 +12,12 @@
  #include "xe_map.h"
  #include "xe_vm_types.h"
  
+/**
+ * MAX_FAULTS_SAVED_PER_VM - Maximum number of faults each vm can store before future
+ * faults are discarded to prevent memory overuse
+ */
+#define MAX_FAULTS_SAVED_PER_VM        50
+
  struct drm_device;
  struct drm_printer;
  struct drm_file;
@@ -22,6 +28,7 @@ struct dma_fence;
  
  struct xe_exec_queue;
  struct xe_file;
+struct xe_pagefault;
  struct xe_sync_entry;
  struct xe_svm_range;
  struct drm_exec;
@@ -318,6 +325,8 @@ void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap);
  void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p);
  void xe_vm_snapshot_free(struct xe_vm_snapshot *snap);
  
+void xe_vm_add_fault_entry_pf(struct xe_vm *vm, struct xe_pagefault *pf);
+
  /**
   * xe_vm_set_validating() - Register this task as currently making bos resident
   * @allow_res_evict: Allow eviction of buffer objects bound to @vm when
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h

index fc811b5e308c69cfd98df7222207d977e18e2b05..3ab2cef2542643027dd0a1af908165db8ef5ca7c 100644 (file)
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -24,6 +24,7 @@
  struct drm_pagemap;
  
  struct xe_bo;
+struct xe_pagefault;
  struct xe_svm_range;
  struct xe_sync_entry;
  struct xe_user_fence;
@@ -176,6 +177,24 @@ struct xe_userptr_vma {
  
  struct xe_device;
  
+/**
+ * struct xe_vm_fault_entry - Elements of vm->faults.list
+ * @list: link into @xe_vm.faults.list
+ * @address: address of the fault
+ * @address_precision: precision of faulted address
+ * @access_type: type of address access that resulted in fault
+ * @fault_type: type of fault reported
+ * @fault_level: fault level of the fault
+ */
+struct xe_vm_fault_entry {
+       struct list_head list;
+       u64 address;
+       u32 address_precision;
+       u8 access_type;
+       u8 fault_type;
+       u8 fault_level;
+};
+
  struct xe_vm {
         /** @gpuvm: base GPUVM used to track VMAs */
         struct drm_gpuvm gpuvm;
@@ -333,6 +352,16 @@ struct xe_vm {
                 bool capture_once;
         } error_capture;
  
+       /** @faults: List of all faults associated with this VM */
+       struct {
+               /** @faults.lock: lock protecting @faults.list */
+               spinlock_t lock;
+               /** @faults.list: list of xe_vm_fault_entry entries */
+               struct list_head list;
+               /** @faults.len: length of @faults.list */
+               unsigned int len;
+       } faults;
+
         /**
          * @validation: Validation data only valid with the vm resv held.
          * Note: This is really task state of the task holding the vm resv,
author	Jonathan Cavitt <jonathan.cavitt@intel.com>
	Tue, 24 Mar 2026 15:29:39 +0000 (15:29 +0000)
committer	Matthew Brost <matthew.brost@intel.com>
	Thu, 26 Mar 2026 01:05:57 +0000 (18:05 -0700)
drivers/gpu/drm/xe/xe_pagefault.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_vm.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_vm.h		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_vm_types.h		patch \| blob \| blame \| history