]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/xe/vf: Avoid indefinite blocking in preempt rebind worker for VFs supporting...
authorMatthew Brost <matthew.brost@intel.com>
Wed, 8 Oct 2025 21:45:16 +0000 (14:45 -0700)
committerMatthew Brost <matthew.brost@intel.com>
Thu, 9 Oct 2025 10:22:41 +0000 (03:22 -0700)
Blocking in work queues on a hardware action that may never occur —
especially when it depends on a software fixup also scheduled on the
a work queue — is a recipe for deadlock. This situation arises with
the preempt rebind worker and VF post-migration recovery. To prevent
potential deadlocks, avoid indefinite blocking in the preempt rebind
worker for VFs that support migration.

v4:
 - Use dma_fence_wait_timeout (CI)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Tomasz Lis <tomasz.lis@intel.com>
Link: https://lore.kernel.org/r/20251008214532.3442967-19-matthew.brost@intel.com
drivers/gpu/drm/xe/xe_vm.c

index 4e914928e0a976237931ef11f159c00e305b69b9..faca626702b86adbd92d896bd81a16c2aa978c3e 100644 (file)
@@ -35,6 +35,7 @@
 #include "xe_pt.h"
 #include "xe_pxp.h"
 #include "xe_res_cursor.h"
+#include "xe_sriov_vf.h"
 #include "xe_svm.h"
 #include "xe_sync.h"
 #include "xe_tile.h"
@@ -111,12 +112,22 @@ static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
 {
        struct xe_exec_queue *q;
+       bool vf_migration = IS_SRIOV_VF(vm->xe) &&
+               xe_sriov_vf_migration_supported(vm->xe);
+       signed long wait_time = vf_migration ? HZ / 5 : MAX_SCHEDULE_TIMEOUT;
 
        xe_vm_assert_held(vm);
 
        list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
                if (q->lr.pfence) {
-                       long timeout = dma_fence_wait(q->lr.pfence, false);
+                       long timeout;
+
+                       timeout = dma_fence_wait_timeout(q->lr.pfence, false,
+                                                        wait_time);
+                       if (!timeout) {
+                               xe_assert(vm->xe, vf_migration);
+                               return -EAGAIN;
+                       }
 
                        /* Only -ETIME on fence indicates VM needs to be killed */
                        if (timeout < 0 || q->lr.pfence->error == -ETIME)
@@ -541,6 +552,19 @@ out_unlock:
 out_unlock_outer:
        if (err == -EAGAIN) {
                trace_xe_vm_rebind_worker_retry(vm);
+
+               /*
+                * We can't block in workers on a VF which supports migration
+                * given this can block the VF post-migration workers from
+                * getting scheduled.
+                */
+               if (IS_SRIOV_VF(vm->xe) &&
+                   xe_sriov_vf_migration_supported(vm->xe)) {
+                       up_write(&vm->lock);
+                       xe_vm_queue_rebind_worker(vm);
+                       return;
+               }
+
                goto retry;
        }