]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
drm/xe: Convert existing drm_exec transactions for exhaustive eviction
authorThomas Hellström <thomas.hellstrom@linux.intel.com>
Mon, 8 Sep 2025 10:12:38 +0000 (12:12 +0200)
committerThomas Hellström <thomas.hellstrom@linux.intel.com>
Wed, 10 Sep 2025 07:16:00 +0000 (09:16 +0200)
Convert existing drm_exec transactions, like GT pagefault validation,
non-LR exec() IOCTL and the rebind worker to support
exhaustive eviction using the xe_validation_guard().

v2:
- Adapt to signature change in xe_validation_guard() (Matt Brost)
- Avoid gotos from within xe_validation_guard() (Matt Brost)
- Check error return from xe_validation_guard()

v3:
- Rebase on gpu_madvise()

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com> #v1
Link: https://lore.kernel.org/r/20250908101246.65025-6-thomas.hellstrom@linux.intel.com
drivers/gpu/drm/xe/xe_exec.c
drivers/gpu/drm/xe/xe_gt_pagefault.c
drivers/gpu/drm/xe/xe_vm.c
drivers/gpu/drm/xe/xe_vm.h

index 56edee596352b9ee53c7e71c75f9379fd9813b72..7715e74bb94544198d2d24b5be3123d66329e980 100644 (file)
@@ -120,10 +120,10 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
        struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn};
        struct drm_exec *exec = &vm_exec.exec;
        u32 i, num_syncs, num_ufence = 0;
+       struct xe_validation_ctx ctx;
        struct xe_sched_job *job;
        struct xe_vm *vm;
        bool write_locked, skip_retry = false;
-       ktime_t end = 0;
        int err = 0;
        struct xe_hw_engine_group *group;
        enum xe_hw_engine_group_execution_mode mode, previous_mode;
@@ -251,17 +251,12 @@ retry:
        if (err)
                goto err_unlock_list;
 
-       vm_exec.vm = &vm->gpuvm;
-       vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
-       if (xe_vm_in_lr_mode(vm)) {
-               drm_exec_init(exec, vm_exec.flags, 0);
-       } else {
-               err = drm_gpuvm_exec_lock(&vm_exec);
-               if (err) {
-                       if (xe_vm_validate_should_retry(exec, err, &end))
-                               err = -EAGAIN;
+       if (!xe_vm_in_lr_mode(vm)) {
+               vm_exec.vm = &vm->gpuvm;
+               vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
+               err = xe_validation_exec_lock(&ctx, &vm_exec, &xe->val);
+               if (err)
                        goto err_unlock_list;
-               }
        }
 
        if (xe_vm_is_closed_or_banned(q->vm)) {
@@ -355,7 +350,8 @@ err_put_job:
        if (err)
                xe_sched_job_put(job);
 err_exec:
-       drm_exec_fini(exec);
+       if (!xe_vm_in_lr_mode(vm))
+               xe_validation_ctx_fini(&ctx);
 err_unlock_list:
        up_read(&vm->lock);
        if (err == -EAGAIN && !skip_retry)
index b9653ecbc7d4a97b22cb8591984f22d0479639ad..ec6f6d520a9c56e602e21d293ae0fda5b6fc4b5d 100644 (file)
@@ -96,9 +96,9 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
 {
        struct xe_vm *vm = xe_vma_vm(vma);
        struct xe_tile *tile = gt_to_tile(gt);
+       struct xe_validation_ctx ctx;
        struct drm_exec exec;
        struct dma_fence *fence;
-       ktime_t end = 0;
        int err, needs_vram;
 
        lockdep_assert_held_write(&vm->lock);
@@ -127,12 +127,11 @@ retry_userptr:
        }
 
        /* Lock VM and BOs dma-resv */
-       drm_exec_init(&exec, 0, 0);
+       xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {});
        drm_exec_until_all_locked(&exec) {
                err = xe_pf_begin(&exec, vma, needs_vram == 1, tile->mem.vram);
                drm_exec_retry_on_contention(&exec);
-               if (xe_vm_validate_should_retry(&exec, err, &end))
-                       err = -EAGAIN;
+               xe_validation_retry_on_oom(&ctx, &err);
                if (err)
                        goto unlock_dma_resv;
 
@@ -143,8 +142,7 @@ retry_userptr:
                xe_vm_set_validation_exec(vm, NULL);
                if (IS_ERR(fence)) {
                        err = PTR_ERR(fence);
-                       if (xe_vm_validate_should_retry(&exec, err, &end))
-                               err = -EAGAIN;
+                       xe_validation_retry_on_oom(&ctx, &err);
                        goto unlock_dma_resv;
                }
        }
@@ -153,7 +151,7 @@ retry_userptr:
        dma_fence_put(fence);
 
 unlock_dma_resv:
-       drm_exec_fini(&exec);
+       xe_validation_ctx_fini(&ctx);
        if (err == -EAGAIN)
                goto retry_userptr;
 
@@ -535,6 +533,7 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
 {
        struct xe_device *xe = gt_to_xe(gt);
        struct xe_tile *tile = gt_to_tile(gt);
+       struct xe_validation_ctx ctx;
        struct drm_exec exec;
        struct xe_vm *vm;
        struct xe_vma *vma;
@@ -564,15 +563,14 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
                goto unlock_vm;
 
        /* Lock VM and BOs dma-resv */
-       drm_exec_init(&exec, 0, 0);
+       xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {});
        drm_exec_until_all_locked(&exec) {
                ret = xe_pf_begin(&exec, vma, IS_DGFX(vm->xe), tile->mem.vram);
                drm_exec_retry_on_contention(&exec);
-               if (ret)
-                       break;
+               xe_validation_retry_on_oom(&ctx, &ret);
        }
 
-       drm_exec_fini(&exec);
+       xe_validation_ctx_fini(&ctx);
 unlock_vm:
        up_read(&vm->lock);
        xe_vm_put(vm);
index 5656da870d6d47d2e44d2bd19ec9b4f8342fb617..dfe88924fda0dd1f5a7b088bdacfeb94a9481b0b 100644 (file)
@@ -210,6 +210,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
                .num_fences = 1,
        };
        struct drm_exec *exec = &vm_exec.exec;
+       struct xe_validation_ctx ctx;
        struct dma_fence *pfence;
        int err;
        bool wait;
@@ -217,7 +218,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
        xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
 
        down_write(&vm->lock);
-       err = drm_gpuvm_exec_lock(&vm_exec);
+       err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val);
        if (err)
                goto out_up_write;
 
@@ -249,7 +250,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
        xe_svm_notifier_unlock(vm);
 
 out_fini:
-       drm_exec_fini(exec);
+       xe_validation_ctx_fini(&ctx);
 out_up_write:
        up_write(&vm->lock);
 
@@ -313,39 +314,6 @@ void xe_vm_kill(struct xe_vm *vm, bool unlocked)
        /* TODO: Inform user the VM is banned */
 }
 
-/**
- * xe_vm_validate_should_retry() - Whether to retry after a validate error.
- * @exec: The drm_exec object used for locking before validation.
- * @err: The error returned from ttm_bo_validate().
- * @end: A ktime_t cookie that should be set to 0 before first use and
- * that should be reused on subsequent calls.
- *
- * With multiple active VMs, under memory pressure, it is possible that
- * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
- * Until ttm properly handles locking in such scenarios, best thing the
- * driver can do is retry with a timeout. Check if that is necessary, and
- * if so unlock the drm_exec's objects while keeping the ticket to prepare
- * for a rerun.
- *
- * Return: true if a retry after drm_exec_init() is recommended;
- * false otherwise.
- */
-bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
-{
-       ktime_t cur;
-
-       if (err != -ENOMEM)
-               return false;
-
-       cur = ktime_get();
-       *end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
-       if (!ktime_before(cur, *end))
-               return false;
-
-       msleep(20);
-       return true;
-}
-
 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
 {
        struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
@@ -476,10 +444,10 @@ void xe_vm_resume_rebind_worker(struct xe_vm *vm)
 static void preempt_rebind_work_func(struct work_struct *w)
 {
        struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
+       struct xe_validation_ctx ctx;
        struct drm_exec exec;
        unsigned int fence_count = 0;
        LIST_HEAD(preempt_fences);
-       ktime_t end = 0;
        int err = 0;
        long wait;
        int __maybe_unused tries = 0;
@@ -507,18 +475,19 @@ retry:
                        goto out_unlock_outer;
        }
 
-       drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
+       err = xe_validation_ctx_init(&ctx, &vm->xe->val, &exec,
+                                    (struct xe_val_flags) {.interruptible = true});
+       if (err)
+               goto out_unlock_outer;
 
        drm_exec_until_all_locked(&exec) {
                bool done = false;
 
                err = xe_preempt_work_begin(&exec, vm, &done);
                drm_exec_retry_on_contention(&exec);
+               xe_validation_retry_on_oom(&ctx, &err);
                if (err || done) {
-                       drm_exec_fini(&exec);
-                       if (err && xe_vm_validate_should_retry(&exec, err, &end))
-                               err = -EAGAIN;
-
+                       xe_validation_ctx_fini(&ctx);
                        goto out_unlock_outer;
                }
        }
@@ -566,7 +535,7 @@ retry:
        xe_svm_notifier_unlock(vm);
 
 out_unlock:
-       drm_exec_fini(&exec);
+       xe_validation_ctx_fini(&ctx);
 out_unlock_outer:
        if (err == -EAGAIN) {
                trace_xe_vm_rebind_worker_retry(vm);
@@ -1164,20 +1133,19 @@ int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
 
 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
 {
+       struct xe_device *xe = xe_vma_vm(vma)->xe;
+       struct xe_validation_ctx ctx;
        struct drm_exec exec;
-       int err;
+       int err = 0;
 
-       drm_exec_init(&exec, 0, 0);
-       drm_exec_until_all_locked(&exec) {
+       xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
                err = xe_vm_lock_vma(&exec, vma);
                drm_exec_retry_on_contention(&exec);
                if (XE_WARN_ON(err))
                        break;
+               xe_vma_destroy(vma, NULL);
        }
-
-       xe_vma_destroy(vma, NULL);
-
-       drm_exec_fini(&exec);
+       xe_assert(xe, !err);
 }
 
 struct xe_vma *
@@ -2383,6 +2351,7 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
                              struct xe_vma_mem_attr *attr, unsigned int flags)
 {
        struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
+       struct xe_validation_ctx ctx;
        struct drm_exec exec;
        struct xe_vma *vma;
        int err = 0;
@@ -2390,9 +2359,9 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
        lockdep_assert_held_write(&vm->lock);
 
        if (bo) {
-               drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
-               drm_exec_until_all_locked(&exec) {
-                       err = 0;
+               err = 0;
+               xe_validation_guard(&ctx, &vm->xe->val, &exec,
+                                   (struct xe_val_flags) {.interruptible = true}, err) {
                        if (!bo->vm) {
                                err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
                                drm_exec_retry_on_contention(&exec);
@@ -2401,27 +2370,35 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
                                err = drm_exec_lock_obj(&exec, &bo->ttm.base);
                                drm_exec_retry_on_contention(&exec);
                        }
-                       if (err) {
-                               drm_exec_fini(&exec);
+                       if (err)
                                return ERR_PTR(err);
-                       }
-               }
-       }
-       vma = xe_vma_create(vm, bo, op->gem.offset,
-                           op->va.addr, op->va.addr +
-                           op->va.range - 1, attr, flags);
-       if (IS_ERR(vma))
-               goto err_unlock;
 
-       if (xe_vma_is_userptr(vma))
-               err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
-       else if (!xe_vma_has_no_bo(vma) && !bo->vm)
-               err = add_preempt_fences(vm, bo);
+                       vma = xe_vma_create(vm, bo, op->gem.offset,
+                                           op->va.addr, op->va.addr +
+                                           op->va.range - 1, attr, flags);
+                       if (IS_ERR(vma))
+                               return vma;
 
-err_unlock:
-       if (bo)
-               drm_exec_fini(&exec);
+                       if (!bo->vm) {
+                               err = add_preempt_fences(vm, bo);
+                               if (err) {
+                                       prep_vma_destroy(vm, vma, false);
+                                       xe_vma_destroy(vma, NULL);
+                               }
+                       }
+               }
+               if (err)
+                       return ERR_PTR(err);
+       } else {
+               vma = xe_vma_create(vm, NULL, op->gem.offset,
+                                   op->va.addr, op->va.addr +
+                                   op->va.range - 1, attr, flags);
+               if (IS_ERR(vma))
+                       return vma;
 
+               if (xe_vma_is_userptr(vma))
+                       err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
+       }
        if (err) {
                prep_vma_destroy(vm, vma, false);
                xe_vma_destroy_unlocked(vma);
@@ -3220,21 +3197,23 @@ static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
                                                   struct xe_vma_ops *vops)
 {
+       struct xe_validation_ctx ctx;
        struct drm_exec exec;
        struct dma_fence *fence;
-       int err;
+       int err = 0;
 
        lockdep_assert_held_write(&vm->lock);
 
-       drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
-                     DRM_EXEC_IGNORE_DUPLICATES, 0);
-       drm_exec_until_all_locked(&exec) {
+       xe_validation_guard(&ctx, &vm->xe->val, &exec,
+                           ((struct xe_val_flags) {
+                                   .interruptible = true,
+                                   .exec_ignore_duplicates = true,
+                           }), err) {
                err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
                drm_exec_retry_on_contention(&exec);
-               if (err) {
-                       fence = ERR_PTR(err);
-                       goto unlock;
-               }
+               xe_validation_retry_on_oom(&ctx, &err);
+               if (err)
+                       return ERR_PTR(err);
 
                xe_vm_set_validation_exec(vm, &exec);
                fence = ops_execute(vm, vops);
@@ -3242,15 +3221,13 @@ static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
                if (IS_ERR(fence)) {
                        if (PTR_ERR(fence) == -ENODATA)
                                vm_bind_ioctl_ops_fini(vm, vops, NULL);
-                       goto unlock;
+                       return fence;
                }
 
                vm_bind_ioctl_ops_fini(vm, vops, fence);
        }
 
-unlock:
-       drm_exec_fini(&exec);
-       return fence;
+       return err ? ERR_PTR(err) : fence;
 }
 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
 
index ba6636c1bd58d78d00e07b1e1ba2373635664b2a..ef8a5019574e6fb6e2a1ac9e03f54c37a1e651fa 100644 (file)
@@ -260,8 +260,6 @@ static inline void xe_vm_reactivate_rebind(struct xe_vm *vm)
        }
 }
 
-bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end);
-
 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);
 
 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,