From: Thomas Hellström Date: Mon, 8 Sep 2025 10:12:38 +0000 (+0200) Subject: drm/xe: Convert existing drm_exec transactions for exhaustive eviction X-Git-Tag: v6.18-rc1~134^2~4^2~61 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=8f25e5abcbfb9cbcd923021774f2e785db3f7f55;p=thirdparty%2Flinux.git drm/xe: Convert existing drm_exec transactions for exhaustive eviction Convert existing drm_exec transactions, like GT pagefault validation, non-LR exec() IOCTL and the rebind worker to support exhaustive eviction using the xe_validation_guard(). v2: - Adapt to signature change in xe_validation_guard() (Matt Brost) - Avoid gotos from within xe_validation_guard() (Matt Brost) - Check error return from xe_validation_guard() v3: - Rebase on gpu_madvise() Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost #v1 Link: https://lore.kernel.org/r/20250908101246.65025-6-thomas.hellstrom@linux.intel.com --- diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 56edee596352b..7715e74bb9454 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -120,10 +120,10 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn}; struct drm_exec *exec = &vm_exec.exec; u32 i, num_syncs, num_ufence = 0; + struct xe_validation_ctx ctx; struct xe_sched_job *job; struct xe_vm *vm; bool write_locked, skip_retry = false; - ktime_t end = 0; int err = 0; struct xe_hw_engine_group *group; enum xe_hw_engine_group_execution_mode mode, previous_mode; @@ -251,17 +251,12 @@ retry: if (err) goto err_unlock_list; - vm_exec.vm = &vm->gpuvm; - vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT; - if (xe_vm_in_lr_mode(vm)) { - drm_exec_init(exec, vm_exec.flags, 0); - } else { - err = drm_gpuvm_exec_lock(&vm_exec); - if (err) { - if (xe_vm_validate_should_retry(exec, err, &end)) - err = -EAGAIN; + if (!xe_vm_in_lr_mode(vm)) { + vm_exec.vm = &vm->gpuvm; + vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT; + err = xe_validation_exec_lock(&ctx, &vm_exec, &xe->val); + if (err) goto err_unlock_list; - } } if (xe_vm_is_closed_or_banned(q->vm)) { @@ -355,7 +350,8 @@ err_put_job: if (err) xe_sched_job_put(job); err_exec: - drm_exec_fini(exec); + if (!xe_vm_in_lr_mode(vm)) + xe_validation_ctx_fini(&ctx); err_unlock_list: up_read(&vm->lock); if (err == -EAGAIN && !skip_retry) diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index b9653ecbc7d4a..ec6f6d520a9c5 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -96,9 +96,9 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma, { struct xe_vm *vm = xe_vma_vm(vma); struct xe_tile *tile = gt_to_tile(gt); + struct xe_validation_ctx ctx; struct drm_exec exec; struct dma_fence *fence; - ktime_t end = 0; int err, needs_vram; lockdep_assert_held_write(&vm->lock); @@ -127,12 +127,11 @@ retry_userptr: } /* Lock VM and BOs dma-resv */ - drm_exec_init(&exec, 0, 0); + xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); drm_exec_until_all_locked(&exec) { err = xe_pf_begin(&exec, vma, needs_vram == 1, tile->mem.vram); drm_exec_retry_on_contention(&exec); - if (xe_vm_validate_should_retry(&exec, err, &end)) - err = -EAGAIN; + xe_validation_retry_on_oom(&ctx, &err); if (err) goto unlock_dma_resv; @@ -143,8 +142,7 @@ retry_userptr: xe_vm_set_validation_exec(vm, NULL); if (IS_ERR(fence)) { err = PTR_ERR(fence); - if (xe_vm_validate_should_retry(&exec, err, &end)) - err = -EAGAIN; + xe_validation_retry_on_oom(&ctx, &err); goto unlock_dma_resv; } } @@ -153,7 +151,7 @@ retry_userptr: dma_fence_put(fence); unlock_dma_resv: - drm_exec_fini(&exec); + xe_validation_ctx_fini(&ctx); if (err == -EAGAIN) goto retry_userptr; @@ -535,6 +533,7 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc) { struct xe_device *xe = gt_to_xe(gt); struct xe_tile *tile = gt_to_tile(gt); + struct xe_validation_ctx ctx; struct drm_exec exec; struct xe_vm *vm; struct xe_vma *vma; @@ -564,15 +563,14 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc) goto unlock_vm; /* Lock VM and BOs dma-resv */ - drm_exec_init(&exec, 0, 0); + xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); drm_exec_until_all_locked(&exec) { ret = xe_pf_begin(&exec, vma, IS_DGFX(vm->xe), tile->mem.vram); drm_exec_retry_on_contention(&exec); - if (ret) - break; + xe_validation_retry_on_oom(&ctx, &ret); } - drm_exec_fini(&exec); + xe_validation_ctx_fini(&ctx); unlock_vm: up_read(&vm->lock); xe_vm_put(vm); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 5656da870d6d4..dfe88924fda0d 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -210,6 +210,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) .num_fences = 1, }; struct drm_exec *exec = &vm_exec.exec; + struct xe_validation_ctx ctx; struct dma_fence *pfence; int err; bool wait; @@ -217,7 +218,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm)); down_write(&vm->lock); - err = drm_gpuvm_exec_lock(&vm_exec); + err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val); if (err) goto out_up_write; @@ -249,7 +250,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) xe_svm_notifier_unlock(vm); out_fini: - drm_exec_fini(exec); + xe_validation_ctx_fini(&ctx); out_up_write: up_write(&vm->lock); @@ -313,39 +314,6 @@ void xe_vm_kill(struct xe_vm *vm, bool unlocked) /* TODO: Inform user the VM is banned */ } -/** - * xe_vm_validate_should_retry() - Whether to retry after a validate error. - * @exec: The drm_exec object used for locking before validation. - * @err: The error returned from ttm_bo_validate(). - * @end: A ktime_t cookie that should be set to 0 before first use and - * that should be reused on subsequent calls. - * - * With multiple active VMs, under memory pressure, it is possible that - * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM. - * Until ttm properly handles locking in such scenarios, best thing the - * driver can do is retry with a timeout. Check if that is necessary, and - * if so unlock the drm_exec's objects while keeping the ticket to prepare - * for a rerun. - * - * Return: true if a retry after drm_exec_init() is recommended; - * false otherwise. - */ -bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end) -{ - ktime_t cur; - - if (err != -ENOMEM) - return false; - - cur = ktime_get(); - *end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS); - if (!ktime_before(cur, *end)) - return false; - - msleep(20); - return true; -} - static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec) { struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm); @@ -476,10 +444,10 @@ void xe_vm_resume_rebind_worker(struct xe_vm *vm) static void preempt_rebind_work_func(struct work_struct *w) { struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work); + struct xe_validation_ctx ctx; struct drm_exec exec; unsigned int fence_count = 0; LIST_HEAD(preempt_fences); - ktime_t end = 0; int err = 0; long wait; int __maybe_unused tries = 0; @@ -507,18 +475,19 @@ retry: goto out_unlock_outer; } - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); + err = xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, + (struct xe_val_flags) {.interruptible = true}); + if (err) + goto out_unlock_outer; drm_exec_until_all_locked(&exec) { bool done = false; err = xe_preempt_work_begin(&exec, vm, &done); drm_exec_retry_on_contention(&exec); + xe_validation_retry_on_oom(&ctx, &err); if (err || done) { - drm_exec_fini(&exec); - if (err && xe_vm_validate_should_retry(&exec, err, &end)) - err = -EAGAIN; - + xe_validation_ctx_fini(&ctx); goto out_unlock_outer; } } @@ -566,7 +535,7 @@ retry: xe_svm_notifier_unlock(vm); out_unlock: - drm_exec_fini(&exec); + xe_validation_ctx_fini(&ctx); out_unlock_outer: if (err == -EAGAIN) { trace_xe_vm_rebind_worker_retry(vm); @@ -1164,20 +1133,19 @@ int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma) static void xe_vma_destroy_unlocked(struct xe_vma *vma) { + struct xe_device *xe = xe_vma_vm(vma)->xe; + struct xe_validation_ctx ctx; struct drm_exec exec; - int err; + int err = 0; - drm_exec_init(&exec, 0, 0); - drm_exec_until_all_locked(&exec) { + xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) { err = xe_vm_lock_vma(&exec, vma); drm_exec_retry_on_contention(&exec); if (XE_WARN_ON(err)) break; + xe_vma_destroy(vma, NULL); } - - xe_vma_destroy(vma, NULL); - - drm_exec_fini(&exec); + xe_assert(xe, !err); } struct xe_vma * @@ -2383,6 +2351,7 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op, struct xe_vma_mem_attr *attr, unsigned int flags) { struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL; + struct xe_validation_ctx ctx; struct drm_exec exec; struct xe_vma *vma; int err = 0; @@ -2390,9 +2359,9 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op, lockdep_assert_held_write(&vm->lock); if (bo) { - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); - drm_exec_until_all_locked(&exec) { - err = 0; + err = 0; + xe_validation_guard(&ctx, &vm->xe->val, &exec, + (struct xe_val_flags) {.interruptible = true}, err) { if (!bo->vm) { err = drm_exec_lock_obj(&exec, xe_vm_obj(vm)); drm_exec_retry_on_contention(&exec); @@ -2401,27 +2370,35 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op, err = drm_exec_lock_obj(&exec, &bo->ttm.base); drm_exec_retry_on_contention(&exec); } - if (err) { - drm_exec_fini(&exec); + if (err) return ERR_PTR(err); - } - } - } - vma = xe_vma_create(vm, bo, op->gem.offset, - op->va.addr, op->va.addr + - op->va.range - 1, attr, flags); - if (IS_ERR(vma)) - goto err_unlock; - if (xe_vma_is_userptr(vma)) - err = xe_vma_userptr_pin_pages(to_userptr_vma(vma)); - else if (!xe_vma_has_no_bo(vma) && !bo->vm) - err = add_preempt_fences(vm, bo); + vma = xe_vma_create(vm, bo, op->gem.offset, + op->va.addr, op->va.addr + + op->va.range - 1, attr, flags); + if (IS_ERR(vma)) + return vma; -err_unlock: - if (bo) - drm_exec_fini(&exec); + if (!bo->vm) { + err = add_preempt_fences(vm, bo); + if (err) { + prep_vma_destroy(vm, vma, false); + xe_vma_destroy(vma, NULL); + } + } + } + if (err) + return ERR_PTR(err); + } else { + vma = xe_vma_create(vm, NULL, op->gem.offset, + op->va.addr, op->va.addr + + op->va.range - 1, attr, flags); + if (IS_ERR(vma)) + return vma; + if (xe_vma_is_userptr(vma)) + err = xe_vma_userptr_pin_pages(to_userptr_vma(vma)); + } if (err) { prep_vma_destroy(vm, vma, false); xe_vma_destroy_unlocked(vma); @@ -3220,21 +3197,23 @@ static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops, static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm, struct xe_vma_ops *vops) { + struct xe_validation_ctx ctx; struct drm_exec exec; struct dma_fence *fence; - int err; + int err = 0; lockdep_assert_held_write(&vm->lock); - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT | - DRM_EXEC_IGNORE_DUPLICATES, 0); - drm_exec_until_all_locked(&exec) { + xe_validation_guard(&ctx, &vm->xe->val, &exec, + ((struct xe_val_flags) { + .interruptible = true, + .exec_ignore_duplicates = true, + }), err) { err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops); drm_exec_retry_on_contention(&exec); - if (err) { - fence = ERR_PTR(err); - goto unlock; - } + xe_validation_retry_on_oom(&ctx, &err); + if (err) + return ERR_PTR(err); xe_vm_set_validation_exec(vm, &exec); fence = ops_execute(vm, vops); @@ -3242,15 +3221,13 @@ static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm, if (IS_ERR(fence)) { if (PTR_ERR(fence) == -ENODATA) vm_bind_ioctl_ops_fini(vm, vops, NULL); - goto unlock; + return fence; } vm_bind_ioctl_ops_fini(vm, vops, fence); } -unlock: - drm_exec_fini(&exec); - return fence; + return err ? ERR_PTR(err) : fence; } ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO); diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h index ba6636c1bd58d..ef8a5019574e6 100644 --- a/drivers/gpu/drm/xe/xe_vm.h +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -260,8 +260,6 @@ static inline void xe_vm_reactivate_rebind(struct xe_vm *vm) } } -bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end); - int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma); int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,