]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/xe/svm: Serialize migration to device if racing
authorThomas Hellström <thomas.hellstrom@linux.intel.com>
Fri, 19 Dec 2025 11:33:20 +0000 (12:33 +0100)
committerThomas Hellström <thomas.hellstrom@linux.intel.com>
Tue, 23 Dec 2025 09:00:49 +0000 (10:00 +0100)
Introduce an rw-semaphore to serialize migration to device if
it's likely that migration races with another device migration
of the same CPU address space range.
This is a temporary fix to attempt to mitigate a livelock that
might happen if many devices try to migrate a range at the same
time, and it affects only devices using the xe driver.
A longer term fix is probably improvements in the core mm
migration layer.

Suggested-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patch.msgid.link/20251219113320.183860-25-thomas.hellstrom@linux.intel.com
drivers/gpu/drm/xe/xe_svm.c

index 84ff99aa3e499dc7103e47c97b807f3eb37005a8..fa2ee2c08f318e0b2c5e820a0c52e9d0770fb47b 100644 (file)
@@ -1593,10 +1593,12 @@ struct drm_pagemap *xe_vma_resolve_pagemap(struct xe_vma *vma, struct xe_tile *t
 int xe_svm_alloc_vram(struct xe_svm_range *range, const struct drm_gpusvm_ctx *ctx,
                      struct drm_pagemap *dpagemap)
 {
+       static DECLARE_RWSEM(driver_migrate_lock);
        struct xe_vm *vm = range_to_vm(&range->base);
        enum drm_gpusvm_scan_result migration_state;
        struct xe_device *xe = vm->xe;
        int err, retries = 1;
+       bool write_locked = false;
 
        xe_assert(range_to_vm(&range->base)->xe, range->base.pages.flags.migrate_devmem);
        range_debug(range, "ALLOCATE VRAM");
@@ -1615,16 +1617,32 @@ int xe_svm_alloc_vram(struct xe_svm_range *range, const struct drm_gpusvm_ctx *c
                drm_dbg(&xe->drm, "Request migration to device memory on \"%s\".\n",
                        dpagemap->drm->unique);
 
+       err = down_read_interruptible(&driver_migrate_lock);
+       if (err)
+               return err;
        do {
                err = drm_pagemap_populate_mm(dpagemap, xe_svm_range_start(range),
                                              xe_svm_range_end(range),
                                              range->base.gpusvm->mm,
                                              ctx->timeslice_ms);
 
-               if (err == -EBUSY && retries)
-                       drm_gpusvm_range_evict(range->base.gpusvm, &range->base);
+               if (err == -EBUSY && retries) {
+                       if (!write_locked) {
+                               int lock_err;
 
+                               up_read(&driver_migrate_lock);
+                               lock_err = down_write_killable(&driver_migrate_lock);
+                               if (lock_err)
+                                       return lock_err;
+                               write_locked = true;
+                       }
+                       drm_gpusvm_range_evict(range->base.gpusvm, &range->base);
+               }
        } while (err == -EBUSY && retries--);
+       if (write_locked)
+               up_write(&driver_migrate_lock);
+       else
+               up_read(&driver_migrate_lock);
 
        return err;
 }