]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
drm/amdgpu: unmap all user mappings of framebuffer and doorbell before mode1 reset
authorYifan Zhang <yifan1.zhang@amd.com>
Mon, 11 May 2026 14:14:23 +0000 (22:14 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 19 May 2026 16:14:55 +0000 (12:14 -0400)
During Mode 1 reset, the ASIC undergoes a reset cycle and becomes temporarily
inaccessible via PCIe. Any attempt to access framebuffer or MMIO registers during
this window can result in uncompleted PCIe transactions, leading to NMI panics or
system hangs.

To prevent this, Unmap all of the applications mappings of the framebuffer
and doorbell BARs before mode1 reset. Also prevent new mappings from coming in
during the reset process.

v2: remove inode in kfd_dev (Christian)
v3: correct unmap offset (Felix), remove prevent new mappings part
to avoid deadlock (Christian)

Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Yifan Zhang <yifan1.zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 70cadefcc6160c575b04f763ada34c20e868d577)

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index d9e283f3b57d5b267baed3889d736152ff3f46e2..9783a3cefb04b0c2dc75c419a9cb716a894440d3 100644 (file)
@@ -36,6 +36,9 @@
 #include "amdgpu_ras.h"
 #include "amdgpu_umc.h"
 #include "amdgpu_reset.h"
+#if IS_ENABLED(CONFIG_HSA_AMD)
+#include "kfd_priv.h"
+#endif
 
 /* Total memory size in system memory and all GPU VRAM. Used to
  * estimate worst case amount of memory to reserve for page tables
@@ -320,6 +323,28 @@ void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
                (void)amdgpu_reset_domain_schedule(adev->reset_domain, &adev->kfd.reset_work);
 }
 
+void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev)
+{
+#if IS_ENABLED(CONFIG_HSA_AMD)
+       struct kfd_dev *kfd = adev->kfd.dev;
+       unsigned int i;
+
+       if (!kfd)
+               return;
+
+       for (i = 0; i < kfd->num_nodes; i++) {
+               struct kfd_node *node = kfd->nodes[i];
+
+               kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_DOORBELL |
+                                           KFD_MMAP_GPU_ID(node->id),
+                                           kfd_doorbell_process_slice(kfd));
+               kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_MMIO |
+                                           KFD_MMAP_GPU_ID(node->id),
+                                           PAGE_SIZE);
+       }
+#endif
+}
+
 int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size,
                                u32 domain, void **mem_obj, uint64_t *gpu_addr,
                                void **cpu_ptr, bool cp_mqd_gfx9)
index cdbab7f8cee8c53dbbe1ddd0ce8731ae05c65c34..2b4108f83f48272436911697dbe4fc193fea6384 100644 (file)
@@ -358,6 +358,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
                uint64_t size, u32 alloc_flag, int8_t xcp_id);
 void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
                uint64_t size, u32 alloc_flag, int8_t xcp_id);
+void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev);
 
 u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id);
 
index 1424c98d2006a83415fea2b0762aee07ffee628f..feab90e3efd1e2a3fdc188ef16063fc5e210cc70 100644 (file)
@@ -5835,6 +5835,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
        /* We need to lock reset domain only once both for XGMI and single device */
        amdgpu_device_recovery_get_reset_lock(adev, &device_list);
 
+       /* unmap all the mappings of doorbell and framebuffer to prevent user space from
+        * accessing them
+        */
+       unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
+       amdgpu_amdkfd_clear_kfd_mapping(adev);
+
        amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
                                      hive, need_emergency_restart);
        if (need_emergency_restart)
index f95bf6d9553463737754e95030aa041c7961ceb1..03b266b26738612d012f0a036bc318c5ee8d5f03 100644 (file)
@@ -67,6 +67,21 @@ static const struct class kfd_class = {
        .name = kfd_dev_name,
 };
 
+/*
+ * Cache the address space of the chardev on first open so that the reset
+ * path can drop all userspace mappings of doorbell and MMIO ranges via
+ * unmap_mapping_range().
+ */
+static struct address_space *kfd_dev_mapping;
+
+void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen)
+{
+       struct address_space *mapping = READ_ONCE(kfd_dev_mapping);
+
+       if (mapping)
+               unmap_mapping_range(mapping, holebegin, holelen, 1);
+}
+
 static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id)
 {
        struct kfd_process_device *pdd;
@@ -133,6 +148,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
        if (iminor(inode) != 0)
                return -ENODEV;
 
+       /*
+        * /dev/kfd is a single chardev so all opens share one inode. Cache
+        * its address_space on the first open for use by the reset path.
+        */
+       if (!READ_ONCE(kfd_dev_mapping))
+               cmpxchg(&kfd_dev_mapping, NULL, inode->i_mapping);
+
        is_32bit_user_mode = in_compat_syscall();
 
        if (is_32bit_user_mode) {
index 7b5b12206919874747c86775d80d20fc7a39f98d..d5b07789eda438833bc7898916298d740ae33df9 100644 (file)
@@ -395,6 +395,7 @@ enum kfd_mempool {
 /* Character device interface */
 int kfd_chardev_init(void);
 void kfd_chardev_exit(void);
+void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen);
 
 /**
  * enum kfd_unmap_queues_filter - Enum for queue filters.