]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amdgpu: unmap all user mappings of framebuffer and doorbell before mode1 reset
authorYifan Zhang <yifan1.zhang@amd.com>
Mon, 11 May 2026 14:14:23 +0000 (22:14 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 19 May 2026 15:45:41 +0000 (11:45 -0400)
During Mode 1 reset, the ASIC undergoes a reset cycle and becomes temporarily
inaccessible via PCIe. Any attempt to access framebuffer or MMIO registers during
this window can result in uncompleted PCIe transactions, leading to NMI panics or
system hangs.

To prevent this, Unmap all of the applications mappings of the framebuffer
and doorbell BARs before mode1 reset. Also prevent new mappings from coming in
during the reset process.

v2: remove inode in kfd_dev (Christian)
v3: correct unmap offset (Felix), remove prevent new mappings part
to avoid deadlock (Christian)

Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Yifan Zhang <yifan1.zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index d9e283f3b57d5b267baed3889d736152ff3f46e2..9783a3cefb04b0c2dc75c419a9cb716a894440d3 100644 (file)
@@ -36,6 +36,9 @@
 #include "amdgpu_ras.h"
 #include "amdgpu_umc.h"
 #include "amdgpu_reset.h"
+#if IS_ENABLED(CONFIG_HSA_AMD)
+#include "kfd_priv.h"
+#endif
 
 /* Total memory size in system memory and all GPU VRAM. Used to
  * estimate worst case amount of memory to reserve for page tables
@@ -320,6 +323,28 @@ void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
                (void)amdgpu_reset_domain_schedule(adev->reset_domain, &adev->kfd.reset_work);
 }
 
+void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev)
+{
+#if IS_ENABLED(CONFIG_HSA_AMD)
+       struct kfd_dev *kfd = adev->kfd.dev;
+       unsigned int i;
+
+       if (!kfd)
+               return;
+
+       for (i = 0; i < kfd->num_nodes; i++) {
+               struct kfd_node *node = kfd->nodes[i];
+
+               kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_DOORBELL |
+                                           KFD_MMAP_GPU_ID(node->id),
+                                           kfd_doorbell_process_slice(kfd));
+               kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_MMIO |
+                                           KFD_MMAP_GPU_ID(node->id),
+                                           PAGE_SIZE);
+       }
+#endif
+}
+
 int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size,
                                u32 domain, void **mem_obj, uint64_t *gpu_addr,
                                void **cpu_ptr, bool cp_mqd_gfx9)
index 2bf6a31c194da25cd02124c12a8f207106674791..5333e052d56db6a988a07c077d24f59d1f0efe32 100644 (file)
@@ -360,6 +360,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
                uint64_t size, u32 alloc_flag, int8_t xcp_id);
 void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
                uint64_t size, u32 alloc_flag, int8_t xcp_id);
+void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev);
 
 u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id);
 
index 179056d384ac40eb4d5962701067fe81e73f0fd4..21a3fb574d53e2412f719b7463e60e7b3c4cd9c0 100644 (file)
@@ -5842,6 +5842,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
        /* We need to lock reset domain only once both for XGMI and single device */
        amdgpu_device_recovery_get_reset_lock(adev, &device_list);
 
+       /* unmap all the mappings of doorbell and framebuffer to prevent user space from
+        * accessing them
+        */
+       unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
+       amdgpu_amdkfd_clear_kfd_mapping(adev);
+
        amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
                                      hive, need_emergency_restart);
        if (need_emergency_restart)
index 04ae3cb3a65ca172d08e1471298b93c0ec048a5e..81bcb16eb6ddcd7e67c6389e841c3a53b6530c86 100644 (file)
@@ -69,6 +69,21 @@ static const struct class kfd_class = {
        .name = kfd_dev_name,
 };
 
+/*
+ * Cache the address space of the chardev on first open so that the reset
+ * path can drop all userspace mappings of doorbell and MMIO ranges via
+ * unmap_mapping_range().
+ */
+static struct address_space *kfd_dev_mapping;
+
+void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen)
+{
+       struct address_space *mapping = READ_ONCE(kfd_dev_mapping);
+
+       if (mapping)
+               unmap_mapping_range(mapping, holebegin, holelen, 1);
+}
+
 static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id)
 {
        struct kfd_process_device *pdd;
@@ -135,6 +150,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
        if (iminor(inode) != 0)
                return -ENODEV;
 
+       /*
+        * /dev/kfd is a single chardev so all opens share one inode. Cache
+        * its address_space on the first open for use by the reset path.
+        */
+       if (!READ_ONCE(kfd_dev_mapping))
+               cmpxchg(&kfd_dev_mapping, NULL, inode->i_mapping);
+
        is_32bit_user_mode = in_compat_syscall();
 
        if (is_32bit_user_mode) {
index 482bcfa10f82f76489c61e931bb6a04af7c07b1f..acd0e41e744c910975ab482ee5e112bccde75ab5 100644 (file)
@@ -399,6 +399,7 @@ enum kfd_mempool {
 /* Character device interface */
 int kfd_chardev_init(void);
 void kfd_chardev_exit(void);
+void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen);
 
 /**
  * enum kfd_unmap_queues_filter - Enum for queue filters.