From: Sasha Levin Date: Fri, 11 Nov 2022 18:17:30 +0000 (-0500) Subject: Fixes for 5.15 X-Git-Tag: v5.10.155~54 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b3bb02532dee8ffd4e1c686cab0c1fe407d05b9e;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.15 Signed-off-by: Sasha Levin --- diff --git a/queue-5.15/drm-amdkfd-avoid-recursive-lock-in-migrations-back-t.patch b/queue-5.15/drm-amdkfd-avoid-recursive-lock-in-migrations-back-t.patch new file mode 100644 index 00000000000..1c7316a417d --- /dev/null +++ b/queue-5.15/drm-amdkfd-avoid-recursive-lock-in-migrations-back-t.patch @@ -0,0 +1,87 @@ +From 258c2743e53efda174f8ee4133ee4ff0e7b6ac2f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 29 Oct 2021 13:30:40 -0500 +Subject: drm/amdkfd: avoid recursive lock in migrations back to RAM + +From: Alex Sierra + +[ Upstream commit a6283010e2907a5576f96b839e1a1c82659f137c ] + +[Why]: +When we call hmm_range_fault to map memory after a migration, we don't +expect memory to be migrated again as a result of hmm_range_fault. The +driver ensures that all memory is in GPU-accessible locations so that +no migration should be needed. However, there is one corner case where +hmm_range_fault can unexpectedly cause a migration from DEVICE_PRIVATE +back to system memory due to a write-fault when a system memory page in +the same range was mapped read-only (e.g. COW). Ranges with individual +pages in different locations are usually the result of failed page +migrations (e.g. page lock contention). The unexpected migration back +to system memory causes a deadlock from recursive locking in our +driver. + +[How]: +Creating a task reference new member under svm_range_list struct. +Setting this with "current" reference, right before the hmm_range_fault +is called. This member is checked against "current" reference at +svm_migrate_to_ram callback function. If equal, the migration will be +ignored. + +Signed-off-by: Alex Sierra +Reviewed-by: Felix Kuehling +Signed-off-by: Alex Deucher +Stable-dep-of: 5b994354af3c ("drm/amdkfd: Fix NULL pointer dereference in svm_migrate_to_ram()") +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 5 +++++ + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + + drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 ++ + 3 files changed, 8 insertions(+) + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +index 4a16e3c257b9..a458c19b371a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +@@ -796,6 +796,11 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) + pr_debug("failed find process at fault address 0x%lx\n", addr); + return VM_FAULT_SIGBUS; + } ++ if (READ_ONCE(p->svms.faulting_task) == current) { ++ pr_debug("skipping ram migration\n"); ++ kfd_unref_process(p); ++ return 0; ++ } + addr >>= PAGE_SHIFT; + pr_debug("CPU page fault svms 0x%p address 0x%lx\n", &p->svms, addr); + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +index 6d8f9bb2d905..47ec820cae72 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +@@ -755,6 +755,7 @@ struct svm_range_list { + atomic_t evicted_ranges; + struct delayed_work restore_work; + DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE); ++ struct task_struct *faulting_task; + }; + + /* Process data */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +index 74e6f613be02..22a70aaccf13 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +@@ -1489,9 +1489,11 @@ static int svm_range_validate_and_map(struct mm_struct *mm, + + next = min(vma->vm_end, end); + npages = (next - addr) >> PAGE_SHIFT; ++ WRITE_ONCE(p->svms.faulting_task, current); + r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, + addr, npages, &hmm_range, + readonly, true, owner); ++ WRITE_ONCE(p->svms.faulting_task, NULL); + if (r) { + pr_debug("failed %d to get svm range pages\n", r); + goto unreserve_out; +-- +2.35.1 + diff --git a/queue-5.15/drm-amdkfd-fix-null-pointer-dereference-in-svm_migra.patch b/queue-5.15/drm-amdkfd-fix-null-pointer-dereference-in-svm_migra.patch new file mode 100644 index 00000000000..9a4e81bf4c0 --- /dev/null +++ b/queue-5.15/drm-amdkfd-fix-null-pointer-dereference-in-svm_migra.patch @@ -0,0 +1,44 @@ +From b8569accdf54077041c7a773405ba4e2ab493477 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 26 Oct 2022 10:00:54 +0800 +Subject: drm/amdkfd: Fix NULL pointer dereference in svm_migrate_to_ram() + +From: Yang Li + +[ Upstream commit 5b994354af3cab770bf13386469c5725713679af ] + +./drivers/gpu/drm/amd/amdkfd/kfd_migrate.c:985:58-62: ERROR: p is NULL but dereferenced. + +Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2549 +Reported-by: Abaci Robot +Signed-off-by: Yang Li +Reviewed-by: Felix Kuehling +Signed-off-by: Felix Kuehling +Signed-off-by: Alex Deucher +Cc: stable@vger.kernel.org +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +index 0cc425f198b4..93307be8f7a9 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +@@ -865,12 +865,10 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) + out_unlock_svms: + mutex_unlock(&p->svms.lock); + out_unref_process: ++ pr_debug("CPU fault svms 0x%p address 0x%lx done\n", &p->svms, addr); + kfd_unref_process(p); + out_mmput: + mmput(mm); +- +- pr_debug("CPU fault svms 0x%p address 0x%lx done\n", &p->svms, addr); +- + return r ? VM_FAULT_SIGBUS : 0; + } + +-- +2.35.1 + diff --git a/queue-5.15/drm-amdkfd-handle-cpu-fault-on-cow-mapping.patch b/queue-5.15/drm-amdkfd-handle-cpu-fault-on-cow-mapping.patch new file mode 100644 index 00000000000..266f68e1ad7 --- /dev/null +++ b/queue-5.15/drm-amdkfd-handle-cpu-fault-on-cow-mapping.patch @@ -0,0 +1,116 @@ +From 665d0e8e33f3346c79a23cf4d8d54d311696cda5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 7 Sep 2022 12:30:12 -0400 +Subject: drm/amdkfd: handle CPU fault on COW mapping + +From: Philip Yang + +[ Upstream commit e1f84eef313f4820cca068a238c645d0a38c6a9b ] + +If CPU page fault in a page with zone_device_data svm_bo from another +process, that means it is COW mapping in the child process and the +range is migrated to VRAM by parent process. Migrate the parent +process range back to system memory to recover the CPU page fault. + +Signed-off-by: Philip Yang +Reviewed-by: Felix Kuehling +Signed-off-by: Alex Deucher +Stable-dep-of: 5b994354af3c ("drm/amdkfd: Fix NULL pointer dereference in svm_migrate_to_ram()") +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 42 ++++++++++++++++-------- + 1 file changed, 29 insertions(+), 13 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +index a458c19b371a..0cc425f198b4 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +@@ -780,7 +780,7 @@ svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, + static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) + { + unsigned long addr = vmf->address; +- struct vm_area_struct *vma; ++ struct svm_range_bo *svm_bo; + enum svm_work_list_ops op; + struct svm_range *parent; + struct svm_range *prange; +@@ -788,29 +788,42 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) + struct mm_struct *mm; + int r = 0; + +- vma = vmf->vma; +- mm = vma->vm_mm; ++ svm_bo = vmf->page->zone_device_data; ++ if (!svm_bo) { ++ pr_debug("failed get device page at addr 0x%lx\n", addr); ++ return VM_FAULT_SIGBUS; ++ } ++ if (!mmget_not_zero(svm_bo->eviction_fence->mm)) { ++ pr_debug("addr 0x%lx of process mm is detroyed\n", addr); ++ return VM_FAULT_SIGBUS; ++ } ++ ++ mm = svm_bo->eviction_fence->mm; ++ if (mm != vmf->vma->vm_mm) ++ pr_debug("addr 0x%lx is COW mapping in child process\n", addr); + +- p = kfd_lookup_process_by_mm(vma->vm_mm); ++ p = kfd_lookup_process_by_mm(mm); + if (!p) { + pr_debug("failed find process at fault address 0x%lx\n", addr); +- return VM_FAULT_SIGBUS; ++ r = VM_FAULT_SIGBUS; ++ goto out_mmput; + } + if (READ_ONCE(p->svms.faulting_task) == current) { + pr_debug("skipping ram migration\n"); +- kfd_unref_process(p); +- return 0; ++ r = 0; ++ goto out_unref_process; + } +- addr >>= PAGE_SHIFT; ++ + pr_debug("CPU page fault svms 0x%p address 0x%lx\n", &p->svms, addr); ++ addr >>= PAGE_SHIFT; + + mutex_lock(&p->svms.lock); + + prange = svm_range_from_addr(&p->svms, addr, &parent); + if (!prange) { +- pr_debug("cannot find svm range at 0x%lx\n", addr); ++ pr_debug("failed get range svms 0x%p addr 0x%lx\n", &p->svms, addr); + r = -EFAULT; +- goto out; ++ goto out_unlock_svms; + } + + mutex_lock(&parent->migrate_mutex); +@@ -834,8 +847,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) + + r = svm_migrate_vram_to_ram(prange, mm); + if (r) +- pr_debug("failed %d migrate 0x%p [0x%lx 0x%lx] to ram\n", r, +- prange, prange->start, prange->last); ++ pr_debug("failed %d migrate svms 0x%p range 0x%p [0x%lx 0x%lx]\n", ++ r, prange->svms, prange, prange->start, prange->last); + + /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ + if (p->xnack_enabled && parent == prange) +@@ -849,9 +862,12 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) + if (prange != parent) + mutex_unlock(&prange->migrate_mutex); + mutex_unlock(&parent->migrate_mutex); +-out: ++out_unlock_svms: + mutex_unlock(&p->svms.lock); ++out_unref_process: + kfd_unref_process(p); ++out_mmput: ++ mmput(mm); + + pr_debug("CPU fault svms 0x%p address 0x%lx done\n", &p->svms, addr); + +-- +2.35.1 + diff --git a/queue-5.15/fuse-fix-readdir-cache-race.patch b/queue-5.15/fuse-fix-readdir-cache-race.patch new file mode 100644 index 00000000000..6f4d9a90fe2 --- /dev/null +++ b/queue-5.15/fuse-fix-readdir-cache-race.patch @@ -0,0 +1,65 @@ +From 919dd2e7282a175d0c81cbbaf651a5e13a171820 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 20 Oct 2022 17:18:58 +0200 +Subject: fuse: fix readdir cache race + +From: Miklos Szeredi + +[ Upstream commit 9fa248c65bdbf5af0a2f74dd38575acfc8dfd2bf ] + +There's a race in fuse's readdir cache that can result in an uninitilized +page being read. The page lock is supposed to prevent this from happening +but in the following case it doesn't: + +Two fuse_add_dirent_to_cache() start out and get the same parameters +(size=0,offset=0). One of them wins the race to create and lock the page, +after which it fills in data, sets rdc.size and unlocks the page. + +In the meantime the page gets evicted from the cache before the other +instance gets to run. That one also creates the page, but finds the +size to be mismatched, bails out and leaves the uninitialized page in the +cache. + +Fix by marking a filled page uptodate and ignoring non-uptodate pages. + +Reported-by: Frank Sorenson +Fixes: 5d7bc7e8680c ("fuse: allow using readdir cache") +Cc: # v4.20 +Signed-off-by: Miklos Szeredi +Signed-off-by: Sasha Levin +--- + fs/fuse/readdir.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c +index bc267832310c..d5294e663df5 100644 +--- a/fs/fuse/readdir.c ++++ b/fs/fuse/readdir.c +@@ -77,8 +77,10 @@ static void fuse_add_dirent_to_cache(struct file *file, + goto unlock; + + addr = kmap_atomic(page); +- if (!offset) ++ if (!offset) { + clear_page(addr); ++ SetPageUptodate(page); ++ } + memcpy(addr + offset, dirent, reclen); + kunmap_atomic(addr); + fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; +@@ -516,6 +518,12 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) + + page = find_get_page_flags(file->f_mapping, index, + FGP_ACCESSED | FGP_LOCK); ++ /* Page gone missing, then re-added to cache, but not initialized? */ ++ if (page && !PageUptodate(page)) { ++ unlock_page(page); ++ put_page(page); ++ page = NULL; ++ } + spin_lock(&fi->rdc.lock); + if (!page) { + /* +-- +2.35.1 + diff --git a/queue-5.15/series b/queue-5.15/series index a1bba147fbd..68e87055770 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -1,2 +1,6 @@ thunderbolt-tear-down-existing-tunnels-when-resuming-from-hibernate.patch thunderbolt-add-dp-out-resource-when-dp-tunnel-is-discovered.patch +fuse-fix-readdir-cache-race.patch +drm-amdkfd-avoid-recursive-lock-in-migrations-back-t.patch +drm-amdkfd-handle-cpu-fault-on-cow-mapping.patch +drm-amdkfd-fix-null-pointer-dereference-in-svm_migra.patch