]>
Commit | Line | Data |
---|---|---|
5c17fab8 GKH |
1 | From fff1386cc889d8fb4089d285f883f8cba62d82ce Mon Sep 17 00:00:00 2001 |
2 | From: Dave Airlie <airlied@redhat.com> | |
3 | Date: Thu, 11 Apr 2024 11:15:09 +1000 | |
4 | Subject: nouveau: fix instmem race condition around ptr stores | |
5 | ||
6 | From: Dave Airlie <airlied@redhat.com> | |
7 | ||
8 | commit fff1386cc889d8fb4089d285f883f8cba62d82ce upstream. | |
9 | ||
10 | Running a lot of VK CTS in parallel against nouveau, once every | |
11 | few hours you might see something like this crash. | |
12 | ||
13 | BUG: kernel NULL pointer dereference, address: 0000000000000008 | |
14 | PGD 8000000114e6e067 P4D 8000000114e6e067 PUD 109046067 PMD 0 | |
15 | Oops: 0000 [#1] PREEMPT SMP PTI | |
16 | CPU: 7 PID: 53891 Comm: deqp-vk Not tainted 6.8.0-rc6+ #27 | |
17 | Hardware name: Gigabyte Technology Co., Ltd. Z390 I AORUS PRO WIFI/Z390 I AORUS PRO WIFI-CF, BIOS F8 11/05/2021 | |
18 | RIP: 0010:gp100_vmm_pgt_mem+0xe3/0x180 [nouveau] | |
19 | Code: c7 48 01 c8 49 89 45 58 85 d2 0f 84 95 00 00 00 41 0f b7 46 12 49 8b 7e 08 89 da 42 8d 2c f8 48 8b 47 08 41 83 c7 01 48 89 ee <48> 8b 40 08 ff d0 0f 1f 00 49 8b 7e 08 48 89 d9 48 8d 75 04 48 c1 | |
20 | RSP: 0000:ffffac20c5857838 EFLAGS: 00010202 | |
21 | RAX: 0000000000000000 RBX: 00000000004d8001 RCX: 0000000000000001 | |
22 | RDX: 00000000004d8001 RSI: 00000000000006d8 RDI: ffffa07afe332180 | |
23 | RBP: 00000000000006d8 R08: ffffac20c5857ad0 R09: 0000000000ffff10 | |
24 | R10: 0000000000000001 R11: ffffa07af27e2de0 R12: 000000000000001c | |
25 | R13: ffffac20c5857ad0 R14: ffffa07a96fe9040 R15: 000000000000001c | |
26 | FS: 00007fe395eed7c0(0000) GS:ffffa07e2c980000(0000) knlGS:0000000000000000 | |
27 | CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 | |
28 | CR2: 0000000000000008 CR3: 000000011febe001 CR4: 00000000003706f0 | |
29 | DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 | |
30 | DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 | |
31 | Call Trace: | |
32 | ||
33 | ... | |
34 | ||
35 | ? gp100_vmm_pgt_mem+0xe3/0x180 [nouveau] | |
36 | ? gp100_vmm_pgt_mem+0x37/0x180 [nouveau] | |
37 | nvkm_vmm_iter+0x351/0xa20 [nouveau] | |
38 | ? __pfx_nvkm_vmm_ref_ptes+0x10/0x10 [nouveau] | |
39 | ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] | |
40 | ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] | |
41 | ? __lock_acquire+0x3ed/0x2170 | |
42 | ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] | |
43 | nvkm_vmm_ptes_get_map+0xc2/0x100 [nouveau] | |
44 | ? __pfx_nvkm_vmm_ref_ptes+0x10/0x10 [nouveau] | |
45 | ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] | |
46 | nvkm_vmm_map_locked+0x224/0x3a0 [nouveau] | |
47 | ||
48 | Adding any sort of useful debug usually makes it go away, so I hand | |
49 | wrote the function in a line, and debugged the asm. | |
50 | ||
51 | Every so often pt->memory->ptrs is NULL. This ptrs ptr is set in | |
52 | the nv50_instobj_acquire called from nvkm_kmap. | |
53 | ||
54 | If Thread A and Thread B both get to nv50_instobj_acquire around | |
55 | the same time, and Thread A hits the refcount_set line, and in | |
56 | lockstep thread B succeeds at refcount_inc_not_zero, there is a | |
57 | chance the ptrs value won't have been stored since refcount_set | |
58 | is unordered. Force a memory barrier here, I picked smp_mb, since | |
59 | we want it on all CPUs and it's write followed by a read. | |
60 | ||
61 | v2: use paired smp_rmb/smp_wmb. | |
62 | ||
63 | Cc: <stable@vger.kernel.org> | |
64 | Fixes: be55287aa5ba ("drm/nouveau/imem/nv50: embed nvkm_instobj directly into nv04_instobj") | |
65 | Signed-off-by: Dave Airlie <airlied@redhat.com> | |
66 | Signed-off-by: Danilo Krummrich <dakr@redhat.com> | |
67 | Link: https://patchwork.freedesktop.org/patch/msgid/20240411011510.2546857-1-airlied@gmail.com | |
68 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
69 | --- | |
70 | drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c | 7 ++++++- | |
71 | 1 file changed, 6 insertions(+), 1 deletion(-) | |
72 | ||
73 | --- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c | |
74 | +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c | |
75 | @@ -222,8 +222,11 @@ nv50_instobj_acquire(struct nvkm_memory | |
76 | void __iomem *map = NULL; | |
77 | ||
78 | /* Already mapped? */ | |
79 | - if (refcount_inc_not_zero(&iobj->maps)) | |
80 | + if (refcount_inc_not_zero(&iobj->maps)) { | |
81 | + /* read barrier match the wmb on refcount set */ | |
82 | + smp_rmb(); | |
83 | return iobj->map; | |
84 | + } | |
85 | ||
86 | /* Take the lock, and re-check that another thread hasn't | |
87 | * already mapped the object in the meantime. | |
88 | @@ -250,6 +253,8 @@ nv50_instobj_acquire(struct nvkm_memory | |
89 | iobj->base.memory.ptrs = &nv50_instobj_fast; | |
90 | else | |
91 | iobj->base.memory.ptrs = &nv50_instobj_slow; | |
92 | + /* barrier to ensure the ptrs are written before refcount is set */ | |
93 | + smp_wmb(); | |
94 | refcount_set(&iobj->maps, 1); | |
95 | } | |
96 |