]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/4.19.29/bpf-fix-lockdep-false-positive-in-percpu_freelist.patch
fixes for 4.19
[thirdparty/kernel/stable-queue.git] / releases / 4.19.29 / bpf-fix-lockdep-false-positive-in-percpu_freelist.patch
1 From 04d658ad2d2a5665fda178982e35a357c67c1a3e Mon Sep 17 00:00:00 2001
2 From: Alexei Starovoitov <ast@kernel.org>
3 Date: Wed, 30 Jan 2019 18:12:43 -0800
4 Subject: bpf: fix lockdep false positive in percpu_freelist
5
6 [ Upstream commit a89fac57b5d080771efd4d71feaae19877cf68f0 ]
7
8 Lockdep warns about false positive:
9 [ 12.492084] 00000000e6b28347 (&head->lock){+...}, at: pcpu_freelist_push+0x2a/0x40
10 [ 12.492696] but this lock was taken by another, HARDIRQ-safe lock in the past:
11 [ 12.493275] (&rq->lock){-.-.}
12 [ 12.493276]
13 [ 12.493276]
14 [ 12.493276] and interrupts could create inverse lock ordering between them.
15 [ 12.493276]
16 [ 12.494435]
17 [ 12.494435] other info that might help us debug this:
18 [ 12.494979] Possible interrupt unsafe locking scenario:
19 [ 12.494979]
20 [ 12.495518] CPU0 CPU1
21 [ 12.495879] ---- ----
22 [ 12.496243] lock(&head->lock);
23 [ 12.496502] local_irq_disable();
24 [ 12.496969] lock(&rq->lock);
25 [ 12.497431] lock(&head->lock);
26 [ 12.497890] <Interrupt>
27 [ 12.498104] lock(&rq->lock);
28 [ 12.498368]
29 [ 12.498368] *** DEADLOCK ***
30 [ 12.498368]
31 [ 12.498837] 1 lock held by dd/276:
32 [ 12.499110] #0: 00000000c58cb2ee (rcu_read_lock){....}, at: trace_call_bpf+0x5e/0x240
33 [ 12.499747]
34 [ 12.499747] the shortest dependencies between 2nd lock and 1st lock:
35 [ 12.500389] -> (&rq->lock){-.-.} {
36 [ 12.500669] IN-HARDIRQ-W at:
37 [ 12.500934] _raw_spin_lock+0x2f/0x40
38 [ 12.501373] scheduler_tick+0x4c/0xf0
39 [ 12.501812] update_process_times+0x40/0x50
40 [ 12.502294] tick_periodic+0x27/0xb0
41 [ 12.502723] tick_handle_periodic+0x1f/0x60
42 [ 12.503203] timer_interrupt+0x11/0x20
43 [ 12.503651] __handle_irq_event_percpu+0x43/0x2c0
44 [ 12.504167] handle_irq_event_percpu+0x20/0x50
45 [ 12.504674] handle_irq_event+0x37/0x60
46 [ 12.505139] handle_level_irq+0xa7/0x120
47 [ 12.505601] handle_irq+0xa1/0x150
48 [ 12.506018] do_IRQ+0x77/0x140
49 [ 12.506411] ret_from_intr+0x0/0x1d
50 [ 12.506834] _raw_spin_unlock_irqrestore+0x53/0x60
51 [ 12.507362] __setup_irq+0x481/0x730
52 [ 12.507789] setup_irq+0x49/0x80
53 [ 12.508195] hpet_time_init+0x21/0x32
54 [ 12.508644] x86_late_time_init+0xb/0x16
55 [ 12.509106] start_kernel+0x390/0x42a
56 [ 12.509554] secondary_startup_64+0xa4/0xb0
57 [ 12.510034] IN-SOFTIRQ-W at:
58 [ 12.510305] _raw_spin_lock+0x2f/0x40
59 [ 12.510772] try_to_wake_up+0x1c7/0x4e0
60 [ 12.511220] swake_up_locked+0x20/0x40
61 [ 12.511657] swake_up_one+0x1a/0x30
62 [ 12.512070] rcu_process_callbacks+0xc5/0x650
63 [ 12.512553] __do_softirq+0xe6/0x47b
64 [ 12.512978] irq_exit+0xc3/0xd0
65 [ 12.513372] smp_apic_timer_interrupt+0xa9/0x250
66 [ 12.513876] apic_timer_interrupt+0xf/0x20
67 [ 12.514343] default_idle+0x1c/0x170
68 [ 12.514765] do_idle+0x199/0x240
69 [ 12.515159] cpu_startup_entry+0x19/0x20
70 [ 12.515614] start_kernel+0x422/0x42a
71 [ 12.516045] secondary_startup_64+0xa4/0xb0
72 [ 12.516521] INITIAL USE at:
73 [ 12.516774] _raw_spin_lock_irqsave+0x38/0x50
74 [ 12.517258] rq_attach_root+0x16/0xd0
75 [ 12.517685] sched_init+0x2f2/0x3eb
76 [ 12.518096] start_kernel+0x1fb/0x42a
77 [ 12.518525] secondary_startup_64+0xa4/0xb0
78 [ 12.518986] }
79 [ 12.519132] ... key at: [<ffffffff82b7bc28>] __key.71384+0x0/0x8
80 [ 12.519649] ... acquired at:
81 [ 12.519892] pcpu_freelist_pop+0x7b/0xd0
82 [ 12.520221] bpf_get_stackid+0x1d2/0x4d0
83 [ 12.520563] ___bpf_prog_run+0x8b4/0x11a0
84 [ 12.520887]
85 [ 12.521008] -> (&head->lock){+...} {
86 [ 12.521292] HARDIRQ-ON-W at:
87 [ 12.521539] _raw_spin_lock+0x2f/0x40
88 [ 12.521950] pcpu_freelist_push+0x2a/0x40
89 [ 12.522396] bpf_get_stackid+0x494/0x4d0
90 [ 12.522828] ___bpf_prog_run+0x8b4/0x11a0
91 [ 12.523296] INITIAL USE at:
92 [ 12.523537] _raw_spin_lock+0x2f/0x40
93 [ 12.523944] pcpu_freelist_populate+0xc0/0x120
94 [ 12.524417] htab_map_alloc+0x405/0x500
95 [ 12.524835] __do_sys_bpf+0x1a3/0x1a90
96 [ 12.525253] do_syscall_64+0x4a/0x180
97 [ 12.525659] entry_SYSCALL_64_after_hwframe+0x49/0xbe
98 [ 12.526167] }
99 [ 12.526311] ... key at: [<ffffffff838f7668>] __key.13130+0x0/0x8
100 [ 12.526812] ... acquired at:
101 [ 12.527047] __lock_acquire+0x521/0x1350
102 [ 12.527371] lock_acquire+0x98/0x190
103 [ 12.527680] _raw_spin_lock+0x2f/0x40
104 [ 12.527994] pcpu_freelist_push+0x2a/0x40
105 [ 12.528325] bpf_get_stackid+0x494/0x4d0
106 [ 12.528645] ___bpf_prog_run+0x8b4/0x11a0
107 [ 12.528970]
108 [ 12.529092]
109 [ 12.529092] stack backtrace:
110 [ 12.529444] CPU: 0 PID: 276 Comm: dd Not tainted 5.0.0-rc3-00018-g2fa53f892422 #475
111 [ 12.530043] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
112 [ 12.530750] Call Trace:
113 [ 12.530948] dump_stack+0x5f/0x8b
114 [ 12.531248] check_usage_backwards+0x10c/0x120
115 [ 12.531598] ? ___bpf_prog_run+0x8b4/0x11a0
116 [ 12.531935] ? mark_lock+0x382/0x560
117 [ 12.532229] mark_lock+0x382/0x560
118 [ 12.532496] ? print_shortest_lock_dependencies+0x180/0x180
119 [ 12.532928] __lock_acquire+0x521/0x1350
120 [ 12.533271] ? find_get_entry+0x17f/0x2e0
121 [ 12.533586] ? find_get_entry+0x19c/0x2e0
122 [ 12.533902] ? lock_acquire+0x98/0x190
123 [ 12.534196] lock_acquire+0x98/0x190
124 [ 12.534482] ? pcpu_freelist_push+0x2a/0x40
125 [ 12.534810] _raw_spin_lock+0x2f/0x40
126 [ 12.535099] ? pcpu_freelist_push+0x2a/0x40
127 [ 12.535432] pcpu_freelist_push+0x2a/0x40
128 [ 12.535750] bpf_get_stackid+0x494/0x4d0
129 [ 12.536062] ___bpf_prog_run+0x8b4/0x11a0
130
131 It has been explained that is a false positive here:
132 https://lkml.org/lkml/2018/7/25/756
133 Recap:
134 - stackmap uses pcpu_freelist
135 - The lock in pcpu_freelist is a percpu lock
136 - stackmap is only used by tracing bpf_prog
137 - A tracing bpf_prog cannot be run if another bpf_prog
138 has already been running (ensured by the percpu bpf_prog_active counter).
139
140 Eric pointed out that this lockdep splats stops other
141 legit lockdep splats in selftests/bpf/test_progs.c.
142
143 Fix this by calling local_irq_save/restore for stackmap.
144
145 Another false positive had also been worked around by calling
146 local_irq_save in commit 89ad2fa3f043 ("bpf: fix lockdep splat").
147 That commit added unnecessary irq_save/restore to fast path of
148 bpf hash map. irqs are already disabled at that point, since htab
149 is holding per bucket spin_lock with irqsave.
150
151 Let's reduce overhead for htab by introducing __pcpu_freelist_push/pop
152 function w/o irqsave and convert pcpu_freelist_push/pop to irqsave
153 to be used elsewhere (right now only in stackmap).
154 It stops lockdep false positive in stackmap with a bit of acceptable overhead.
155
156 Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation")
157 Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
158 Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
159 Acked-by: Martin KaFai Lau <kafai@fb.com>
160 Signed-off-by: Alexei Starovoitov <ast@kernel.org>
161 Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
162 Signed-off-by: Sasha Levin <sashal@kernel.org>
163 ---
164 kernel/bpf/hashtab.c | 4 ++--
165 kernel/bpf/percpu_freelist.c | 41 +++++++++++++++++++++++++-----------
166 kernel/bpf/percpu_freelist.h | 4 ++++
167 3 files changed, 35 insertions(+), 14 deletions(-)
168
169 diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
170 index 03cc59ee9c95..cebadd6af4d9 100644
171 --- a/kernel/bpf/hashtab.c
172 +++ b/kernel/bpf/hashtab.c
173 @@ -677,7 +677,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
174 }
175
176 if (htab_is_prealloc(htab)) {
177 - pcpu_freelist_push(&htab->freelist, &l->fnode);
178 + __pcpu_freelist_push(&htab->freelist, &l->fnode);
179 } else {
180 atomic_dec(&htab->count);
181 l->htab = htab;
182 @@ -739,7 +739,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
183 } else {
184 struct pcpu_freelist_node *l;
185
186 - l = pcpu_freelist_pop(&htab->freelist);
187 + l = __pcpu_freelist_pop(&htab->freelist);
188 if (!l)
189 return ERR_PTR(-E2BIG);
190 l_new = container_of(l, struct htab_elem, fnode);
191 diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
192 index 673fa6fe2d73..0c1b4ba9e90e 100644
193 --- a/kernel/bpf/percpu_freelist.c
194 +++ b/kernel/bpf/percpu_freelist.c
195 @@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
196 free_percpu(s->freelist);
197 }
198
199 -static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
200 - struct pcpu_freelist_node *node)
201 +static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
202 + struct pcpu_freelist_node *node)
203 {
204 raw_spin_lock(&head->lock);
205 node->next = head->first;
206 @@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
207 raw_spin_unlock(&head->lock);
208 }
209
210 -void pcpu_freelist_push(struct pcpu_freelist *s,
211 +void __pcpu_freelist_push(struct pcpu_freelist *s,
212 struct pcpu_freelist_node *node)
213 {
214 struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
215
216 - __pcpu_freelist_push(head, node);
217 + ___pcpu_freelist_push(head, node);
218 +}
219 +
220 +void pcpu_freelist_push(struct pcpu_freelist *s,
221 + struct pcpu_freelist_node *node)
222 +{
223 + unsigned long flags;
224 +
225 + local_irq_save(flags);
226 + __pcpu_freelist_push(s, node);
227 + local_irq_restore(flags);
228 }
229
230 void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
231 @@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
232 for_each_possible_cpu(cpu) {
233 again:
234 head = per_cpu_ptr(s->freelist, cpu);
235 - __pcpu_freelist_push(head, buf);
236 + ___pcpu_freelist_push(head, buf);
237 i++;
238 buf += elem_size;
239 if (i == nr_elems)
240 @@ -74,14 +84,12 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
241 local_irq_restore(flags);
242 }
243
244 -struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
245 +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
246 {
247 struct pcpu_freelist_head *head;
248 struct pcpu_freelist_node *node;
249 - unsigned long flags;
250 int orig_cpu, cpu;
251
252 - local_irq_save(flags);
253 orig_cpu = cpu = raw_smp_processor_id();
254 while (1) {
255 head = per_cpu_ptr(s->freelist, cpu);
256 @@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
257 node = head->first;
258 if (node) {
259 head->first = node->next;
260 - raw_spin_unlock_irqrestore(&head->lock, flags);
261 + raw_spin_unlock(&head->lock);
262 return node;
263 }
264 raw_spin_unlock(&head->lock);
265 cpu = cpumask_next(cpu, cpu_possible_mask);
266 if (cpu >= nr_cpu_ids)
267 cpu = 0;
268 - if (cpu == orig_cpu) {
269 - local_irq_restore(flags);
270 + if (cpu == orig_cpu)
271 return NULL;
272 - }
273 }
274 }
275 +
276 +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
277 +{
278 + struct pcpu_freelist_node *ret;
279 + unsigned long flags;
280 +
281 + local_irq_save(flags);
282 + ret = __pcpu_freelist_pop(s);
283 + local_irq_restore(flags);
284 + return ret;
285 +}
286 diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
287 index 3049aae8ea1e..c3960118e617 100644
288 --- a/kernel/bpf/percpu_freelist.h
289 +++ b/kernel/bpf/percpu_freelist.h
290 @@ -22,8 +22,12 @@ struct pcpu_freelist_node {
291 struct pcpu_freelist_node *next;
292 };
293
294 +/* pcpu_freelist_* do spin_lock_irqsave. */
295 void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
296 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
297 +/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */
298 +void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
299 +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *);
300 void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
301 u32 nr_elems);
302 int pcpu_freelist_init(struct pcpu_freelist *);
303 --
304 2.19.1
305