1 From 9e9e085effe9b7e342138fde3cf8577d22509932 Mon Sep 17 00:00:00 2001
2 From: Adrian Huang <ahuang12@lenovo.com>
3 Date: Sat, 27 Jul 2024 00:52:46 +0800
4 Subject: mm/vmalloc: combine all TLB flush operations of KASAN shadow virtual address into one operation
6 From: Adrian Huang <ahuang12@lenovo.com>
8 commit 9e9e085effe9b7e342138fde3cf8577d22509932 upstream.
10 When compiling kernel source 'make -j $(nproc)' with the up-and-running
11 KASAN-enabled kernel on a 256-core machine, the following soft lockup is
14 watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760]
15 CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 #95
16 Workqueue: events drain_vmap_area_work
17 RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0
18 Code: 38 c8 7c 08 84 c9 0f 85 49 08 00 00 8b 45 08 a8 01 74 2e 48 89 f1 49 89 f7 48 c1 e9 03 41 83 e7 07 4c 01 e9 41 83 c7 03 f3 90 <0f> b6 01 41 38 c7 7c 08 84 c0 0f 85 d4 06 00 00 8b 45 08 a8 01 75
19 RSP: 0018:ffffc9000cb3fb60 EFLAGS: 00000202
20 RAX: 0000000000000011 RBX: ffff8883bc4469c0 RCX: ffffed10776e9949
21 RDX: 0000000000000002 RSI: ffff8883bb74ca48 RDI: ffffffff8434dc50
22 RBP: ffff8883bb74ca40 R08: ffff888103585dc0 R09: ffff8884533a1800
23 R10: 0000000000000004 R11: ffffffffffffffff R12: ffffed1077888d39
24 R13: dffffc0000000000 R14: ffffed1077888d38 R15: 0000000000000003
25 FS: 0000000000000000(0000) GS:ffff8883bc400000(0000) knlGS:0000000000000000
26 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
27 CR2: 00005577b5c8d158 CR3: 0000000004850000 CR4: 0000000000350ef0
30 ? watchdog_timer_fn+0x2cd/0x390
31 ? __pfx_watchdog_timer_fn+0x10/0x10
32 ? __hrtimer_run_queues+0x300/0x6d0
33 ? sched_clock_cpu+0x69/0x4e0
34 ? __pfx___hrtimer_run_queues+0x10/0x10
35 ? srso_return_thunk+0x5/0x5f
36 ? ktime_get_update_offsets_now+0x7f/0x2a0
37 ? srso_return_thunk+0x5/0x5f
38 ? srso_return_thunk+0x5/0x5f
39 ? hrtimer_interrupt+0x2ca/0x760
40 ? __sysvec_apic_timer_interrupt+0x8c/0x2b0
41 ? sysvec_apic_timer_interrupt+0x6a/0x90
44 ? asm_sysvec_apic_timer_interrupt+0x16/0x20
45 ? smp_call_function_many_cond+0x1d8/0xbb0
46 ? __pfx_do_kernel_range_flush+0x10/0x10
47 on_each_cpu_cond_mask+0x20/0x40
48 flush_tlb_kernel_range+0x19b/0x250
49 ? srso_return_thunk+0x5/0x5f
50 ? kasan_release_vmalloc+0xa7/0xc0
51 purge_vmap_node+0x357/0x820
52 ? __pfx_purge_vmap_node+0x10/0x10
53 __purge_vmap_area_lazy+0x5b8/0xa10
54 drain_vmap_area_work+0x21/0x30
55 process_one_work+0x661/0x10b0
56 worker_thread+0x844/0x10e0
57 ? srso_return_thunk+0x5/0x5f
58 ? __kthread_parkme+0x82/0x140
59 ? __pfx_worker_thread+0x10/0x10
61 ? __pfx_kthread+0x10/0x10
62 ret_from_fork+0x30/0x70
63 ? __pfx_kthread+0x10/0x10
64 ret_from_fork_asm+0x1a/0x30
69 1. The following ftrace log shows that the lockup CPU spends too much
70 time iterating vmap_nodes and flushing TLB when purging vm_area
71 structures. (Some info is trimmed).
73 kworker: funcgraph_entry: | drain_vmap_area_work() {
74 kworker: funcgraph_entry: | mutex_lock() {
75 kworker: funcgraph_entry: 1.092 us | __cond_resched();
76 kworker: funcgraph_exit: 3.306 us | }
78 kworker: funcgraph_entry: | flush_tlb_kernel_range() {
80 kworker: funcgraph_exit: # 7533.649 us | }
82 kworker: funcgraph_entry: 2.344 us | mutex_unlock();
83 kworker: funcgraph_exit: $ 23871554 us | }
85 The drain_vmap_area_work() spends over 23 seconds.
87 There are 2805 flush_tlb_kernel_range() calls in the ftrace log.
88 * One is called in __purge_vmap_area_lazy().
89 * Others are called by purge_vmap_node->kasan_release_vmalloc.
90 purge_vmap_node() iteratively releases kasan vmalloc
91 allocations and flushes TLB for each vmap_area.
92 - [Rough calculation] Each flush_tlb_kernel_range() runs
94 -- 2804 * 7.5ms = 21.03 seconds.
95 -- That's why a soft lock is triggered.
97 2. Extending the soft lockup time can work around the issue (For example,
98 # echo 60 > /proc/sys/kernel/watchdog_thresh). This confirms the
99 above-mentioned speculation: drain_vmap_area_work() spends too much
102 If we combine all TLB flush operations of the KASAN shadow virtual
103 address into one operation in the call path
104 'purge_vmap_node()->kasan_release_vmalloc()', the running time of
105 drain_vmap_area_work() can be saved greatly. The idea is from the
106 flush_tlb_kernel_range() call in __purge_vmap_area_lazy(). And, the
107 soft lockup won't be triggered.
109 Here is the test result based on 6.10:
112 1. ftrace latency profiling (record a trace if the latency > 20s).
113 echo 20000000 > /sys/kernel/debug/tracing/tracing_thresh
114 echo drain_vmap_area_work > /sys/kernel/debug/tracing/set_graph_function
115 echo function_graph > /sys/kernel/debug/tracing/current_tracer
116 echo 1 > /sys/kernel/debug/tracing/tracing_on
118 2. Run `make -j $(nproc)` to compile the kernel source
120 3. Once the soft lockup is reproduced, check the ftrace log:
121 cat /sys/kernel/debug/tracing/trace
122 # tracer: function_graph
124 # CPU DURATION FUNCTION CALLS
126 76) $ 50412985 us | } /* __purge_vmap_area_lazy */
127 76) $ 50412997 us | } /* drain_vmap_area_work */
128 76) $ 29165911 us | } /* __purge_vmap_area_lazy */
129 76) $ 29165926 us | } /* drain_vmap_area_work */
130 91) $ 53629423 us | } /* __purge_vmap_area_lazy */
131 91) $ 53629434 us | } /* drain_vmap_area_work */
132 91) $ 28121014 us | } /* __purge_vmap_area_lazy */
133 91) $ 28121026 us | } /* drain_vmap_area_work */
136 1. Repeat step 1-2 in "[6.10 wo/ the patch]"
138 2. The soft lockup is not triggered and ftrace log is empty.
139 cat /sys/kernel/debug/tracing/trace
140 # tracer: function_graph
142 # CPU DURATION FUNCTION CALLS
145 3. Setting 'tracing_thresh' to 10/5 seconds does not get any ftrace
148 4. Setting 'tracing_thresh' to 1 second gets ftrace log.
149 cat /sys/kernel/debug/tracing/trace
150 # tracer: function_graph
152 # CPU DURATION FUNCTION CALLS
154 23) $ 1074942 us | } /* __purge_vmap_area_lazy */
155 23) $ 1074950 us | } /* drain_vmap_area_work */
157 The worst execution time of drain_vmap_area_work() is about 1 second.
159 Link: https://lore.kernel.org/lkml/ZqFlawuVnOMY2k3E@pc638.lan/
160 Link: https://lkml.kernel.org/r/20240726165246.31326-1-ahuang12@lenovo.com
161 Fixes: 282631cb2447 ("mm: vmalloc: remove global purge_vmap_area_root rb-tree")
162 Signed-off-by: Adrian Huang <ahuang12@lenovo.com>
163 Co-developed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
164 Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
165 Tested-by: Jiwei Sun <sunjw10@lenovo.com>
166 Reviewed-by: Baoquan He <bhe@redhat.com>
167 Cc: Alexander Potapenko <glider@google.com>
168 Cc: Andrey Konovalov <andreyknvl@gmail.com>
169 Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
170 Cc: Christoph Hellwig <hch@infradead.org>
171 Cc: Dmitry Vyukov <dvyukov@google.com>
172 Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
173 Cc: <stable@vger.kernel.org>
174 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
175 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
177 include/linux/kasan.h | 12 +++++++++---
178 mm/kasan/shadow.c | 14 ++++++++++----
179 mm/vmalloc.c | 34 ++++++++++++++++++++++++++--------
180 3 files changed, 45 insertions(+), 15 deletions(-)
182 diff --git a/include/linux/kasan.h b/include/linux/kasan.h
183 index 00a3bf7c0d8f..6bbfc8aa42e8 100644
184 --- a/include/linux/kasan.h
185 +++ b/include/linux/kasan.h
186 @@ -29,6 +29,9 @@ typedef unsigned int __bitwise kasan_vmalloc_flags_t;
187 #define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u)
188 #define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u)
190 +#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
191 +#define KASAN_VMALLOC_TLB_FLUSH 0x2 /* TLB flush */
193 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
195 #include <linux/pgtable.h>
196 @@ -564,7 +567,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
197 int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
198 void kasan_release_vmalloc(unsigned long start, unsigned long end,
199 unsigned long free_region_start,
200 - unsigned long free_region_end);
201 + unsigned long free_region_end,
202 + unsigned long flags);
204 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
206 @@ -579,7 +583,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
207 static inline void kasan_release_vmalloc(unsigned long start,
209 unsigned long free_region_start,
210 - unsigned long free_region_end) { }
211 + unsigned long free_region_end,
212 + unsigned long flags) { }
214 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
216 @@ -614,7 +619,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
217 static inline void kasan_release_vmalloc(unsigned long start,
219 unsigned long free_region_start,
220 - unsigned long free_region_end) { }
221 + unsigned long free_region_end,
222 + unsigned long flags) { }
224 static inline void *kasan_unpoison_vmalloc(const void *start,
226 diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
227 index d6210ca48dda..88d1c9dcb507 100644
228 --- a/mm/kasan/shadow.c
229 +++ b/mm/kasan/shadow.c
230 @@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
232 void kasan_release_vmalloc(unsigned long start, unsigned long end,
233 unsigned long free_region_start,
234 - unsigned long free_region_end)
235 + unsigned long free_region_end,
236 + unsigned long flags)
238 void *shadow_start, *shadow_end;
239 unsigned long region_start, region_end;
240 @@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
241 __memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
244 - apply_to_existing_page_range(&init_mm,
247 + if (flags & KASAN_VMALLOC_PAGE_RANGE)
248 + apply_to_existing_page_range(&init_mm,
249 (unsigned long)shadow_start,
250 size, kasan_depopulate_vmalloc_pte,
252 - flush_tlb_kernel_range((unsigned long)shadow_start,
253 - (unsigned long)shadow_end);
255 + if (flags & KASAN_VMALLOC_TLB_FLUSH)
256 + flush_tlb_kernel_range((unsigned long)shadow_start,
257 + (unsigned long)shadow_end);
261 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
262 index 634162271c00..5480b77f4167 100644
265 @@ -2182,6 +2182,25 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
266 reclaim_list_global(&decay_list);
270 +kasan_release_vmalloc_node(struct vmap_node *vn)
272 + struct vmap_area *va;
273 + unsigned long start, end;
275 + start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
276 + end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
278 + list_for_each_entry(va, &vn->purge_list, list) {
279 + if (is_vmalloc_or_module_addr((void *) va->va_start))
280 + kasan_release_vmalloc(va->va_start, va->va_end,
281 + va->va_start, va->va_end,
282 + KASAN_VMALLOC_PAGE_RANGE);
285 + kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
288 static void purge_vmap_node(struct work_struct *work)
290 struct vmap_node *vn = container_of(work,
291 @@ -2190,20 +2209,17 @@ static void purge_vmap_node(struct work_struct *work)
292 struct vmap_area *va, *n_va;
293 LIST_HEAD(local_list);
295 + if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
296 + kasan_release_vmalloc_node(vn);
300 list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
301 unsigned long nr = va_size(va) >> PAGE_SHIFT;
302 - unsigned long orig_start = va->va_start;
303 - unsigned long orig_end = va->va_end;
304 unsigned int vn_id = decode_vn_id(va->flags);
306 list_del_init(&va->list);
308 - if (is_vmalloc_or_module_addr((void *)orig_start))
309 - kasan_release_vmalloc(orig_start, orig_end,
310 - va->va_start, va->va_end);
312 nr_purged_pages += nr;
315 @@ -4784,7 +4800,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
316 &free_vmap_area_list);
318 kasan_release_vmalloc(orig_start, orig_end,
319 - va->va_start, va->va_end);
320 + va->va_start, va->va_end,
321 + KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
325 @@ -4834,7 +4851,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
326 &free_vmap_area_list);
328 kasan_release_vmalloc(orig_start, orig_end,
329 - va->va_start, va->va_end);
330 + va->va_start, va->va_end,
331 + KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);