]> git.ipfire.org Git - thirdparty/linux.git/blame - arch/x86/mm/tlb.c
membarrier: Provide core serializing command, *_SYNC_CORE
[thirdparty/linux.git] / arch / x86 / mm / tlb.c
CommitLineData
c048fdfe
GC
1#include <linux/init.h>
2
3#include <linux/mm.h>
c048fdfe
GC
4#include <linux/spinlock.h>
5#include <linux/smp.h>
c048fdfe 6#include <linux/interrupt.h>
4b599fed 7#include <linux/export.h>
93296720 8#include <linux/cpu.h>
c048fdfe 9
c048fdfe 10#include <asm/tlbflush.h>
c048fdfe 11#include <asm/mmu_context.h>
350f8f56 12#include <asm/cache.h>
6dd01bed 13#include <asm/apic.h>
bdbcdd48 14#include <asm/uv/uv.h>
3df3212f 15#include <linux/debugfs.h>
5af5573e 16
c048fdfe 17/*
ce4a4e56 18 * TLB flushing, formerly SMP-only
c048fdfe
GC
19 * c/o Linus Torvalds.
20 *
21 * These mean you can really definitely utterly forget about
22 * writing to user space from interrupts. (Its not allowed anyway).
23 *
24 * Optimizations Manfred Spraul <manfred@colorfullife.com>
25 *
26 * More scalable flush, from Andi Kleen
27 *
52aec330 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
c048fdfe
GC
29 */
30
2ea907c4
DH
31/*
32 * We get here when we do something requiring a TLB invalidation
33 * but could not go invalidate all of the contexts. We do the
34 * necessary invalidation by clearing out the 'ctx_id' which
35 * forces a TLB flush when the context is loaded.
36 */
37void clear_asid_other(void)
38{
39 u16 asid;
40
41 /*
42 * This is only expected to be set if we have disabled
43 * kernel _PAGE_GLOBAL pages.
44 */
45 if (!static_cpu_has(X86_FEATURE_PTI)) {
46 WARN_ON_ONCE(1);
47 return;
48 }
49
50 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
51 /* Do not need to flush the current asid */
52 if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
53 continue;
54 /*
55 * Make sure the next time we go to switch to
56 * this asid, we do a flush:
57 */
58 this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
59 }
60 this_cpu_write(cpu_tlbstate.invalidate_other, false);
61}
62
f39681ed
AL
63atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
64
b956575b 65
10af6235
AL
66static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
67 u16 *new_asid, bool *need_flush)
68{
69 u16 asid;
70
71 if (!static_cpu_has(X86_FEATURE_PCID)) {
72 *new_asid = 0;
73 *need_flush = true;
74 return;
75 }
76
2ea907c4
DH
77 if (this_cpu_read(cpu_tlbstate.invalidate_other))
78 clear_asid_other();
79
10af6235
AL
80 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
81 if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
82 next->context.ctx_id)
83 continue;
84
85 *new_asid = asid;
86 *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
87 next_tlb_gen);
88 return;
89 }
90
91 /*
92 * We don't currently own an ASID slot on this CPU.
93 * Allocate a slot.
94 */
95 *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
96 if (*new_asid >= TLB_NR_DYN_ASIDS) {
97 *new_asid = 0;
98 this_cpu_write(cpu_tlbstate.next_asid, 1);
99 }
100 *need_flush = true;
101}
102
48e11198
DH
103static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
104{
105 unsigned long new_mm_cr3;
106
107 if (need_flush) {
6fd166aa 108 invalidate_user_asid(new_asid);
48e11198
DH
109 new_mm_cr3 = build_cr3(pgdir, new_asid);
110 } else {
111 new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
112 }
113
114 /*
115 * Caution: many callers of this function expect
116 * that load_cr3() is serializing and orders TLB
117 * fills with respect to the mm_cpumask writes.
118 */
119 write_cr3(new_mm_cr3);
120}
121
c048fdfe
GC
122void leave_mm(int cpu)
123{
3d28ebce
AL
124 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
125
126 /*
127 * It's plausible that we're in lazy TLB mode while our mm is init_mm.
128 * If so, our callers still expect us to flush the TLB, but there
129 * aren't any user TLB entries in init_mm to worry about.
130 *
131 * This needs to happen before any other sanity checks due to
132 * intel_idle's shenanigans.
133 */
134 if (loaded_mm == &init_mm)
135 return;
136
94b1b03b 137 /* Warn if we're not lazy. */
b956575b 138 WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
3d28ebce
AL
139
140 switch_mm(NULL, &init_mm, NULL);
c048fdfe 141}
67535736 142EXPORT_SYMBOL_GPL(leave_mm);
c048fdfe 143
69c0319a
AL
144void switch_mm(struct mm_struct *prev, struct mm_struct *next,
145 struct task_struct *tsk)
078194f8
AL
146{
147 unsigned long flags;
148
149 local_irq_save(flags);
150 switch_mm_irqs_off(prev, next, tsk);
151 local_irq_restore(flags);
152}
153
5beda7d5
AL
154static void sync_current_stack_to_mm(struct mm_struct *mm)
155{
156 unsigned long sp = current_stack_pointer;
157 pgd_t *pgd = pgd_offset(mm, sp);
158
159 if (CONFIG_PGTABLE_LEVELS > 4) {
160 if (unlikely(pgd_none(*pgd))) {
161 pgd_t *pgd_ref = pgd_offset_k(sp);
162
163 set_pgd(pgd, *pgd_ref);
164 }
165 } else {
166 /*
167 * "pgd" is faked. The top level entries are "p4d"s, so sync
168 * the p4d. This compiles to approximately the same code as
169 * the 5-level case.
170 */
171 p4d_t *p4d = p4d_offset(pgd, sp);
172
173 if (unlikely(p4d_none(*p4d))) {
174 pgd_t *pgd_ref = pgd_offset_k(sp);
175 p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
176
177 set_p4d(p4d, *p4d_ref);
178 }
179 }
180}
181
078194f8
AL
182void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
183 struct task_struct *tsk)
69c0319a 184{
3d28ebce 185 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
10af6235 186 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
94b1b03b
AL
187 unsigned cpu = smp_processor_id();
188 u64 next_tlb_gen;
69c0319a 189
3d28ebce 190 /*
94b1b03b
AL
191 * NB: The scheduler will call us with prev == next when switching
192 * from lazy TLB mode to normal mode if active_mm isn't changing.
193 * When this happens, we don't assume that CR3 (and hence
194 * cpu_tlbstate.loaded_mm) matches next.
3d28ebce
AL
195 *
196 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
197 */
e37e43a4 198
94b1b03b
AL
199 /* We don't want flush_tlb_func_* to run concurrently with us. */
200 if (IS_ENABLED(CONFIG_PROVE_LOCKING))
201 WARN_ON_ONCE(!irqs_disabled());
202
203 /*
204 * Verify that CR3 is what we think it is. This will catch
205 * hypothetical buggy code that directly switches to swapper_pg_dir
10af6235
AL
206 * without going through leave_mm() / switch_mm_irqs_off() or that
207 * does something like write_cr3(read_cr3_pa()).
a376e7f9
AL
208 *
209 * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
210 * isn't free.
94b1b03b 211 */
a376e7f9 212#ifdef CONFIG_DEBUG_VM
50fb83a6 213 if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
a376e7f9
AL
214 /*
215 * If we were to BUG here, we'd be very likely to kill
216 * the system so hard that we don't see the call trace.
217 * Try to recover instead by ignoring the error and doing
218 * a global flush to minimize the chance of corruption.
219 *
220 * (This is far from being a fully correct recovery.
221 * Architecturally, the CPU could prefetch something
222 * back into an incorrect ASID slot and leave it there
223 * to cause trouble down the road. It's better than
224 * nothing, though.)
225 */
226 __flush_tlb_all();
227 }
228#endif
b956575b 229 this_cpu_write(cpu_tlbstate.is_lazy, false);
e37e43a4 230
306e0604
MD
231 /*
232 * The membarrier system call requires a full memory barrier
233 * before returning to user-space, after storing to rq->curr.
234 * Writing to CR3 provides that full memory barrier.
235 */
3d28ebce 236 if (real_prev == next) {
e8b9b0cc
AL
237 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
238 next->context.ctx_id);
94b1b03b 239
69c0319a 240 /*
b956575b
AL
241 * We don't currently support having a real mm loaded without
242 * our cpu set in mm_cpumask(). We have all the bookkeeping
243 * in place to figure out whether we would need to flush
244 * if our cpu were cleared in mm_cpumask(), but we don't
245 * currently use it.
69c0319a 246 */
b956575b
AL
247 if (WARN_ON_ONCE(real_prev != &init_mm &&
248 !cpumask_test_cpu(cpu, mm_cpumask(next))))
249 cpumask_set_cpu(cpu, mm_cpumask(next));
250
251 return;
94b1b03b 252 } else {
10af6235
AL
253 u16 new_asid;
254 bool need_flush;
94b1b03b
AL
255
256 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
257 /*
258 * If our current stack is in vmalloc space and isn't
259 * mapped in the new pgd, we'll double-fault. Forcibly
260 * map it.
261 */
5beda7d5 262 sync_current_stack_to_mm(next);
94b1b03b 263 }
69c0319a 264
94b1b03b 265 /* Stop remote flushes for the previous mm */
b956575b
AL
266 VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
267 real_prev != &init_mm);
268 cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
3d28ebce 269
94b1b03b
AL
270 /*
271 * Start remote flushes and then read tlb_gen.
272 */
273 cpumask_set_cpu(cpu, mm_cpumask(next));
274 next_tlb_gen = atomic64_read(&next->context.tlb_gen);
3d28ebce 275
10af6235 276 choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
3d28ebce 277
10af6235
AL
278 if (need_flush) {
279 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
280 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
48e11198 281 load_new_mm_cr3(next->pgd, new_asid, true);
67535736
AL
282
283 /*
284 * NB: This gets called via leave_mm() in the idle path
285 * where RCU functions differently. Tracing normally
286 * uses RCU, so we need to use the _rcuidle variant.
287 *
288 * (There is no good reason for this. The idle code should
289 * be rearranged to call this before rcu_idle_enter().)
290 */
291 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
10af6235
AL
292 } else {
293 /* The new ASID is already up to date. */
48e11198 294 load_new_mm_cr3(next->pgd, new_asid, false);
67535736
AL
295
296 /* See above wrt _rcuidle. */
297 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
10af6235
AL
298 }
299
300 this_cpu_write(cpu_tlbstate.loaded_mm, next);
301 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
94b1b03b 302 }
3d28ebce 303
3d28ebce 304 load_mm_cr4(next);
73534258 305 switch_ldt(real_prev, next);
69c0319a
AL
306}
307
b956575b 308/*
4e57b946
AL
309 * Please ignore the name of this function. It should be called
310 * switch_to_kernel_thread().
311 *
b956575b
AL
312 * enter_lazy_tlb() is a hint from the scheduler that we are entering a
313 * kernel thread or other context without an mm. Acceptable implementations
314 * include doing nothing whatsoever, switching to init_mm, or various clever
315 * lazy tricks to try to minimize TLB flushes.
316 *
317 * The scheduler reserves the right to call enter_lazy_tlb() several times
318 * in a row. It will notify us that we're going back to a real mm by
319 * calling switch_mm_irqs_off().
320 */
321void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
322{
323 if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
324 return;
325
4e57b946 326 if (tlb_defer_switch_to_init_mm()) {
b956575b
AL
327 /*
328 * There's a significant optimization that may be possible
329 * here. We have accurate enough TLB flush tracking that we
330 * don't need to maintain coherence of TLB per se when we're
331 * lazy. We do, however, need to maintain coherence of
332 * paging-structure caches. We could, in principle, leave our
333 * old mm loaded and only switch to init_mm when
334 * tlb_remove_page() happens.
335 */
336 this_cpu_write(cpu_tlbstate.is_lazy, true);
337 } else {
338 switch_mm(NULL, &init_mm, NULL);
339 }
340}
341
72c0098d
AL
342/*
343 * Call this when reinitializing a CPU. It fixes the following potential
344 * problems:
345 *
346 * - The ASID changed from what cpu_tlbstate thinks it is (most likely
347 * because the CPU was taken down and came back up with CR3's PCID
348 * bits clear. CPU hotplug can do this.
349 *
350 * - The TLB contains junk in slots corresponding to inactive ASIDs.
351 *
352 * - The CPU went so far out to lunch that it may have missed a TLB
353 * flush.
354 */
355void initialize_tlbstate_and_flush(void)
356{
357 int i;
358 struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
359 u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
360 unsigned long cr3 = __read_cr3();
361
362 /* Assert that CR3 already references the right mm. */
363 WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
364
365 /*
366 * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
367 * doesn't work like other CR4 bits because it can only be set from
368 * long mode.)
369 */
7898f796 370 WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
72c0098d
AL
371 !(cr4_read_shadow() & X86_CR4_PCIDE));
372
373 /* Force ASID 0 and force a TLB flush. */
50fb83a6 374 write_cr3(build_cr3(mm->pgd, 0));
72c0098d
AL
375
376 /* Reinitialize tlbstate. */
377 this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
378 this_cpu_write(cpu_tlbstate.next_asid, 1);
379 this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
380 this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
381
382 for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
383 this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
384}
385
b0579ade
AL
386/*
387 * flush_tlb_func_common()'s memory ordering requirement is that any
388 * TLB fills that happen after we flush the TLB are ordered after we
389 * read active_mm's tlb_gen. We don't need any explicit barriers
390 * because all x86 flush operations are serializing and the
391 * atomic64_read operation won't be reordered by the compiler.
392 */
454bbad9
AL
393static void flush_tlb_func_common(const struct flush_tlb_info *f,
394 bool local, enum tlb_flush_reason reason)
c048fdfe 395{
b0579ade
AL
396 /*
397 * We have three different tlb_gen values in here. They are:
398 *
399 * - mm_tlb_gen: the latest generation.
400 * - local_tlb_gen: the generation that this CPU has already caught
401 * up to.
402 * - f->new_tlb_gen: the generation that the requester of the flush
403 * wants us to catch up to.
404 */
405 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
10af6235 406 u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
b0579ade 407 u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
10af6235 408 u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
b0579ade 409
bc0d5a89
AL
410 /* This code cannot presently handle being reentered. */
411 VM_WARN_ON(!irqs_disabled());
412
b956575b
AL
413 if (unlikely(loaded_mm == &init_mm))
414 return;
415
10af6235 416 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
b0579ade
AL
417 loaded_mm->context.ctx_id);
418
b956575b 419 if (this_cpu_read(cpu_tlbstate.is_lazy)) {
b0579ade 420 /*
b956575b
AL
421 * We're in lazy mode. We need to at least flush our
422 * paging-structure cache to avoid speculatively reading
423 * garbage into our TLB. Since switching to init_mm is barely
424 * slower than a minimal flush, just switch to init_mm.
b0579ade 425 */
b956575b 426 switch_mm_irqs_off(NULL, &init_mm, NULL);
b3b90e5a
AL
427 return;
428 }
c048fdfe 429
b0579ade
AL
430 if (unlikely(local_tlb_gen == mm_tlb_gen)) {
431 /*
432 * There's nothing to do: we're already up to date. This can
433 * happen if two concurrent flushes happen -- the first flush to
434 * be handled can catch us all the way up, leaving no work for
435 * the second flush.
436 */
94b1b03b 437 trace_tlb_flush(reason, 0);
b0579ade
AL
438 return;
439 }
440
441 WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
442 WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
443
444 /*
445 * If we get to this point, we know that our TLB is out of date.
446 * This does not strictly imply that we need to flush (it's
447 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
448 * going to need to flush in the very near future, so we might
449 * as well get it over with.
450 *
451 * The only question is whether to do a full or partial flush.
452 *
453 * We do a partial flush if requested and two extra conditions
454 * are met:
455 *
456 * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
457 * we've always done all needed flushes to catch up to
458 * local_tlb_gen. If, for example, local_tlb_gen == 2 and
459 * f->new_tlb_gen == 3, then we know that the flush needed to bring
460 * us up to date for tlb_gen 3 is the partial flush we're
461 * processing.
462 *
463 * As an example of why this check is needed, suppose that there
464 * are two concurrent flushes. The first is a full flush that
465 * changes context.tlb_gen from 1 to 2. The second is a partial
466 * flush that changes context.tlb_gen from 2 to 3. If they get
467 * processed on this CPU in reverse order, we'll see
468 * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
469 * If we were to use __flush_tlb_single() and set local_tlb_gen to
470 * 3, we'd be break the invariant: we'd update local_tlb_gen above
471 * 1 without the full flush that's needed for tlb_gen 2.
472 *
473 * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
474 * Partial TLB flushes are not all that much cheaper than full TLB
475 * flushes, so it seems unlikely that it would be a performance win
476 * to do a partial flush if that won't bring our TLB fully up to
477 * date. By doing a full flush instead, we can increase
478 * local_tlb_gen all the way to mm_tlb_gen and we can probably
479 * avoid another flush in the very near future.
480 */
481 if (f->end != TLB_FLUSH_ALL &&
482 f->new_tlb_gen == local_tlb_gen + 1 &&
483 f->new_tlb_gen == mm_tlb_gen) {
484 /* Partial flush */
b3b90e5a 485 unsigned long addr;
be4ffc0d 486 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
b0579ade 487
a2055abe
AL
488 addr = f->start;
489 while (addr < f->end) {
b3b90e5a
AL
490 __flush_tlb_single(addr);
491 addr += PAGE_SIZE;
492 }
454bbad9
AL
493 if (local)
494 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
495 trace_tlb_flush(reason, nr_pages);
b0579ade
AL
496 } else {
497 /* Full flush. */
498 local_flush_tlb();
499 if (local)
500 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
501 trace_tlb_flush(reason, TLB_FLUSH_ALL);
b3b90e5a 502 }
b0579ade
AL
503
504 /* Both paths above update our state to mm_tlb_gen. */
10af6235 505 this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
c048fdfe
GC
506}
507
454bbad9
AL
508static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
509{
510 const struct flush_tlb_info *f = info;
511
512 flush_tlb_func_common(f, true, reason);
513}
514
515static void flush_tlb_func_remote(void *info)
516{
517 const struct flush_tlb_info *f = info;
518
519 inc_irq_stat(irq_tlb_count);
520
3d28ebce 521 if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
454bbad9
AL
522 return;
523
524 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
525 flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
526}
527
4595f962 528void native_flush_tlb_others(const struct cpumask *cpumask,
a2055abe 529 const struct flush_tlb_info *info)
4595f962 530{
ec659934 531 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
a2055abe 532 if (info->end == TLB_FLUSH_ALL)
18c98243
NA
533 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
534 else
535 trace_tlb_flush(TLB_REMOTE_SEND_IPI,
a2055abe 536 (info->end - info->start) >> PAGE_SHIFT);
18c98243 537
4595f962 538 if (is_uv_system()) {
94b1b03b
AL
539 /*
540 * This whole special case is confused. UV has a "Broadcast
541 * Assist Unit", which seems to be a fancy way to send IPIs.
542 * Back when x86 used an explicit TLB flush IPI, UV was
543 * optimized to use its own mechanism. These days, x86 uses
544 * smp_call_function_many(), but UV still uses a manual IPI,
545 * and that IPI's action is out of date -- it does a manual
546 * flush instead of calling flush_tlb_func_remote(). This
547 * means that the percpu tlb_gen variables won't be updated
548 * and we'll do pointless flushes on future context switches.
549 *
550 * Rather than hooking native_flush_tlb_others() here, I think
551 * that UV should be updated so that smp_call_function_many(),
552 * etc, are optimal on UV.
553 */
bdbcdd48 554 unsigned int cpu;
0e21990a 555
25542c64 556 cpu = smp_processor_id();
a2055abe 557 cpumask = uv_flush_tlb_others(cpumask, info);
bdbcdd48 558 if (cpumask)
454bbad9 559 smp_call_function_many(cpumask, flush_tlb_func_remote,
a2055abe 560 (void *)info, 1);
0e21990a 561 return;
4595f962 562 }
454bbad9 563 smp_call_function_many(cpumask, flush_tlb_func_remote,
a2055abe 564 (void *)info, 1);
c048fdfe 565}
c048fdfe 566
a5102476
DH
567/*
568 * See Documentation/x86/tlb.txt for details. We choose 33
569 * because it is large enough to cover the vast majority (at
570 * least 95%) of allocations, and is small enough that we are
571 * confident it will not cause too much overhead. Each single
572 * flush is about 100 ns, so this caps the maximum overhead at
573 * _about_ 3,000 ns.
574 *
575 * This is in units of pages.
576 */
86426851 577static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
e9f4e0a9 578
611ae8e3
AS
579void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
580 unsigned long end, unsigned long vmflag)
581{
454bbad9 582 int cpu;
ce27374f 583
454bbad9
AL
584 struct flush_tlb_info info = {
585 .mm = mm,
586 };
ce27374f 587
454bbad9 588 cpu = get_cpu();
71b3c126 589
f39681ed 590 /* This is also a barrier that synchronizes with switch_mm(). */
b0579ade 591 info.new_tlb_gen = inc_mm_tlb_gen(mm);
71b3c126 592
454bbad9
AL
593 /* Should we flush just the requested range? */
594 if ((end != TLB_FLUSH_ALL) &&
595 !(vmflag & VM_HUGETLB) &&
596 ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
597 info.start = start;
598 info.end = end;
9824cf97 599 } else {
a2055abe
AL
600 info.start = 0UL;
601 info.end = TLB_FLUSH_ALL;
4995ab9c 602 }
454bbad9 603
bc0d5a89
AL
604 if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
605 VM_WARN_ON(irqs_disabled());
606 local_irq_disable();
454bbad9 607 flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
bc0d5a89
AL
608 local_irq_enable();
609 }
610
454bbad9 611 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
a2055abe 612 flush_tlb_others(mm_cpumask(mm), &info);
94b1b03b 613
454bbad9 614 put_cpu();
c048fdfe
GC
615}
616
a2055abe 617
c048fdfe
GC
618static void do_flush_tlb_all(void *info)
619{
ec659934 620 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
c048fdfe 621 __flush_tlb_all();
c048fdfe
GC
622}
623
624void flush_tlb_all(void)
625{
ec659934 626 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
15c8b6c1 627 on_each_cpu(do_flush_tlb_all, NULL, 1);
c048fdfe 628}
3df3212f 629
effee4b9
AS
630static void do_kernel_range_flush(void *info)
631{
632 struct flush_tlb_info *f = info;
633 unsigned long addr;
634
635 /* flush range by one by one 'invlpg' */
a2055abe 636 for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
a501686b 637 __flush_tlb_one(addr);
effee4b9
AS
638}
639
640void flush_tlb_kernel_range(unsigned long start, unsigned long end)
641{
effee4b9
AS
642
643 /* Balance as user space task's flush, a bit conservative */
e9f4e0a9 644 if (end == TLB_FLUSH_ALL ||
be4ffc0d 645 (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
effee4b9 646 on_each_cpu(do_flush_tlb_all, NULL, 1);
e9f4e0a9
DH
647 } else {
648 struct flush_tlb_info info;
a2055abe
AL
649 info.start = start;
650 info.end = end;
effee4b9
AS
651 on_each_cpu(do_kernel_range_flush, &info, 1);
652 }
653}
2d040a1c 654
e73ad5ff
AL
655void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
656{
a2055abe
AL
657 struct flush_tlb_info info = {
658 .mm = NULL,
659 .start = 0UL,
660 .end = TLB_FLUSH_ALL,
661 };
662
e73ad5ff
AL
663 int cpu = get_cpu();
664
bc0d5a89
AL
665 if (cpumask_test_cpu(cpu, &batch->cpumask)) {
666 VM_WARN_ON(irqs_disabled());
667 local_irq_disable();
3f79e4c7 668 flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
bc0d5a89
AL
669 local_irq_enable();
670 }
671
e73ad5ff 672 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
a2055abe 673 flush_tlb_others(&batch->cpumask, &info);
94b1b03b 674
e73ad5ff
AL
675 cpumask_clear(&batch->cpumask);
676
677 put_cpu();
678}
679
2d040a1c
DH
680static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
681 size_t count, loff_t *ppos)
682{
683 char buf[32];
684 unsigned int len;
685
686 len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
687 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
688}
689
690static ssize_t tlbflush_write_file(struct file *file,
691 const char __user *user_buf, size_t count, loff_t *ppos)
692{
693 char buf[32];
694 ssize_t len;
695 int ceiling;
696
697 len = min(count, sizeof(buf) - 1);
698 if (copy_from_user(buf, user_buf, len))
699 return -EFAULT;
700
701 buf[len] = '\0';
702 if (kstrtoint(buf, 0, &ceiling))
703 return -EINVAL;
704
705 if (ceiling < 0)
706 return -EINVAL;
707
708 tlb_single_page_flush_ceiling = ceiling;
709 return count;
710}
711
712static const struct file_operations fops_tlbflush = {
713 .read = tlbflush_read_file,
714 .write = tlbflush_write_file,
715 .llseek = default_llseek,
716};
717
718static int __init create_tlb_single_page_flush_ceiling(void)
719{
720 debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
721 arch_debugfs_dir, NULL, &fops_tlbflush);
722 return 0;
723}
724late_initcall(create_tlb_single_page_flush_ceiling);