]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
x86/mm/tlb: Update mm_cpumask lazily
authorRik van Riel <riel@surriel.com>
Thu, 14 Nov 2024 15:26:16 +0000 (10:26 -0500)
committerIngo Molnar <mingo@kernel.org>
Tue, 19 Nov 2024 11:02:46 +0000 (12:02 +0100)
On busy multi-threaded workloads, there can be significant contention
on the mm_cpumask at context switch time.

Reduce that contention by updating mm_cpumask lazily, setting the CPU bit
at context switch time (if not already set), and clearing the CPU bit at
the first TLB flush sent to a CPU where the process isn't running.

When a flurry of TLB flushes for a process happen, only the first one
will be sent to CPUs where the process isn't running. The others will
be sent to CPUs where the process is currently running.

On an AMD Milan system with 36 cores, there is a noticeable difference:
$ hackbench --groups 20 --loops 10000

  Before: ~4.5s +/- 0.1s
  After:  ~4.2s +/- 0.1s

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20241114152723.1294686-2-riel@surriel.com
arch/x86/kernel/alternative.c
arch/x86/mm/tlb.c

index d17518ca19b8b82a94678569ceb0f5871ecd07b4..8b66a555d2f03540ec2861c9eacd55f900145950 100644 (file)
@@ -1825,11 +1825,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
        return temp_state;
 }
 
+__ro_after_init struct mm_struct *poking_mm;
+__ro_after_init unsigned long poking_addr;
+
 static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
 {
        lockdep_assert_irqs_disabled();
+
        switch_mm_irqs_off(NULL, prev_state.mm, current);
 
+       /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
+       cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
+
        /*
         * Restore the breakpoints if they were disabled before the temporary mm
         * was loaded.
@@ -1838,9 +1845,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
                hw_breakpoint_restore();
 }
 
-__ro_after_init struct mm_struct *poking_mm;
-__ro_after_init unsigned long poking_addr;
-
 static void text_poke_memcpy(void *dst, const void *src, size_t len)
 {
        memcpy(dst, src, len);
index b0d5a644fc84dc2c405d42bb102fd20873441744..cc4e57ae690f5cff6536a9e31cddf60ff646f120 100644 (file)
@@ -606,18 +606,15 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
                cond_mitigation(tsk);
 
                /*
-                * Stop remote flushes for the previous mm.
-                * Skip kernel threads; we never send init_mm TLB flushing IPIs,
-                * but the bitmap manipulation can cause cache line contention.
+                * Leave this CPU in prev's mm_cpumask. Atomic writes to
+                * mm_cpumask can be expensive under contention. The CPU
+                * will be removed lazily at TLB flush time.
                 */
-               if (prev != &init_mm) {
-                       VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
-                                               mm_cpumask(prev)));
-                       cpumask_clear_cpu(cpu, mm_cpumask(prev));
-               }
+               VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu,
+                               mm_cpumask(prev)));
 
                /* Start receiving IPIs and then read tlb_gen (and LAM below) */
-               if (next != &init_mm)
+               if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
                        cpumask_set_cpu(cpu, mm_cpumask(next));
                next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 
@@ -761,8 +758,10 @@ static void flush_tlb_func(void *info)
                count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
 
                /* Can only happen on remote CPUs */
-               if (f->mm && f->mm != loaded_mm)
+               if (f->mm && f->mm != loaded_mm) {
+                       cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm));
                        return;
+               }
        }
 
        if (unlikely(loaded_mm == &init_mm))