]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
s390/percpu: Infrastructure for more efficient this_cpu operations
authorHeiko Carstens <hca@linux.ibm.com>
Tue, 26 May 2026 05:56:56 +0000 (07:56 +0200)
committerAlexander Gordeev <agordeev@linux.ibm.com>
Wed, 3 Jun 2026 13:32:46 +0000 (15:32 +0200)
With the intended removal of PREEMPT_NONE this_cpu operations based on
atomic instructions, guarded with preempt_disable()/preempt_enable() pairs
become more expensive: the preempt_disable() / preempt_enable() pairs are
not optimized away anymore during compile time.

In particular the conditional call to preempt_schedule_notrace() after
preempt_enable() adds additional code and register pressure.

E.g. this simple C code sequence

DEFINE_PER_CPU(long, foo);
long bar(long a) { return this_cpu_add_return(foo, a); }

generates this code:

  11a976:       eb af f0 68 00 24       stmg    %r10,%r15,104(%r15)
  11a97c:       b9 04 00 ef             lgr     %r14,%r15
  11a980:       b9 04 00 b2             lgr     %r11,%r2
  11a984:       e3 f0 ff c8 ff 71       lay     %r15,-56(%r15)
  11a98a:       e3 e0 f0 98 00 24       stg     %r14,152(%r15)
  11a990:       eb 01 03 a8 00 6a       asi     936,1            <- __preempt_count_add(1)
  11a996:       c0 10 00 d2 ac b5       larl    %r1,1b70300      <- address of percpu var
  11a9a0:       e3 10 23 b8 00 08       ag      %r1,952          <- add percpu offset
  11a9a6:       eb ab 10 00 00 e8       laag    %r10,%r11,0(%r1) <- atomic op
  11a9ac:       eb ff 03 a8 00 6e       alsi    936,-1           <- __preempt_count_dec_and_test()
  11a9b2:       a7 54 00 05             jnhe    11a9bc <bar+0x4c>
  11a9b6:       c0 e5 00 76 d1 bd       brasl   %r14,ff4d30 <preempt_schedule_notrace>
  11a9bc:       b9 e8 b0 2a             agrk    %r2,%r10,%r11
  11a9c0:       eb af f0 a0 00 04       lmg     %r10,%r15,160(%r15)
  11a9c6        07 fe                   br      %r14

Even though the above example is more or less the worst case, since the
branch to preempt_schedule_notrace() requires a stackframe, which
otherwise wouldn't be necessary, there is also the conditional jnhe branch
instruction.

Get rid of the conditional branch with the following code sequence:

  11a8e6:       c0 30 00 d0 c5 0d       larl    %r3,1b33300
  11a8ec:       b9 04 00 43             lgr     %r4,%r3
  11a8f0:       eb 00 43 c0 00 52       mviy    960,4
  11a8f6:       e3 40 03 b8 00 08       ag      %r4,952
  11a8fc:       eb 52 40 00 00 e8       laag    %r5,%r2,0(%r4)
  11a902:       eb 00 03 c0 00 52       mviy    960,0
  11a908:       b9 08 00 25             agr     %r2,%r5
  11a90c        07 fe                   br      %r14

The general idea is that this_cpu operations based on atomic instructions
are guarded with mviy instructions:

- The first mviy instruction writes the register number, which contains
  the percpu address variable to lowcore. This also indicates that a
  percpu code section is executed.

- The first instruction following the mviy instruction must be the ag
  instruction which adds the percpu offset to the percpu address register.

- Afterwards the atomic percpu operation follows.

- Then a second mviy instruction writes a zero to lowcore, which indicates
  the end of the percpu code section.

- In case of an interrupt/exception/nmi the register number which was
  written to lowcore is copied to the exception frame (pt_regs), and a zero
  is written to lowcore.

- On return to the previous context it is checked if a percpu code section
  was executed (saved register number not zero), and if the process was
  migrated to a different cpu. If the percpu offset was already added to
  the percpu address register (instruction address does _not_ point to the
  ag instruction) the content of the percpu address register is adjusted so
  it points to percpu variable of the new cpu.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
arch/s390/include/asm/entry-percpu.h [new file with mode: 0644]
arch/s390/include/asm/lowcore.h
arch/s390/include/asm/percpu.h
arch/s390/include/asm/ptrace.h
arch/s390/kernel/irq.c
arch/s390/kernel/nmi.c
arch/s390/kernel/traps.c

diff --git a/arch/s390/include/asm/entry-percpu.h b/arch/s390/include/asm/entry-percpu.h
new file mode 100644 (file)
index 0000000..4ef4726
--- /dev/null
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_S390_ENTRY_PERCPU_H
+#define ARCH_S390_ENTRY_PERCPU_H
+
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <asm/lowcore.h>
+#include <asm/ptrace.h>
+#include <asm/asm-offsets.h>
+
+static __always_inline void percpu_entry(struct pt_regs *regs)
+{
+       struct lowcore *lc = get_lowcore();
+
+       if (user_mode(regs))
+               return;
+       regs->cpu = lc->cpu_nr;
+       regs->percpu_register = lc->percpu_register;
+       lc->percpu_register = 0;
+}
+
+static __always_inline bool percpu_code_check(struct pt_regs *regs)
+{
+       unsigned int insn, disp;
+       struct kprobe *p;
+
+       if (likely(user_mode(regs) || !regs->percpu_register))
+               return false;
+       /*
+        * Within a percpu code section - check if the percpu base register
+        * needs to be updated. This is the case if the PSW does not point to
+        * the ADD instruction within the section.
+        * - AG %rx,percpu_offset_in_lowcore(%r0,%r0)
+        * which adds the percpu offset to the percpu base register.
+        */
+       lockdep_assert_preemption_disabled();
+again:
+       insn = READ_ONCE(*(u16 *)psw_bits(regs->psw).ia);
+       if (unlikely(insn == BREAKPOINT_INSTRUCTION)) {
+               p = get_kprobe((void *)psw_bits(regs->psw).ia);
+               /*
+                * If the kprobe is concurrently removed on a different CPU
+                * it might not be found anymore. However text must have
+                * been restored - try again.
+                */
+               if (!p)
+                       goto again;
+               insn = p->opcode;
+       }
+       if ((insn & 0xff0f) != 0xe300)
+               return true;
+       disp = offsetof(struct lowcore, percpu_offset);
+       if (machine_has_relocated_lowcore())
+               disp += LOWCORE_ALT_ADDRESS;
+       insn = (disp & 0xff000) >> 4 | (disp & 0x00fff) << 16 | 0x8;
+       if (*(u32 *)(psw_bits(regs->psw).ia + 2) != insn)
+               return true;
+       return false;
+}
+
+static __always_inline void percpu_exit(struct pt_regs *regs, bool needs_fixup)
+{
+       struct lowcore *lc = get_lowcore();
+       unsigned char reg;
+
+       if (user_mode(regs))
+               return;
+       reg = regs->percpu_register;
+       lc->percpu_register = reg;
+       if (likely(!needs_fixup))
+               return;
+       /* Check if process has been migrated to a different CPU. */
+       if (regs->cpu == lc->cpu_nr)
+               return;
+       /* Fixup percpu base register */
+       regs->gprs[reg] -= __per_cpu_offset[regs->cpu];
+       regs->gprs[reg] += lc->percpu_offset;
+}
+
+#endif
index 50ffe75adeb477947357b90d22a809d703e76cbb..cd1ddfdb5d35d5a0e4af21a843c70749ab0c75ed 100644 (file)
@@ -165,7 +165,8 @@ struct lowcore {
        __u32   spinlock_index;                 /* 0x03b0 */
        __u8    pad_0x03b4[0x03b8-0x03b4];      /* 0x03b4 */
        __u64   percpu_offset;                  /* 0x03b8 */
-       __u8    pad_0x03c0[0x0400-0x03c0];      /* 0x03c0 */
+       __u8    percpu_register;                /* 0x03c0 */
+       __u8    pad_0x03c1[0x0400-0x03c1];      /* 0x03c1 */
 
        __u32   return_lpswe;                   /* 0x0400 */
        __u32   return_mcck_lpswe;              /* 0x0404 */
index b18a96f3a3345b8cd500f308408bad0fa98b68fa..78602d2f5ebaad145e4528957e5287e312d78ca6 100644 (file)
 #define this_cpu_or_1(pcp, val)                arch_this_cpu_to_op_simple(pcp, val, |)
 #define this_cpu_or_2(pcp, val)                arch_this_cpu_to_op_simple(pcp, val, |)
 
+/*
+ * Macros to be used for percpu code section based on atomic instructions.
+ *
+ * Avoid the need to use preempt_disable() / preempt_disable() pairs and the
+ * conditional preempt_schedule_notrace() function calls which come with
+ * this. The idea is that this_cpu operations based on atomic instructions are
+ * guarded with mviy instructions:
+ *
+ * - The first mviy instruction writes the register number, which contains the
+ *   percpu address variable to lowcore. This also indicates that a percpu
+ *   code section is executed.
+ *
+ * - The first mviy instruction following the mviy instruction must be the ag
+ *   instruction which adds the percpu offset to the percpu address register.
+ *
+ * - Afterwards the atomic percpu operation follows.
+ *
+ * - Then a second mviy instruction writes a zero to lowcore, which indicates
+ *   the end of the percpu code section.
+ *
+ * - In case of an interrupt/exception/nmi the register number which was
+ *   written to lowcore is copied to the exception frame (pt_regs), and a zero
+ *   is written to lowcore.
+ *
+ * - On return to the previous context it is checked if a percpu code section
+ *   was executed (saved register number not zero), and if the process was
+ *   migrated to a different cpu. If the percpu offset was already added to
+ *   the percpu address register (instruction address does _not_ point to the
+ *   ag instruction) the content of the percpu address register is adjusted so
+ *   it points to percpu variable of the new cpu.
+ *
+ * Inline assemblies making use of this typically have a code sequence like:
+ *
+ *   MVIY_PERCPU(...) <- start of percpu code section
+ *   AG_ALT(...)      <- add percpu offset; must be the second instruction
+ *   atomic_op       <- atomic op
+ *   MVIY_ALT(...)    <- end of percpu code section
+ */
+
+#define MVIY_PERCPU(disp, dispalt, reg)                                                \
+       ".macro GEN_MVIY disp reg\n"                                            \
+       ".irp   rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15\n"                     \
+       "       .ifc \\reg,%%r\\rs\n"                                           \
+       "       mviy    \\disp(%%r0),\\rs\n"                                    \
+       "       .endif\n"                                                       \
+       ".endr\n"                                                               \
+       ".endm\n"                                                               \
+       ALTERNATIVE("GEN_MVIY " __stringify(disp)    " " __stringify(reg) "\n", \
+                   "GEN_MVIY " __stringify(dispalt) " " __stringify(reg) "\n", \
+                   ALT_FEATURE(MFEATURE_LOWCORE))                              \
+       ".purgem GEN_MVIY\n"
+
+#define MVIY_ALT(disp, dispalt)                                                        \
+       ALTERNATIVE("   mviy    " disp    "(%%r0),0\n",                         \
+                   "   mviy    " dispalt "(%%r0),0\n",                         \
+                   ALT_FEATURE(MFEATURE_LOWCORE))
+
+#define AG_ALT(disp, dispalt, reg)                                             \
+       ALTERNATIVE("   ag      " reg ", " disp    "(%%r0)\n",                  \
+                   "   ag      " reg ", " dispalt "(%%r0)\n",                  \
+                   ALT_FEATURE(MFEATURE_LOWCORE))
+
 #ifndef MARCH_HAS_Z196_FEATURES
 
 #define this_cpu_add_4(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, +)
index aaceb1d9110a4dd16d9bec9231a6912044855146..495e310c3d6d714eff3236563538e9be951bd049 100644 (file)
@@ -134,6 +134,8 @@ struct pt_regs {
        };
        unsigned long flags;
        unsigned long last_break;
+       unsigned int cpu;
+       unsigned char percpu_register;
 };
 
 /*
index d10a17e6531dae0d42fac6cd7ffef77f6cb2b131..efb988833c88d1f2bad6975a3d42937b08414f4e 100644 (file)
@@ -33,6 +33,7 @@
 #include <asm/softirq_stack.h>
 #include <asm/vtime.h>
 #include <asm/asm.h>
+#include <asm/entry-percpu.h>
 #include "entry.h"
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
@@ -142,10 +143,13 @@ static int irq_pending(struct pt_regs *regs)
 
 void noinstr do_io_irq(struct pt_regs *regs)
 {
-       irqentry_state_t state = irqentry_enter(regs);
-       struct pt_regs *old_regs = set_irq_regs(regs);
-       bool from_idle;
+       bool from_idle, percpu_needs_fixup;
+       struct pt_regs *old_regs;
+       irqentry_state_t state;
 
+       percpu_entry(regs);
+       state = irqentry_enter(regs);
+       old_regs = set_irq_regs(regs);
        from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
        if (from_idle)
                update_timer_idle();
@@ -170,21 +174,25 @@ void noinstr do_io_irq(struct pt_regs *regs)
                        do_irq_async(regs, IO_INTERRUPT);
        } while (machine_is_lpar() && irq_pending(regs));
 
+       percpu_needs_fixup = percpu_code_check(regs);
        irq_exit_rcu();
-
        set_irq_regs(old_regs);
        irqentry_exit(regs, state);
 
        if (from_idle)
                regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
+       percpu_exit(regs, percpu_needs_fixup);
 }
 
 void noinstr do_ext_irq(struct pt_regs *regs)
 {
-       irqentry_state_t state = irqentry_enter(regs);
-       struct pt_regs *old_regs = set_irq_regs(regs);
-       bool from_idle;
+       bool from_idle, percpu_needs_fixup;
+       struct pt_regs *old_regs;
+       irqentry_state_t state;
 
+       percpu_entry(regs);
+       state = irqentry_enter(regs);
+       old_regs = set_irq_regs(regs);
        from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
        if (from_idle)
                update_timer_idle();
@@ -206,12 +214,14 @@ void noinstr do_ext_irq(struct pt_regs *regs)
 
        do_irq_async(regs, EXT_INTERRUPT);
 
+       percpu_needs_fixup = percpu_code_check(regs);
        irq_exit_rcu();
        set_irq_regs(old_regs);
        irqentry_exit(regs, state);
 
        if (from_idle)
                regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
+       percpu_exit(regs, percpu_needs_fixup);
 }
 
 static void show_msi_interrupt(struct seq_file *p, int irq)
index 94fbfad49f62051a901ea719d4ba511befd6de69..e17a59d4d5a41c3a24ba6658381a9e0fb6a0683f 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/sched/signal.h>
 #include <linux/kvm_host.h>
+#include <asm/entry-percpu.h>
 #include <asm/lowcore.h>
 #include <asm/ctlreg.h>
 #include <asm/fpu.h>
@@ -363,6 +364,7 @@ NOKPROBE_SYMBOL(s390_backup_mcck_info);
  */
 void notrace s390_do_machine_check(struct pt_regs *regs)
 {
+       bool percpu_needs_fixup;
        static int ipd_count;
        static DEFINE_SPINLOCK(ipd_lock);
        static unsigned long long last_ipd;
@@ -374,6 +376,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
        unsigned long mcck_dam_code;
        int mcck_pending = 0;
 
+       percpu_entry(regs);
        irq_state = irqentry_nmi_enter(regs);
 
        if (user_mode(regs))
@@ -495,7 +498,9 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
        if (mcck_pending)
                schedule_mcck_handler();
 
+       percpu_needs_fixup = percpu_code_check(regs);
        irqentry_nmi_exit(regs, irq_state);
+       percpu_exit(regs, percpu_needs_fixup);
 }
 NOKPROBE_SYMBOL(s390_do_machine_check);
 
index 1b5c6fc431cc1982028d5f312f4678cc04d42f0c..564403496a7ced12f8273a9119446386cb4c93f9 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/entry-common.h>
 #include <linux/kmsan.h>
 #include <linux/bug.h>
+#include <asm/entry-percpu.h>
 #include <asm/asm-extable.h>
 #include <asm/irqflags.h>
 #include <asm/ptrace.h>
@@ -329,6 +330,7 @@ static void (*pgm_check_table[128])(struct pt_regs *regs);
 void noinstr __do_pgm_check(struct pt_regs *regs)
 {
        struct lowcore *lc = get_lowcore();
+       bool percpu_needs_fixup;
        irqentry_state_t state;
        unsigned int trapnr;
        union teid teid;
@@ -349,6 +351,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
                current->thread.gmap_int_code = regs->int_code & 0xffff;
                return;
        }
+       percpu_entry(regs);
        state = irqentry_enter(regs);
        if (user_mode(regs)) {
                update_timer_sys();
@@ -385,7 +388,9 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
                pgm_check_table[trapnr](regs);
 out:
        local_irq_disable();
+       percpu_needs_fixup = percpu_code_check(regs);
        irqentry_exit(regs, state);
+       percpu_exit(regs, percpu_needs_fixup);
 }
 
 /*