From: Heiko Carstens Date: Tue, 26 May 2026 05:56:56 +0000 (+0200) Subject: s390/percpu: Infrastructure for more efficient this_cpu operations X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;h=a737737cdb9c94e40a9926cdc2320f874c05d709;p=thirdparty%2Fkernel%2Flinux.git s390/percpu: Infrastructure for more efficient this_cpu operations With the intended removal of PREEMPT_NONE this_cpu operations based on atomic instructions, guarded with preempt_disable()/preempt_enable() pairs become more expensive: the preempt_disable() / preempt_enable() pairs are not optimized away anymore during compile time. In particular the conditional call to preempt_schedule_notrace() after preempt_enable() adds additional code and register pressure. E.g. this simple C code sequence DEFINE_PER_CPU(long, foo); long bar(long a) { return this_cpu_add_return(foo, a); } generates this code: 11a976: eb af f0 68 00 24 stmg %r10,%r15,104(%r15) 11a97c: b9 04 00 ef lgr %r14,%r15 11a980: b9 04 00 b2 lgr %r11,%r2 11a984: e3 f0 ff c8 ff 71 lay %r15,-56(%r15) 11a98a: e3 e0 f0 98 00 24 stg %r14,152(%r15) 11a990: eb 01 03 a8 00 6a asi 936,1 <- __preempt_count_add(1) 11a996: c0 10 00 d2 ac b5 larl %r1,1b70300 <- address of percpu var 11a9a0: e3 10 23 b8 00 08 ag %r1,952 <- add percpu offset 11a9a6: eb ab 10 00 00 e8 laag %r10,%r11,0(%r1) <- atomic op 11a9ac: eb ff 03 a8 00 6e alsi 936,-1 <- __preempt_count_dec_and_test() 11a9b2: a7 54 00 05 jnhe 11a9bc 11a9b6: c0 e5 00 76 d1 bd brasl %r14,ff4d30 11a9bc: b9 e8 b0 2a agrk %r2,%r10,%r11 11a9c0: eb af f0 a0 00 04 lmg %r10,%r15,160(%r15) 11a9c6 07 fe br %r14 Even though the above example is more or less the worst case, since the branch to preempt_schedule_notrace() requires a stackframe, which otherwise wouldn't be necessary, there is also the conditional jnhe branch instruction. Get rid of the conditional branch with the following code sequence: 11a8e6: c0 30 00 d0 c5 0d larl %r3,1b33300 11a8ec: b9 04 00 43 lgr %r4,%r3 11a8f0: eb 00 43 c0 00 52 mviy 960,4 11a8f6: e3 40 03 b8 00 08 ag %r4,952 11a8fc: eb 52 40 00 00 e8 laag %r5,%r2,0(%r4) 11a902: eb 00 03 c0 00 52 mviy 960,0 11a908: b9 08 00 25 agr %r2,%r5 11a90c 07 fe br %r14 The general idea is that this_cpu operations based on atomic instructions are guarded with mviy instructions: - The first mviy instruction writes the register number, which contains the percpu address variable to lowcore. This also indicates that a percpu code section is executed. - The first instruction following the mviy instruction must be the ag instruction which adds the percpu offset to the percpu address register. - Afterwards the atomic percpu operation follows. - Then a second mviy instruction writes a zero to lowcore, which indicates the end of the percpu code section. - In case of an interrupt/exception/nmi the register number which was written to lowcore is copied to the exception frame (pt_regs), and a zero is written to lowcore. - On return to the previous context it is checked if a percpu code section was executed (saved register number not zero), and if the process was migrated to a different cpu. If the percpu offset was already added to the percpu address register (instruction address does _not_ point to the ag instruction) the content of the percpu address register is adjusted so it points to percpu variable of the new cpu. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- diff --git a/arch/s390/include/asm/entry-percpu.h b/arch/s390/include/asm/entry-percpu.h new file mode 100644 index 0000000000000..4ef47265cfce8 --- /dev/null +++ b/arch/s390/include/asm/entry-percpu.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_S390_ENTRY_PERCPU_H +#define ARCH_S390_ENTRY_PERCPU_H + +#include +#include +#include +#include +#include + +static __always_inline void percpu_entry(struct pt_regs *regs) +{ + struct lowcore *lc = get_lowcore(); + + if (user_mode(regs)) + return; + regs->cpu = lc->cpu_nr; + regs->percpu_register = lc->percpu_register; + lc->percpu_register = 0; +} + +static __always_inline bool percpu_code_check(struct pt_regs *regs) +{ + unsigned int insn, disp; + struct kprobe *p; + + if (likely(user_mode(regs) || !regs->percpu_register)) + return false; + /* + * Within a percpu code section - check if the percpu base register + * needs to be updated. This is the case if the PSW does not point to + * the ADD instruction within the section. + * - AG %rx,percpu_offset_in_lowcore(%r0,%r0) + * which adds the percpu offset to the percpu base register. + */ + lockdep_assert_preemption_disabled(); +again: + insn = READ_ONCE(*(u16 *)psw_bits(regs->psw).ia); + if (unlikely(insn == BREAKPOINT_INSTRUCTION)) { + p = get_kprobe((void *)psw_bits(regs->psw).ia); + /* + * If the kprobe is concurrently removed on a different CPU + * it might not be found anymore. However text must have + * been restored - try again. + */ + if (!p) + goto again; + insn = p->opcode; + } + if ((insn & 0xff0f) != 0xe300) + return true; + disp = offsetof(struct lowcore, percpu_offset); + if (machine_has_relocated_lowcore()) + disp += LOWCORE_ALT_ADDRESS; + insn = (disp & 0xff000) >> 4 | (disp & 0x00fff) << 16 | 0x8; + if (*(u32 *)(psw_bits(regs->psw).ia + 2) != insn) + return true; + return false; +} + +static __always_inline void percpu_exit(struct pt_regs *regs, bool needs_fixup) +{ + struct lowcore *lc = get_lowcore(); + unsigned char reg; + + if (user_mode(regs)) + return; + reg = regs->percpu_register; + lc->percpu_register = reg; + if (likely(!needs_fixup)) + return; + /* Check if process has been migrated to a different CPU. */ + if (regs->cpu == lc->cpu_nr) + return; + /* Fixup percpu base register */ + regs->gprs[reg] -= __per_cpu_offset[regs->cpu]; + regs->gprs[reg] += lc->percpu_offset; +} + +#endif diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 50ffe75adeb47..cd1ddfdb5d35d 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -165,7 +165,8 @@ struct lowcore { __u32 spinlock_index; /* 0x03b0 */ __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */ __u64 percpu_offset; /* 0x03b8 */ - __u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */ + __u8 percpu_register; /* 0x03c0 */ + __u8 pad_0x03c1[0x0400-0x03c1]; /* 0x03c1 */ __u32 return_lpswe; /* 0x0400 */ __u32 return_mcck_lpswe; /* 0x0404 */ diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h index b18a96f3a3345..78602d2f5ebaa 100644 --- a/arch/s390/include/asm/percpu.h +++ b/arch/s390/include/asm/percpu.h @@ -60,6 +60,68 @@ #define this_cpu_or_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) #define this_cpu_or_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) +/* + * Macros to be used for percpu code section based on atomic instructions. + * + * Avoid the need to use preempt_disable() / preempt_disable() pairs and the + * conditional preempt_schedule_notrace() function calls which come with + * this. The idea is that this_cpu operations based on atomic instructions are + * guarded with mviy instructions: + * + * - The first mviy instruction writes the register number, which contains the + * percpu address variable to lowcore. This also indicates that a percpu + * code section is executed. + * + * - The first mviy instruction following the mviy instruction must be the ag + * instruction which adds the percpu offset to the percpu address register. + * + * - Afterwards the atomic percpu operation follows. + * + * - Then a second mviy instruction writes a zero to lowcore, which indicates + * the end of the percpu code section. + * + * - In case of an interrupt/exception/nmi the register number which was + * written to lowcore is copied to the exception frame (pt_regs), and a zero + * is written to lowcore. + * + * - On return to the previous context it is checked if a percpu code section + * was executed (saved register number not zero), and if the process was + * migrated to a different cpu. If the percpu offset was already added to + * the percpu address register (instruction address does _not_ point to the + * ag instruction) the content of the percpu address register is adjusted so + * it points to percpu variable of the new cpu. + * + * Inline assemblies making use of this typically have a code sequence like: + * + * MVIY_PERCPU(...) <- start of percpu code section + * AG_ALT(...) <- add percpu offset; must be the second instruction + * atomic_op <- atomic op + * MVIY_ALT(...) <- end of percpu code section + */ + +#define MVIY_PERCPU(disp, dispalt, reg) \ + ".macro GEN_MVIY disp reg\n" \ + ".irp rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15\n" \ + " .ifc \\reg,%%r\\rs\n" \ + " mviy \\disp(%%r0),\\rs\n" \ + " .endif\n" \ + ".endr\n" \ + ".endm\n" \ + ALTERNATIVE("GEN_MVIY " __stringify(disp) " " __stringify(reg) "\n", \ + "GEN_MVIY " __stringify(dispalt) " " __stringify(reg) "\n", \ + ALT_FEATURE(MFEATURE_LOWCORE)) \ + ".purgem GEN_MVIY\n" + +#define MVIY_ALT(disp, dispalt) \ + ALTERNATIVE(" mviy " disp "(%%r0),0\n", \ + " mviy " dispalt "(%%r0),0\n", \ + ALT_FEATURE(MFEATURE_LOWCORE)) + +#define AG_ALT(disp, dispalt, reg) \ + ALTERNATIVE(" ag " reg ", " disp "(%%r0)\n", \ + " ag " reg ", " dispalt "(%%r0)\n", \ + ALT_FEATURE(MFEATURE_LOWCORE)) + #ifndef MARCH_HAS_Z196_FEATURES #define this_cpu_add_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index aaceb1d9110a4..495e310c3d6d7 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -134,6 +134,8 @@ struct pt_regs { }; unsigned long flags; unsigned long last_break; + unsigned int cpu; + unsigned char percpu_register; }; /* diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index d10a17e6531da..efb988833c88d 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "entry.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); @@ -142,10 +143,13 @@ static int irq_pending(struct pt_regs *regs) void noinstr do_io_irq(struct pt_regs *regs) { - irqentry_state_t state = irqentry_enter(regs); - struct pt_regs *old_regs = set_irq_regs(regs); - bool from_idle; + bool from_idle, percpu_needs_fixup; + struct pt_regs *old_regs; + irqentry_state_t state; + percpu_entry(regs); + state = irqentry_enter(regs); + old_regs = set_irq_regs(regs); from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) update_timer_idle(); @@ -170,21 +174,25 @@ void noinstr do_io_irq(struct pt_regs *regs) do_irq_async(regs, IO_INTERRUPT); } while (machine_is_lpar() && irq_pending(regs)); + percpu_needs_fixup = percpu_code_check(regs); irq_exit_rcu(); - set_irq_regs(old_regs); irqentry_exit(regs, state); if (from_idle) regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); + percpu_exit(regs, percpu_needs_fixup); } void noinstr do_ext_irq(struct pt_regs *regs) { - irqentry_state_t state = irqentry_enter(regs); - struct pt_regs *old_regs = set_irq_regs(regs); - bool from_idle; + bool from_idle, percpu_needs_fixup; + struct pt_regs *old_regs; + irqentry_state_t state; + percpu_entry(regs); + state = irqentry_enter(regs); + old_regs = set_irq_regs(regs); from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) update_timer_idle(); @@ -206,12 +214,14 @@ void noinstr do_ext_irq(struct pt_regs *regs) do_irq_async(regs, EXT_INTERRUPT); + percpu_needs_fixup = percpu_code_check(regs); irq_exit_rcu(); set_irq_regs(old_regs); irqentry_exit(regs, state); if (from_idle) regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); + percpu_exit(regs, percpu_needs_fixup); } static void show_msi_interrupt(struct seq_file *p, int irq) diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 94fbfad49f620..e17a59d4d5a41 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -363,6 +364,7 @@ NOKPROBE_SYMBOL(s390_backup_mcck_info); */ void notrace s390_do_machine_check(struct pt_regs *regs) { + bool percpu_needs_fixup; static int ipd_count; static DEFINE_SPINLOCK(ipd_lock); static unsigned long long last_ipd; @@ -374,6 +376,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) unsigned long mcck_dam_code; int mcck_pending = 0; + percpu_entry(regs); irq_state = irqentry_nmi_enter(regs); if (user_mode(regs)) @@ -495,7 +498,9 @@ void notrace s390_do_machine_check(struct pt_regs *regs) if (mcck_pending) schedule_mcck_handler(); + percpu_needs_fixup = percpu_code_check(regs); irqentry_nmi_exit(regs, irq_state); + percpu_exit(regs, percpu_needs_fixup); } NOKPROBE_SYMBOL(s390_do_machine_check); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 1b5c6fc431cc1..564403496a7ce 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -329,6 +330,7 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { struct lowcore *lc = get_lowcore(); + bool percpu_needs_fixup; irqentry_state_t state; unsigned int trapnr; union teid teid; @@ -349,6 +351,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs) current->thread.gmap_int_code = regs->int_code & 0xffff; return; } + percpu_entry(regs); state = irqentry_enter(regs); if (user_mode(regs)) { update_timer_sys(); @@ -385,7 +388,9 @@ void noinstr __do_pgm_check(struct pt_regs *regs) pgm_check_table[trapnr](regs); out: local_irq_disable(); + percpu_needs_fixup = percpu_code_check(regs); irqentry_exit(regs, state); + percpu_exit(regs, percpu_needs_fixup); } /*