From: Juergen Christ Date: Mon, 11 Aug 2025 15:22:53 +0000 (+0200) Subject: s390/bitops: Optimize inlining X-Git-Tag: v6.18-rc1~207^2~17 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=669bc57e7016cf9d1a9eedb2a984c4fb4fd67f3d;p=thirdparty%2Fkernel%2Fstable.git s390/bitops: Optimize inlining GCC inlining heuristics prevent code growth due to inlining into cold paths. This causes GCC to emit a partially specialized version of __flogr for non-constant input for all occurrences on cold paths. This happens since the overhead seen during inlining includes setting up a union register_pair, calling flogr, and extracting and casting the result. This overhead is not removed until the function is lowered into RTL. But this happens after inlining. For -ftrivial-var-auto-init=zero builds, an additional initialization of the union register_pair adds another statement to be inlinined. This is unneeded since the even register is initialized anyway and the odd register is not an input register. It is only marked as such since the whole pair has to be marked as a read/write output register. Mark the union register_pair as uninitialized to get rid of this statement. This, however, does not change the code since the initialization happens when part of the register pair is written. Nevertheless, GCC function size approximation during inlining is reduced by one statement. Force inlining of flogr and also flatten some other functions that should be leaf functions but are called in cold context, like, e.g., __init functions. Acked-by: Heiko Carstens Signed-off-by: Juergen Christ Signed-off-by: Alexander Gordeev --- diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 10d7573d1582c..9dfb687ba6200 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -130,7 +130,7 @@ static inline bool test_bit_inv(unsigned long nr, * where the most significant bit has bit number 0. * If no bit is set this function returns 64. */ -static inline unsigned char __flogr(unsigned long word) +static __always_inline unsigned char __flogr(unsigned long word) { if (__builtin_constant_p(word)) { unsigned long bit = 0; @@ -163,7 +163,7 @@ static inline unsigned char __flogr(unsigned long word) } return bit; } else { - union register_pair rp; + union register_pair rp __uninitialized; rp.even = word; asm volatile( @@ -179,7 +179,7 @@ static inline unsigned char __flogr(unsigned long word) * * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static __always_inline __flatten unsigned long __ffs(unsigned long word) { return __flogr(-word & word) ^ (BITS_PER_LONG - 1); } @@ -191,7 +191,7 @@ static inline unsigned long __ffs(unsigned long word) * This is defined the same way as the libc and * compiler builtin ffs routines (man ffs). */ -static inline int ffs(int word) +static __always_inline __flatten int ffs(int word) { unsigned int val = (unsigned int)word; @@ -204,7 +204,7 @@ static inline int ffs(int word) * * Undefined if no set bit exists, so code should check against 0 first. */ -static inline unsigned long __fls(unsigned long word) +static __always_inline __flatten unsigned long __fls(unsigned long word) { return __flogr(word) ^ (BITS_PER_LONG - 1); } @@ -220,7 +220,7 @@ static inline unsigned long __fls(unsigned long word) * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ -static inline int fls64(unsigned long word) +static __always_inline __flatten int fls64(unsigned long word) { return BITS_PER_LONG - __flogr(word); } @@ -232,7 +232,7 @@ static inline int fls64(unsigned long word) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(unsigned int word) +static __always_inline __flatten int fls(unsigned int word) { return fls64(word); }