]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
x86: Remove catomic_* locking primitives
authorUros Bizjak <ubizjak@gmail.com>
Mon, 8 Sep 2025 12:38:20 +0000 (14:38 +0200)
committerH.J. Lu <hjl.tools@gmail.com>
Tue, 9 Sep 2025 14:36:02 +0000 (07:36 -0700)
Remove obsolete catomic_* locking primitives which don't map
to standard compiler builtins.

There are still a couple of places in the tree that uses them
(malloc/arena.c and malloc/malloc.c).

x86 didn't define __arch_c_compare_and_exchange_bool_* primitives
so fallback code used __arch_c_compare_and_exchange_val_* primitives
instead.  This resulted in unoptimal code for
catomic_compare_and_exchange_bool_acq where superfluous
CMP was emitted after CMPXCHG, e.g. in arena_get2:

   775b8: 48 8d 4a 01           lea    0x1(%rdx),%rcx
   775bc: 48 89 d0              mov    %rdx,%rax
   775bf: 64 83 3c 25 18 00 00  cmpl   $0x0,%fs:0x18
   775c6: 00 00
   775c8: 74 01                 je     775cb <arena_get2+0x35b>
   775ca: f0 48 0f b1 0d 75 3d  lock cmpxchg %rcx,0x163d75(%rip)        # 1db348 <narenas>
   775d1: 16 00
   775d3: 48 39 c2              cmp    %rax,%rdx
   775d6: 74 7f                 je     77657 <arena_get2+0x3e7>

that now becomes:

   775b8: 48 8d 4a 01           lea    0x1(%rdx),%rcx
   775bc: 48 89 d0              mov    %rdx,%rax
   775bf: f0 48 0f b1 0d 80 3d  lock cmpxchg %rcx,0x163d80(%rip)        # 1db348 <narenas>
   775c6: 16 00
   775c8: 74 7f                 je     77649 <arena_get2+0x3d9>

OTOH, catomic_decrement does not fallback to atomic_fetch_add (, -1)
builtin but to the cmpxchg loop, so the generated code in arena_get2
regresses a bit, from using LOCK DECQ insn:

   77829: 64 83 3c 25 18 00 00  cmpl   $0x0,%fs:0x18
   77830: 00 00
   77832: 74 01                 je     77835 <arena_get2+0x5c5>
   77834: f0 48 ff 0d 0c 3b 16  lock decq 0x163b0c(%rip)        # 1db348 <narenas>
   7783b: 00

to a cmpxchg loop:

   7783d: 48 8b 0d 04 3b 16 00  mov    0x163b04(%rip),%rcx        # 1db348 <narenas>
   77844: 48 8d 71 ff           lea    -0x1(%rcx),%rsi
   77848: 48 89 c8              mov    %rcx,%rax
   7784b: f0 48 0f b1 35 f4 3a  lock cmpxchg %rsi,0x163af4(%rip)        # 1db348 <narenas>
   77852: 16 00
   77854: 0f 84 c9 fa ff ff     je     77323 <arena_get2+0xb3>
   7785a: eb e1                 jmp    7783d <arena_get2+0x5cd>

Defining catomic_exchange_and_add using __atomic_fetch_add solves the
above issue and generates optimal:

   77809: f0 48 83 2d 36 3b 16  lock subq $0x1,0x163b36(%rip)        # 1db348 <narenas>
   77810: 00 01

Depending on the target processor, the compiler may emit either
'LOCK ADD/SUB $1, m' or 'INC/DEC $1, m' instruction, due to partial
flag register stall issue.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Adhemerval Zanella Netto <adhemerval.zanella@linaro.org>
Cc: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Cc: Collin Funk <collin.funk1@gmail.com>
Cc: H.J.Lu <hjl.tools@gmail.com>
Cc: Carlos O'Donell <carlos@redhat.com>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
sysdeps/x86/atomic-machine.h

index 9b5019c4f1f2957c508713dc1fdfef1f863b2982..d5b2d49031fc5f3bc6da0547e7c9a721bd9657da 100644 (file)
 #define _X86_ATOMIC_MACHINE_H 1
 
 #include <stdint.h>
-#include <tls.h>                       /* For tcbhead_t.  */
 #include <libc-pointer-arith.h>                /* For cast_to_integer.  */
 
-#define LOCK_PREFIX "lock;"
-
 #define USE_ATOMIC_COMPILER_BUILTINS   1
 
 #ifdef __x86_64__
 # define __HAVE_64B_ATOMICS            1
 # define SP_REG                                "rsp"
-# define SEG_REG                       "fs"
-# define BR_CONSTRAINT                 "q"
 #else
 /* Since the Pentium, i386 CPUs have supported 64-bit atomics, but the
    i386 psABI supplement provides only 4-byte alignment for uint64_t
@@ -39,8 +34,6 @@
    atomics on this platform.  */
 # define __HAVE_64B_ATOMICS            0
 # define SP_REG                                "esp"
-# define SEG_REG                       "gs"
-# define BR_CONSTRAINT                 "r"
 #endif
 #define ATOMIC_EXCHANGE_USES_CAS       0
 
 #define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
   (! __sync_bool_compare_and_swap (mem, oldval, newval))
 
-
-#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;                                                    \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"                              \
-                      "je 0f\n\t"                                            \
-                      "lock\n"                                               \
-                      "0:\tcmpxchgb %b2, %1"                                 \
-                      : "=a" (ret), "=m" (*mem)                              \
-                      : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
-     ret; })
-
-#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;                                                    \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"                              \
-                      "je 0f\n\t"                                            \
-                      "lock\n"                                               \
-                      "0:\tcmpxchgw %w2, %1"                                 \
-                      : "=a" (ret), "=m" (*mem)                              \
-                      : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
-     ret; })
-
-#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;                                                    \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"                              \
-                      "je 0f\n\t"                                            \
-                      "lock\n"                                               \
-                      "0:\tcmpxchgl %2, %1"                                  \
-                      : "=a" (ret), "=m" (*mem)                              \
-                      : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
-     ret; })
-
-#ifdef __x86_64__
-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;                                                    \
-     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"                               \
-                      "je 0f\n\t"                                            \
-                      "lock\n"                                               \
-                      "0:\tcmpxchgq %q2, %1"                                 \
-                      : "=a" (ret), "=m" (*mem)                              \
-                      : "q" ((int64_t) cast_to_integer (newval)),            \
-                        "m" (*mem),                                          \
-                        "0" ((int64_t) cast_to_integer (oldval)),            \
-                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
-     ret; })
-# define do_add_val_64_acq(pfx, mem, value) do { } while (0)
-#else
-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret = *(mem);                                           \
-     __atomic_link_error ();                                                 \
-     ret = (newval);                                                         \
-     ret = (oldval);                                                         \
-     ret; })
-
-# define do_add_val_64_acq(pfx, mem, value) \
-  {                                                                          \
-    __typeof (value) __addval = (value);                                     \
-    __typeof (mem) __memp = (mem);                                           \
-    __typeof (*mem) __oldval = *__memp;                                              \
-    __typeof (*mem) __tmpval;                                                \
-    do                                                                       \
-      __tmpval = __oldval;                                                   \
-    while ((__oldval = pfx##_compare_and_exchange_val_64_acq                 \
-           (__memp, __oldval + __addval, __oldval)) == __tmpval);            \
-  }
-#endif
-
-
 /* Note that we need no lock prefix.  */
 #define atomic_exchange_acq(mem, newvalue) \
   ({ __typeof (*mem) result;                                                 \
        }                                                                     \
      result; })
 
-#define __arch_decrement_body(lock, pfx, mem) \
-  do {                                                                       \
-    if (sizeof (*mem) == 1)                                                  \
-      __asm __volatile (lock "decb %b0"                                              \
-                       : "=m" (*mem)                                         \
-                       : "m" (*mem),                                         \
-                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 2)                                             \
-      __asm __volatile (lock "decw %w0"                                              \
-                       : "=m" (*mem)                                         \
-                       : "m" (*mem),                                         \
-                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 4)                                             \
-      __asm __volatile (lock "decl %0"                                       \
-                       : "=m" (*mem)                                         \
-                       : "m" (*mem),                                         \
-                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (__HAVE_64B_ATOMICS)                                             \
-      __asm __volatile (lock "decq %q0"                                              \
-                       : "=m" (*mem)                                         \
-                       : "m" (*mem),                                         \
-                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else                                                                     \
-      do_add_val_64_acq (pfx, mem, -1);                                              \
-  } while (0)
-
-#define __arch_decrement_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_decrement(mem) \
-  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
+/* ??? Remove when catomic_exchange_and_add
+   fallback uses __atomic_fetch_add.  */
+#define catomic_exchange_and_add(mem, value) \
+  __atomic_fetch_add (mem, value, __ATOMIC_ACQUIRE)
 
 /* We don't use mfence because it is supposedly slower due to having to
    provide stronger guarantees (e.g., regarding self-modifying code).  */