From: Uros Bizjak Date: Mon, 8 Sep 2025 12:38:20 +0000 (+0200) Subject: x86: Remove catomic_* locking primitives X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=4eef002328ddf70f6d5f4af856f923e701ffe7e3;p=thirdparty%2Fglibc.git x86: Remove catomic_* locking primitives Remove obsolete catomic_* locking primitives which don't map to standard compiler builtins. There are still a couple of places in the tree that uses them (malloc/arena.c and malloc/malloc.c). x86 didn't define __arch_c_compare_and_exchange_bool_* primitives so fallback code used __arch_c_compare_and_exchange_val_* primitives instead. This resulted in unoptimal code for catomic_compare_and_exchange_bool_acq where superfluous CMP was emitted after CMPXCHG, e.g. in arena_get2: 775b8: 48 8d 4a 01 lea 0x1(%rdx),%rcx 775bc: 48 89 d0 mov %rdx,%rax 775bf: 64 83 3c 25 18 00 00 cmpl $0x0,%fs:0x18 775c6: 00 00 775c8: 74 01 je 775cb 775ca: f0 48 0f b1 0d 75 3d lock cmpxchg %rcx,0x163d75(%rip) # 1db348 775d1: 16 00 775d3: 48 39 c2 cmp %rax,%rdx 775d6: 74 7f je 77657 that now becomes: 775b8: 48 8d 4a 01 lea 0x1(%rdx),%rcx 775bc: 48 89 d0 mov %rdx,%rax 775bf: f0 48 0f b1 0d 80 3d lock cmpxchg %rcx,0x163d80(%rip) # 1db348 775c6: 16 00 775c8: 74 7f je 77649 OTOH, catomic_decrement does not fallback to atomic_fetch_add (, -1) builtin but to the cmpxchg loop, so the generated code in arena_get2 regresses a bit, from using LOCK DECQ insn: 77829: 64 83 3c 25 18 00 00 cmpl $0x0,%fs:0x18 77830: 00 00 77832: 74 01 je 77835 77834: f0 48 ff 0d 0c 3b 16 lock decq 0x163b0c(%rip) # 1db348 7783b: 00 to a cmpxchg loop: 7783d: 48 8b 0d 04 3b 16 00 mov 0x163b04(%rip),%rcx # 1db348 77844: 48 8d 71 ff lea -0x1(%rcx),%rsi 77848: 48 89 c8 mov %rcx,%rax 7784b: f0 48 0f b1 35 f4 3a lock cmpxchg %rsi,0x163af4(%rip) # 1db348 77852: 16 00 77854: 0f 84 c9 fa ff ff je 77323 7785a: eb e1 jmp 7783d Defining catomic_exchange_and_add using __atomic_fetch_add solves the above issue and generates optimal: 77809: f0 48 83 2d 36 3b 16 lock subq $0x1,0x163b36(%rip) # 1db348 77810: 00 01 Depending on the target processor, the compiler may emit either 'LOCK ADD/SUB $1, m' or 'INC/DEC $1, m' instruction, due to partial flag register stall issue. Signed-off-by: Uros Bizjak Cc: Florian Weimer Cc: Adhemerval Zanella Netto Cc: Wilco Dijkstra Cc: Collin Funk Cc: H.J.Lu Cc: Carlos O'Donell Reviewed-by: Adhemerval Zanella --- diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h index 9b5019c4f1..d5b2d49031 100644 --- a/sysdeps/x86/atomic-machine.h +++ b/sysdeps/x86/atomic-machine.h @@ -20,18 +20,13 @@ #define _X86_ATOMIC_MACHINE_H 1 #include -#include /* For tcbhead_t. */ #include /* For cast_to_integer. */ -#define LOCK_PREFIX "lock;" - #define USE_ATOMIC_COMPILER_BUILTINS 1 #ifdef __x86_64__ # define __HAVE_64B_ATOMICS 1 # define SP_REG "rsp" -# define SEG_REG "fs" -# define BR_CONSTRAINT "q" #else /* Since the Pentium, i386 CPUs have supported 64-bit atomics, but the i386 psABI supplement provides only 4-byte alignment for uint64_t @@ -39,8 +34,6 @@ atomics on this platform. */ # define __HAVE_64B_ATOMICS 0 # define SP_REG "esp" -# define SEG_REG "gs" -# define BR_CONSTRAINT "r" #endif #define ATOMIC_EXCHANGE_USES_CAS 0 @@ -49,76 +42,6 @@ #define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \ (! __sync_bool_compare_and_swap (mem, oldval, newval)) - -#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \ - ({ __typeof (*mem) ret; \ - __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t" \ - "je 0f\n\t" \ - "lock\n" \ - "0:\tcmpxchgb %b2, %1" \ - : "=a" (ret), "=m" (*mem) \ - : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval), \ - "i" (offsetof (tcbhead_t, multiple_threads))); \ - ret; }) - -#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \ - ({ __typeof (*mem) ret; \ - __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t" \ - "je 0f\n\t" \ - "lock\n" \ - "0:\tcmpxchgw %w2, %1" \ - : "=a" (ret), "=m" (*mem) \ - : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval), \ - "i" (offsetof (tcbhead_t, multiple_threads))); \ - ret; }) - -#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \ - ({ __typeof (*mem) ret; \ - __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t" \ - "je 0f\n\t" \ - "lock\n" \ - "0:\tcmpxchgl %2, %1" \ - : "=a" (ret), "=m" (*mem) \ - : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval), \ - "i" (offsetof (tcbhead_t, multiple_threads))); \ - ret; }) - -#ifdef __x86_64__ -# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \ - ({ __typeof (*mem) ret; \ - __asm __volatile ("cmpl $0, %%fs:%P5\n\t" \ - "je 0f\n\t" \ - "lock\n" \ - "0:\tcmpxchgq %q2, %1" \ - : "=a" (ret), "=m" (*mem) \ - : "q" ((int64_t) cast_to_integer (newval)), \ - "m" (*mem), \ - "0" ((int64_t) cast_to_integer (oldval)), \ - "i" (offsetof (tcbhead_t, multiple_threads))); \ - ret; }) -# define do_add_val_64_acq(pfx, mem, value) do { } while (0) -#else -# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \ - ({ __typeof (*mem) ret = *(mem); \ - __atomic_link_error (); \ - ret = (newval); \ - ret = (oldval); \ - ret; }) - -# define do_add_val_64_acq(pfx, mem, value) \ - { \ - __typeof (value) __addval = (value); \ - __typeof (mem) __memp = (mem); \ - __typeof (*mem) __oldval = *__memp; \ - __typeof (*mem) __tmpval; \ - do \ - __tmpval = __oldval; \ - while ((__oldval = pfx##_compare_and_exchange_val_64_acq \ - (__memp, __oldval + __addval, __oldval)) == __tmpval); \ - } -#endif - - /* Note that we need no lock prefix. */ #define atomic_exchange_acq(mem, newvalue) \ ({ __typeof (*mem) result; \ @@ -146,37 +69,10 @@ } \ result; }) -#define __arch_decrement_body(lock, pfx, mem) \ - do { \ - if (sizeof (*mem) == 1) \ - __asm __volatile (lock "decb %b0" \ - : "=m" (*mem) \ - : "m" (*mem), \ - "i" (offsetof (tcbhead_t, multiple_threads))); \ - else if (sizeof (*mem) == 2) \ - __asm __volatile (lock "decw %w0" \ - : "=m" (*mem) \ - : "m" (*mem), \ - "i" (offsetof (tcbhead_t, multiple_threads))); \ - else if (sizeof (*mem) == 4) \ - __asm __volatile (lock "decl %0" \ - : "=m" (*mem) \ - : "m" (*mem), \ - "i" (offsetof (tcbhead_t, multiple_threads))); \ - else if (__HAVE_64B_ATOMICS) \ - __asm __volatile (lock "decq %q0" \ - : "=m" (*mem) \ - : "m" (*mem), \ - "i" (offsetof (tcbhead_t, multiple_threads))); \ - else \ - do_add_val_64_acq (pfx, mem, -1); \ - } while (0) - -#define __arch_decrement_cprefix \ - "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t" - -#define catomic_decrement(mem) \ - __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem) +/* ??? Remove when catomic_exchange_and_add + fallback uses __atomic_fetch_add. */ +#define catomic_exchange_and_add(mem, value) \ + __atomic_fetch_add (mem, value, __ATOMIC_ACQUIRE) /* We don't use mfence because it is supposedly slower due to having to provide stronger guarantees (e.g., regarding self-modifying code). */