]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
PR target/103069: Relax cmpxchg loop for x86 target
authorHongyu Wang <hongyu.wang@intel.com>
Fri, 12 Nov 2021 02:50:46 +0000 (10:50 +0800)
committerHongyu Wang <hongyu.wang@intel.com>
Mon, 15 Nov 2021 11:09:38 +0000 (19:09 +0800)
From the CPU's point of view, getting a cache line for writing is more
expensive than reading.  See Appendix A.2 Spinlock in:

https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/
xeon-lock-scaling-analysis-paper.pdf

The full compare and swap will grab the cache line exclusive and causes
excessive cache line bouncing.

The atomic_fetch_{or,xor,and,nand} builtins generates cmpxchg loop under
-march=x86-64 like:

movl v(%rip), %eax
.L2:
movl %eax, %ecx
movl %eax, %edx
orl $1, %ecx
lock cmpxchgl %ecx, v(%rip)
jne .L2
movl %edx, %eax
andl $1, %eax
ret

To relax above loop, GCC should first emit a normal load, check and jump to
.L2 if cmpxchgl may fail. Before jump to .L2, PAUSE should be inserted to
yield the CPU to another hyperthread and to save power, so the code is
like

.L84:
        movl    (%rdi), %ecx
        movl    %eax, %edx
        orl     %esi, %edx
        cmpl    %eax, %ecx
        jne     .L82
        lock cmpxchgl   %edx, (%rdi)
        jne     .L84
.L82:
        rep nop
        jmp     .L84

This patch adds corresponding atomic_fetch_op expanders to insert load/
compare and pause for all the atomic logic fetch builtins. Add flag
-mrelax-cmpxchg-loop to control whether to generate relaxed loop.

gcc/ChangeLog:

PR target/103069
* config/i386/i386-expand.c (ix86_expand_atomic_fetch_op_loop):
New expand function.
* config/i386/i386-options.c (ix86_target_string): Add
-mrelax-cmpxchg-loop flag.
(ix86_valid_target_attribute_inner_p): Likewise.
* config/i386/i386-protos.h (ix86_expand_atomic_fetch_op_loop):
New expand function prototype.
* config/i386/i386.opt: Add -mrelax-cmpxchg-loop.
* config/i386/sync.md (atomic_fetch_<logic><mode>): New expander
for SI,HI,QI modes.
(atomic_<logic>_fetch<mode>): Likewise.
(atomic_fetch_nand<mode>): Likewise.
(atomic_nand_fetch<mode>): Likewise.
(atomic_fetch_<logic><mode>): New expander for DI,TI modes.
(atomic_<logic>_fetch<mode>): Likewise.
(atomic_fetch_nand<mode>): Likewise.
(atomic_nand_fetch<mode>): Likewise.
* doc/invoke.texi: Document -mrelax-cmpxchg-loop.

gcc/testsuite/ChangeLog:

PR target/103069
* gcc.target/i386/pr103069-1.c: New test.
* gcc.target/i386/pr103069-2.c: Ditto.

gcc/config/i386/i386-expand.c
gcc/config/i386/i386-options.c
gcc/config/i386/i386-protos.h
gcc/config/i386/i386.opt
gcc/config/i386/sync.md
gcc/doc/invoke.texi
gcc/testsuite/gcc.target/i386/pr103069-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/pr103069-2.c [new file with mode: 0644]

index 088e6af2258a56bb4eeb1b0402624a48d080d88a..3e4de64ec24365b8c7c6be2cb6e1fecd0c2cf638 100644 (file)
@@ -23138,4 +23138,80 @@ ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
   *rem_p = rem;
 }
 
+void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
+                                      enum rtx_code code, bool after,
+                                      bool doubleword)
+{
+  rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
+  rtx_code_label *loop_label, *pause_label;
+  machine_mode mode = GET_MODE (target);
+
+  old_reg = gen_reg_rtx (mode);
+  new_reg = old_reg;
+  loop_label = gen_label_rtx ();
+  pause_label = gen_label_rtx ();
+  old_mem = copy_to_reg (mem);
+  emit_label (loop_label);
+  emit_move_insn (old_reg, old_mem);
+
+  /* return value for atomic_fetch_op.  */
+  if (!after)
+    emit_move_insn (target, old_reg);
+
+  if (code == NOT)
+    {
+      new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
+                                    true, OPTAB_LIB_WIDEN);
+      new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
+    }
+  else
+    new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
+                                  true, OPTAB_LIB_WIDEN);
+
+  /* return value for atomic_op_fetch.  */
+  if (after)
+    emit_move_insn (target, new_reg);
+
+  /* Load memory again inside loop.  */
+  new_mem = copy_to_reg (mem);
+  /* Compare mem value with expected value.  */
+
+  if (doubleword)
+    {
+      machine_mode half_mode = (mode == DImode)? SImode : DImode;
+      rtx low_new_mem = gen_lowpart (half_mode, new_mem);
+      rtx low_old_mem = gen_lowpart (half_mode, old_mem);
+      rtx high_new_mem = gen_highpart (half_mode, new_mem);
+      rtx high_old_mem = gen_highpart (half_mode, old_mem);
+      emit_cmp_and_jump_insns (low_new_mem, low_old_mem, NE, NULL_RTX,
+                              half_mode, 1, pause_label,
+                              profile_probability::guessed_never ());
+      emit_cmp_and_jump_insns (high_new_mem, high_old_mem, NE, NULL_RTX,
+                              half_mode, 1, pause_label,
+                              profile_probability::guessed_never ());
+    }
+  else
+    emit_cmp_and_jump_insns (new_mem, old_mem, NE, NULL_RTX,
+                            GET_MODE (old_mem), 1, pause_label,
+                            profile_probability::guessed_never ());
+
+  success = NULL_RTX;
+  oldval = old_mem;
+  expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg,
+                                 new_reg, false, MEMMODEL_SYNC_SEQ_CST,
+                                 MEMMODEL_RELAXED);
+  if (oldval != old_mem)
+    emit_move_insn (old_mem, oldval);
+
+  emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx,
+                          GET_MODE (success), 1, loop_label,
+                          profile_probability::guessed_never ());
+
+  /* If mem is not expected, pause and loop back.  */
+  emit_label (pause_label);
+  emit_insn (gen_pause ());
+  emit_jump_insn (gen_jump (loop_label));
+  emit_barrier ();
+}
+
 #include "gt-i386-expand.h"
index a8cc0664f11cb2f20eb967f7409e0620e638a93f..feff2584f41a2c5b7e501d3e8415b8d29708e056 100644 (file)
@@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
     { "-mstv",                         MASK_STV },
     { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
     { "-mavx256-split-unaligned-store",        MASK_AVX256_SPLIT_UNALIGNED_STORE },
-    { "-mcall-ms2sysv-xlogues",                MASK_CALL_MS2SYSV_XLOGUES }
+    { "-mcall-ms2sysv-xlogues",                MASK_CALL_MS2SYSV_XLOGUES },
+    { "-mrelax-cmpxchg-loop",          MASK_RELAX_CMPXCHG_LOOP }
   };
 
   /* Additional flag options.  */
@@ -1092,6 +1093,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
     IX86_ATTR_IX86_YES ("general-regs-only",
                        OPT_mgeneral_regs_only,
                        OPTION_MASK_GENERAL_REGS_ONLY),
+
+    IX86_ATTR_YES ("relax-cmpxchg-loop",
+                  OPT_mrelax_cmpxchg_loop,
+                  MASK_RELAX_CMPXCHG_LOOP),
   };
 
   location_t loc
index bd52450a148eb84c67c49a409409a9b550856819..7e05510c679d054b08c5ddac6c85d532940b7ec2 100644 (file)
@@ -217,6 +217,8 @@ extern void ix86_move_vector_high_sse_to_mmx (rtx);
 extern void ix86_split_mmx_pack (rtx[], enum rtx_code);
 extern void ix86_split_mmx_punpck (rtx[], bool);
 extern void ix86_expand_avx_vzeroupper (void);
+extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
+                                             bool, bool);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
index ad366974b5b867c437b5c39983bcf5bab2b30013..46fad3cc03808b170514585ea17e152c3b3149d1 100644 (file)
@@ -404,6 +404,10 @@ momit-leaf-frame-pointer
 Target Mask(OMIT_LEAF_FRAME_POINTER) Save
 Omit the frame pointer in leaf functions.
 
+mrelax-cmpxchg-loop
+Target Mask(RELAX_CMPXCHG_LOOP) Save
+Relax cmpxchg loop for atomic_fetch_{or,xor,and,nand} by adding load and cmp before cmpxchg, execute pause and loop back to load and compare if load value is not expected.
+
 mpc32
 Target RejectNegative
 Set 80387 floating-point precision to 32-bit.
index 9716a0b2f2c1b95ca353a5a212fe07322e20402c..cc4fe727bd908379044d286a0bd425cdd1c5fbec 100644 (file)
              (set (reg:CCZ FLAGS_REG)
                   (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG))])])
 
+(define_expand "atomic_fetch_<logic><mode>"
+  [(match_operand:SWI124 0 "register_operand")
+   (any_logic:SWI124
+    (match_operand:SWI124 1 "memory_operand")
+    (match_operand:SWI124 2 "register_operand"))
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+                                   operands[2], <CODE>, false,
+                                   false);
+  DONE;
+})
+
+(define_expand "atomic_<logic>_fetch<mode>"
+  [(match_operand:SWI124 0 "register_operand")
+   (any_logic:SWI124
+    (match_operand:SWI124 1 "memory_operand")
+    (match_operand:SWI124 2 "register_operand"))
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+                                   operands[2], <CODE>, true,
+                                   false);
+  DONE;
+})
+
+(define_expand "atomic_fetch_nand<mode>"
+  [(match_operand:SWI124 0 "register_operand")
+   (match_operand:SWI124 1 "memory_operand")
+   (match_operand:SWI124 2 "register_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+                                   operands[2], NOT, false,
+                                   false);
+  DONE;
+})
+
+(define_expand "atomic_nand_fetch<mode>"
+  [(match_operand:SWI124 0 "register_operand")
+   (match_operand:SWI124 1 "memory_operand")
+   (match_operand:SWI124 2 "register_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+                                   operands[2], NOT, true,
+                                   false);
+  DONE;
+})
+
+(define_expand "atomic_fetch_<logic><mode>"
+  [(match_operand:CASMODE 0 "register_operand")
+   (any_logic:CASMODE
+    (match_operand:CASMODE 1 "memory_operand")
+    (match_operand:CASMODE 2 "register_operand"))
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  bool doubleword = (<MODE>mode == DImode && !TARGET_64BIT)
+                   || (<MODE>mode == TImode);
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+                                   operands[2], <CODE>, false,
+                                   doubleword);
+  DONE;
+})
+
+(define_expand "atomic_<logic>_fetch<mode>"
+  [(match_operand:CASMODE 0 "register_operand")
+   (any_logic:CASMODE
+    (match_operand:CASMODE 1 "memory_operand")
+    (match_operand:CASMODE 2 "register_operand"))
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  bool doubleword = (<MODE>mode == DImode && !TARGET_64BIT)
+                   || (<MODE>mode == TImode);
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+                                   operands[2], <CODE>, true,
+                                   doubleword);
+  DONE;
+})
+
+(define_expand "atomic_fetch_nand<mode>"
+  [(match_operand:CASMODE 0 "register_operand")
+   (match_operand:CASMODE 1 "memory_operand")
+   (match_operand:CASMODE 2 "register_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  bool doubleword = (<MODE>mode == DImode && !TARGET_64BIT)
+                   || (<MODE>mode == TImode);
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+                                   operands[2], NOT, false,
+                                   doubleword);
+  DONE;
+})
+
+(define_expand "atomic_nand_fetch<mode>"
+  [(match_operand:CASMODE 0 "register_operand")
+   (match_operand:CASMODE 1 "memory_operand")
+   (match_operand:CASMODE 2 "register_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  bool doubleword = (<MODE>mode == DImode && !TARGET_64BIT)
+                   || (<MODE>mode == TImode);
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+                                   operands[2], NOT, true,
+                                   doubleword);
+  DONE;
+})
+
+
 ;; For operand 2 nonmemory_operand predicate is used instead of
 ;; register_operand to allow combiner to better optimize atomic
 ;; additions of constants.
index 2d9c1782f332433b7949b74c37e15b9df42a04d8..6070288856c02bc6e76ad1496424332a58719e78 100644 (file)
@@ -1423,7 +1423,7 @@ See RS/6000 and PowerPC Options.
 -mstack-protector-guard-reg=@var{reg} @gol
 -mstack-protector-guard-offset=@var{offset} @gol
 -mstack-protector-guard-symbol=@var{symbol} @gol
--mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
+-mgeneral-regs-only  -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol
 -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
 -mindirect-branch-register -mneeded}
 
@@ -32330,6 +32330,13 @@ Generate code that uses only the general-purpose registers.  This
 prevents the compiler from using floating-point, vector, mask and bound
 registers.
 
+@item -mrelax-cmpxchg-loop
+@opindex mrelax-cmpxchg-loop
+Relax cmpxchg loop by emitting an early load and compare before cmpxchg,
+execute pause if load value is not expected. This reduces excessive
+cachline bouncing when and works for all atomic logic fetch builtins
+that generates compare and swap loop.
+
 @item -mindirect-branch=@var{choice}
 @opindex mindirect-branch
 Convert indirect call and jump with @var{choice}.  The default is
diff --git a/gcc/testsuite/gcc.target/i386/pr103069-1.c b/gcc/testsuite/gcc.target/i386/pr103069-1.c
new file mode 100644 (file)
index 0000000..f819af4
--- /dev/null
@@ -0,0 +1,35 @@
+/* PR target/103068 */
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic -mrelax-cmpxchg-loop" } */ 
+/* { dg-final { scan-assembler-times "rep;?\[ \\t\]+nop" 32 } } */
+
+#include <stdint.h>
+
+#define FUNC_ATOMIC(TYPE, OP) \
+__attribute__ ((noinline, noclone))    \
+TYPE f_##TYPE##_##OP##_fetch (TYPE *a, TYPE b) \
+{ \
+  return __atomic_##OP##_fetch (a, b, __ATOMIC_RELAXED);  \
+} \
+__attribute__ ((noinline, noclone))    \
+TYPE f_##TYPE##_fetch_##OP (TYPE *a, TYPE b)   \
+{ \
+  return __atomic_fetch_##OP (a, b, __ATOMIC_RELAXED);  \
+}
+
+FUNC_ATOMIC (int64_t, and)
+FUNC_ATOMIC (int64_t, nand)
+FUNC_ATOMIC (int64_t, or)
+FUNC_ATOMIC (int64_t, xor)
+FUNC_ATOMIC (int, and)
+FUNC_ATOMIC (int, nand)
+FUNC_ATOMIC (int, or)
+FUNC_ATOMIC (int, xor)
+FUNC_ATOMIC (short, and)
+FUNC_ATOMIC (short, nand)
+FUNC_ATOMIC (short, or)
+FUNC_ATOMIC (short, xor)
+FUNC_ATOMIC (char, and)
+FUNC_ATOMIC (char, nand)
+FUNC_ATOMIC (char, or)
+FUNC_ATOMIC (char, xor)
diff --git a/gcc/testsuite/gcc.target/i386/pr103069-2.c b/gcc/testsuite/gcc.target/i386/pr103069-2.c
new file mode 100644 (file)
index 0000000..8ac824c
--- /dev/null
@@ -0,0 +1,70 @@
+/* PR target/103068 */
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic" } */ 
+
+#include <stdlib.h>
+#include "pr103069-1.c"
+
+#define FUNC_ATOMIC_RELAX(TYPE, OP) \
+__attribute__ ((noinline, noclone, target ("relax-cmpxchg-loop")))     \
+TYPE relax_##TYPE##_##OP##_fetch (TYPE *a, TYPE b)     \
+{ \
+  return __atomic_##OP##_fetch (a, b, __ATOMIC_RELAXED);  \
+} \
+__attribute__ ((noinline, noclone, target ("relax-cmpxchg-loop")))     \
+TYPE relax_##TYPE##_fetch_##OP (TYPE *a, TYPE b)       \
+{ \
+  return __atomic_fetch_##OP (a, b, __ATOMIC_RELAXED);  \
+}
+
+FUNC_ATOMIC_RELAX (int64_t, and)
+FUNC_ATOMIC_RELAX (int64_t, nand)
+FUNC_ATOMIC_RELAX (int64_t, or)
+FUNC_ATOMIC_RELAX (int64_t, xor)
+FUNC_ATOMIC_RELAX (int, and)
+FUNC_ATOMIC_RELAX (int, nand)
+FUNC_ATOMIC_RELAX (int, or)
+FUNC_ATOMIC_RELAX (int, xor)
+FUNC_ATOMIC_RELAX (short, and)
+FUNC_ATOMIC_RELAX (short, nand)
+FUNC_ATOMIC_RELAX (short, or)
+FUNC_ATOMIC_RELAX (short, xor)
+FUNC_ATOMIC_RELAX (char, and)
+FUNC_ATOMIC_RELAX (char, nand)
+FUNC_ATOMIC_RELAX (char, or)
+FUNC_ATOMIC_RELAX (char, xor)
+
+#define TEST_ATOMIC_FETCH_LOGIC(TYPE, OP) \
+{ \
+  TYPE a = 11, b = 101, res, exp; \
+  res = relax_##TYPE##_##OP##_fetch (&a, b); \
+  exp = f_##TYPE##_##OP##_fetch (&a, b);  \
+  if (res != exp) \
+    abort (); \
+  a = 21, b = 92; \
+  res = relax_##TYPE##_fetch_##OP (&a, b); \
+  exp = f_##TYPE##_fetch_##OP (&a, b);  \
+  if (res != exp) \
+    abort (); \
+}
+
+int main (void)
+{
+  TEST_ATOMIC_FETCH_LOGIC (int64_t, and)
+  TEST_ATOMIC_FETCH_LOGIC (int64_t, nand)
+  TEST_ATOMIC_FETCH_LOGIC (int64_t, or)
+  TEST_ATOMIC_FETCH_LOGIC (int64_t, xor)
+  TEST_ATOMIC_FETCH_LOGIC (int, and)
+  TEST_ATOMIC_FETCH_LOGIC (int, nand)
+  TEST_ATOMIC_FETCH_LOGIC (int, or)
+  TEST_ATOMIC_FETCH_LOGIC (int, xor)
+  TEST_ATOMIC_FETCH_LOGIC (short, and)
+  TEST_ATOMIC_FETCH_LOGIC (short, nand)
+  TEST_ATOMIC_FETCH_LOGIC (short, or)
+  TEST_ATOMIC_FETCH_LOGIC (short, xor)
+  TEST_ATOMIC_FETCH_LOGIC (char, and)
+  TEST_ATOMIC_FETCH_LOGIC (char, nand)
+  TEST_ATOMIC_FETCH_LOGIC (char, or)
+  TEST_ATOMIC_FETCH_LOGIC (char, xor)
+  return 0;
+}