From: Roger Sayle <roger@nextmovesoftware.com>
Date: Mon, 15 Jun 2026 19:09:55 +0000 (+0100)
Subject: i386: Tweak cost of SSE fabs/fneg in ix86_insn_cost.
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=398d9979ec07ac3f478d6e2d941609cd4eeccf12;p=thirdparty%2Fgcc.git

i386: Tweak cost of SSE fabs/fneg in ix86_insn_cost.

This patch fixes a poor interaction between the splitters for SSE
floating point abs/neg in the i386 backend, and the late-combine pass.
Before reload, these patterns exist as a PARALLEL containing the USE
of a value (pseudo) holding the sign-bit.  Currently late-combine
propagates this sign-bit mask from the constant pool, changing the
USE of a REG to the USE of a MEM.  This unCSE is reasonable if this
MEM is used only once, but less than optimal if this MEM is accessed
many times.

The problem is that this USE doesn't currently have a cost in
ix86_insn_cost, so propagating this load from memory into the USE
makes if free (to combine's profitable replacement calculation).
This patch improve things by providing a nominal cost for USEs of
MEM.

As an example, consider the following function:

float x, y, z;
void foo()
{
  x = -x;
  y = -y;
  z = -z;
}

Currently with -O2 GCC generates three loads from the constant pool:

        movss   x(%rip), %xmm0
        xorps   .LC0(%rip), %xmm0
        movss   %xmm0, x(%rip)
        movss   y(%rip), %xmm0
        xorps   .LC0(%rip), %xmm0
        movss   %xmm0, y(%rip)
        movss   z(%rip), %xmm0
        xorps   .LC0(%rip), %xmm0
        movss   %xmm0, z(%rip)
        ret

With the patch below, this load remains CSEd.

        movss   x(%rip), %xmm0
        movss   .LC0(%rip), %xmm1
        xorps   %xmm1, %xmm0
        movss   %xmm0, x(%rip)
        movss   y(%rip), %xmm0
        xorps   %xmm1, %xmm0
        movss   %xmm0, y(%rip)
        movss   z(%rip), %xmm0
        xorps   %xmm1, %xmm0
        movss   %xmm0, z(%rip)
        ret

Note this is one more instruction, but code size is smaller and
the total cost (as calculated by the i386 backend) is lower.
For a single neg/abs the memory address is still propagated.

2026-06-15  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* config/i386/i386.cc (ix86_insn_cost): Add a suitable penalty
	for USE of a MEM in a PARALLEL (for *<absneg>[sd]f2_1 splitter).

gcc/testsuite/ChangeLog
	* gcc.target/i386/fabsneg-2.c: New test case.
---

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 2945081234b..b1fd86c2b32 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22462,7 +22462,22 @@ ix86_insn_cost (rtx_insn *insn, bool speed)
       == AVX_PARTIAL_XMM_UPDATE_TRUE)
     insn_cost += COSTS_N_INSNS (3);
 
-  return insn_cost + pattern_cost (PATTERN (insn), speed);
+  rtx pat = PATTERN (insn);
+  /* A USE of a memory is more expensive than a use of a REG.
+     For example *<absneg>mode2_1's use of a signbit mask.  */
+  if (GET_CODE (pat) == PARALLEL)
+    {
+      for (int i = 0; i < XVECLEN (pat, 0); i++)
+	{
+	  rtx x = XVECEXP (pat, 0, i);
+	  if (GET_CODE (x) == USE && MEM_P (XEXP (x, 0)))
+	    insn_cost += !speed ? COSTS_N_BYTES (4)
+				: TARGET_64BIT ? COSTS_N_INSNS (1) + 1
+					       : COSTS_N_INSNS (3) + 1;
+	}
+    }
+
+  return insn_cost + pattern_cost (pat, speed);
 }
 
 /* Return cost of SSE/AVX FP->FP conversion (extensions and truncates).  */
diff --git a/gcc/testsuite/gcc.target/i386/fabsneg-2.c b/gcc/testsuite/gcc.target/i386/fabsneg-2.c
new file mode 100644
index 00000000000..dd40c755a49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fabsneg-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfpmath=sse -march=skylake" } */
+
+float x;
+float y;
+float z;
+
+void foo()
+{
+  x = -x;
+  y = -y;
+  z = -z;
+}
+
+void bar()
+{
+  x = __builtin_fabsf(x);
+  y = __builtin_fabsf(y);
+  z = __builtin_fabsf(z);
+}
+
+/* { dg-final { scan-assembler-times "LC0\[,(\]" 1 } } */
+/* { dg-final { scan-assembler-times "LC1\[,(\]" 1 } } */