This patch fixes a poor interaction between the splitters for SSE
floating point abs/neg in the i386 backend, and the late-combine pass.
Before reload, these patterns exist as a PARALLEL containing the USE
of a value (pseudo) holding the sign-bit. Currently late-combine
propagates this sign-bit mask from the constant pool, changing the
USE of a REG to the USE of a MEM. This unCSE is reasonable if this
MEM is used only once, but less than optimal if this MEM is accessed
many times.
The problem is that this USE doesn't currently have a cost in
ix86_insn_cost, so propagating this load from memory into the USE
makes if free (to combine's profitable replacement calculation).
This patch improve things by providing a nominal cost for USEs of
MEM.
As an example, consider the following function:
float x, y, z;
void foo()
{
x = -x;
y = -y;
z = -z;
}
Currently with -O2 GCC generates three loads from the constant pool:
movss x(%rip), %xmm0
xorps .LC0(%rip), %xmm0
movss %xmm0, x(%rip)
movss y(%rip), %xmm0
xorps .LC0(%rip), %xmm0
movss %xmm0, y(%rip)
movss z(%rip), %xmm0
xorps .LC0(%rip), %xmm0
movss %xmm0, z(%rip)
ret
With the patch below, this load remains CSEd.
movss x(%rip), %xmm0
movss .LC0(%rip), %xmm1
xorps %xmm1, %xmm0
movss %xmm0, x(%rip)
movss y(%rip), %xmm0
xorps %xmm1, %xmm0
movss %xmm0, y(%rip)
movss z(%rip), %xmm0
xorps %xmm1, %xmm0
movss %xmm0, z(%rip)
ret
Note this is one more instruction, but code size is smaller and
the total cost (as calculated by the i386 backend) is lower.
For a single neg/abs the memory address is still propagated.
2026-06-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386.cc (ix86_insn_cost): Add a suitable penalty
for USE of a MEM in a PARALLEL (for *<absneg>[sd]f2_1 splitter).
gcc/testsuite/ChangeLog
* gcc.target/i386/fabsneg-2.c: New test case.
== AVX_PARTIAL_XMM_UPDATE_TRUE)
insn_cost += COSTS_N_INSNS (3);
- return insn_cost + pattern_cost (PATTERN (insn), speed);
+ rtx pat = PATTERN (insn);
+ /* A USE of a memory is more expensive than a use of a REG.
+ For example *<absneg>mode2_1's use of a signbit mask. */
+ if (GET_CODE (pat) == PARALLEL)
+ {
+ for (int i = 0; i < XVECLEN (pat, 0); i++)
+ {
+ rtx x = XVECEXP (pat, 0, i);
+ if (GET_CODE (x) == USE && MEM_P (XEXP (x, 0)))
+ insn_cost += !speed ? COSTS_N_BYTES (4)
+ : TARGET_64BIT ? COSTS_N_INSNS (1) + 1
+ : COSTS_N_INSNS (3) + 1;
+ }
+ }
+
+ return insn_cost + pattern_cost (pat, speed);
}
/* Return cost of SSE/AVX FP->FP conversion (extensions and truncates). */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfpmath=sse -march=skylake" } */
+
+float x;
+float y;
+float z;
+
+void foo()
+{
+ x = -x;
+ y = -y;
+ z = -z;
+}
+
+void bar()
+{
+ x = __builtin_fabsf(x);
+ y = __builtin_fabsf(y);
+ z = __builtin_fabsf(z);
+}
+
+/* { dg-final { scan-assembler-times "LC0\[,(\]" 1 } } */
+/* { dg-final { scan-assembler-times "LC1\[,(\]" 1 } } */