This patch fixes two issues with the handling of 128-bit TImode integer
constants in the x86_64 backend. The main issue is that GCC always
tries to load 128-bit integer constants via broadcasts to vector SSE
registers, even if the result is required in general registers. This
is seen in the two closely related functions below:
__int128 m;
void foo() { m &= CONST; }
void bar() { m = CONST; }
When compiled with -O2 -mavx, we currently generate:
foo: movabsq $
81985529216486895, %rax
vmovq %rax, %xmm0
vpunpcklqdq %xmm0, %xmm0, %xmm0
vmovq %xmm0, %rax
vpextrq $1, %xmm0, %rdx
andq %rax, m(%rip)
andq %rdx, m+8(%rip)
ret
bar: movabsq $
81985529216486895, %rax
vmovq %rax, %xmm1
vpunpcklqdq %xmm1, %xmm1, %xmm0
vpextrq $1, %xmm0, %rdx
vmovq %xmm0, m(%rip)
movq %rdx, m+8(%rip)
ret
With this patch we defer the decision to use vector broadcast for
TImode until we know that we actually want a SSE register result,
by moving the call to ix86_convert_const_wide_int_to_broadcast from
the RTL expansion pass, to the scalar-to-vector (STV) pass. With
this change (and a minor tweak described below) we now generate:
foo: movabsq $
81985529216486895, %rax
andq %rax, m(%rip)
andq %rax, m+8(%rip)
ret
bar: movabsq $
81985529216486895, %rax
vmovq %rax, %xmm0
vpunpcklqdq %xmm0, %xmm0, %xmm0
vmovdqa %xmm0, m(%rip)
ret
showing that we now correctly use vector mode broadcasts (only)
where appropriate.
The one minor tweak mentioned above is to enable the un-cprop hi/lo
optimization, that I originally contributed back in September 2004
https://gcc.gnu.org/pipermail/gcc-patches/2004-September/148756.html
even when not optimizing for size. Without this (and currently with
just -O2) the function foo above generates:
foo: movabsq $
81985529216486895, %rax
movabsq $
81985529216486895, %rdx
andq %rax, m(%rip)
andq %rdx, m+8(%rip)
ret
I'm not sure why (back in 2004) I thought that avoiding the implicit
"movq %rax, %rdx" instead of a second load was faster, perhaps avoiding
a dependency to allow better scheduling, but nowadays "movq %rax, %rdx"
is either eliminated by GCC's hardreg cprop pass, or special cased by
modern hardware, making the first foo preferrable, not only shorter but
also faster.
2023-12-19 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-expand.cc
(ix86_convert_const_wide_int_to_broadcast): Remove static.
(ix86_expand_move): Don't attempt to convert wide constants
to SSE using ix86_convert_const_wide_int_to_broadcast here.
(ix86_split_long_move): Always un-cprop multi-word constants.
* config/i386/i386-expand.h
(ix86_convert_const_wide_int_to_broadcast): Prototype here.
* config/i386/i386-features.cc: Include i386-expand.h.
(timode_scalar_chain::convert_insn): When converting TImode to
V1TImode, try ix86_convert_const_wide_int_to_broadcast.
gcc/testsuite/ChangeLog
* gcc.target/i386/movti-2.c: New test case.
* gcc.target/i386/movti-3.c: Likewise.
/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
-static rtx
+rtx
ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
{
/* Don't use integer vector broadcast if we can't move from GPR to SSE
return;
}
}
- else if (CONST_WIDE_INT_P (op1)
- && GET_MODE_SIZE (mode) >= 16)
- {
- rtx tmp = ix86_convert_const_wide_int_to_broadcast
- (GET_MODE (op0), op1);
- if (tmp != nullptr)
- op1 = tmp;
- }
}
}
}
}
- /* If optimizing for size, attempt to locally unCSE nonzero constants. */
- if (optimize_insn_for_size_p ())
- {
- for (j = 0; j < nparts - 1; j++)
- if (CONST_INT_P (operands[6 + j])
- && operands[6 + j] != const0_rtx
- && REG_P (operands[2 + j]))
- for (i = j; i < nparts - 1; i++)
- if (CONST_INT_P (operands[7 + i])
- && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
- operands[7 + i] = operands[2 + j];
- }
+ /* Attempt to locally unCSE nonzero constants. */
+ for (j = 0; j < nparts - 1; j++)
+ if (CONST_INT_P (operands[6 + j])
+ && operands[6 + j] != const0_rtx
+ && REG_P (operands[2 + j]))
+ for (i = j; i < nparts - 1; i++)
+ if (CONST_INT_P (operands[7 + i])
+ && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
+ operands[7 + i] = operands[2 + j];
for (i = 0; i < nparts; i++)
emit_move_insn (operands[2 + i], operands[6 + i]);
machine_mode ix86_split_reduction (machine_mode mode);
void ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0,
rtx op1, rtx *quot_p, rtx *rem_p);
+rtx ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op);
#endif /* GCC_I386_EXPAND_H */
#include "dwarf2out.h"
#include "i386-builtins.h"
#include "i386-features.h"
+#include "i386-expand.h"
const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
"savms64",
{
/* Since there are no instructions to store 128-bit constant,
temporary register usage is required. */
+ bool use_move;
start_sequence ();
- src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
- src = validize_mem (force_const_mem (V1TImode, src));
+ tmp = ix86_convert_const_wide_int_to_broadcast (TImode, src);
+ if (tmp)
+ {
+ src = lowpart_subreg (V1TImode, tmp, TImode);
+ use_move = true;
+ }
+ else
+ {
+ src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
+ src = validize_mem (force_const_mem (V1TImode, src));
+ use_move = MEM_P (dst);
+ }
rtx_insn *seq = get_insns ();
end_sequence ();
if (seq)
emit_insn_before (seq, insn);
- if (MEM_P (dst))
+ if (use_move)
{
tmp = gen_reg_rtx (V1TImode);
emit_insn_before (gen_rtx_SET (tmp, src), insn);
--- /dev/null
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -mavx" } */
+__int128 m;
+
+void foo()
+{
+ m &= ((__int128)0x0123456789abcdefULL<<64) | 0x0123456789abcdefULL;
+}
+
+/* { dg-final { scan-assembler-times "movabsq" 1 } } */
+/* { dg-final { scan-assembler-not "vmovq" } } */
+/* { dg-final { scan-assembler-not "vpunpcklqdq" } } */
--- /dev/null
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -mavx" } */
+
+__int128 m;
+
+void bar()
+{
+ m = ((__int128)0x0123456789abcdefULL<<64) | 0x0123456789abcdefULL;
+}
+
+/* { dg-final { scan-assembler "vmovdqa" } } */
+/* { dg-final { scan-assembler-not "vpextrq" } } */