From: Roger Sayle Date: Tue, 19 Dec 2023 11:24:36 +0000 (+0000) Subject: i386: Improved TImode (128-bit) integer constants on x86_64. X-Git-Tag: basepoints/gcc-15~3457 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7e1c440bc84c02e67b1cf338579a3274cdc337e0;p=thirdparty%2Fgcc.git i386: Improved TImode (128-bit) integer constants on x86_64. This patch fixes two issues with the handling of 128-bit TImode integer constants in the x86_64 backend. The main issue is that GCC always tries to load 128-bit integer constants via broadcasts to vector SSE registers, even if the result is required in general registers. This is seen in the two closely related functions below: __int128 m; void foo() { m &= CONST; } void bar() { m = CONST; } When compiled with -O2 -mavx, we currently generate: foo: movabsq $81985529216486895, %rax vmovq %rax, %xmm0 vpunpcklqdq %xmm0, %xmm0, %xmm0 vmovq %xmm0, %rax vpextrq $1, %xmm0, %rdx andq %rax, m(%rip) andq %rdx, m+8(%rip) ret bar: movabsq $81985529216486895, %rax vmovq %rax, %xmm1 vpunpcklqdq %xmm1, %xmm1, %xmm0 vpextrq $1, %xmm0, %rdx vmovq %xmm0, m(%rip) movq %rdx, m+8(%rip) ret With this patch we defer the decision to use vector broadcast for TImode until we know that we actually want a SSE register result, by moving the call to ix86_convert_const_wide_int_to_broadcast from the RTL expansion pass, to the scalar-to-vector (STV) pass. With this change (and a minor tweak described below) we now generate: foo: movabsq $81985529216486895, %rax andq %rax, m(%rip) andq %rax, m+8(%rip) ret bar: movabsq $81985529216486895, %rax vmovq %rax, %xmm0 vpunpcklqdq %xmm0, %xmm0, %xmm0 vmovdqa %xmm0, m(%rip) ret showing that we now correctly use vector mode broadcasts (only) where appropriate. The one minor tweak mentioned above is to enable the un-cprop hi/lo optimization, that I originally contributed back in September 2004 https://gcc.gnu.org/pipermail/gcc-patches/2004-September/148756.html even when not optimizing for size. Without this (and currently with just -O2) the function foo above generates: foo: movabsq $81985529216486895, %rax movabsq $81985529216486895, %rdx andq %rax, m(%rip) andq %rdx, m+8(%rip) ret I'm not sure why (back in 2004) I thought that avoiding the implicit "movq %rax, %rdx" instead of a second load was faster, perhaps avoiding a dependency to allow better scheduling, but nowadays "movq %rax, %rdx" is either eliminated by GCC's hardreg cprop pass, or special cased by modern hardware, making the first foo preferrable, not only shorter but also faster. 2023-12-19 Roger Sayle gcc/ChangeLog * config/i386/i386-expand.cc (ix86_convert_const_wide_int_to_broadcast): Remove static. (ix86_expand_move): Don't attempt to convert wide constants to SSE using ix86_convert_const_wide_int_to_broadcast here. (ix86_split_long_move): Always un-cprop multi-word constants. * config/i386/i386-expand.h (ix86_convert_const_wide_int_to_broadcast): Prototype here. * config/i386/i386-features.cc: Include i386-expand.h. (timode_scalar_chain::convert_insn): When converting TImode to V1TImode, try ix86_convert_const_wide_int_to_broadcast. gcc/testsuite/ChangeLog * gcc.target/i386/movti-2.c: New test case. * gcc.target/i386/movti-3.c: Likewise. --- diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index fad4f34f9055..57a108ae4a76 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -289,7 +289,7 @@ ix86_broadcast (HOST_WIDE_INT v, unsigned int width, /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */ -static rtx +rtx ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op) { /* Don't use integer vector broadcast if we can't move from GPR to SSE @@ -541,14 +541,6 @@ ix86_expand_move (machine_mode mode, rtx operands[]) return; } } - else if (CONST_WIDE_INT_P (op1) - && GET_MODE_SIZE (mode) >= 16) - { - rtx tmp = ix86_convert_const_wide_int_to_broadcast - (GET_MODE (op0), op1); - if (tmp != nullptr) - op1 = tmp; - } } } @@ -6323,18 +6315,15 @@ ix86_split_long_move (rtx operands[]) } } - /* If optimizing for size, attempt to locally unCSE nonzero constants. */ - if (optimize_insn_for_size_p ()) - { - for (j = 0; j < nparts - 1; j++) - if (CONST_INT_P (operands[6 + j]) - && operands[6 + j] != const0_rtx - && REG_P (operands[2 + j])) - for (i = j; i < nparts - 1; i++) - if (CONST_INT_P (operands[7 + i]) - && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j])) - operands[7 + i] = operands[2 + j]; - } + /* Attempt to locally unCSE nonzero constants. */ + for (j = 0; j < nparts - 1; j++) + if (CONST_INT_P (operands[6 + j]) + && operands[6 + j] != const0_rtx + && REG_P (operands[2 + j])) + for (i = j; i < nparts - 1; i++) + if (CONST_INT_P (operands[7 + i]) + && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j])) + operands[7 + i] = operands[2 + j]; for (i = 0; i < nparts; i++) emit_move_insn (operands[2 + i], operands[6 + i]); diff --git a/gcc/config/i386/i386-expand.h b/gcc/config/i386/i386-expand.h index 997cb7db0592..e9e94bf92ae9 100644 --- a/gcc/config/i386/i386-expand.h +++ b/gcc/config/i386/i386-expand.h @@ -57,5 +57,6 @@ bool ix86_notrack_prefixed_insn_p (rtx_insn *); machine_mode ix86_split_reduction (machine_mode mode); void ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0, rtx op1, rtx *quot_p, rtx *rem_p); +rtx ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op); #endif /* GCC_I386_EXPAND_H */ diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index e6fc135f32f0..3fcbb81254cf 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -90,6 +90,7 @@ along with GCC; see the file COPYING3. If not see #include "dwarf2out.h" #include "i386-builtins.h" #include "i386-features.h" +#include "i386-expand.h" const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = { "savms64", @@ -1853,14 +1854,25 @@ timode_scalar_chain::convert_insn (rtx_insn *insn) { /* Since there are no instructions to store 128-bit constant, temporary register usage is required. */ + bool use_move; start_sequence (); - src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src)); - src = validize_mem (force_const_mem (V1TImode, src)); + tmp = ix86_convert_const_wide_int_to_broadcast (TImode, src); + if (tmp) + { + src = lowpart_subreg (V1TImode, tmp, TImode); + use_move = true; + } + else + { + src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src)); + src = validize_mem (force_const_mem (V1TImode, src)); + use_move = MEM_P (dst); + } rtx_insn *seq = get_insns (); end_sequence (); if (seq) emit_insn_before (seq, insn); - if (MEM_P (dst)) + if (use_move) { tmp = gen_reg_rtx (V1TImode); emit_insn_before (gen_rtx_SET (tmp, src), insn); diff --git a/gcc/testsuite/gcc.target/i386/movti-2.c b/gcc/testsuite/gcc.target/i386/movti-2.c new file mode 100644 index 000000000000..73f69d290cbd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/movti-2.c @@ -0,0 +1,12 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -mavx" } */ +__int128 m; + +void foo() +{ + m &= ((__int128)0x0123456789abcdefULL<<64) | 0x0123456789abcdefULL; +} + +/* { dg-final { scan-assembler-times "movabsq" 1 } } */ +/* { dg-final { scan-assembler-not "vmovq" } } */ +/* { dg-final { scan-assembler-not "vpunpcklqdq" } } */ diff --git a/gcc/testsuite/gcc.target/i386/movti-3.c b/gcc/testsuite/gcc.target/i386/movti-3.c new file mode 100644 index 000000000000..535e5dc2118b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/movti-3.c @@ -0,0 +1,12 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -mavx" } */ + +__int128 m; + +void bar() +{ + m = ((__int128)0x0123456789abcdefULL<<64) | 0x0123456789abcdefULL; +} + +/* { dg-final { scan-assembler "vmovdqa" } } */ +/* { dg-final { scan-assembler-not "vpextrq" } } */