From: Roger Sayle Date: Tue, 21 Oct 2025 12:14:58 +0000 (+0100) Subject: x86_64: Start TImode STV chains from zero-extension or *concatditi. X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f4afefbbbee1414e130ca2f1552216bb702a985c;p=thirdparty%2Fgcc.git x86_64: Start TImode STV chains from zero-extension or *concatditi. Currently x86_64's TImode STV pass has the restriction that candidate chains must start with a TImode load from memory. This patch improves the functionality of STV to allow zero-extensions and construction of TImode pseudos from two DImode values (i.e. *concatditi) to both be considered candidate chain initiators. For example, this allows chains starting from an __int128 function argument to be processed by STV. Compiled with -O2 on x86_64: __int128 m0,m1,m2,m3; void foo(__int128 m) { m0 = m; m1 = m; m2 = m; m3 = m; } Previously generated: foo: xchgq %rdi, %rsi movq %rsi, m0(%rip) movq %rdi, m0+8(%rip) movq %rsi, m1(%rip) movq %rdi, m1+8(%rip) movq %rsi, m2(%rip) movq %rdi, m2+8(%rip) movq %rsi, m3(%rip) movq %rdi, m3+8(%rip) ret With the patch, we now generate: foo: movq %rdi, %xmm0 movq %rsi, %xmm1 punpcklqdq %xmm1, %xmm0 movaps %xmm0, m0(%rip) movaps %xmm0, m1(%rip) movaps %xmm0, m2(%rip) movaps %xmm0, m3(%rip) ret or with -mavx2: foo: vmovq %rdi, %xmm1 vpinsrq $1, %rsi, %xmm1, %xmm0 vmovdqa %xmm0, m0(%rip) vmovdqa %xmm0, m1(%rip) vmovdqa %xmm0, m2(%rip) vmovdqa %xmm0, m3(%rip) ret Likewise, for zero-extension: __int128 m0,m1,m2,m3; void bar(unsigned long x) { __int128 m = x; m0 = m; m1 = m; m2 = m; m3 = m; } Previously with -O2: bar: movq %rdi, m0(%rip) movq $0, m0+8(%rip) movq %rdi, m1(%rip) movq $0, m1+8(%rip) movq %rdi, m2(%rip) movq $0, m2+8(%rip) movq %rdi, m3(%rip) movq $0, m3+8(%rip) ret with this patch: bar: movq %rdi, %xmm0 movaps %xmm0, m0(%rip) movaps %xmm0, m1(%rip) movaps %xmm0, m2(%rip) movaps %xmm0, m3(%rip) ret As shown in the examples above, the scalar-to-vector (STV) conversion of *concatditi has an overhead [treating two DImode registers as a TImode value is free on x86_64], but specifying this penalty allows the STV pass to make an informed decision if the total cost/gain of the chain is a net win. 2025-10-21 Roger Sayle gcc/ChangeLog * config/i386/i386-features.cc (timode_concatdi_p): New function to recognize the various variants of *concatditi3_[1-7]. (scalar_chain::add_insn): Like VEC_SELECT, ZERO_EXTEND and timode_concatdi_p instructions don't require their input operands to be converted (to TImode). (timode_scalar_chain::compute_convert_gain): Split/clone XOR and IOR cases from AND case, to handle timode_concatdi_p costs. : Handle timode_concatdi_p conversion costs. : Provide costs of DImode to TImode extension. (timode_convert_concatdi): Helper function to transform a *concatditi3 instruction into a vec_concatv2di instruction. (timode_scalar_chain::convert_insn): Split/clone XOR and IOR cases from ANS case, to handle timode_concatdi_p using the new timode_convert_concatdi helper function. : Convert zero_extendditi2 to *vec_concatv2di_0. : Handle timode_concatdi_p using the new timode_convert_concatdi helper function. (timode_scalar_to_vector_candidate_p): Support timode_concatdi_p instructions in IOR, XOR and PLUS cases. : Consider zero extension of a register from DImode to TImode to be a candidate. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-10.c: New test case. * gcc.target/i386/sse4_1-stv-11.c: Likewise. * gcc.target/i386/sse4_1-stv-12.c: Likewise. --- diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 9348f55c2cd..8e277843f23 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -449,6 +449,30 @@ scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref, return true; } +/* Check whether X is a convertible *concatditi_? variant. X is known + to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI. */ + +static bool +timode_concatdi_p (rtx x) +{ + rtx op0 = XEXP (x, 0); + rtx op1 = XEXP (x, 1); + + if (GET_CODE (op1) == ASHIFT) + std::swap (op0, op1); + + return GET_CODE (op0) == ASHIFT + && GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND + && GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode + && REG_P (XEXP (XEXP (op0, 0), 0)) + && CONST_INT_P (XEXP (op0, 1)) + && INTVAL (XEXP (op0, 1)) == 64 + && GET_CODE (op1) == ZERO_EXTEND + && GET_MODE (XEXP (op1, 0)) == DImode + && REG_P (XEXP (op1, 0)); +} + + /* Add instruction into a chain. Return true if OK, false if the search was aborted. */ @@ -477,9 +501,26 @@ scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid, if (!analyze_register_chain (candidates, ref, disallowed)) return false; - /* The operand(s) of VEC_SELECT don't need to be converted/convertible. */ - if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT) - return true; + /* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need + to be converted/convertible. */ + if (def_set) + switch (GET_CODE (SET_SRC (def_set))) + { + case VEC_SELECT: + return true; + case ZERO_EXTEND: + if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode) + return true; + break; + case PLUS: + case IOR: + case XOR: + if (smode == TImode && timode_concatdi_p (SET_SRC (def_set))) + return true; + break; + default: + break; + } for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) if (!DF_REF_REG_MEM_P (ref)) @@ -1628,14 +1669,34 @@ timode_scalar_chain::compute_convert_gain () break; case AND: + if (!MEM_P (dst)) + igain = COSTS_N_INSNS (1); + if (CONST_SCALAR_INT_P (XEXP (src, 1))) + igain += timode_immed_const_gain (XEXP (src, 1), bb); + break; + case XOR: case IOR: + if (timode_concatdi_p (src)) + { + /* vmovq;vpinsrq (11 bytes). */ + igain = speed_p ? -2 * ix86_cost->sse_to_integer + : -COSTS_N_BYTES (11); + break; + } if (!MEM_P (dst)) igain = COSTS_N_INSNS (1); if (CONST_SCALAR_INT_P (XEXP (src, 1))) igain += timode_immed_const_gain (XEXP (src, 1), bb); break; + case PLUS: + if (timode_concatdi_p (src)) + /* vmovq;vpinsrq (11 bytes). */ + igain = speed_p ? -2 * ix86_cost->sse_to_integer + : -COSTS_N_BYTES (11); + break; + case ASHIFT: case LSHIFTRT: /* See ix86_expand_v1ti_shift. */ @@ -1794,6 +1855,13 @@ timode_scalar_chain::compute_convert_gain () igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1); break; + case ZERO_EXTEND: + if (GET_MODE (XEXP (src, 0)) == DImode) + /* xor (2 bytes) vs. vmovq (5 bytes). */ + igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer + : -COSTS_N_BYTES (3); + break; + default: break; } @@ -1858,6 +1926,28 @@ timode_scalar_chain::fix_debug_reg_uses (rtx reg) } } +/* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction. + Insert this before INSN, and return the result as a V1TImode subreg. */ + +static rtx +timode_convert_concatdi (rtx src, rtx_insn *insn) +{ + rtx hi, lo; + rtx tmp = gen_reg_rtx (V2DImode); + if (GET_CODE (XEXP (src, 0)) == ASHIFT) + { + hi = XEXP (XEXP (XEXP (src, 0), 0), 0); + lo = XEXP (XEXP (src, 1), 0); + } + else + { + hi = XEXP (XEXP (XEXP (src, 1), 0), 0); + lo = XEXP (XEXP (src, 0), 0); + } + emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn); + return gen_rtx_SUBREG (V1TImode, tmp, 0); +} + /* Convert INSN from TImode to V1T1mode. */ void @@ -1967,10 +2057,24 @@ timode_scalar_chain::convert_insn (rtx_insn *insn) PUT_MODE (src, V1TImode); break; } - /* FALLTHRU */ + convert_op (&XEXP (src, 0), insn); + convert_op (&XEXP (src, 1), insn); + PUT_MODE (src, V1TImode); + if (MEM_P (dst)) + { + tmp = gen_reg_rtx (V1TImode); + emit_insn_before (gen_rtx_SET (tmp, src), insn); + src = tmp; + } + break; case XOR: case IOR: + if (timode_concatdi_p (src)) + { + src = timode_convert_concatdi (src, insn); + break; + } convert_op (&XEXP (src, 0), insn); convert_op (&XEXP (src, 1), insn); PUT_MODE (src, V1TImode); @@ -2010,6 +2114,26 @@ timode_scalar_chain::convert_insn (rtx_insn *insn) PUT_MODE (src, V1TImode); break; + case ZERO_EXTEND: + if (GET_MODE (XEXP (src, 0)) == DImode) + { + /* Convert to *vec_concatv2di_0. */ + rtx tmp = gen_reg_rtx (V2DImode); + rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx); + emit_insn_before (gen_move_insn (tmp, pat), insn); + src = gen_rtx_SUBREG (vmode, tmp, 0); + } + else + gcc_unreachable (); + break; + + case PLUS: + if (timode_concatdi_p (src)) + src = timode_convert_concatdi (src, insn); + else + gcc_unreachable (); + break; + default: gcc_unreachable (); } @@ -2389,6 +2513,8 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn) case IOR: case XOR: + if (timode_concatdi_p (src)) + return true; return (REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0))) && (REG_P (XEXP (src, 1)) @@ -2408,6 +2534,13 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn) && CONST_INT_P (XEXP (src, 1)) && (INTVAL (XEXP (src, 1)) & ~0x7f) == 0; + case PLUS: + return timode_concatdi_p (src); + + case ZERO_EXTEND: + return REG_P (XEXP (src, 0)) + && GET_MODE (XEXP (src, 0)) == DImode; + default: return false; } diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-10.c b/gcc/testsuite/gcc.target/i386/sse4_1-stv-10.c new file mode 100644 index 00000000000..229bc459747 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-10.c @@ -0,0 +1,13 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */ + +__int128 m0,m1,m2,m3; +void foo(__int128 m) +{ + m0 = m; + m1 = m; + m2 = m; + m3 = m; +} + +/* { dg-final { scan-assembler-times "movaps" 4 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-11.c b/gcc/testsuite/gcc.target/i386/sse4_1-stv-11.c new file mode 100644 index 00000000000..3508bfb6726 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-11.c @@ -0,0 +1,14 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */ + +__int128 m0,m1,m2,m3; +void foo(unsigned long x) +{ + __int128 m = x; + m0 = m; + m1 = m; + m2 = m; + m3 = m; +} + +/* { dg-final { scan-assembler-times "movaps" 4 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-12.c b/gcc/testsuite/gcc.target/i386/sse4_1-stv-12.c new file mode 100644 index 00000000000..9587b6405d7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-12.c @@ -0,0 +1,14 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */ + +__int128 m0,m1,m2,m3; +void foo(unsigned int x) +{ + __int128 m = x; + m0 = m; + m1 = m; + m2 = m; + m3 = m; +} + +/* { dg-final { scan-assembler-times "movaps" 4 } } */