From: Roger Sayle Date: Wed, 1 Apr 2026 22:52:26 +0000 (+0100) Subject: PR target/123238: VCOND_MASK regression on x86_64. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=48c2ea1750efe48723dbb04a77d0deb3e242f35f;p=thirdparty%2Fgcc.git PR target/123238: VCOND_MASK regression on x86_64. This patch is my revised fix for (the regression aspects of) PR123238, a code quality regression on x86_64 triggered by the generation of VCOND_MASK. The regression is actually just bad luck. From gimple, VCOND_MASK(a==b,c,d) is equivalent to VCOND_MASK(a!=b,d,c), and which form gets generated was previously arbitrary. This is reasonable for many (most?) targets, but on x86_64 there's an asymmetry, equality can be performed in 1 instruction, but inequality requires three. Teaching the middle-end's expand pass which form is preferred could in theory be done with a new (very specific) target hook, that would require documentation, but a more generic solution is for expand's expand_vec_cond_mask_optab_fn to make use of rtx_costs, and reverse the sense of VCOND_MASK if that would be an improvement. This has the convenient property that the default rtx_costs of all comparison operators is the same, resulting in no change unless explicitly specified by the backend. This revision incorporates the feedback from both Andrew Pinksi and Richard Biener, using get_gimple_for_ssa_name instead of SSA_NAME_DEF_STMT, and Andrew's suggestion to log expand's decision to the dump file, which now contains lines such as: ;; swapping operands of .VCOND_MASK ;; cost of original ne: 8 ;; cost of replacement eq: 4 or (for failure) ;; not swapping operands of .VCOND_MASK ;; cost of original eq: 4 ;; cost of replacement ne: 8 Thanks to Richard and Hongtao for approvals. 2026-04-01 Roger Sayle gcc/ChangeLog PR target/123238 * expr.cc (convert_tree_comp_to_rtx): Make global. * expr.h (convert_tree_comp_to_rtx): Prototype here. * internal-fn.cc (expand_vec_cond_mask_optab_fn): Use rtx_costs to determine whether swapping operands would result in better code. * config/i386/i386-expand.cc (ix86_expand_int_vec_cmp): On AVX512 targets use a ternlog instead of a comparison to negate the mask (requires one instruction instead of two). * config/i386/i386.cc (ix86_rtx_costs): Refactor code for UNSPEC. Provide costs for UNSPEC_BLENDV and UNSPEC_MOVMSK. Provide costs for comparison operators of integer vector modes. gcc/testsuite/ChangeLog PR target/123238 * gcc.target/i386/pr123238.c: Likewise. --- diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index a82bb4399c9..366ad513da9 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -5282,11 +5282,17 @@ ix86_expand_int_vec_cmp (rtx operands[]) return false; if (negate) - cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp, - CONST0_RTX (GET_MODE (cmp)), - NULL, NULL, &negate); - - gcc_assert (!negate); + { + if (TARGET_AVX512F && GET_MODE_SIZE (GET_MODE (cmp)) >= 16) + cmp = gen_rtx_XOR (GET_MODE (cmp), cmp, CONSTM1_RTX (GET_MODE (cmp))); + else + { + cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp, + CONST0_RTX (GET_MODE (cmp)), + NULL, NULL, &negate); + gcc_assert (!negate); + } + } if (operands[0] != cmp) emit_move_insn (operands[0], cmp); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 588e7c9d81d..39136ce5042 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -23163,36 +23163,71 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, return false; case UNSPEC: - if (XINT (x, 1) == UNSPEC_TP) - *total = 0; - else if (XINT (x, 1) == UNSPEC_VTERNLOG) + switch (XINT (x, 1)) { + case UNSPEC_TP: + *total = 0; + break; + + case UNSPEC_VTERNLOG: *total = cost->sse_op; - *total += rtx_cost (XVECEXP (x, 0, 0), mode, code, 0, speed); - *total += rtx_cost (XVECEXP (x, 0, 1), mode, code, 1, speed); - *total += rtx_cost (XVECEXP (x, 0, 2), mode, code, 2, speed); + if (!REG_P (XVECEXP (x, 0, 0))) + *total += rtx_cost (XVECEXP (x, 0, 0), mode, code, 0, speed); + if (!REG_P (XVECEXP (x, 0, 1))) + *total += rtx_cost (XVECEXP (x, 0, 1), mode, code, 1, speed); + if (!REG_P (XVECEXP (x, 0, 2))) + *total += rtx_cost (XVECEXP (x, 0, 2), mode, code, 2, speed); return true; - } - else if (XINT (x, 1) == UNSPEC_PTEST) - { + + case UNSPEC_PTEST: + { + *total = cost->sse_op; + rtx test_op0 = XVECEXP (x, 0, 0); + if (!rtx_equal_p (test_op0, XVECEXP (x, 0, 1))) + return false; + if (GET_CODE (test_op0) == AND) + { + rtx and_op0 = XEXP (test_op0, 0); + if (GET_CODE (and_op0) == NOT) + and_op0 = XEXP (and_op0, 0); + *total += rtx_cost (and_op0, GET_MODE (and_op0), + AND, 0, speed) + + rtx_cost (XEXP (test_op0, 1), GET_MODE (and_op0), + AND, 1, speed); + } + else + *total = rtx_cost (test_op0, GET_MODE (test_op0), + UNSPEC, 0, speed); + } + return true; + + case UNSPEC_BLENDV: *total = cost->sse_op; - rtx test_op0 = XVECEXP (x, 0, 0); - if (!rtx_equal_p (test_op0, XVECEXP (x, 0, 1))) - return false; - if (GET_CODE (test_op0) == AND) + if (!REG_P (XVECEXP (x, 0, 0))) + *total += rtx_cost (XVECEXP (x, 0, 0), mode, code, 0, speed); + if (!REG_P (XVECEXP (x, 0, 1))) + *total += rtx_cost (XVECEXP (x, 0, 1), mode, code, 1, speed); + if (!REG_P (XVECEXP (x, 0, 2))) { - rtx and_op0 = XEXP (test_op0, 0); - if (GET_CODE (and_op0) == NOT) - and_op0 = XEXP (and_op0, 0); - *total += rtx_cost (and_op0, GET_MODE (and_op0), - AND, 0, speed) - + rtx_cost (XEXP (test_op0, 1), GET_MODE (and_op0), - AND, 1, speed); + rtx cond = XVECEXP (x, 0, 2); + if ((GET_CODE (cond) == LT || GET_CODE (cond) == GT) + && CONST_VECTOR_P (XEXP (cond, 1))) + { + /* avx2_blendvpd256_gt and friends. */ + if (!REG_P (XEXP (cond, 0))) + *total += rtx_cost (XEXP (cond, 0), mode, code, 2, speed); + } + else + *total += rtx_cost (cond, mode, code, 2, speed); } - else - *total = rtx_cost (test_op0, GET_MODE (test_op0), - UNSPEC, 0, speed); return true; + + case UNSPEC_MOVMSK: + *total = cost->sse_op; + return true; + + default: + break; } return false; @@ -23409,6 +23444,70 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } return false; + case EQ: + case GT: + case GTU: + case LT: + case LTU: + if (TARGET_SSE2 + && GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && GET_MODE_SIZE (mode) >= 8) + { + /* vpcmpeq */ + *total = speed ? COSTS_N_INSNS (1) : COSTS_N_BYTES (4); + if (!REG_P (XEXP (x, 0))) + *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed); + if (!REG_P (XEXP (x, 1))) + *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed); + return true; + } + if (TARGET_XOP + && GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && GET_MODE_SIZE (mode) <= 16) + { + /* vpcomeq */ + *total = speed ? COSTS_N_INSNS (1) : COSTS_N_BYTES (6); + if (!REG_P (XEXP (x, 0))) + *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed); + if (!REG_P (XEXP (x, 1))) + *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed); + return true; + } + return false; + + case NE: + case GE: + case GEU: + if (TARGET_XOP + && GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && GET_MODE_SIZE (mode) <= 16) + { + /* vpcomneq */ + *total = speed ? COSTS_N_INSNS (1) : COSTS_N_BYTES (6); + if (!REG_P (XEXP (x, 0))) + *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed); + if (!REG_P (XEXP (x, 1))) + *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed); + return true; + } + if (TARGET_SSE2 + && GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && GET_MODE_SIZE (mode) >= 8) + { + if (TARGET_AVX512F && GET_MODE_SIZE (mode) >= 16) + /* vpcmpeq + vpternlog */ + *total = speed ? COSTS_N_INSNS (2) : COSTS_N_BYTES (11); + else + /* vpcmpeq + pxor + vpcmpeq */ + *total = speed ? COSTS_N_INSNS (3) : COSTS_N_BYTES (12); + if (!REG_P (XEXP (x, 0))) + *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed); + if (!REG_P (XEXP (x, 1))) + *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed); + return true; + } + return false; + default: return false; } diff --git a/gcc/expr.cc b/gcc/expr.cc index ba00226c4e7..7ca5dd84ce9 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -9121,7 +9121,7 @@ highest_pow2_factor_for_target (const_tree target, const_tree exp) /* Convert the tree comparison code TCODE to the rtl one where the signedness is UNSIGNEDP. */ -static enum rtx_code +enum rtx_code convert_tree_comp_to_rtx (enum tree_code tcode, int unsignedp) { enum rtx_code code; diff --git a/gcc/expr.h b/gcc/expr.h index ddd47cb4ecc..1e89a142d8c 100644 --- a/gcc/expr.h +++ b/gcc/expr.h @@ -338,6 +338,7 @@ extern tree string_constant (tree, tree *, tree *, tree *); a constant. */ extern tree byte_representation (tree, tree *, tree *, tree *); +extern enum rtx_code convert_tree_comp_to_rtx (enum tree_code, int); extern enum tree_code maybe_optimize_mod_cmp (enum tree_code, tree *, tree *); extern void maybe_optimize_sub_cmp_0 (enum tree_code, tree *, tree *); diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index d879568c6e3..65be5aed35c 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -3229,6 +3229,74 @@ expand_vec_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) gcc_assert (icode != CODE_FOR_nothing); + /* Find the comparison generating the mask OP0. */ + tree cmp_op0 = NULL_TREE; + tree cmp_op1 = NULL_TREE; + enum tree_code cmp_code = TREE_CODE (op0); + if (TREE_CODE_CLASS (cmp_code) == tcc_comparison) + { + cmp_op0 = TREE_OPERAND (op0, 0); + cmp_op1 = TREE_OPERAND (op0, 1); + } + else if (cmp_code == SSA_NAME) + { + gimple *def_stmt = get_gimple_for_ssa_name (op0); + if (def_stmt && is_gimple_assign (def_stmt)) + { + cmp_code = gimple_assign_rhs_code (def_stmt); + if (TREE_CODE_CLASS (cmp_code) == tcc_comparison) + { + cmp_op0 = gimple_assign_rhs1 (def_stmt); + cmp_op1 = gimple_assign_rhs2 (def_stmt); + } + } + } + + /* Decide whether to invert comparison based on rtx_cost. */ + if (cmp_op0) + { + enum tree_code rev_code; + tree op_type = TREE_TYPE (cmp_op0); + int unsignedp = TYPE_UNSIGNED (op_type); + rev_code = invert_tree_comparison (cmp_code, HONOR_NANS (op_type)); + + if (rev_code != ERROR_MARK) + { + tree cmp_type = TREE_TYPE (op0); + machine_mode cmp_mode = TYPE_MODE (cmp_type); + machine_mode op_mode = TYPE_MODE (op_type); + bool speed_p = optimize_insn_for_speed_p (); + rtx reg = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1); + enum rtx_code cmp_rtx_code = convert_tree_comp_to_rtx (cmp_code, + unsignedp); + rtx veccmp = gen_rtx_fmt_ee (cmp_rtx_code, cmp_mode, reg, reg); + int old_cost = rtx_cost (veccmp, cmp_mode, SET, 0, speed_p); + enum rtx_code rev_rtx_code = convert_tree_comp_to_rtx (rev_code, + unsignedp); + PUT_CODE (veccmp, rev_rtx_code); + int new_cost = rtx_cost (veccmp, cmp_mode, SET, 0, speed_p); + if (new_cost < old_cost) + { + op0 = fold_build2_loc (EXPR_LOCATION (op0), rev_code, + cmp_type, cmp_op0, cmp_op1); + std::swap (op1, op2); + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, + ";; %sswapping operands of .VCOND_MASK\n", + new_cost >= old_cost ? "not " : ""); + fprintf (dump_file, + ";; cost of original %s: %d\n", + GET_RTX_NAME (cmp_rtx_code), old_cost); + fprintf (dump_file, + ";; cost of replacement %s: %d\n", + GET_RTX_NAME (rev_rtx_code), new_cost); + } + } + } + mask = expand_normal (op0); rtx_op1 = expand_normal (op1); rtx_op2 = expand_normal (op2); diff --git a/gcc/testsuite/gcc.target/i386/pr123238.c b/gcc/testsuite/gcc.target/i386/pr123238.c new file mode 100644 index 00000000000..63906ae0fb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr123238.c @@ -0,0 +1,10 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2" } */ + +void f(char c[]) +{ + for (int i = 0; i < 8; i++) + c[i] = c[i] ? 'a' : 'c'; +} + +/* { dg-final { scan-assembler-times "pcmpeqb" 1 } } */