From: Roger Sayle Date: Sun, 7 Jan 2024 17:42:00 +0000 (+0000) Subject: i386: PR target/113231: Improved costs in Scalar-To-Vector (STV) pass. X-Git-Tag: basepoints/gcc-15~3154 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0a8aba760f62e9d66cc5610ecc276c1f0befc651;p=thirdparty%2Fgcc.git i386: PR target/113231: Improved costs in Scalar-To-Vector (STV) pass. This patch improves the cost/gain calculation used during the i386 backend's SImode/DImode scalar-to-vector (STV) conversion pass. The current code handles loads and stores, but doesn't consider that converting other scalar operations with a memory destination, requires an explicit load before and an explicit store after the vector equivalent. To ease the review, the significant change looks like: /* For operations on memory operands, include the overhead of explicit load and store instructions. */ if (MEM_P (dst)) igain += !optimize_insn_for_size_p () ? -COSTS_N_BYTES (8); : (m * (ix86_cost->int_load[2] + ix86_cost->int_store[2]) - (ix86_cost->sse_load[sse_cost_idx] + ix86_cost->sse_store[sse_cost_idx])); however the patch itself is complicated by a change in indentation which leads to a number of lines with only whitespace changes. For architectures where integer load/store costs are the same as vector load/store costs, there should be no change without -Os/-Oz. 2024-01-07 Roger Sayle Uros Bizjak gcc/ChangeLog PR target/113231 * config/i386/i386-features.cc (compute_convert_gain): Include the overhead of explicit load and store (movd) instructions when converting non-store scalar operations with memory destinations. Various indentation whitespace fixes. gcc/testsuite/ChangeLog PR target/113231 * gcc.target/i386/pr113231.c: New test case. --- diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 4ae3e7564d81..4020b271328a 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -563,183 +563,195 @@ general_scalar_chain::compute_convert_gain () else if (MEM_P (src) && REG_P (dst)) igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx]; else - switch (GET_CODE (src)) - { - case ASHIFT: - case ASHIFTRT: - case LSHIFTRT: - if (m == 2) - { - if (INTVAL (XEXP (src, 1)) >= 32) - igain += ix86_cost->add; - /* Gain for extend highpart case. */ - else if (GET_CODE (XEXP (src, 0)) == ASHIFT) - igain += ix86_cost->shift_const - ix86_cost->sse_op; - else - igain += ix86_cost->shift_const; - } + { + /* For operations on memory operands, include the overhead + of explicit load and store instructions. */ + if (MEM_P (dst)) + igain += optimize_insn_for_size_p () + ? -COSTS_N_BYTES (8) + : (m * (ix86_cost->int_load[2] + + ix86_cost->int_store[2]) + - (ix86_cost->sse_load[sse_cost_idx] + + ix86_cost->sse_store[sse_cost_idx])); - igain += ix86_cost->shift_const - ix86_cost->sse_op; + switch (GET_CODE (src)) + { + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + if (m == 2) + { + if (INTVAL (XEXP (src, 1)) >= 32) + igain += ix86_cost->add; + /* Gain for extend highpart case. */ + else if (GET_CODE (XEXP (src, 0)) == ASHIFT) + igain += ix86_cost->shift_const - ix86_cost->sse_op; + else + igain += ix86_cost->shift_const; + } - if (CONST_INT_P (XEXP (src, 0))) - igain -= vector_const_cost (XEXP (src, 0)); - break; + igain += ix86_cost->shift_const - ix86_cost->sse_op; - case ROTATE: - case ROTATERT: - igain += m * ix86_cost->shift_const; - if (TARGET_AVX512VL) - igain -= ix86_cost->sse_op; - else if (smode == DImode) - { - int bits = INTVAL (XEXP (src, 1)); - if ((bits & 0x0f) == 0) - igain -= ix86_cost->sse_op; - else if ((bits & 0x07) == 0) - igain -= 2 * ix86_cost->sse_op; - else - igain -= 3 * ix86_cost->sse_op; - } - else if (INTVAL (XEXP (src, 1)) == 16) - igain -= ix86_cost->sse_op; - else - igain -= 2 * ix86_cost->sse_op; - break; - - case AND: - case IOR: - case XOR: - case PLUS: - case MINUS: - igain += m * ix86_cost->add - ix86_cost->sse_op; - /* Additional gain for andnot for targets without BMI. */ - if (GET_CODE (XEXP (src, 0)) == NOT - && !TARGET_BMI) - igain += m * ix86_cost->add; - - if (CONST_INT_P (XEXP (src, 0))) - igain -= vector_const_cost (XEXP (src, 0)); - if (CONST_INT_P (XEXP (src, 1))) - igain -= vector_const_cost (XEXP (src, 1)); - if (MEM_P (XEXP (src, 1))) - { - if (optimize_insn_for_size_p ()) - igain -= COSTS_N_BYTES (m == 2 ? 3 : 5); - else - igain += m * ix86_cost->int_load[2] - - ix86_cost->sse_load[sse_cost_idx]; - } - break; + if (CONST_INT_P (XEXP (src, 0))) + igain -= vector_const_cost (XEXP (src, 0)); + break; - case NEG: - case NOT: - igain -= ix86_cost->sse_op + COSTS_N_INSNS (1); + case ROTATE: + case ROTATERT: + igain += m * ix86_cost->shift_const; + if (TARGET_AVX512VL) + igain -= ix86_cost->sse_op; + else if (smode == DImode) + { + int bits = INTVAL (XEXP (src, 1)); + if ((bits & 0x0f) == 0) + igain -= ix86_cost->sse_op; + else if ((bits & 0x07) == 0) + igain -= 2 * ix86_cost->sse_op; + else + igain -= 3 * ix86_cost->sse_op; + } + else if (INTVAL (XEXP (src, 1)) == 16) + igain -= ix86_cost->sse_op; + else + igain -= 2 * ix86_cost->sse_op; + break; - if (GET_CODE (XEXP (src, 0)) != ABS) - { + case AND: + case IOR: + case XOR: + case PLUS: + case MINUS: + igain += m * ix86_cost->add - ix86_cost->sse_op; + /* Additional gain for andnot for targets without BMI. */ + if (GET_CODE (XEXP (src, 0)) == NOT + && !TARGET_BMI) igain += m * ix86_cost->add; - break; - } - /* FALLTHRU */ - - case ABS: - case SMAX: - case SMIN: - case UMAX: - case UMIN: - /* We do not have any conditional move cost, estimate it as a - reg-reg move. Comparisons are costed as adds. */ - igain += m * (COSTS_N_INSNS (2) + ix86_cost->add); - /* Integer SSE ops are all costed the same. */ - igain -= ix86_cost->sse_op; - break; - case COMPARE: - if (XEXP (src, 1) != const0_rtx) - { - /* cmp vs. pxor;pshufd;ptest. */ - igain += COSTS_N_INSNS (m - 3); - } - else if (GET_CODE (XEXP (src, 0)) != AND) - { - /* test vs. pshufd;ptest. */ - igain += COSTS_N_INSNS (m - 2); - } - else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT) - { - /* and;test vs. pshufd;ptest. */ - igain += COSTS_N_INSNS (2 * m - 2); - } - else if (TARGET_BMI) - { - /* andn;test vs. pandn;pshufd;ptest. */ - igain += COSTS_N_INSNS (2 * m - 3); - } - else - { - /* not;and;test vs. pandn;pshufd;ptest. */ - igain += COSTS_N_INSNS (3 * m - 3); - } - break; + if (CONST_INT_P (XEXP (src, 0))) + igain -= vector_const_cost (XEXP (src, 0)); + if (CONST_INT_P (XEXP (src, 1))) + igain -= vector_const_cost (XEXP (src, 1)); + if (MEM_P (XEXP (src, 1))) + { + if (optimize_insn_for_size_p ()) + igain -= COSTS_N_BYTES (m == 2 ? 3 : 5); + else + igain += m * ix86_cost->int_load[2] + - ix86_cost->sse_load[sse_cost_idx]; + } + break; - case CONST_INT: - if (REG_P (dst)) - { - if (optimize_insn_for_size_p ()) - { - /* xor (2 bytes) vs. xorps (3 bytes). */ - if (src == const0_rtx) - igain -= COSTS_N_BYTES (1); - /* movdi_internal vs. movv2di_internal. */ - /* => mov (5 bytes) vs. movaps (7 bytes). */ - else if (x86_64_immediate_operand (src, SImode)) - igain -= COSTS_N_BYTES (2); - else - /* ??? Larger immediate constants are placed in the - constant pool, where the size benefit/impact of - STV conversion is affected by whether and how - often each constant pool entry is shared/reused. - The value below is empirically derived from the - CSiBE benchmark (and the optimal value may drift - over time). */ - igain += COSTS_N_BYTES (0); - } - else - { - /* DImode can be immediate for TARGET_64BIT - and SImode always. */ - igain += m * COSTS_N_INSNS (1); - igain -= vector_const_cost (src); - } - } - else if (MEM_P (dst)) - { - igain += (m * ix86_cost->int_store[2] - - ix86_cost->sse_store[sse_cost_idx]); - igain -= vector_const_cost (src); - } - break; + case NEG: + case NOT: + igain -= ix86_cost->sse_op + COSTS_N_INSNS (1); - case VEC_SELECT: - if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx) - { - // movd (4 bytes) replaced with movdqa (4 bytes). - if (!optimize_insn_for_size_p ()) - igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move; - } - else - { - // pshufd; movd replaced with pshufd. - if (optimize_insn_for_size_p ()) - igain += COSTS_N_BYTES (4); - else - igain += ix86_cost->sse_to_integer; - } - break; + if (GET_CODE (XEXP (src, 0)) != ABS) + { + igain += m * ix86_cost->add; + break; + } + /* FALLTHRU */ + + case ABS: + case SMAX: + case SMIN: + case UMAX: + case UMIN: + /* We do not have any conditional move cost, estimate it as a + reg-reg move. Comparisons are costed as adds. */ + igain += m * (COSTS_N_INSNS (2) + ix86_cost->add); + /* Integer SSE ops are all costed the same. */ + igain -= ix86_cost->sse_op; + break; - default: - gcc_unreachable (); - } + case COMPARE: + if (XEXP (src, 1) != const0_rtx) + { + /* cmp vs. pxor;pshufd;ptest. */ + igain += COSTS_N_INSNS (m - 3); + } + else if (GET_CODE (XEXP (src, 0)) != AND) + { + /* test vs. pshufd;ptest. */ + igain += COSTS_N_INSNS (m - 2); + } + else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT) + { + /* and;test vs. pshufd;ptest. */ + igain += COSTS_N_INSNS (2 * m - 2); + } + else if (TARGET_BMI) + { + /* andn;test vs. pandn;pshufd;ptest. */ + igain += COSTS_N_INSNS (2 * m - 3); + } + else + { + /* not;and;test vs. pandn;pshufd;ptest. */ + igain += COSTS_N_INSNS (3 * m - 3); + } + break; + + case CONST_INT: + if (REG_P (dst)) + { + if (optimize_insn_for_size_p ()) + { + /* xor (2 bytes) vs. xorps (3 bytes). */ + if (src == const0_rtx) + igain -= COSTS_N_BYTES (1); + /* movdi_internal vs. movv2di_internal. */ + /* => mov (5 bytes) vs. movaps (7 bytes). */ + else if (x86_64_immediate_operand (src, SImode)) + igain -= COSTS_N_BYTES (2); + else + /* ??? Larger immediate constants are placed in the + constant pool, where the size benefit/impact of + STV conversion is affected by whether and how + often each constant pool entry is shared/reused. + The value below is empirically derived from the + CSiBE benchmark (and the optimal value may drift + over time). */ + igain += COSTS_N_BYTES (0); + } + else + { + /* DImode can be immediate for TARGET_64BIT + and SImode always. */ + igain += m * COSTS_N_INSNS (1); + igain -= vector_const_cost (src); + } + } + else if (MEM_P (dst)) + { + igain += (m * ix86_cost->int_store[2] + - ix86_cost->sse_store[sse_cost_idx]); + igain -= vector_const_cost (src); + } + break; + + case VEC_SELECT: + if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx) + { + // movd (4 bytes) replaced with movdqa (4 bytes). + if (!optimize_insn_for_size_p ()) + igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move; + } + else + { + // pshufd; movd replaced with pshufd. + if (optimize_insn_for_size_p ()) + igain += COSTS_N_BYTES (4); + else + igain += ix86_cost->sse_to_integer; + } + break; + + default: + gcc_unreachable (); + } + } if (igain != 0 && dump_file) { @@ -757,7 +769,7 @@ general_scalar_chain::compute_convert_gain () { cost += n_sse_to_integer * ix86_cost->sse_to_integer; /* ??? integer_to_sse but we only have that in the RA cost table. - Assume sse_to_integer/integer_to_sse are the same which they + Assume sse_to_integer/integer_to_sse are the same which they are at the moment. */ cost += n_integer_to_sse * ix86_cost->sse_to_integer; } @@ -3147,7 +3159,7 @@ remove_partial_avx_dependency (void) insn = NEXT_INSN (insn); } if (insn == BB_HEAD (bb)) - set_insn = emit_insn_before (set, insn); + set_insn = emit_insn_before (set, insn); else set_insn = emit_insn_after (set, insn ? PREV_INSN (insn) : BB_END (bb)); @@ -3321,7 +3333,7 @@ add_condition_to_bb (tree function_decl, tree version_decl, predicate_chain = TREE_CHAIN (predicate_chain); if (and_expr_var == NULL) - and_expr_var = cond_var; + and_expr_var = cond_var; else { gimple *assign_stmt; @@ -3338,7 +3350,7 @@ add_condition_to_bb (tree function_decl, tree version_decl, } if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var, - integer_zero_node, + integer_zero_node, NULL_TREE, NULL_TREE); gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl)); gimple_set_bb (if_else_stmt, new_bb); @@ -3435,10 +3447,10 @@ dispatch_function_versions (tree dispatch_decl, tree predicate_chain = NULL_TREE; unsigned int priority; /* Get attribute string, parse it and find the right predicate decl. - The predicate function could be a lengthy combination of many + The predicate function could be a lengthy combination of many features, like arch-type and various isa-variants. */ priority = get_builtin_code_for_version (version_decl, - &predicate_chain); + &predicate_chain); if (predicate_chain == NULL_TREE) continue; @@ -3456,7 +3468,7 @@ dispatch_function_versions (tree dispatch_decl, to execute, which one should be dispatched? In future, allow the user to specify a dispatch priority next to the version. */ qsort (function_version_info, actual_versions, - sizeof (struct _function_version_info), feature_compare); + sizeof (struct _function_version_info), feature_compare); for (i = 0; i < actual_versions; ++i) *empty_bb = add_condition_to_bb (dispatch_decl, @@ -3573,7 +3585,7 @@ ix86_get_function_versions_dispatcher (void *decl) { if (is_function_default_version (default_version_info->this_node->decl)) - break; + break; default_version_info = default_version_info->next; } @@ -3586,7 +3598,7 @@ ix86_get_function_versions_dispatcher (void *decl) { default_version_info->prev->next = default_version_info->next; if (default_version_info->next) - default_version_info->next->prev = default_version_info->prev; + default_version_info->next->prev = default_version_info->prev; first_v->prev = default_version_info; default_version_info->next = first_v; default_version_info->prev = NULL; diff --git a/gcc/testsuite/gcc.target/i386/pr113231.c b/gcc/testsuite/gcc.target/i386/pr113231.c new file mode 100644 index 000000000000..f9dcd9a3137d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr113231.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-Os" } */ + +void foo(int *i) { *i *= 2; } +void bar(int *i) { *i <<= 2; } +void baz(int *i) { *i >>= 2; } + +/* { dg-final { scan-assembler-not "movd" } } */