]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
i386: PR target/113231: Improved costs in Scalar-To-Vector (STV) pass.
authorRoger Sayle <roger@nextmovesoftware.com>
Sun, 7 Jan 2024 17:42:00 +0000 (17:42 +0000)
committerRoger Sayle <roger@nextmovesoftware.com>
Sun, 7 Jan 2024 17:42:00 +0000 (17:42 +0000)
This patch improves the cost/gain calculation used during the i386 backend's
SImode/DImode scalar-to-vector (STV) conversion pass.  The current code
handles loads and stores, but doesn't consider that converting other
scalar operations with a memory destination, requires an explicit load
before and an explicit store after the vector equivalent.

To ease the review, the significant change looks like:

         /* For operations on memory operands, include the overhead
            of explicit load and store instructions.  */
         if (MEM_P (dst))
           igain += !optimize_insn_for_size_p ()
                    ? -COSTS_N_BYTES (8);
                    : (m * (ix86_cost->int_load[2]
                            + ix86_cost->int_store[2])
                       - (ix86_cost->sse_load[sse_cost_idx] +
                          ix86_cost->sse_store[sse_cost_idx]));

however the patch itself is complicated by a change in indentation
which leads to a number of lines with only whitespace changes.
For architectures where integer load/store costs are the same as
vector load/store costs, there should be no change without -Os/-Oz.

2024-01-07  Roger Sayle  <roger@nextmovesoftware.com>
    Uros Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog
PR target/113231
* config/i386/i386-features.cc (compute_convert_gain): Include
the overhead of explicit load and store (movd) instructions when
converting non-store scalar operations with memory destinations.
Various indentation whitespace fixes.

gcc/testsuite/ChangeLog
PR target/113231
* gcc.target/i386/pr113231.c: New test case.

gcc/config/i386/i386-features.cc
gcc/testsuite/gcc.target/i386/pr113231.c [new file with mode: 0644]

index 4ae3e7564d81ce51274c2df66f78e56da1d3eed1..4020b271328af9cce477d861bc916bcae7476dc7 100644 (file)
@@ -563,183 +563,195 @@ general_scalar_chain::compute_convert_gain ()
       else if (MEM_P (src) && REG_P (dst))
        igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
       else
-       switch (GET_CODE (src))
-         {
-         case ASHIFT:
-         case ASHIFTRT:
-         case LSHIFTRT:
-           if (m == 2)
-             {
-               if (INTVAL (XEXP (src, 1)) >= 32)
-                 igain += ix86_cost->add;
-               /* Gain for extend highpart case.  */
-               else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
-                 igain += ix86_cost->shift_const - ix86_cost->sse_op;
-               else
-                 igain += ix86_cost->shift_const;
-             }
+       {
+         /* For operations on memory operands, include the overhead
+            of explicit load and store instructions.  */
+         if (MEM_P (dst))
+           igain += optimize_insn_for_size_p ()
+                    ? -COSTS_N_BYTES (8)
+                    : (m * (ix86_cost->int_load[2]
+                            + ix86_cost->int_store[2])
+                       - (ix86_cost->sse_load[sse_cost_idx] +
+                          ix86_cost->sse_store[sse_cost_idx]));
 
-           igain += ix86_cost->shift_const - ix86_cost->sse_op;
+         switch (GET_CODE (src))
+           {
+           case ASHIFT:
+           case ASHIFTRT:
+           case LSHIFTRT:
+             if (m == 2)
+               {
+                 if (INTVAL (XEXP (src, 1)) >= 32)
+                   igain += ix86_cost->add;
+                 /* Gain for extend highpart case.  */
+                 else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
+                   igain += ix86_cost->shift_const - ix86_cost->sse_op;
+                 else
+                   igain += ix86_cost->shift_const;
+               }
 
-           if (CONST_INT_P (XEXP (src, 0)))
-             igain -= vector_const_cost (XEXP (src, 0));
-           break;
+             igain += ix86_cost->shift_const - ix86_cost->sse_op;
 
-         case ROTATE:
-         case ROTATERT:
-           igain += m * ix86_cost->shift_const;
-           if (TARGET_AVX512VL)
-             igain -= ix86_cost->sse_op;
-           else if (smode == DImode)
-             {
-               int bits = INTVAL (XEXP (src, 1));
-               if ((bits & 0x0f) == 0)
-                 igain -= ix86_cost->sse_op;
-               else if ((bits & 0x07) == 0)
-                 igain -= 2 * ix86_cost->sse_op;
-               else
-                 igain -= 3 * ix86_cost->sse_op;
-             }
-           else if (INTVAL (XEXP (src, 1)) == 16)
-             igain -= ix86_cost->sse_op;
-           else
-             igain -= 2 * ix86_cost->sse_op;
-           break;
-
-         case AND:
-         case IOR:
-         case XOR:
-         case PLUS:
-         case MINUS:
-           igain += m * ix86_cost->add - ix86_cost->sse_op;
-           /* Additional gain for andnot for targets without BMI.  */
-           if (GET_CODE (XEXP (src, 0)) == NOT
-               && !TARGET_BMI)
-             igain += m * ix86_cost->add;
-
-           if (CONST_INT_P (XEXP (src, 0)))
-             igain -= vector_const_cost (XEXP (src, 0));
-           if (CONST_INT_P (XEXP (src, 1)))
-             igain -= vector_const_cost (XEXP (src, 1));
-           if (MEM_P (XEXP (src, 1)))
-             {
-               if (optimize_insn_for_size_p ())
-                 igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
-               else
-                 igain += m * ix86_cost->int_load[2]
-                          - ix86_cost->sse_load[sse_cost_idx];
-             }
-           break;
+             if (CONST_INT_P (XEXP (src, 0)))
+               igain -= vector_const_cost (XEXP (src, 0));
+             break;
 
-         case NEG:
-         case NOT:
-           igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
+           case ROTATE:
+           case ROTATERT:
+             igain += m * ix86_cost->shift_const;
+             if (TARGET_AVX512VL)
+               igain -= ix86_cost->sse_op;
+             else if (smode == DImode)
+               {
+                 int bits = INTVAL (XEXP (src, 1));
+                 if ((bits & 0x0f) == 0)
+                   igain -= ix86_cost->sse_op;
+                 else if ((bits & 0x07) == 0)
+                   igain -= 2 * ix86_cost->sse_op;
+                 else
+                   igain -= 3 * ix86_cost->sse_op;
+               }
+             else if (INTVAL (XEXP (src, 1)) == 16)
+               igain -= ix86_cost->sse_op;
+             else
+               igain -= 2 * ix86_cost->sse_op;
+             break;
 
-           if (GET_CODE (XEXP (src, 0)) != ABS)
-             {
+           case AND:
+           case IOR:
+           case XOR:
+           case PLUS:
+           case MINUS:
+             igain += m * ix86_cost->add - ix86_cost->sse_op;
+             /* Additional gain for andnot for targets without BMI.  */
+             if (GET_CODE (XEXP (src, 0)) == NOT
+                 && !TARGET_BMI)
                igain += m * ix86_cost->add;
-               break;
-             }
-           /* FALLTHRU */
-
-         case ABS:
-         case SMAX:
-         case SMIN:
-         case UMAX:
-         case UMIN:
-           /* We do not have any conditional move cost, estimate it as a
-              reg-reg move.  Comparisons are costed as adds.  */
-           igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
-           /* Integer SSE ops are all costed the same.  */
-           igain -= ix86_cost->sse_op;
-           break;
 
-         case COMPARE:
-           if (XEXP (src, 1) != const0_rtx)
-             {
-               /* cmp vs. pxor;pshufd;ptest.  */
-               igain += COSTS_N_INSNS (m - 3);
-             }
-           else if (GET_CODE (XEXP (src, 0)) != AND)
-             {
-               /* test vs. pshufd;ptest.  */
-               igain += COSTS_N_INSNS (m - 2);
-             }
-           else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
-             {
-               /* and;test vs. pshufd;ptest.  */
-               igain += COSTS_N_INSNS (2 * m - 2);
-             }
-           else if (TARGET_BMI)
-             {
-               /* andn;test vs. pandn;pshufd;ptest.  */
-               igain += COSTS_N_INSNS (2 * m - 3);
-             }
-           else
-             {
-               /* not;and;test vs. pandn;pshufd;ptest.  */
-               igain += COSTS_N_INSNS (3 * m - 3);
-             }
-           break;
+             if (CONST_INT_P (XEXP (src, 0)))
+               igain -= vector_const_cost (XEXP (src, 0));
+             if (CONST_INT_P (XEXP (src, 1)))
+               igain -= vector_const_cost (XEXP (src, 1));
+             if (MEM_P (XEXP (src, 1)))
+               {
+                 if (optimize_insn_for_size_p ())
+                   igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
+                 else
+                   igain += m * ix86_cost->int_load[2]
+                            - ix86_cost->sse_load[sse_cost_idx];
+               }
+             break;
 
-         case CONST_INT:
-           if (REG_P (dst))
-             {
-               if (optimize_insn_for_size_p ())
-                 {
-                   /* xor (2 bytes) vs. xorps (3 bytes).  */
-                   if (src == const0_rtx)
-                     igain -= COSTS_N_BYTES (1);
-                   /* movdi_internal vs. movv2di_internal.  */
-                   /* => mov (5 bytes) vs. movaps (7 bytes).  */
-                   else if (x86_64_immediate_operand (src, SImode))
-                     igain -= COSTS_N_BYTES (2);
-                   else
-                     /* ??? Larger immediate constants are placed in the
-                        constant pool, where the size benefit/impact of
-                        STV conversion is affected by whether and how
-                        often each constant pool entry is shared/reused.
-                        The value below is empirically derived from the
-                        CSiBE benchmark (and the optimal value may drift
-                        over time).  */
-                     igain += COSTS_N_BYTES (0);
-                 }
-               else
-                 {
-                   /* DImode can be immediate for TARGET_64BIT
-                      and SImode always.  */
-                   igain += m * COSTS_N_INSNS (1);
-                   igain -= vector_const_cost (src);
-                 }
-             }
-           else if (MEM_P (dst))
-             {
-               igain += (m * ix86_cost->int_store[2]
-                         - ix86_cost->sse_store[sse_cost_idx]);
-               igain -= vector_const_cost (src);
-             }
-           break;
+           case NEG:
+           case NOT:
+             igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
 
-         case VEC_SELECT:
-           if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
-             {
-               // movd (4 bytes) replaced with movdqa (4 bytes).
-               if (!optimize_insn_for_size_p ())
-                 igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
-             }
-           else
-             {
-               // pshufd; movd replaced with pshufd.
-               if (optimize_insn_for_size_p ())
-                 igain += COSTS_N_BYTES (4);
-               else
-                 igain += ix86_cost->sse_to_integer;
-             }
-           break;
+             if (GET_CODE (XEXP (src, 0)) != ABS)
+               {
+                 igain += m * ix86_cost->add;
+                 break;
+               }
+             /* FALLTHRU */
+
+           case ABS:
+           case SMAX:
+           case SMIN:
+           case UMAX:
+           case UMIN:
+             /* We do not have any conditional move cost, estimate it as a
+                reg-reg move.  Comparisons are costed as adds.  */
+             igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
+             /* Integer SSE ops are all costed the same.  */
+             igain -= ix86_cost->sse_op;
+             break;
 
-         default:
-           gcc_unreachable ();
-         }
+           case COMPARE:
+             if (XEXP (src, 1) != const0_rtx)
+               {
+                 /* cmp vs. pxor;pshufd;ptest.  */
+                 igain += COSTS_N_INSNS (m - 3);
+               }
+             else if (GET_CODE (XEXP (src, 0)) != AND)
+               {
+                 /* test vs. pshufd;ptest.  */
+                 igain += COSTS_N_INSNS (m - 2);
+               }
+             else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
+               {
+                 /* and;test vs. pshufd;ptest.  */
+                 igain += COSTS_N_INSNS (2 * m - 2);
+               }
+             else if (TARGET_BMI)
+               {
+                 /* andn;test vs. pandn;pshufd;ptest.  */
+                 igain += COSTS_N_INSNS (2 * m - 3);
+               }
+             else
+               {
+                 /* not;and;test vs. pandn;pshufd;ptest.  */
+                 igain += COSTS_N_INSNS (3 * m - 3);
+               }
+             break;
+
+           case CONST_INT:
+             if (REG_P (dst))
+               {
+                 if (optimize_insn_for_size_p ())
+                   {
+                     /* xor (2 bytes) vs. xorps (3 bytes).  */
+                     if (src == const0_rtx)
+                       igain -= COSTS_N_BYTES (1);
+                     /* movdi_internal vs. movv2di_internal.  */
+                     /* => mov (5 bytes) vs. movaps (7 bytes).  */
+                     else if (x86_64_immediate_operand (src, SImode))
+                       igain -= COSTS_N_BYTES (2);
+                     else
+                       /* ??? Larger immediate constants are placed in the
+                          constant pool, where the size benefit/impact of
+                          STV conversion is affected by whether and how
+                          often each constant pool entry is shared/reused.
+                          The value below is empirically derived from the
+                          CSiBE benchmark (and the optimal value may drift
+                          over time).  */
+                       igain += COSTS_N_BYTES (0);
+                   }
+                 else
+                   {
+                     /* DImode can be immediate for TARGET_64BIT
+                        and SImode always.  */
+                     igain += m * COSTS_N_INSNS (1);
+                     igain -= vector_const_cost (src);
+                   }
+               }
+             else if (MEM_P (dst))
+               {
+                 igain += (m * ix86_cost->int_store[2]
+                           - ix86_cost->sse_store[sse_cost_idx]);
+                 igain -= vector_const_cost (src);
+               }
+             break;
+
+           case VEC_SELECT:
+             if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
+               {
+                 // movd (4 bytes) replaced with movdqa (4 bytes).
+                 if (!optimize_insn_for_size_p ())
+                   igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
+               }
+             else
+               {
+                 // pshufd; movd replaced with pshufd.
+                 if (optimize_insn_for_size_p ())
+                   igain += COSTS_N_BYTES (4);
+                 else
+                   igain += ix86_cost->sse_to_integer;
+               }
+             break;
+
+           default:
+             gcc_unreachable ();
+           }
+       }
 
       if (igain != 0 && dump_file)
        {
@@ -757,7 +769,7 @@ general_scalar_chain::compute_convert_gain ()
     {
       cost += n_sse_to_integer * ix86_cost->sse_to_integer;
       /* ???  integer_to_sse but we only have that in the RA cost table.
-              Assume sse_to_integer/integer_to_sse are the same which they
+             Assume sse_to_integer/integer_to_sse are the same which they
              are at the moment.  */
       cost += n_integer_to_sse * ix86_cost->sse_to_integer;
     }
@@ -3147,7 +3159,7 @@ remove_partial_avx_dependency (void)
          insn = NEXT_INSN (insn);
        }
       if (insn == BB_HEAD (bb))
-        set_insn = emit_insn_before (set, insn);
+       set_insn = emit_insn_before (set, insn);
       else
        set_insn = emit_insn_after (set,
                                    insn ? PREV_INSN (insn) : BB_END (bb));
@@ -3321,7 +3333,7 @@ add_condition_to_bb (tree function_decl, tree version_decl,
       predicate_chain = TREE_CHAIN (predicate_chain);
       
       if (and_expr_var == NULL)
-        and_expr_var = cond_var;
+       and_expr_var = cond_var;
       else
        {
          gimple *assign_stmt;
@@ -3338,7 +3350,7 @@ add_condition_to_bb (tree function_decl, tree version_decl,
     }
 
   if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
-                                   integer_zero_node,
+                                   integer_zero_node,
                                    NULL_TREE, NULL_TREE);
   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
   gimple_set_bb (if_else_stmt, new_bb);
@@ -3435,10 +3447,10 @@ dispatch_function_versions (tree dispatch_decl,
       tree predicate_chain = NULL_TREE;
       unsigned int priority;
       /* Get attribute string, parse it and find the right predicate decl.
-         The predicate function could be a lengthy combination of many
+        The predicate function could be a lengthy combination of many
         features, like arch-type and various isa-variants.  */
       priority = get_builtin_code_for_version (version_decl,
-                                              &predicate_chain);
+                                              &predicate_chain);
 
       if (predicate_chain == NULL_TREE)
        continue;
@@ -3456,7 +3468,7 @@ dispatch_function_versions (tree dispatch_decl,
      to execute,  which one should be dispatched?  In future, allow the user
      to specify a dispatch  priority next to the version.  */
   qsort (function_version_info, actual_versions,
-         sizeof (struct _function_version_info), feature_compare);
+        sizeof (struct _function_version_info), feature_compare);
 
   for  (i = 0; i < actual_versions; ++i)
     *empty_bb = add_condition_to_bb (dispatch_decl,
@@ -3573,7 +3585,7 @@ ix86_get_function_versions_dispatcher (void *decl)
     {
       if (is_function_default_version
            (default_version_info->this_node->decl))
-        break;
+       break;
       default_version_info = default_version_info->next;
     }
 
@@ -3586,7 +3598,7 @@ ix86_get_function_versions_dispatcher (void *decl)
     {
       default_version_info->prev->next = default_version_info->next;
       if (default_version_info->next)
-        default_version_info->next->prev = default_version_info->prev;
+       default_version_info->next->prev = default_version_info->prev;
       first_v->prev = default_version_info;
       default_version_info->next = first_v;
       default_version_info->prev = NULL;
diff --git a/gcc/testsuite/gcc.target/i386/pr113231.c b/gcc/testsuite/gcc.target/i386/pr113231.c
new file mode 100644 (file)
index 0000000..f9dcd9a
--- /dev/null
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+
+void foo(int *i) { *i *= 2; }
+void bar(int *i) { *i <<= 2; }
+void baz(int *i) { *i >>= 2; }
+
+/* { dg-final { scan-assembler-not "movd" } } */