From: Roger Sayle <roger@nextmovesoftware.com>
Date: Sun, 7 Jan 2024 17:42:00 +0000 (+0000)
Subject: i386: PR target/113231: Improved costs in Scalar-To-Vector (STV) pass.
X-Git-Tag: basepoints/gcc-15~3154
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0a8aba760f62e9d66cc5610ecc276c1f0befc651;p=thirdparty%2Fgcc.git

i386: PR target/113231: Improved costs in Scalar-To-Vector (STV) pass.

This patch improves the cost/gain calculation used during the i386 backend's
SImode/DImode scalar-to-vector (STV) conversion pass.  The current code
handles loads and stores, but doesn't consider that converting other
scalar operations with a memory destination, requires an explicit load
before and an explicit store after the vector equivalent.

To ease the review, the significant change looks like:

         /* For operations on memory operands, include the overhead
            of explicit load and store instructions.  */
         if (MEM_P (dst))
           igain += !optimize_insn_for_size_p ()
                    ? -COSTS_N_BYTES (8);
                    : (m * (ix86_cost->int_load[2]
                            + ix86_cost->int_store[2])
                       - (ix86_cost->sse_load[sse_cost_idx] +
                          ix86_cost->sse_store[sse_cost_idx]));

however the patch itself is complicated by a change in indentation
which leads to a number of lines with only whitespace changes.
For architectures where integer load/store costs are the same as
vector load/store costs, there should be no change without -Os/-Oz.

2024-01-07  Roger Sayle  <roger@nextmovesoftware.com>
	    Uros Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog
	PR target/113231
	* config/i386/i386-features.cc (compute_convert_gain): Include
	the overhead of explicit load and store (movd) instructions when
	converting non-store scalar operations with memory destinations.
	Various indentation whitespace fixes.

gcc/testsuite/ChangeLog
	PR target/113231
	* gcc.target/i386/pr113231.c: New test case.
---

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 4ae3e7564d81..4020b271328a 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -563,183 +563,195 @@ general_scalar_chain::compute_convert_gain ()
       else if (MEM_P (src) && REG_P (dst))
 	igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
       else
-	switch (GET_CODE (src))
-	  {
-	  case ASHIFT:
-	  case ASHIFTRT:
-	  case LSHIFTRT:
-	    if (m == 2)
-	      {
-		if (INTVAL (XEXP (src, 1)) >= 32)
-		  igain += ix86_cost->add;
-		/* Gain for extend highpart case.  */
-		else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
-		  igain += ix86_cost->shift_const - ix86_cost->sse_op;
-		else
-		  igain += ix86_cost->shift_const;
-	      }
+	{
+	  /* For operations on memory operands, include the overhead
+	     of explicit load and store instructions.  */
+	  if (MEM_P (dst))
+	    igain += optimize_insn_for_size_p ()
+		     ? -COSTS_N_BYTES (8)
+		     : (m * (ix86_cost->int_load[2]
+			     + ix86_cost->int_store[2])
+			- (ix86_cost->sse_load[sse_cost_idx] +
+			   ix86_cost->sse_store[sse_cost_idx]));
 
-	    igain += ix86_cost->shift_const - ix86_cost->sse_op;
+	  switch (GET_CODE (src))
+	    {
+	    case ASHIFT:
+	    case ASHIFTRT:
+	    case LSHIFTRT:
+	      if (m == 2)
+		{
+		  if (INTVAL (XEXP (src, 1)) >= 32)
+		    igain += ix86_cost->add;
+		  /* Gain for extend highpart case.  */
+		  else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
+		    igain += ix86_cost->shift_const - ix86_cost->sse_op;
+		  else
+		    igain += ix86_cost->shift_const;
+		}
 
-	    if (CONST_INT_P (XEXP (src, 0)))
-	      igain -= vector_const_cost (XEXP (src, 0));
-	    break;
+	      igain += ix86_cost->shift_const - ix86_cost->sse_op;
 
-	  case ROTATE:
-	  case ROTATERT:
-	    igain += m * ix86_cost->shift_const;
-	    if (TARGET_AVX512VL)
-	      igain -= ix86_cost->sse_op;
-	    else if (smode == DImode)
-	      {
-		int bits = INTVAL (XEXP (src, 1));
-		if ((bits & 0x0f) == 0)
-		  igain -= ix86_cost->sse_op;
-		else if ((bits & 0x07) == 0)
-		  igain -= 2 * ix86_cost->sse_op;
-		else
-		  igain -= 3 * ix86_cost->sse_op;
-	      }
-	    else if (INTVAL (XEXP (src, 1)) == 16)
-	      igain -= ix86_cost->sse_op;
-	    else
-	      igain -= 2 * ix86_cost->sse_op;
-	    break;
-
-	  case AND:
-	  case IOR:
-	  case XOR:
-	  case PLUS:
-	  case MINUS:
-	    igain += m * ix86_cost->add - ix86_cost->sse_op;
-	    /* Additional gain for andnot for targets without BMI.  */
-	    if (GET_CODE (XEXP (src, 0)) == NOT
-		&& !TARGET_BMI)
-	      igain += m * ix86_cost->add;
-
-	    if (CONST_INT_P (XEXP (src, 0)))
-	      igain -= vector_const_cost (XEXP (src, 0));
-	    if (CONST_INT_P (XEXP (src, 1)))
-	      igain -= vector_const_cost (XEXP (src, 1));
-	    if (MEM_P (XEXP (src, 1)))
-	      {
-		if (optimize_insn_for_size_p ())
-		  igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
-		else
-		  igain += m * ix86_cost->int_load[2]
-			   - ix86_cost->sse_load[sse_cost_idx];
-	      }
-	    break;
+	      if (CONST_INT_P (XEXP (src, 0)))
+		igain -= vector_const_cost (XEXP (src, 0));
+	      break;
 
-	  case NEG:
-	  case NOT:
-	    igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
+	    case ROTATE:
+	    case ROTATERT:
+	      igain += m * ix86_cost->shift_const;
+	      if (TARGET_AVX512VL)
+		igain -= ix86_cost->sse_op;
+	      else if (smode == DImode)
+		{
+		  int bits = INTVAL (XEXP (src, 1));
+		  if ((bits & 0x0f) == 0)
+		    igain -= ix86_cost->sse_op;
+		  else if ((bits & 0x07) == 0)
+		    igain -= 2 * ix86_cost->sse_op;
+		  else
+		    igain -= 3 * ix86_cost->sse_op;
+		}
+	      else if (INTVAL (XEXP (src, 1)) == 16)
+		igain -= ix86_cost->sse_op;
+	      else
+		igain -= 2 * ix86_cost->sse_op;
+	      break;
 
-	    if (GET_CODE (XEXP (src, 0)) != ABS)
-	      {
+	    case AND:
+	    case IOR:
+	    case XOR:
+	    case PLUS:
+	    case MINUS:
+	      igain += m * ix86_cost->add - ix86_cost->sse_op;
+	      /* Additional gain for andnot for targets without BMI.  */
+	      if (GET_CODE (XEXP (src, 0)) == NOT
+		  && !TARGET_BMI)
 		igain += m * ix86_cost->add;
-		break;
-	      }
-	    /* FALLTHRU */
-
-	  case ABS:
-	  case SMAX:
-	  case SMIN:
-	  case UMAX:
-	  case UMIN:
-	    /* We do not have any conditional move cost, estimate it as a
-	       reg-reg move.  Comparisons are costed as adds.  */
-	    igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
-	    /* Integer SSE ops are all costed the same.  */
-	    igain -= ix86_cost->sse_op;
-	    break;
 
-	  case COMPARE:
-	    if (XEXP (src, 1) != const0_rtx)
-	      {
-		/* cmp vs. pxor;pshufd;ptest.  */
-		igain += COSTS_N_INSNS (m - 3);
-	      }
-	    else if (GET_CODE (XEXP (src, 0)) != AND)
-	      {
-		/* test vs. pshufd;ptest.  */
-		igain += COSTS_N_INSNS (m - 2);
-	      }
-	    else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
-	      {
-		/* and;test vs. pshufd;ptest.  */
-		igain += COSTS_N_INSNS (2 * m - 2);
-	      }
-	    else if (TARGET_BMI)
-	      {
-		/* andn;test vs. pandn;pshufd;ptest.  */
-		igain += COSTS_N_INSNS (2 * m - 3);
-	      }
-	    else
-	      {
-		/* not;and;test vs. pandn;pshufd;ptest.  */
-		igain += COSTS_N_INSNS (3 * m - 3);
-	      }
-	    break;
+	      if (CONST_INT_P (XEXP (src, 0)))
+		igain -= vector_const_cost (XEXP (src, 0));
+	      if (CONST_INT_P (XEXP (src, 1)))
+		igain -= vector_const_cost (XEXP (src, 1));
+	      if (MEM_P (XEXP (src, 1)))
+		{
+		  if (optimize_insn_for_size_p ())
+		    igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
+		  else
+		    igain += m * ix86_cost->int_load[2]
+			     - ix86_cost->sse_load[sse_cost_idx];
+		}
+	      break;
 
-	  case CONST_INT:
-	    if (REG_P (dst))
-	      {
-		if (optimize_insn_for_size_p ())
-		  {
-		    /* xor (2 bytes) vs. xorps (3 bytes).  */
-		    if (src == const0_rtx)
-		      igain -= COSTS_N_BYTES (1);
-		    /* movdi_internal vs. movv2di_internal.  */
-		    /* => mov (5 bytes) vs. movaps (7 bytes).  */
-		    else if (x86_64_immediate_operand (src, SImode))
-		      igain -= COSTS_N_BYTES (2);
-		    else
-		      /* ??? Larger immediate constants are placed in the
-			 constant pool, where the size benefit/impact of
-			 STV conversion is affected by whether and how
-			 often each constant pool entry is shared/reused.
-			 The value below is empirically derived from the
-			 CSiBE benchmark (and the optimal value may drift
-			 over time).  */
-		      igain += COSTS_N_BYTES (0);
-		  }
-		else
-		  {
-		    /* DImode can be immediate for TARGET_64BIT
-		       and SImode always.  */
-		    igain += m * COSTS_N_INSNS (1);
-		    igain -= vector_const_cost (src);
-		  }
-	      }
-	    else if (MEM_P (dst))
-	      {
-		igain += (m * ix86_cost->int_store[2]
-			  - ix86_cost->sse_store[sse_cost_idx]);
-		igain -= vector_const_cost (src);
-	      }
-	    break;
+	    case NEG:
+	    case NOT:
+	      igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
 
-	  case VEC_SELECT:
-	    if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
-	      {
-		// movd (4 bytes) replaced with movdqa (4 bytes).
-		if (!optimize_insn_for_size_p ())
-		  igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
-	      }
-	    else
-	      {
-		// pshufd; movd replaced with pshufd.
-		if (optimize_insn_for_size_p ())
-		  igain += COSTS_N_BYTES (4);
-		else
-		  igain += ix86_cost->sse_to_integer;
-	      }
-	    break;
+	      if (GET_CODE (XEXP (src, 0)) != ABS)
+		{
+		  igain += m * ix86_cost->add;
+		  break;
+		}
+	      /* FALLTHRU */
+
+	    case ABS:
+	    case SMAX:
+	    case SMIN:
+	    case UMAX:
+	    case UMIN:
+	      /* We do not have any conditional move cost, estimate it as a
+		 reg-reg move.  Comparisons are costed as adds.  */
+	      igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
+	      /* Integer SSE ops are all costed the same.  */
+	      igain -= ix86_cost->sse_op;
+	      break;
 
-	  default:
-	    gcc_unreachable ();
-	  }
+	    case COMPARE:
+	      if (XEXP (src, 1) != const0_rtx)
+		{
+		  /* cmp vs. pxor;pshufd;ptest.  */
+		  igain += COSTS_N_INSNS (m - 3);
+		}
+	      else if (GET_CODE (XEXP (src, 0)) != AND)
+		{
+		  /* test vs. pshufd;ptest.  */
+		  igain += COSTS_N_INSNS (m - 2);
+		}
+	      else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
+		{
+		  /* and;test vs. pshufd;ptest.  */
+		  igain += COSTS_N_INSNS (2 * m - 2);
+		}
+	      else if (TARGET_BMI)
+		{
+		  /* andn;test vs. pandn;pshufd;ptest.  */
+		  igain += COSTS_N_INSNS (2 * m - 3);
+		}
+	      else
+		{
+		  /* not;and;test vs. pandn;pshufd;ptest.  */
+		  igain += COSTS_N_INSNS (3 * m - 3);
+		}
+	      break;
+
+	    case CONST_INT:
+	      if (REG_P (dst))
+		{
+		  if (optimize_insn_for_size_p ())
+		    {
+		      /* xor (2 bytes) vs. xorps (3 bytes).  */
+		      if (src == const0_rtx)
+			igain -= COSTS_N_BYTES (1);
+		      /* movdi_internal vs. movv2di_internal.  */
+		      /* => mov (5 bytes) vs. movaps (7 bytes).  */
+		      else if (x86_64_immediate_operand (src, SImode))
+			igain -= COSTS_N_BYTES (2);
+		      else
+			/* ??? Larger immediate constants are placed in the
+			   constant pool, where the size benefit/impact of
+			   STV conversion is affected by whether and how
+			   often each constant pool entry is shared/reused.
+			   The value below is empirically derived from the
+			   CSiBE benchmark (and the optimal value may drift
+			   over time).  */
+			igain += COSTS_N_BYTES (0);
+		    }
+		  else
+		    {
+		      /* DImode can be immediate for TARGET_64BIT
+			 and SImode always.  */
+		      igain += m * COSTS_N_INSNS (1);
+		      igain -= vector_const_cost (src);
+		    }
+		}
+	      else if (MEM_P (dst))
+		{
+		  igain += (m * ix86_cost->int_store[2]
+			    - ix86_cost->sse_store[sse_cost_idx]);
+		  igain -= vector_const_cost (src);
+		}
+	      break;
+
+	    case VEC_SELECT:
+	      if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
+		{
+		  // movd (4 bytes) replaced with movdqa (4 bytes).
+		  if (!optimize_insn_for_size_p ())
+		    igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
+		}
+	      else
+		{
+		  // pshufd; movd replaced with pshufd.
+		  if (optimize_insn_for_size_p ())
+		    igain += COSTS_N_BYTES (4);
+		  else
+		    igain += ix86_cost->sse_to_integer;
+		}
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
 
       if (igain != 0 && dump_file)
 	{
@@ -757,7 +769,7 @@ general_scalar_chain::compute_convert_gain ()
     {
       cost += n_sse_to_integer * ix86_cost->sse_to_integer;
       /* ???  integer_to_sse but we only have that in the RA cost table.
-              Assume sse_to_integer/integer_to_sse are the same which they
+	      Assume sse_to_integer/integer_to_sse are the same which they
 	      are at the moment.  */
       cost += n_integer_to_sse * ix86_cost->sse_to_integer;
     }
@@ -3147,7 +3159,7 @@ remove_partial_avx_dependency (void)
 	  insn = NEXT_INSN (insn);
 	}
       if (insn == BB_HEAD (bb))
-        set_insn = emit_insn_before (set, insn);
+	set_insn = emit_insn_before (set, insn);
       else
 	set_insn = emit_insn_after (set,
 				    insn ? PREV_INSN (insn) : BB_END (bb));
@@ -3321,7 +3333,7 @@ add_condition_to_bb (tree function_decl, tree version_decl,
       predicate_chain = TREE_CHAIN (predicate_chain);
       
       if (and_expr_var == NULL)
-        and_expr_var = cond_var;
+	and_expr_var = cond_var;
       else
 	{
 	  gimple *assign_stmt;
@@ -3338,7 +3350,7 @@ add_condition_to_bb (tree function_decl, tree version_decl,
     }
 
   if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
-	  		            integer_zero_node,
+				    integer_zero_node,
 				    NULL_TREE, NULL_TREE);
   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
   gimple_set_bb (if_else_stmt, new_bb);
@@ -3435,10 +3447,10 @@ dispatch_function_versions (tree dispatch_decl,
       tree predicate_chain = NULL_TREE;
       unsigned int priority;
       /* Get attribute string, parse it and find the right predicate decl.
-         The predicate function could be a lengthy combination of many
+	 The predicate function could be a lengthy combination of many
 	 features, like arch-type and various isa-variants.  */
       priority = get_builtin_code_for_version (version_decl,
-	 			               &predicate_chain);
+					       &predicate_chain);
 
       if (predicate_chain == NULL_TREE)
 	continue;
@@ -3456,7 +3468,7 @@ dispatch_function_versions (tree dispatch_decl,
      to execute,  which one should be dispatched?  In future, allow the user
      to specify a dispatch  priority next to the version.  */
   qsort (function_version_info, actual_versions,
-         sizeof (struct _function_version_info), feature_compare);
+	 sizeof (struct _function_version_info), feature_compare);
 
   for  (i = 0; i < actual_versions; ++i)
     *empty_bb = add_condition_to_bb (dispatch_decl,
@@ -3573,7 +3585,7 @@ ix86_get_function_versions_dispatcher (void *decl)
     {
       if (is_function_default_version
 	    (default_version_info->this_node->decl))
-        break;
+	break;
       default_version_info = default_version_info->next;
     }
 
@@ -3586,7 +3598,7 @@ ix86_get_function_versions_dispatcher (void *decl)
     {
       default_version_info->prev->next = default_version_info->next;
       if (default_version_info->next)
-        default_version_info->next->prev = default_version_info->prev;
+	default_version_info->next->prev = default_version_info->prev;
       first_v->prev = default_version_info;
       default_version_info->next = first_v;
       default_version_info->prev = NULL;
diff --git a/gcc/testsuite/gcc.target/i386/pr113231.c b/gcc/testsuite/gcc.target/i386/pr113231.c
new file mode 100644
index 000000000000..f9dcd9a3137d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113231.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+
+void foo(int *i) { *i *= 2; }
+void bar(int *i) { *i <<= 2; }
+void baz(int *i) { *i >>= 2; }
+
+/* { dg-final { scan-assembler-not "movd" } } */