i386: Fix some problems in stv cost model

author Jan Hubicka <hubicka@ucw.cz>

Sat, 10 May 2025 20:23:48 +0000 (22:23 +0200)

committer Jan Hubicka <hubicka@ucw.cz>

Sat, 10 May 2025 20:24:52 +0000 (22:24 +0200)
author Jan Hubicka <hubicka@ucw.cz>
Sat, 10 May 2025 20:23:48 +0000 (22:23 +0200)
committer Jan Hubicka <hubicka@ucw.cz>
Sat, 10 May 2025 20:24:52 +0000 (22:24 +0200)
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc

index 1ba5ac4faa4cadb211064712285afe461c8201a0..54b3f6d33b22bfb328489a9cb74566e4f4b75334 100644 (file)
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -518,15 +518,17 @@ scalar_chain::build (bitmap candidates, unsigned insn_uid, bitmap disallowed)
     instead of using a scalar one.  */
  
  int
-general_scalar_chain::vector_const_cost (rtx exp)
+general_scalar_chain::vector_const_cost (rtx exp, basic_block bb)
  {
    gcc_assert (CONST_INT_P (exp));
  
    if (standard_sse_constant_p (exp, vmode))
      return ix86_cost->sse_op;
+  if (optimize_bb_for_size_p (bb))
+    return COSTS_N_BYTES (8);
    /* We have separate costs for SImode and DImode, use SImode costs
       for smaller modes.  */
-  return ix86_cost->sse_load[smode == DImode ? 1 : 0];
+  return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2;
  }
  
  /* Compute a gain for chain conversion.  */
@@ -547,7 +549,7 @@ general_scalar_chain::compute_convert_gain ()
       smaller modes than SImode the int load/store costs need to be
       adjusted as well.  */
    unsigned sse_cost_idx = smode == DImode ? 1 : 0;
-  unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
+  int m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
  
    EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
      {
@@ -555,26 +557,55 @@ general_scalar_chain::compute_convert_gain ()
        rtx def_set = single_set (insn);
        rtx src = SET_SRC (def_set);
        rtx dst = SET_DEST (def_set);
+      basic_block bb = BLOCK_FOR_INSN (insn);
        int igain = 0;
  
        if (REG_P (src) && REG_P (dst))
-       igain += 2 * m - ix86_cost->xmm_move;
+       {
+         if (optimize_bb_for_size_p (bb))
+           /* reg-reg move is 2 bytes, while SSE 3.  */
+           igain += COSTS_N_BYTES (2 * m - 3);
+         else
+           /* Move costs are normalized to reg-reg move having cost 2.  */
+           igain += COSTS_N_INSNS (2 * m - ix86_cost->xmm_move) / 2;
+       }
        else if (REG_P (src) && MEM_P (dst))
-       igain
-         += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
+       {
+         if (optimize_bb_for_size_p (bb))
+           /* Integer load/store is 3+ bytes and SSE 4+.  */
+           igain += COSTS_N_BYTES (3 * m - 4);
+         else
+           igain
+             += COSTS_N_INSNS (m * ix86_cost->int_store[2]
+                               - ix86_cost->sse_store[sse_cost_idx]) / 2;
+       }
        else if (MEM_P (src) && REG_P (dst))
-       igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
+       {
+         if (optimize_bb_for_size_p (bb))
+           igain += COSTS_N_BYTES (3 * m - 4);
+         else
+           igain += COSTS_N_INSNS (m * ix86_cost->int_load[2]
+                                   - ix86_cost->sse_load[sse_cost_idx]) / 2;
+       }
        else
         {
           /* For operations on memory operands, include the overhead
              of explicit load and store instructions.  */
           if (MEM_P (dst))
-           igain += optimize_insn_for_size_p ()
-                    ? -COSTS_N_BYTES (8)
-                    : (m * (ix86_cost->int_load[2]
-                            + ix86_cost->int_store[2])
-                       - (ix86_cost->sse_load[sse_cost_idx] +
-                          ix86_cost->sse_store[sse_cost_idx]));
+           {
+             if (optimize_bb_for_size_p (bb))
+               /* ??? This probably should account size difference
+                  of SSE and integer load rather than full SSE load.  */
+               igain -= COSTS_N_BYTES (8);
+             else
+               {
+                 int cost = (m * (ix86_cost->int_load[2]
+                                  + ix86_cost->int_store[2])
+                            - (ix86_cost->sse_load[sse_cost_idx] +
+                               ix86_cost->sse_store[sse_cost_idx]));
+                 igain += COSTS_N_INSNS (cost) / 2;
+               }
+           }
  
           switch (GET_CODE (src))
             {
@@ -595,7 +626,7 @@ general_scalar_chain::compute_convert_gain ()
               igain += ix86_cost->shift_const - ix86_cost->sse_op;
  
               if (CONST_INT_P (XEXP (src, 0)))
-               igain -= vector_const_cost (XEXP (src, 0));
+               igain -= vector_const_cost (XEXP (src, 0), bb);
               break;
  
             case ROTATE:
@@ -631,16 +662,17 @@ general_scalar_chain::compute_convert_gain ()
                 igain += m * ix86_cost->add;
  
               if (CONST_INT_P (XEXP (src, 0)))
-               igain -= vector_const_cost (XEXP (src, 0));
+               igain -= vector_const_cost (XEXP (src, 0), bb);
               if (CONST_INT_P (XEXP (src, 1)))
-               igain -= vector_const_cost (XEXP (src, 1));
+               igain -= vector_const_cost (XEXP (src, 1), bb);
               if (MEM_P (XEXP (src, 1)))
                 {
-                 if (optimize_insn_for_size_p ())
+                 if (optimize_bb_for_size_p (bb))
                     igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
                   else
-                   igain += m * ix86_cost->int_load[2]
-                            - ix86_cost->sse_load[sse_cost_idx];
+                   igain += COSTS_N_INSNS
+                              (m * ix86_cost->int_load[2]
+                                - ix86_cost->sse_load[sse_cost_idx]) / 2;
                 }
               break;
  
@@ -698,7 +730,7 @@ general_scalar_chain::compute_convert_gain ()
             case CONST_INT:
               if (REG_P (dst))
                 {
-                 if (optimize_insn_for_size_p ())
+                 if (optimize_bb_for_size_p (bb))
                     {
                       /* xor (2 bytes) vs. xorps (3 bytes).  */
                       if (src == const0_rtx)
@@ -722,14 +754,14 @@ general_scalar_chain::compute_convert_gain ()
                       /* DImode can be immediate for TARGET_64BIT
                          and SImode always.  */
                       igain += m * COSTS_N_INSNS (1);
-                     igain -= vector_const_cost (src);
+                     igain -= vector_const_cost (src, bb);
                     }
                 }
               else if (MEM_P (dst))
                 {
                   igain += (m * ix86_cost->int_store[2]
                             - ix86_cost->sse_store[sse_cost_idx]);
-                 igain -= vector_const_cost (src);
+                 igain -= vector_const_cost (src, bb);
                 }
               break;
  
@@ -737,13 +769,14 @@ general_scalar_chain::compute_convert_gain ()
               if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
                 {
                   // movd (4 bytes) replaced with movdqa (4 bytes).
-                 if (!optimize_insn_for_size_p ())
-                   igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
+                 if (!optimize_bb_for_size_p (bb))
+                   igain += COSTS_N_INSNS (ix86_cost->sse_to_integer
+                                           - ix86_cost->xmm_move) / 2;
                 }
               else
                 {
                   // pshufd; movd replaced with pshufd.
-                 if (optimize_insn_for_size_p ())
+                 if (optimize_bb_for_size_p (bb))
                     igain += COSTS_N_BYTES (4);
                   else
                     igain += ix86_cost->sse_to_integer;
@@ -769,11 +802,11 @@ general_scalar_chain::compute_convert_gain ()
    /* Cost the integer to sse and sse to integer moves.  */
    if (!optimize_function_for_size_p (cfun))
      {
-      cost += n_sse_to_integer * ix86_cost->sse_to_integer;
+      cost += n_sse_to_integer * COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
        /* ???  integer_to_sse but we only have that in the RA cost table.
               Assume sse_to_integer/integer_to_sse are the same which they
               are at the moment.  */
-      cost += n_integer_to_sse * ix86_cost->sse_to_integer;
+      cost += n_integer_to_sse * COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
      }
    else if (TARGET_64BIT || smode == SImode)
      {
@@ -1508,13 +1541,13 @@ general_scalar_chain::convert_insn (rtx_insn *insn)
     with numerous special cases.  */
  
  static int
-timode_immed_const_gain (rtx cst)
+timode_immed_const_gain (rtx cst, basic_block bb)
  {
    /* movabsq vs. movabsq+vmovq+vunpacklqdq.  */
    if (CONST_WIDE_INT_P (cst)
        && CONST_WIDE_INT_NUNITS (cst) == 2
        && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1))
-    return optimize_insn_for_size_p () ? -COSTS_N_BYTES (9)
+    return optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (9)
                                        : -COSTS_N_INSNS (2);
    /* 2x movabsq ~ vmovdqa.  */
    return 0;
@@ -1546,33 +1579,34 @@ timode_scalar_chain::compute_convert_gain ()
        rtx src = SET_SRC (def_set);
        rtx dst = SET_DEST (def_set);
        HOST_WIDE_INT op1val;
+      basic_block bb = BLOCK_FOR_INSN (insn);
        int scost, vcost;
        int igain = 0;
  
        switch (GET_CODE (src))
         {
         case REG:
-         if (optimize_insn_for_size_p ())
+         if (optimize_bb_for_size_p (bb))
             igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
           else
             igain = COSTS_N_INSNS (1);
           break;
  
         case MEM:
-         igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (7)
+         igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (7)
                                               : COSTS_N_INSNS (1);
           break;
  
         case CONST_INT:
           if (MEM_P (dst)
               && standard_sse_constant_p (src, V1TImode))
-           igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (11) : 1;
+           igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (11) : 1;
           break;
  
         case CONST_WIDE_INT:
           /* 2 x mov vs. vmovdqa.  */
           if (MEM_P (dst))
-           igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (3)
+           igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (3)
                                                 : COSTS_N_INSNS (1);
           break;
  
@@ -1587,14 +1621,14 @@ timode_scalar_chain::compute_convert_gain ()
           if (!MEM_P (dst))
             igain = COSTS_N_INSNS (1);
           if (CONST_SCALAR_INT_P (XEXP (src, 1)))
-           igain += timode_immed_const_gain (XEXP (src, 1));
+           igain += timode_immed_const_gain (XEXP (src, 1), bb);
           break;
  
         case ASHIFT:
         case LSHIFTRT:
           /* See ix86_expand_v1ti_shift.  */
           op1val = INTVAL (XEXP (src, 1));
-         if (optimize_insn_for_size_p ())
+         if (optimize_bb_for_size_p (bb))
             {
               if (op1val == 64 || op1val == 65)
                 scost = COSTS_N_BYTES (5);
@@ -1628,7 +1662,7 @@ timode_scalar_chain::compute_convert_gain ()
         case ASHIFTRT:
           /* See ix86_expand_v1ti_ashiftrt.  */
           op1val = INTVAL (XEXP (src, 1));
-         if (optimize_insn_for_size_p ())
+         if (optimize_bb_for_size_p (bb))
             {
               if (op1val == 64 || op1val == 127)
                 scost = COSTS_N_BYTES (7);
@@ -1706,7 +1740,7 @@ timode_scalar_chain::compute_convert_gain ()
         case ROTATERT:
           /* See ix86_expand_v1ti_rotate.  */
           op1val = INTVAL (XEXP (src, 1));
-         if (optimize_insn_for_size_p ())
+         if (optimize_bb_for_size_p (bb))
             {
               scost = COSTS_N_BYTES (13);
               if ((op1val & 31) == 0)
@@ -1738,16 +1772,16 @@ timode_scalar_chain::compute_convert_gain ()
             {
               if (GET_CODE (XEXP (src, 0)) == AND)
                 /* and;and;or (9 bytes) vs. ptest (5 bytes).  */
-               igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (4)
-                                                  : COSTS_N_INSNS (2);
+               igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (4)
+                                                   : COSTS_N_INSNS (2);
               /* or (3 bytes) vs. ptest (5 bytes).  */
-             else if (optimize_insn_for_size_p ())
+             else if (optimize_bb_for_size_p (bb))
                 igain = -COSTS_N_BYTES (2);
             }
           else if (XEXP (src, 1) == const1_rtx)
             /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes).  */
-           igain = optimize_insn_for_size_p() ? -COSTS_N_BYTES (6)
-                                              : -COSTS_N_INSNS (1);
+           igain = optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (6)
+                                               : -COSTS_N_INSNS (1);
           break;
  
         default:
diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h

index 24b0c4ed0cda456b4ed8395d7b6e73a4f1e61dfd..7f7c0f78c9627bca21f5234f3a06a7e120265982 100644 (file)
--- a/gcc/config/i386/i386-features.h
+++ b/gcc/config/i386/i386-features.h
@@ -188,7 +188,7 @@ class general_scalar_chain : public scalar_chain
  
   private:
    void convert_insn (rtx_insn *insn) final override;
-  int vector_const_cost (rtx exp);
+  int vector_const_cost (rtx exp, basic_block bb);
    rtx convert_rotate (enum rtx_code, rtx op0, rtx op1, rtx_insn *insn);
  };
  
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

index 6a38de30de4375a139d7895f56fc864c1228f628..18fa97a9eb0d5ce0c6b894e86e888118e395e951 100644 (file)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -179,6 +179,7 @@ struct processor_costs {
    const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
             zmm_move;
    const int sse_to_integer;    /* cost of moving SSE register to integer.  */
+  const int integer_to_sse;    /* cost of moving integer register to SSE. */
    const int gather_static, gather_per_elt; /* Cost of gather load is computed
                                    as static + per_item * nelts. */
    const int scatter_static, scatter_per_elt; /* Cost of gather store is
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

index 6cce70a6c4037f259f4b5cfa380ebacaf1891fe9..e50912935094c864663feb31412d550783f32ecb 100644 (file)
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -107,6 +107,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
                                            in 128bit, 256bit and 512bit */
    4, 4, 6,                             /* cost of moving XMM,YMM,ZMM register */
    4,                                   /* cost of moving SSE register to integer.  */
+  4,                                   /* cost of moving integer register to SSE.  */
    COSTS_N_BYTES (5), 0,                        /* Gather load static, per_elt.  */
    COSTS_N_BYTES (5), 0,                        /* Gather store static, per_elt.  */
    0,                                   /* size of l1 cache  */
@@ -227,6 +228,7 @@ struct processor_costs i386_cost = {        /* 386 specific costs */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    3,                                   /* cost of moving SSE register to integer.  */
+  3,                                   /* cost of moving integer register to SSE.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    0,                                   /* size of l1 cache  */
@@ -345,6 +347,7 @@ struct processor_costs i486_cost = {        /* 486 specific costs */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    3,                                   /* cost of moving SSE register to integer.  */
+  3,                                   /* cost of moving integer register to SSE.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    4,                                   /* size of l1 cache.  486 has 8kB cache
@@ -465,6 +468,7 @@ struct processor_costs pentium_cost = {
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    3,                                   /* cost of moving SSE register to integer.  */
+  3,                                   /* cost of moving integer register to SSE.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -576,6 +580,7 @@ struct processor_costs lakemont_cost = {
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    3,                                   /* cost of moving SSE register to integer.  */
+  3,                                   /* cost of moving integer register to SSE.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -702,6 +707,7 @@ struct processor_costs pentiumpro_cost = {
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    3,                                   /* cost of moving SSE register to integer.  */
+  3,                                   /* cost of moving integer register to SSE.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -819,6 +825,7 @@ struct processor_costs geode_cost = {
    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    2, 2,                                        /* Gather load static, per_elt.  */
    2, 2,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -936,6 +943,7 @@ struct processor_costs k6_cost = {
    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    2, 2,                                        /* Gather load static, per_elt.  */
    2, 2,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1059,6 +1067,7 @@ struct processor_costs athlon_cost = {
    {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    5,                                   /* cost of moving SSE register to integer.  */
+  5,                                   /* cost of moving integer register to SSE.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -1184,6 +1193,7 @@ struct processor_costs k8_cost = {
    {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    5,                                   /* cost of moving SSE register to integer.  */
+  5,                                   /* cost of moving integer register to SSE.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -1322,6 +1332,7 @@ struct processor_costs amdfam10_cost = {
    {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    3,                                   /* cost of moving SSE register to integer.  */
+  3,                                   /* cost of moving integer register to SSE.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -1452,6 +1463,7 @@ const struct processor_costs bdver_cost = {
    {10, 10, 10, 40, 60},                        /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    16,                                  /* cost of moving SSE register to integer.  */
+  16,                                  /* cost of moving integer register to SSE.  */
    12, 12,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    16,                                  /* size of l1 cache.  */
@@ -1603,6 +1615,7 @@ struct processor_costs znver1_cost = {
    {8, 8, 8, 16, 32},                   /* cost of unaligned stores.  */
    2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
       throughput 12.  Approx 9 uops do not depend on vector size and every load
       is 7 uops.  */
@@ -1770,6 +1783,7 @@ struct processor_costs znver2_cost = {
    2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
                                            register.  */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
       throughput 12.  Approx 9 uops do not depend on vector size and every load
       is 7 uops.  */
@@ -1912,6 +1926,7 @@ struct processor_costs znver3_cost = {
    2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
                                            register.  */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
       throughput 9.  Approx 7 uops do not depend on vector size and every load
       is 4 uops.  */
@@ -2056,6 +2071,7 @@ struct processor_costs znver4_cost = {
    2, 2, 2,                             /* cost of moving XMM,YMM,ZMM
                                            register.  */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
       throughput 5.  Approx 7 uops do not depend on vector size and every load
       is 5 uops.  */
@@ -2204,6 +2220,7 @@ struct processor_costs znver5_cost = {
    2, 2, 2,                             /* cost of moving XMM,YMM,ZMM
                                            register.  */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
  
    /* TODO: gather and scatter instructions are currently disabled in
       x86-tune.def.  In some cases they are however a win, see PR116582
@@ -2372,6 +2389,7 @@ struct processor_costs skylake_cost = {
    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
    2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    20, 8,                               /* Gather load static, per_elt.  */
    22, 10,                              /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -2508,6 +2526,7 @@ struct processor_costs icelake_cost = {
    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
    2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    20, 8,                               /* Gather load static, per_elt.  */
    22, 10,                              /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -2638,6 +2657,7 @@ struct processor_costs alderlake_cost = {
    {8, 8, 8, 10, 15},                   /* cost of unaligned storess.  */
    2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    18, 6,                               /* Gather load static, per_elt.  */
    18, 6,                               /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -2761,6 +2781,7 @@ const struct processor_costs btver1_cost = {
    {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    14,                                  /* cost of moving SSE register to integer.  */
+  14,                                  /* cost of moving integer register to SSE.  */
    10, 10,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -2881,6 +2902,7 @@ const struct processor_costs btver2_cost = {
    {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    14,                                  /* cost of moving SSE register to integer.  */
+  14,                                  /* cost of moving integer register to SSE.  */
    10, 10,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -3000,6 +3022,7 @@ struct processor_costs pentium4_cost = {
    {32, 32, 32, 64, 128},               /* cost of unaligned stores.  */
    12, 24, 48,                          /* cost of moving XMM,YMM,ZMM register */
    20,                                  /* cost of moving SSE register to integer.  */
+  20,                                  /* cost of moving integer register to SSE.  */
    16, 16,                              /* Gather load static, per_elt.  */
    16, 16,                              /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -3122,6 +3145,7 @@ struct processor_costs nocona_cost = {
    {24, 24, 24, 48, 96},                        /* cost of unaligned stores.  */
    6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
    20,                                  /* cost of moving SSE register to integer.  */
+  20,                                  /* cost of moving integer register to SSE.  */
    12, 12,                              /* Gather load static, per_elt.  */
    12, 12,                              /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -3242,6 +3266,7 @@ struct processor_costs atom_cost = {
    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    8,                                   /* cost of moving SSE register to integer.  */
+  8,                                   /* cost of moving integer register to SSE.  */
    8, 8,                                        /* Gather load static, per_elt.  */
    8, 8,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -3362,6 +3387,7 @@ struct processor_costs slm_cost = {
    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    8,                                   /* cost of moving SSE register to integer.  */
+  8,                                   /* cost of moving integer register to SSE.  */
    8, 8,                                        /* Gather load static, per_elt.  */
    8, 8,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -3494,6 +3520,7 @@ struct processor_costs tremont_cost = {
    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
    2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    18, 6,                               /* Gather load static, per_elt.  */
    18, 6,                               /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -3616,6 +3643,7 @@ struct processor_costs intel_cost = {
    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
    2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
    4,                                   /* cost of moving SSE register to integer.  */
+  4,                                   /* cost of moving integer register to SSE.  */
    6, 6,                                        /* Gather load static, per_elt.  */
    6, 6,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -3731,15 +3759,16 @@ struct processor_costs lujiazui_cost = {
    {6, 6, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
-  {6, 6, 6},                   /* cost of storing integer registers.  */
+  {6, 6, 6},                           /* cost of storing integer registers.  */
    {6, 6, 6, 10, 15},                   /* cost of loading SSE register
-                               in 32bit, 64bit, 128bit, 256bit and 512bit.  */
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit.  */
    {6, 6, 6, 10, 15},                   /* cost of storing SSE register
-                               in 32bit, 64bit, 128bit, 256bit and 512bit.  */
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit.  */
    {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
-  2, 3, 4,                     /* cost of moving XMM,YMM,ZMM register.  */
-  6,                           /* cost of moving SSE register to integer.  */
+  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register.  */
+  6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    18, 6,                               /* Gather load static, per_elt.  */
    18, 6,                               /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -3864,6 +3893,7 @@ struct processor_costs yongfeng_cost = {
    {8, 8, 8, 12, 15},                   /* cost of unaligned storess.  */
    2, 3, 4,                     /* cost of moving XMM,YMM,ZMM register.  */
    8,                           /* cost of moving SSE register to integer.  */
+  8,                                   /* cost of moving integer register to SSE.  */
    18, 6,                               /* Gather load static, per_elt.  */
    18, 6,                               /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -3987,6 +4017,7 @@ struct processor_costs shijidadao_cost = {
    {8, 8, 8, 12, 15},                   /* cost of unaligned storess.  */
    2, 3, 4,                     /* cost of moving XMM,YMM,ZMM register.  */
    8,                           /* cost of moving SSE register to integer.  */
+  8,                                   /* cost of moving integer register to SSE.  */
    18, 6,                               /* Gather load static, per_elt.  */
    18, 6,                               /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -4116,6 +4147,7 @@ struct processor_costs generic_cost = {
    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
    2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
    6,                                   /* cost of moving SSE register to integer.  */
+  6,                                   /* cost of moving integer register to SSE.  */
    18, 6,                               /* Gather load static, per_elt.  */
    18, 6,                               /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -4249,6 +4281,7 @@ struct processor_costs core_cost = {
    {6, 6, 6, 6, 12},                    /* cost of unaligned stores.  */
    2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
    2,                                   /* cost of moving SSE register to integer.  */
+  2,                                   /* cost of moving integer register to SSE.  */
    /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
       rec. throughput 6.
       So 5 uops statically and one uops per load.  */
diff --git a/gcc/testsuite/gcc.target/i386/minmax-6.c b/gcc/testsuite/gcc.target/i386/minmax-6.c

index 615f919ba0adf2d6f81979551fb398ae7eecd746..23f61c52d80472585d4dcf25dca298b7d53974a4 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/minmax-6.c
+++ b/gcc/testsuite/gcc.target/i386/minmax-6.c
@@ -15,4 +15,4 @@ UMVLine16Y_11 (short unsigned int * Pic, int y, int width)
  /* We do not want the RA to spill %esi for it's dual-use but using
     pmaxsd is OK.  */
  /* { dg-final { scan-assembler-not "rsp" { target { ! { ia32 } } } } } */
-/* { dg-final { scan-assembler "pmaxsd" } } */
+/* { dg-final { scan-assembler "pmaxsd" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/i386/minmax-7.c b/gcc/testsuite/gcc.target/i386/minmax-7.c

index 619a93946c7074cd7fdb61b5d7bfa9ffe90c4ddb..b2cb1c24d7e0398d27f02a9d9ec4325542c9a919 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/minmax-7.c
+++ b/gcc/testsuite/gcc.target/i386/minmax-7.c
@@ -17,4 +17,4 @@ void bar (int aleft, int axcenter)
  /* We do not want the RA to spill %esi for it's dual-use but using
     pminsd is OK.  */
  /* { dg-final { scan-assembler-not "rsp" { target { ! { ia32 } } } } } */
-/* { dg-final { scan-assembler "pminsd" } } */
+/* { dg-final { scan-assembler "pminsd" { xfail *-*-* } } } */
author	Jan Hubicka <hubicka@ucw.cz>
	Sat, 10 May 2025 20:23:48 +0000 (22:23 +0200)
committer	Jan Hubicka <hubicka@ucw.cz>
	Sat, 10 May 2025 20:24:52 +0000 (22:24 +0200)
gcc/config/i386/i386-features.cc		patch \| blob \| blame \| history
gcc/config/i386/i386-features.h		patch \| blob \| blame \| history
gcc/config/i386/i386.h		patch \| blob \| blame \| history
gcc/config/i386/x86-tune-costs.h		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/minmax-6.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/minmax-7.c		patch \| blob \| blame \| history