i386: Use 2x-wider modes when emulating QImode vector instructions

author Uros Bizjak <ubizjak@gmail.com>

Thu, 25 May 2023 17:40:26 +0000 (19:40 +0200)

committer Uros Bizjak <ubizjak@gmail.com>

Thu, 25 May 2023 17:41:09 +0000 (19:41 +0200)
author Uros Bizjak <ubizjak@gmail.com>
Thu, 25 May 2023 17:40:26 +0000 (19:40 +0200)
committer Uros Bizjak <ubizjak@gmail.com>
Thu, 25 May 2023 17:41:09 +0000 (19:41 +0200)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc

index 5a57be82e98af8a0702b58f98bd914fa37f3c7c7..7302f2780c29682a938cb1017eb443d9caf208f8 100644 (file)
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23106,68 +23106,6 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
    gcc_assert (ok);
  }
  
-/* This function is similar as ix86_expand_vecop_qihi,
-   but optimized under AVX512BW by using vpmovwb.
-   For example, optimize vector MUL generation like
-
-   vpmovzxbw ymm2, xmm0
-   vpmovzxbw ymm3, xmm1
-   vpmullw   ymm4, ymm2, ymm3
-   vpmovwb   xmm0, ymm4
-
-   it would take less instructions than ix86_expand_vecop_qihi.
-   Return true if success.  */
-
-static bool
-ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
-{
-  machine_mode himode, qimode = GET_MODE (dest);
-  rtx hop1, hop2, hdest;
-  rtx (*gen_truncate)(rtx, rtx);
-  bool uns_p = (code == ASHIFTRT) ? false : true;
-
-  /* There are no V64HImode instructions.  */
-  if (qimode == V64QImode)
-    return false;
-
-  /* vpmovwb only available under AVX512BW.  */
-  if (!TARGET_AVX512BW)
-    return false;
-
-  if (qimode == V16QImode && !TARGET_AVX512VL)
-    return false;
-
-  /* Do not generate ymm/zmm instructions when
-     target prefers 128/256 bit vector width.  */
-  if ((qimode == V16QImode && TARGET_PREFER_AVX128)
-      || (qimode == V32QImode && TARGET_PREFER_AVX256))
-    return false;
-
-  switch (qimode)
-    {
-    case E_V16QImode:
-      himode = V16HImode;
-      gen_truncate = gen_truncv16hiv16qi2;
-      break;
-    case E_V32QImode:
-      himode = V32HImode;
-      gen_truncate = gen_truncv32hiv32qi2;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  hop1 = gen_reg_rtx (himode);
-  hop2 = gen_reg_rtx (himode);
-  hdest = gen_reg_rtx (himode);
-  emit_insn (gen_extend_insn (hop1, op1, himode, qimode, uns_p));
-  emit_insn (gen_extend_insn (hop2, op2, himode, qimode, uns_p));
-  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
-                                                     hop1, hop2)));
-  emit_insn (gen_truncate (dest, hdest));
-  return true;
-}
-
  /* Expand a vector operation shift by constant for a V*QImode in terms of the
     same operation on V*HImode. Return true if success. */
  static bool
@@ -23272,9 +23210,9 @@ void
  ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
  {
    machine_mode qimode = GET_MODE (dest);
-  rtx qop1, qop2, hop1, hop2, qdest, hres;
+  rtx qop1, qop2, hop1, hop2, qdest, hdest;
    bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
-  bool uns_p = true;
+  bool uns_p = code != ASHIFTRT;
  
    switch (qimode)
      {
@@ -23306,24 +23244,25 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
      {
      case MULT:
        gcc_assert (op2vec);
-      /* Unpack data such that we've got a source byte in each low byte of
-        each word.  We don't care what goes into the high byte of each word.
-        Rather than trying to get zero in there, most convenient is to let
-        it be a copy of the low byte.  */
-      hop1 = copy_to_reg (qop1);
-      hop2 = copy_to_reg (qop2);
-      emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
-      emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
-      break;
-
-    case ASHIFTRT:
-      uns_p = false;
+      if (!TARGET_SSE4_1)
+       {
+         /* Unpack data such that we've got a source byte in each low byte
+            of each word.  We don't care what goes into the high byte of
+            each word.  Rather than trying to get zero in there, most
+            convenient is to let it be a copy of the low byte.  */
+         hop1 = copy_to_reg (qop1);
+         hop2 = copy_to_reg (qop2);
+         emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
+         emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
+         break;
+       }
        /* FALLTHRU */
      case ASHIFT:
+    case ASHIFTRT:
      case LSHIFTRT:
        hop1 = gen_reg_rtx (V8HImode);
        ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
-      /* vashr/vlshr/vashl  */
+      /* mult/vashr/vlshr/vashl  */
        if (op2vec)
         {
           hop2 = gen_reg_rtx (V8HImode);
@@ -23340,14 +23279,14 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
    if (code != MULT && op2vec)
      {
        /* Expand vashr/vlshr/vashl.  */
-      hres = gen_reg_rtx (V8HImode);
-      emit_insn (gen_rtx_SET (hres,
+      hdest = gen_reg_rtx (V8HImode);
+      emit_insn (gen_rtx_SET (hdest,
                               simplify_gen_binary (code, V8HImode,
                                                    hop1, hop2)));
      }
    else
      /* Expand mult/ashr/lshr/ashl.  */
-    hres = expand_simple_binop (V8HImode, code, hop1, hop2,
+    hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
                                 NULL_RTX, 1, OPTAB_DIRECT);
  
    if (TARGET_AVX512BW && TARGET_AVX512VL)
@@ -23357,19 +23296,18 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
        else
         qdest = gen_reg_rtx (V8QImode);
  
-      emit_insn (gen_truncv8hiv8qi2 (qdest, hres));
+      emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
      }
    else
      {
        struct expand_vec_perm_d d;
-      rtx qres = gen_lowpart (V16QImode, hres);
+      rtx qres = gen_lowpart (V16QImode, hdest);
        bool ok;
        int i;
  
        /* Merge the data back into the right place.  */
        d.target = qdest;
-      d.op0 = qres;
-      d.op1 = qres;
+      d.op0 = d.op1 = qres;
        d.vmode = V16QImode;
        d.nelt = 16;
        d.one_operand_p = false;
@@ -23386,6 +23324,116 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
      emit_move_insn (dest, gen_lowpart (qimode, qdest));
  }
  
+/* Emit instruction in 2x wider mode.  For example, optimize
+   vector MUL generation like
+
+   vpmovzxbw ymm2, xmm0
+   vpmovzxbw ymm3, xmm1
+   vpmullw   ymm4, ymm2, ymm3
+   vpmovwb   xmm0, ymm4
+
+   it would take less instructions than ix86_expand_vecop_qihi.
+   Return true if success.  */
+
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  machine_mode himode, qimode = GET_MODE (dest);
+  machine_mode wqimode;
+  rtx qop1, qop2, hop1, hop2, hdest;
+  rtx (*gen_truncate)(rtx, rtx) = NULL;
+  bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+  bool uns_p = code != ASHIFTRT;
+
+  if ((qimode == V16QImode && !TARGET_AVX2)
+      || (qimode == V32QImode && !TARGET_AVX512BW)
+      /* There are no V64HImode instructions.  */
+      || qimode == V64QImode)
+     return false;
+
+  /* Do not generate ymm/zmm instructions when
+     target prefers 128/256 bit vector width.  */
+  if ((qimode == V16QImode && TARGET_PREFER_AVX128)
+      || (qimode == V32QImode && TARGET_PREFER_AVX256))
+    return false;
+
+  switch (qimode)
+    {
+    case E_V16QImode:
+      himode = V16HImode;
+      if (TARGET_AVX512VL)
+       gen_truncate = gen_truncv16hiv16qi2;
+      break;
+    case E_V32QImode:
+      himode = V32HImode;
+      gen_truncate = gen_truncv32hiv32qi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
+  qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
+
+  if (op2vec)
+    qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
+  else
+    qop2 = op2;
+
+  hop1 = gen_reg_rtx (himode);
+  ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+
+  if (op2vec)
+    {
+      hop2 = gen_reg_rtx (himode);
+      ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+    }
+  else
+    hop2 = qop2;
+
+    if (code != MULT && op2vec)
+      {
+       /* Expand vashr/vlshr/vashl.  */
+       hdest = gen_reg_rtx (himode);
+       emit_insn (gen_rtx_SET (hdest,
+                               simplify_gen_binary (code, himode,
+                                                    hop1, hop2)));
+      }
+    else
+      /* Expand mult/ashr/lshr/ashl.  */
+      hdest = expand_simple_binop (himode, code, hop1, hop2,
+                                  NULL_RTX, 1, OPTAB_DIRECT);
+
+  if (gen_truncate)
+    emit_insn (gen_truncate (dest, hdest));
+  else
+    {
+      struct expand_vec_perm_d d;
+      rtx wqdest = gen_reg_rtx (wqimode);
+      rtx wqres = gen_lowpart (wqimode, hdest);
+      bool ok;
+      int i;
+
+      /* Merge the data back into the right place.  */
+      d.target = wqdest;
+      d.op0 = d.op1 = wqres;
+      d.vmode = wqimode;
+      d.nelt = GET_MODE_NUNITS (wqimode);
+      d.one_operand_p = false;
+      d.testing_p = false;
+
+      for (i = 0; i < d.nelt; ++i)
+       d.perm[i] = i * 2;
+
+      ok = ix86_expand_vec_perm_const_1 (&d);
+      gcc_assert (ok);
+
+      emit_move_insn (dest, gen_lowpart (qimode, wqdest));
+    }
+
+  return true;
+}
+
  /* Expand a vector operation CODE for a V*QImode in terms of the
     same operation on V*HImode.  */
  
@@ -23400,7 +23448,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
    bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
    struct expand_vec_perm_d d;
    bool full_interleave = true;
-  bool uns_p = true;
+  bool uns_p = code != ASHIFTRT;
    bool ok;
    int i;
  
@@ -23409,9 +23457,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
        && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
      return;
  
-  if (TARGET_AVX512BW
-      && VECTOR_MODE_P (GET_MODE (op2))
-      && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+  if (0 && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
      return;
  
    switch (qimode)
@@ -23468,10 +23514,8 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
        emit_insn (gen_ih (op1_h, op1, op1));
        break;
  
-    case ASHIFTRT:
-      uns_p = false;
-      /* FALLTHRU */
      case ASHIFT:
+    case ASHIFTRT:
      case LSHIFTRT:
        op1_l = gen_reg_rtx (himode);
        op1_h = gen_reg_rtx (himode);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc

index 202abf0b39c169c67cb5589fad1cd399dfeb7c2a..e548d6588289ba01cd00adc4ee9ff61f75092ba7 100644 (file)
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20497,75 +20497,109 @@ ix86_multiplication_cost (const struct processor_costs *cost,
      return  ix86_vec_cost (mode,
                            inner_mode == DFmode ? cost->mulsd : cost->mulss);
    else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-    switch (mode)
-      {
-      case V4QImode:
-      case V8QImode:
-       /* Partial V*QImode is emulated with 4-6 insns.  */
-       if (TARGET_AVX512BW && TARGET_AVX512VL)
-         return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3);
-       else if (TARGET_AVX2)
-         return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 5);
-       else if (TARGET_XOP)
-         return (ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3)
-                 + cost->sse_load[2]);
-       else
-         return (ix86_vec_cost (mode, cost->mulss + cost->sse_op * 4)
-                 + cost->sse_load[2]);
-
-      case V16QImode:
-       /* V*QImode is emulated with 4-11 insns.  */
-       if (TARGET_AVX512BW && TARGET_AVX512VL)
-         return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3);
-       else if (TARGET_AVX2)
-         return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 8);
-       else if (TARGET_XOP)
-         return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5)
-                 + cost->sse_load[2]);
-       else
-         return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 7)
-                 + cost->sse_load[2]);
+    {
+      int nmults, nops;
+      /* Cost of reading the memory.  */
+      int extra;
  
-      case V32QImode:
-       if (TARGET_AVX512BW)
-         return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3);
-       else
-         return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 7)
-                 + cost->sse_load[3] * 2);
-
-      case V64QImode:
-       return (ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 9)
-               + cost->sse_load[3] * 2
-               + cost->sse_load[4] * 2);
-
-      case V4SImode:
-       /* pmulld is used in this case. No emulation is needed.  */
-       if (TARGET_SSE4_1)
-         goto do_native;
-       /* V4SImode is emulated with 7 insns.  */
-       else
-         return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
-
-      case V2DImode:
-      case V4DImode:
-       /* vpmullq is used in this case. No emulation is needed.  */
-       if (TARGET_AVX512DQ && TARGET_AVX512VL)
-         goto do_native;
-       /* V*DImode is emulated with 6-8 insns.  */
-       else if (TARGET_XOP && mode == V2DImode)
-         return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 4);
-       /* FALLTHRU */
-      case V8DImode:
-       /* vpmullq is used in this case. No emulation is needed.  */
-       if (TARGET_AVX512DQ && mode == V8DImode)
-         goto do_native;
-       else
-         return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
+      switch (mode)
+       {
+       case V4QImode:
+       case V8QImode:
+         /* Partial V*QImode is emulated with 4-6 insns.  */
+         nmults = 1;
+         nops = 3;
+         extra = 0;
  
-      default:
-      do_native:
-       return ix86_vec_cost (mode, cost->mulss);
-      }
+         if (TARGET_AVX512BW && TARGET_AVX512VL)
+           ;
+         else if (TARGET_AVX2)
+           nops += 2;
+         else if (TARGET_XOP)
+           extra += cost->sse_load[2];
+         else
+           {
+             nops += 1;
+             extra += cost->sse_load[2];
+           }
+         goto do_qimode;
+
+       case V16QImode:
+         /* V*QImode is emulated with 4-11 insns.  */
+         nmults = 1;
+         nops = 3;
+         extra = 0;
+
+         if (TARGET_AVX2 && !TARGET_PREFER_AVX128)
+           {
+             if (!(TARGET_AVX512BW && TARGET_AVX512VL))
+               nops += 3;
+           }
+         else if (TARGET_XOP)
+           {
+             nmults += 1;
+             nops += 2;
+             extra += cost->sse_load[2];
+           }
+         else
+           {
+             nmults += 1;
+             nops += 4;
+             extra += cost->sse_load[2];
+           }
+         goto do_qimode;
+
+       case V32QImode:
+         nmults = 1;
+         nops = 3;
+         extra = 0;
+
+         if (!TARGET_AVX512BW || TARGET_PREFER_AVX256)
+           {
+             nmults += 1;
+             nops += 4;
+             extra += cost->sse_load[3] * 2;
+           }
+         goto do_qimode;
+
+       case V64QImode:
+         nmults = 2;
+         nops = 9;
+         extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2;
+
+       do_qimode:
+         return ix86_vec_cost (mode, cost->mulss * nmults
+                               + cost->sse_op * nops) + extra;
+
+       case V4SImode:
+         /* pmulld is used in this case. No emulation is needed.  */
+         if (TARGET_SSE4_1)
+           goto do_native;
+         /* V4SImode is emulated with 7 insns.  */
+         else
+           return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
+
+       case V2DImode:
+       case V4DImode:
+         /* vpmullq is used in this case. No emulation is needed.  */
+         if (TARGET_AVX512DQ && TARGET_AVX512VL)
+           goto do_native;
+         /* V*DImode is emulated with 6-8 insns.  */
+         else if (TARGET_XOP && mode == V2DImode)
+           return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 4);
+         /* FALLTHRU */
+       case V8DImode:
+         /* vpmullq is used in this case. No emulation is needed.  */
+         if (TARGET_AVX512DQ && mode == V8DImode)
+           goto do_native;
+         else
+           return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
+
+       default:
+       do_native:
+         return ix86_vec_cost (mode, cost->mulss);
+       }
+    }
    else
      return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
  }
@@ -20637,16 +20671,13 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
                 count = 2;
             }
           else if (TARGET_AVX512BW && TARGET_AVX512VL)
-           {
-             count = 3;
-             return ix86_vec_cost (mode, cost->sse_op * count);
-           }
+           return ix86_vec_cost (mode, cost->sse_op * 4);
           else if (TARGET_SSE4_1)
-           count = 4;
-         else if (code == ASHIFTRT)
             count = 5;
+         else if (code == ASHIFTRT)
+           count = 6;
           else
-           count = 4;
+           count = 5;
           return ix86_vec_cost (mode, cost->sse_op * count) + extra;
  
         case V16QImode:
@@ -20663,7 +20694,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
                 }
               else
                 {
-                 count = (code == ASHIFT) ? 2 : 3;
+                 count = (code == ASHIFT) ? 3 : 4;
                   return ix86_vec_cost (mode, cost->sse_op * count);
                 }
             }
@@ -20685,12 +20716,20 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
               else
                 count = 2;
             }
+         else if (TARGET_AVX512BW
+                  && ((mode == V32QImode && !TARGET_PREFER_AVX256)
+                      || (mode == V16QImode && TARGET_AVX512VL
+                          && !TARGET_PREFER_AVX128)))
+           return ix86_vec_cost (mode, cost->sse_op * 4);
+         else if (TARGET_AVX2
+                  && mode == V16QImode && !TARGET_PREFER_AVX128)
+           count = 6;
           else if (TARGET_SSE4_1)
-           count = 8;
-         else if (code == ASHIFTRT)
             count = 9;
+         else if (code == ASHIFTRT)
+           count = 10;
           else
-           count = 8;
+           count = 9;
           return ix86_vec_cost (mode, cost->sse_op * count) + extra;
  
         case V2DImode:
@@ -20704,6 +20743,8 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
                     count = TARGET_SSE4_2 ? 1 : 2;
                   else if (TARGET_XOP)
                     count = 2;
+                 else if (TARGET_SSE4_1)
+                   count = 3;
                   else
                     count = 4;
                 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c

index 5e9f4f2805c9ab5c87f13b4ad488b6afc9e3f68b..dc684a167c8ebebe399aebf5ef9885d7fcd571f0 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
@@ -1,8 +1,7 @@
  /* PR target/pr95488  */
  /* { dg-do compile } */
  /* { dg-options "-O2 -mavx512bw -mavx512vl" }  */
-/* { dg-final { scan-assembler-times "vpmovzxbw" 4 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpunpcklbw" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 8 { target { ! ia32 } } } } */
  /* { dg-final { scan-assembler-times "vpmullw\[^\n\]*ymm" 2 } } */
  /* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 { target { ! ia32 } } } } */
  /* { dg-final { scan-assembler-times "vpmovwb" 4 { target { ! ia32 } } } } */
author	Uros Bizjak <ubizjak@gmail.com>
	Thu, 25 May 2023 17:40:26 +0000 (19:40 +0200)
committer	Uros Bizjak <ubizjak@gmail.com>
	Thu, 25 May 2023 17:41:09 +0000 (19:41 +0200)
gcc/config/i386/i386-expand.cc		patch \| blob \| blame \| history
gcc/config/i386/i386.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c		patch \| blob \| blame \| history