]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Use x86 GFNI for vectorized constant byte shifts/rotates
authorAndi Kleen <ak@gcc.gnu.org>
Mon, 4 Aug 2025 00:35:39 +0000 (17:35 -0700)
committerAndi Kleen <ak@gcc.gnu.org>
Mon, 25 Aug 2025 07:12:10 +0000 (00:12 -0700)
The GFNI AVX gf2p8affineqb instruction can be used to implement
vectorized byte shifts or rotates. This patch uses them to implement
shift and rotate patterns to allow the vectorizer to use them.
Previously AVX couldn't do rotates (except with XOP) and had to handle
8 bit shifts with a half throughput 16 bit shift.

This is only implemented for constant shifts. In theory it could
be used with a lookup table for variable shifts, but it's unclear
if it's worth it.

The vectorizer cost model could be improved, but seems to work for now.
It doesn't model the true latencies of the instructions. Also it doesn't
account for the memory loading of the mask, assuming that for a loop
it will be loaded outside the loop.

The instructions would also support more complex patterns
(e.g. arbitary bit movement or inversions), so some of the tricks
applied to ternlog could be applied here too to collapse
more code. It's trickier because the input patterns
can be much longer since they can apply to every bit individually. I didn't
attempt any of this.

There's currently no test case for the masked/cond_ variants, they seem
to be difficult to trigger with the vectorizer. Suggestions for a test
case for them welcome.

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_vgf2p8affine_shift_matrix):
New function to lookup shift/rotate matrixes for gf2p8affine.
* config/i386/i386-protos.h (ix86_vgf2p8affine_shift_matrix):
Declare new function.
* config/i386/i386.cc (ix86_shift_rotate_cost): Add cost model
for shift/rotate implemented using gf2p8affine.
* config/i386/sse.md (VI1_AVX512_3264): New mode iterator.
(<insn><mode>3): Add GFNI case for shift patterns.
(cond_<insn><mode>3): New pattern.
(<insn><mode>3<mask_name>): Dito.
(<insn>v16qi): New rotate pattern to handle XOP V16QI case
and GFNI.
(rotl<mode>3, rotr<mode>3): Exclude V16QI case.

gcc/testsuite/ChangeLog:

* gcc.target/i386/shift-gf2p8affine-1.c: New test
* gcc.target/i386/shift-gf2p8affine-2.c: New test
* gcc.target/i386/shift-gf2p8affine-3.c: New test
* gcc.target/i386/shift-v16qi-4.c: New test
* gcc.target/i386/shift-gf2p8affine-5.c: New test
* gcc.target/i386/shift-gf2p8affine-6.c: New test
* gcc.target/i386/shift-gf2p8affine-7.c: New test

gcc/config/i386/i386-expand.cc
gcc/config/i386/i386-protos.h
gcc/config/i386/i386.cc
gcc/config/i386/sse.md
gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/shift-gf2p8affine-5.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/shift-gf2p8affine-6.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/shift-gf2p8affine-7.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/shift-v16qi-4.c [new file with mode: 0644]

index 12cec611c7b5ed13313a14e8174625ab26f1b178..ef6c12cd5697e1098bd843d3ae36526bd7728667 100644 (file)
@@ -27034,6 +27034,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
   return target;
 }
 
+/* GF2P8AFFINEQB matrixes to implement shift and rotate.  */
+
+static const uint64_t matrix_ashift[8] =
+{
+  0,
+  0x0001020408102040, /* 1 l */
+  0x0000010204081020, /* 2 l */
+  0x0000000102040810, /* 3 l */
+  0x0000000001020408, /* 4 l */
+  0x0000000000010204, /* 5 l */
+  0x0000000000000102, /* 6 l */
+  0x0000000000000001  /* 7 l */
+};
+
+static const uint64_t matrix_lshiftrt[8] =
+{
+  0,
+  0x0204081020408000, /* 1 r */
+  0x0408102040800000, /* 2 r */
+  0x0810204080000000, /* 3 r */
+  0x1020408000000000, /* 4 r */
+  0x2040800000000000, /* 5 r */
+  0x4080000000000000, /* 6 r */
+  0x8000000000000000  /* 7 r */
+};
+
+static const uint64_t matrix_ashiftrt[8] =
+{
+  0,
+  0x0204081020408080, /* 1 r */
+  0x0408102040808080, /* 2 r */
+  0x0810204080808080, /* 3 r */
+  0x1020408080808080, /* 4 r */
+  0x2040808080808080, /* 5 r */
+  0x4080808080808080, /* 6 r */
+  0x8080808080808080  /* 7 r */
+};
+
+static const uint64_t matrix_rotate[8] =
+{
+  0,
+  0x8001020408102040, /* 1 rol8 */
+  0x4080010204081020, /* 2 rol8 */
+  0x2040800102040810, /* 3 rol8 */
+  0x1020408001020408, /* 4 rol8 */
+  0x0810204080010204, /* 5 rol8 */
+  0x0408102040800102, /* 6 rol8 */
+  0x0204081020408001  /* 7 rol8 */
+};
+
+static const uint64_t matrix_rotatert[8] =
+{
+  0,
+  0x0204081020408001, /* 1 ror8 */
+  0x0408102040800102, /* 2 ror8 */
+  0x0810204080010204, /* 3 ror8 */
+  0x1020408001020408, /* 4 ror8 */
+  0x2040800102040810, /* 5 ror8 */
+  0x4080010204081020, /* 6 ror8 */
+  0x8001020408102040  /* 7 ror8 */
+};
+
+/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
+   for CODE and shift count COUNT into register with vector of size of SRC.  */
+
+rtx
+ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
+{
+  machine_mode mode = GET_MODE (src);
+  const uint64_t *matrix;
+  unsigned shift = INTVAL (count) & 7;
+  gcc_assert (shift > 0 && shift < 8);
+
+  switch (code)
+    {
+    case ASHIFT:
+      matrix = matrix_ashift;
+      break;
+    case ASHIFTRT:
+      matrix = matrix_ashiftrt;
+      break;
+    case LSHIFTRT:
+      matrix = matrix_lshiftrt;
+      break;
+    case ROTATE:
+      matrix = matrix_rotate;
+      break;
+    case ROTATERT:
+      matrix = matrix_rotatert;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  int nelts = GET_MODE_NUNITS (mode);
+  rtvec vec = rtvec_alloc (nelts);
+  uint64_t ma = matrix[shift];
+  for (int i = 0; i < nelts; i++)
+    RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
+
+  return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
+}
+
 /* Trunc a vector to a narrow vector, like v4di -> v4si.  */
 
 void
index ee6b78b2c77cdf3f87771de2f74dfe3502445cc0..bdb8bb963b5dcba243c48071aebeadfc6112f4d8 100644 (file)
@@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap);
 /* In i386-expand.cc.  */
 bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
                                   HOST_WIDE_INT*);
+rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code);
index 55c9b16dd388e1f7a29359047276108b6455dc0f..b4b84b9fecba004effb7246d6be6eb53287d2a2c 100644 (file)
@@ -22102,6 +22102,15 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
            }
          /* FALLTHRU */
        case V32QImode:
+         if (TARGET_GFNI && constant_op1)
+           {
+             /* Use vgf2p8affine. One extra load for the mask, but in a loop
+                with enough registers it will be moved out. So for now don't
+                account the constant mask load. This is not quite right
+                for non loop vectorization.  */
+             extra = 0;
+             return ix86_vec_cost (mode, cost->sse_op) + extra;
+           }
          if (TARGET_AVX2)
            /* Use vpbroadcast.  */
            extra = cost->sse_op;
@@ -22136,6 +22145,11 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
            count = 9;
          return ix86_vec_cost (mode, cost->sse_op * count) + extra;
 
+       case V64QImode:
+         /* Ignore the mask load for GF2P8AFFINEQB.  */
+         extra = 0;
+         return ix86_vec_cost (mode, cost->sse_op) + extra;
+
        case V2DImode:
        case V4DImode:
          /* V*DImode arithmetic right shift is emulated.  */
index ec74f93731d8c2006bcc4a3d20cfadf270ec3548..951ee54589f35c8fc433069f6dd99c9a1dbae6fb 100644 (file)
 (define_mode_iterator VI1_AVX512VL
   [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")])
 
+(define_mode_iterator VI1_AVX512_3264
+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX")])
+
 ;; All vector modes
 (define_mode_iterator V
   [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
 
 ;; XOP packed rotate instructions
 (define_expand "rotl<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-       (rotate:VI_128
-        (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+       (rotate:VI248_128
+        (match_operand:VI248_128 1 "nonimmediate_operand")
         (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
 })
 
 (define_expand "rotr<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-       (rotatert:VI_128
-        (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+       (rotatert:VI248_128
+        (match_operand:VI248_128 1 "nonimmediate_operand")
         (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
       int i;
 
       if (<CODE> != ASHIFT)
-       {
-         if (CONST_INT_P (operands[2]))
-           operands[2] = GEN_INT (-INTVAL (operands[2]));
-         else
-           negate = true;
-       }
+       {
+            if (CONST_INT_P (operands[2]))
+              operands[2] = GEN_INT (-INTVAL (operands[2]));
+            else
+              negate = true;
+          }
       par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
       tmp = lowpart_subreg (QImode, operands[2], SImode);
       for (i = 0; i < 16; i++)
-       XVECEXP (par, 0, i) = tmp;
+        XVECEXP (par, 0, i) = tmp;
 
       tmp = gen_reg_rtx (V16QImode);
       emit_insn (gen_vec_initv16qiqi (tmp, par));
 
       if (negate)
-       emit_insn (gen_negv16qi2 (tmp, tmp));
+        emit_insn (gen_negv16qi2 (tmp, tmp));
 
       gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
       emit_insn (gen (operands[0], operands[1], tmp));
     }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2])
+           && (<MODE_SIZE> == 64
+               || !(INTVAL (operands[2]) == 7 && <CODE> == ASHIFTRT)))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2],
+                                                  <CODE>);
+      emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+                                           GEN_INT (0)));
+    }
   else
     ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
   DONE;
 })
 
+; not generated by vectorizer?
+(define_expand "cond_<insn><mode>"
+  [(set (match_operand:VI1_AVX512VL 0 "register_operand")
+       (vec_merge:VI1_AVX512VL
+         (any_shift:VI1_AVX512VL
+           (match_operand:VI1_AVX512VL 2 "register_operand")
+           (match_operand:VI1_AVX512VL 3 "nonimmediate_or_const_vec_dup_operand"))
+         (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand")
+       (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_GFNI && TARGET_AVX512F"
+{
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[1], matrix,
+               GEN_INT (0), operands[4], operands[1]));
+  DONE;
+})
+
+(define_expand "<insn><mode>3"
+  [(set (match_operand:VI1_AVX512_3264 0 "register_operand")
+       (any_rotate:VI1_AVX512_3264
+         (match_operand:VI1_AVX512_3264 1 "general_operand")
+         (match_operand:SI 2 "const_int_operand")))]
+  "TARGET_GFNI"
+{
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+             GEN_INT (0)));
+  DONE;
+})
+
+(define_expand "<insn>v16qi3"
+  [(set (match_operand:V16QI 0 "register_operand")
+     (any_rotate:V16QI
+       (match_operand:V16QI 1 "nonimmediate_operand")
+       (match_operand:SI 2 "general_operand")))]
+  "TARGET_GFNI || TARGET_XOP"
+{
+  /* Handle the V16QI XOP case to avoid a conflict with the other expand.  */
+  if (TARGET_XOP)
+    {
+      if (! const_0_to_7_operand (operands[2], SImode))
+        {
+          rtvec vs = rtvec_alloc (16);
+          rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+          rtx reg = gen_reg_rtx (V16QImode);
+          rtx op2 = operands[2];
+          int i;
+
+          if (GET_MODE (op2) != QImode)
+            {
+              op2 = gen_reg_rtx (QImode);
+              convert_move (op2, operands[2], false);
+            }
+
+          for (i = 0; i < 16; i++)
+            RTVEC_ELT (vs, i) = op2;
+
+          emit_insn (gen_vec_initv16qiqi (reg, par));
+          if (<CODE> == ROTATERT)
+            {
+              rtx neg = gen_reg_rtx (V16QImode);
+              emit_insn (gen_negv16qi2 (neg, reg));
+              emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg));
+              reg = neg;
+            }
+          emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg));
+          DONE;
+       }
+    }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2]))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+      emit_insn (gen_vgf2p8affineqb_v16qi (operands[0], operands[1], matrix,
+                 GEN_INT (0)));
+      DONE;
+    }
+  else
+    FAIL;
+})
+
 (define_expand "ashrv2di3"
   [(set (match_operand:V2DI 0 "register_operand")
        (ashiftrt:V2DI
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c
new file mode 100644 (file)
index 0000000..e5be3a3
--- /dev/null
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx512vl -mavx512bw -mavx512f -O3" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 14 } } */
+
+#ifndef N
+#define N 5
+#endif
+
+void
+ubyteshiftl (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] <<= N;
+}
+
+void
+ubyteshiftr (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] >>= N;
+}
+
+void
+ubyteshiftl_mask (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    if (a[i] & 1)
+      a[i] <<= N;
+}
+
+void
+sbyteshiftl (signed char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] <<= N;
+}
+
+void
+sbyteshiftr (signed char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] >>= N;
+}
+
+void
+ubyteror (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] = a[i] << N | a[i] >> (8 - N);
+}
+
+void
+ubyterol (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] = a[i] >> N | a[i] << (8 - N);
+}
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c
new file mode 100644 (file)
index 0000000..098361a
--- /dev/null
@@ -0,0 +1,196 @@
+/* { dg-do run } */
+/* { dg-options "-mgfni -mavx512vl -mavx512bw -mavx512f -O3 -Wno-shift-count-negative" } */
+
+#include <string.h>
+
+#ifndef N1
+#define N1 5
+#endif
+
+#ifndef N2
+#define N2 3
+#endif
+
+#ifndef N3
+#define N3 1
+#endif
+
+#ifndef N4
+#define N4 7
+#endif
+
+#ifndef N5
+#define N5 -3
+#endif
+
+#ifndef FILLER
+#define FILLER 0xab
+#endif
+
+#define FUNC(N)                                                        \
+  void ubyteshiftl##N(unsigned char *a, int len)               \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] <<= N;                                              \
+  }                                                            \
+                                                               \
+  void ubyteshiftr##N(unsigned char *a, int len)               \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] >>= N;                                              \
+  }                                                            \
+                                                               \
+  void ubyteshiftl_mask##N(unsigned char *a, int len)          \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      if (a[i] & 1)                                            \
+       a[i] <<= N;                                             \
+  }                                                            \
+                                                               \
+  void sbyteshiftl##N(signed char *a, int len)                 \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] <<= N;                                              \
+  }                                                            \
+                                                               \
+  void sbyteshiftr##N(signed char *a, int len)                 \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] >>= N;                                              \
+  }                                                            \
+                                                               \
+  void ubyteror##N(unsigned char *a, int len)                  \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] = a[i] << N | a[i] >> (8-N);                                \
+  }                                                            \
+                                                               \
+  void ubyterol##N(unsigned char *a, int len)                  \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] = a[i] >> N | a[i] << (8-N);                                \
+  }                                                            \
+  void ubyteshiftl##N##ref(unsigned char *a, int len)          \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] <<= N;                                             \
+  }                                                            \
+                                                               \
+  void ubyteshiftr##N##ref(unsigned char *a, int len)          \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] >>= N;                                             \
+  }                                                            \
+                                                               \
+  void ubyteshiftl_mask##N##ref(unsigned char *a, int len)     \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       if (a[i] & 1)                                           \
+         a[i] <<= N;                                           \
+  }                                                            \
+                                                               \
+  void sbyteshiftl##N##ref(signed char *a, int len)            \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] <<= N;                                             \
+  }                                                            \
+                                                               \
+  void sbyteshiftr##N##ref(signed char *a, int len)            \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] >>= N;                                             \
+  }                                                            \
+                                                               \
+  void ubyteror##N##ref(unsigned char *a, int len)             \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] = a[i] << N | a[i] >> (8-N);                       \
+  }                                                            \
+                                                               \
+  void ubyterol##N##ref(unsigned char *a, int len)             \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] = a[i] >> N | a[i] << (8-N);                       \
+  }
+
+FUNC (N1)
+FUNC (N2)
+FUNC (N3)
+FUNC (N4)
+FUNC (N5)
+
+#define TEST(N, func)                                  \
+  memset (array, filler, len);                         \
+  func##N (array, len);                                        \
+  memset (array2, filler, len);                                \
+  func##N##ref (array2, len);                          \
+  if (memcmp (array, array2, len)) __builtin_abort ()
+
+int main ()
+{
+  __builtin_cpu_init ();
+  if (!__builtin_cpu_supports ("gfni"))
+    return 0;
+
+  const unsigned long len = 256;
+  char array[len], array2[len];
+  unsigned char filler = FILLER;
+
+  TEST (N1, ubyteshiftl);
+  TEST (N1, ubyteshiftl_mask);
+  TEST (N1, sbyteshiftl);
+  TEST (N1, sbyteshiftr);
+  TEST (N1, ubyteror);
+  TEST (N1, ubyterol);
+
+  TEST (N2, ubyteshiftl);
+  TEST (N2, ubyteshiftl_mask);
+  TEST (N2, sbyteshiftl);
+  TEST (N2, sbyteshiftr);
+  TEST (N2, ubyteror);
+  TEST (N2, ubyterol);
+
+  TEST (N3, ubyteshiftl);
+  TEST (N3, ubyteshiftl_mask);
+  TEST (N3, sbyteshiftl);
+  TEST (N3, sbyteshiftr);
+  TEST (N3, ubyteror);
+  TEST (N3, ubyterol);
+
+  TEST (N4, ubyteshiftl);
+  TEST (N4, ubyteshiftl_mask);
+  TEST (N4, sbyteshiftl);
+  TEST (N4, sbyteshiftr);
+  TEST (N4, ubyteror);
+  TEST (N4, ubyterol);
+
+  TEST (N5, ubyteshiftl);
+  TEST (N5, ubyteshiftl_mask);
+  TEST (N5, sbyteshiftl);
+  TEST (N5, sbyteshiftr);
+  TEST (N5, ubyteror);
+  TEST (N5, ubyterol);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c
new file mode 100644 (file)
index 0000000..9e5ae5d
--- /dev/null
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx512bw -mavx512f -O3" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 12 } } */
+
+/* Based on a test case from Andrew Pinski */
+
+#ifndef N
+#define N 5
+#endif
+
+void
+ubyteshiftl (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) << N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) << N : a[i];
+    }
+}
+
+void
+ubyteshiftr (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) >> N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) >> N : a[i];
+    }
+}
+
+void
+sbyteshiftl (signed char *restrict a, signed char *restrict b, signed char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) << N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) << N : a[i];
+    }
+}
+
+void
+sbyteshiftr (signed char *restrict a, signed char *restrict b, signed char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) >> N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) >> N : a[i];
+    }
+}
+
+static inline unsigned char rol8(unsigned char v, int c)
+{
+  return (v >> c) | (v << (8-c));
+}
+
+static inline unsigned char ror8(unsigned char v, int c)
+{
+  return (v << c) | (v >> (8-c));
+}
+
+void
+ubyterol (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? rol8(a[i] | b[i], N) : a[i];
+      a[i] = (!c[i]) ? rol8(a[i] ^ b[i], N) : a[i];
+    }
+}
+
+void
+ubyteror (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? ror8(a[i] | b[i], N) : a[i];
+      a[i] = (!c[i]) ? ror8(a[i] ^ b[i], N) : a[i];
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-5.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-5.c
new file mode 100644 (file)
index 0000000..65fb692
--- /dev/null
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx -O3 -Wno-shift-count-negative" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 31 } } */
+
+#include "shift-gf2p8affine-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-6.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-6.c
new file mode 100644 (file)
index 0000000..3391deb
--- /dev/null
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -O3 -Wno-shift-count-negative" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 0 } } */
+
+#include "shift-gf2p8affine-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-7.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-7.c
new file mode 100644 (file)
index 0000000..37ba0c8
--- /dev/null
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx512vl -mavx512bw -mavx512f -O3 -Wno-shift-count-negative" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 53 } } */
+
+#include "shift-gf2p8affine-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/shift-v16qi-4.c b/gcc/testsuite/gcc.target/i386/shift-v16qi-4.c
new file mode 100644 (file)
index 0000000..edc2b21
--- /dev/null
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx512vl -mavx512bw -mavx512f -O2" } */
+/* { dg-final { scan-assembler-times "vpcmpgtb" 1 } } */
+
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16qi
+foo (v16qi a)
+{
+  return a >> 7;
+}