]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
i386: Auto vectorize sdot_prod, usdot_prod, udot_prod with AVX10.2 instructions
authorHaochen Jiang <haochen.jiang@intel.com>
Mon, 2 Sep 2024 02:24:29 +0000 (10:24 +0800)
committerHaochen Jiang <haochen.jiang@intel.com>
Mon, 2 Sep 2024 02:24:29 +0000 (10:24 +0800)
gcc/ChangeLog:

* config/i386/sse.md (VI1_AVX512VNNIBW): New.
(VI2_AVX10_2): Ditto.
(sdot_prod<mode>): Add AVX10.2
to auto vectorize and combine 512 bit part.
(udot_prod<mode>): Ditto.
(sdot_prodv64qi): Removed.
(udot_prodv64qi): Ditto.
(usdot_prod<mode>): Add AVX10.2 to auto vectorize.
(udot_prod<mode>): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vnniint16-auto-vectorize-2.c: Only define
TEST when not defined.
* gcc.target/i386/vnniint8-auto-vectorize-2.c: Ditto.
* gcc.target/i386/vnniint16-auto-vectorize-3.c: New test.
* gcc.target/i386/vnniint16-auto-vectorize-4.c: Ditto.
* gcc.target/i386/vnniint8-auto-vectorize-3.c: Ditto.
* gcc.target/i386/vnniint8-auto-vectorize-4.c: Ditto.

gcc/config/i386/sse.md
gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-2.c
gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-2.c
gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-4.c [new file with mode: 0644]

index da91d39cf8ebc449c46a72d1f642e082246cc5a5..442ac93afa2bf008e8812ff8fc82c547f30e5c80 100644 (file)
 (define_mode_iterator VI1_AVX512VNNI
   [(V64QI "TARGET_AVX512VNNI && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI])
 
+(define_mode_iterator VI1_AVX512VNNIBW
+  [(V64QI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512")
+   (V32QI "TARGET_AVX2") V16QI])
+
 (define_mode_iterator VI12_256_512_AVX512VL
   [(V64QI "TARGET_EVEX512") (V32QI "TARGET_AVX512VL")
    (V32HI "TARGET_EVEX512") (V16HI "TARGET_AVX512VL")])
   [(V32HI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512")
    (V16HI "TARGET_AVX2") V8HI])
 
+(define_mode_iterator VI2_AVX10_2
+  [(V32HI "TARGET_AVX10_2_512") V16HI V8HI])
+
 (define_mode_iterator VI4_AVX
   [(V8SI "TARGET_AVX") V4SI])
 
 
 (define_expand "sdot_prod<mode>"
   [(match_operand:<ssedvecmode> 0 "register_operand")
-   (match_operand:VI1_AVX2 1 "register_operand")
-   (match_operand:VI1_AVX2 2 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 1 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 2 "register_operand")
    (match_operand:<ssedvecmode> 3 "register_operand")]
   "TARGET_SSE2"
 {
-  if (TARGET_AVXVNNIINT8)
+  if ((<MODE_SIZE> == 64 && TARGET_AVX10_2_512)
+      || (<MODE_SIZE> < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256)))
     {
       operands[1] = lowpart_subreg (<ssedvecmode>mode,
                                    force_reg (<MODE>mode, operands[1]),
   DONE;
 })
 
-(define_expand "sdot_prodv64qi"
-  [(match_operand:V16SI 0 "register_operand")
-   (match_operand:V64QI 1 "register_operand")
-   (match_operand:V64QI 2 "register_operand")
-   (match_operand:V16SI 3 "register_operand")]
-  "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512"
-{
-  /* Emulate with vpdpwssd.  */
-  rtx op1_lo = gen_reg_rtx (V32HImode);
-  rtx op1_hi = gen_reg_rtx (V32HImode);
-  rtx op2_lo = gen_reg_rtx (V32HImode);
-  rtx op2_hi = gen_reg_rtx (V32HImode);
-
-  emit_insn (gen_vec_unpacks_lo_v64qi (op1_lo, operands[1]));
-  emit_insn (gen_vec_unpacks_lo_v64qi (op2_lo, operands[2]));
-  emit_insn (gen_vec_unpacks_hi_v64qi (op1_hi, operands[1]));
-  emit_insn (gen_vec_unpacks_hi_v64qi (op2_hi, operands[2]));
-
-  rtx res1 = gen_reg_rtx (V16SImode);
-  rtx res2 = gen_reg_rtx (V16SImode);
-  rtx sum = gen_reg_rtx (V16SImode);
-
-  emit_move_insn (sum, CONST0_RTX (V16SImode));
-  emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum));
-  emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3]));
-
-  emit_insn (gen_addv16si3 (operands[0], res1, res2));
-  DONE;
-})
-
 (define_expand "udot_prod<mode>"
   [(match_operand:<ssedvecmode> 0 "register_operand")
-   (match_operand:VI1_AVX2 1 "register_operand")
-   (match_operand:VI1_AVX2 2 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 1 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 2 "register_operand")
    (match_operand:<ssedvecmode> 3 "register_operand")]
   "TARGET_SSE2"
 {
-  if (TARGET_AVXVNNIINT8)
+  if ((<MODE_SIZE> == 64 && TARGET_AVX10_2_512)
+      || (<MODE_SIZE> < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256)))
     {
       operands[1] = lowpart_subreg (<ssedvecmode>mode,
                                    force_reg (<MODE>mode, operands[1]),
   DONE;
 })
 
-(define_expand "udot_prodv64qi"
-  [(match_operand:V16SI 0 "register_operand")
-   (match_operand:V64QI 1 "register_operand")
-   (match_operand:V64QI 2 "register_operand")
-   (match_operand:V16SI 3 "register_operand")]
-  "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512"
-{
-  /* Emulate with vpdpwssd.  */
-  rtx op1_lo = gen_reg_rtx (V32HImode);
-  rtx op1_hi = gen_reg_rtx (V32HImode);
-  rtx op2_lo = gen_reg_rtx (V32HImode);
-  rtx op2_hi = gen_reg_rtx (V32HImode);
-
-  emit_insn (gen_vec_unpacku_lo_v64qi (op1_lo, operands[1]));
-  emit_insn (gen_vec_unpacku_lo_v64qi (op2_lo, operands[2]));
-  emit_insn (gen_vec_unpacku_hi_v64qi (op1_hi, operands[1]));
-  emit_insn (gen_vec_unpacku_hi_v64qi (op2_hi, operands[2]));
-
-  rtx res1 = gen_reg_rtx (V16SImode);
-  rtx res2 = gen_reg_rtx (V16SImode);
-  rtx sum = gen_reg_rtx (V16SImode);
-
-  emit_move_insn (sum, CONST0_RTX (V16SImode));
-  emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum));
-  emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3]));
-
-  emit_insn (gen_addv16si3 (operands[0], res1, res2));
-  DONE;
-})
-
 (define_insn "vpdp<vpdotprodtype>_<mode>"
   [(set (match_operand:VI4_AVX 0 "register_operand" "=v")
        (unspec:VI4_AVX
 
 (define_expand "usdot_prod<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI2_AVX2 1 "register_operand")
-   (match_operand:VI2_AVX2 2 "register_operand")
+   (match_operand:VI2_AVX10_2 1 "register_operand")
+   (match_operand:VI2_AVX10_2 2 "register_operand")
    (match_operand:<sseunpackmode> 3 "register_operand")]
-  "TARGET_AVXVNNIINT16"
+  "TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256"
 {
   operands[1] = lowpart_subreg (<sseunpackmode>mode,
                                 force_reg (<MODE>mode, operands[1]),
 
 (define_expand "udot_prod<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI2_AVX2 1 "register_operand")
-   (match_operand:VI2_AVX2 2 "register_operand")
+   (match_operand:VI2_AVX10_2 1 "register_operand")
+   (match_operand:VI2_AVX10_2 2 "register_operand")
    (match_operand:<sseunpackmode> 3 "register_operand")]
-  "TARGET_AVXVNNIINT16"
+  "TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256"
 {
   operands[1] = lowpart_subreg (<sseunpackmode>mode,
                                 force_reg (<MODE>mode, operands[1]),
index 90dc0eade7ef2b82c4cd5a17ddcd0869688af1a8..1bd1dfbd3a3fef33f12eb52f2af41513ef64a49c 100644 (file)
@@ -2,19 +2,24 @@
 /* { dg-options "-O2 -mavxvnniint16" } */
 /* { dg-require-effective-target avxvnniint16 } */
 
+#ifndef AVX10_2
 #define AVXVNNIINT16
+#endif
+
 #ifndef CHECK
 #define CHECK "avx-check.h"
 #endif
 
+#include CHECK
+#include "vnniint16-auto-vectorize-1.c"
+
 #ifndef TEST
 #define TEST avx_test
 #endif
 
-#include CHECK
-#include "vnniint16-auto-vectorize-1.c"
-
+#ifndef N
 #define N 256
+#endif
 
 short a_i16[N];
 unsigned short b_u16[N], c_u16[N], d_u16[N];
diff --git a/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-3.c b/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-3.c
new file mode 100644 (file)
index 0000000..85dd80e
--- /dev/null
@@ -0,0 +1,6 @@
+/* { dg-do compile } */                                     
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler "vpdpwusd\t" } } */
+/* { dg-final { scan-assembler "vpdpwuud\t" } } */
+
+#include "vnniint16-auto-vectorize-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-4.c b/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-4.c
new file mode 100644 (file)
index 0000000..beaab18
--- /dev/null
@@ -0,0 +1,18 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx10.2-512" } */
+/* { dg-require-effective-target avx10_2_512 } */
+
+#define N 512
+
+#define AVX10_2
+#define AVX10_2_512
+#define AVX10_512BIT
+#define AVX512F_LEN 512
+
+#define TEST test_512
+
+#ifndef CHECK
+#define CHECK "avx10-check.h"
+#endif
+
+#include "vnniint16-auto-vectorize-2.c"
index 99853e6c3b7604056f3060f47fddb68f84432fe0..5a791f0f59e52950211a302362ab329c4b0cb849 100644 (file)
@@ -2,19 +2,25 @@
 /* { dg-options "-O2 -mavxvnniint8" } */
 /* { dg-require-effective-target avxvnniint8 } */
 
+#ifndef AVX10_2
 #define AVXVNNIINT8
+#endif
+
 #ifndef CHECK
 #define CHECK "avx-check.h"
 #endif
 
+#include CHECK
+#include "vnniint8-auto-vectorize-1.c"
+
 #ifndef TEST
 #define TEST avx_test
 #endif
 
-#include CHECK
-#include "vnniint8-auto-vectorize-1.c"
-
+#ifndef N
 #define N 256
+#endif
+
 char a_i8[N], b_i8[N];
 unsigned char c_u8[N], d_u8[N];
 int i8_exp, i8_ref;
diff --git a/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-3.c b/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-3.c
new file mode 100644 (file)
index 0000000..bbb49e8
--- /dev/null
@@ -0,0 +1,6 @@
+/* { dg-do compile } */                                     
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler "vpdpbssd\t" } } */
+/* { dg-final { scan-assembler "vpdpbuud\t" } } */
+
+#include "vnniint8-auto-vectorize-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-4.c b/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-4.c
new file mode 100644 (file)
index 0000000..70cd80c
--- /dev/null
@@ -0,0 +1,18 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx10.2-512" } */
+/* { dg-require-effective-target avx10_2_512 } */
+
+#define N 512
+
+#define AVX10_2
+#define AVX10_2_512
+#define AVX10_512BIT
+#define AVX512F_LEN 512
+
+#define TEST test_512
+
+#ifndef CHECK
+#define CHECK "avx10-check.h"
+#endif
+
+#include "vnniint8-auto-vectorize-2.c"