]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
vect: Use sdot for a fallback implementation of usdot
authorRichard Sandiford <richard.sandiford@arm.com>
Tue, 5 Jul 2022 07:53:10 +0000 (08:53 +0100)
committerRichard Sandiford <richard.sandiford@arm.com>
Tue, 5 Jul 2022 07:53:10 +0000 (08:53 +0100)
Following a suggestion from Tamar, this patch adds a fallback
implementation of usdot using sdot.  Specifically, for 8-bit
input types:

   acc_2 = DOT_PROD_EXPR <a_unsigned, b_signed, acc_1>;

becomes:

   tmp_1 = DOT_PROD_EXPR <64, b_signed, acc_1>;
   tmp_2 = DOT_PROD_EXPR <64, b_signed, tmp_1>;
   acc_2 = DOT_PROD_EXPR <a_unsigned - 128, b_signed, tmp_2>;

on the basis that (x-128)*y + 64*y + 64*y.  Doing the two 64*y
operations first should give more time for x to be calculated,
on the off chance that that's useful.

gcc/
* tree-vect-patterns.cc (vect_convert_input): Expect the input
type to be signed for optab_vector_mixed_sign.  Update the vectype
at the same time as type.
(vect_recog_dot_prod_pattern): Update accordingly.  If usdot isn't
available, try sdot instead.
* tree-vect-loop.cc (vect_is_emulated_mixed_dot_prod): New function.
(vect_model_reduction_cost): Model the cost of implementing usdot
using sdot.
(vectorizable_reduction): Likewise.  Skip target support test
for lane reductions.
(vect_emulate_mixed_dot_prod): New function.
(vect_transform_reduction): Use it to emulate usdot via sdot.

gcc/testsuite/
* gcc.dg/vect/vect-reduc-dot-9.c: Reduce target requirements
from i8mm to dotprod.
* gcc.dg/vect/vect-reduc-dot-10.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-11.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-12.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-13.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-14.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-15.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-16.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-17.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-18.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-19.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-20.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-21.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-22.c: Likewise.

16 files changed:
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
gcc/tree-vect-loop.cc
gcc/tree-vect-patterns.cc

index 7ce86965ea97d37c43d96b4d2271df667dcb2aae..34e25ab7fb0ecf6e7a76403ee44d6a9ef0b73ef7 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 unsigned
 #define SIGNEDNESS_2 unsigned
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
index 0f7cbbb87ef028f166366aea55bc4ef49d2f8e9b..3af8df54cf94ad350102feadac8e3396fdb81426 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 unsigned
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
index 08412614fc67045d3067b5b55ba032d297595237..77ceef3643b1459321f7d84cdc0b9218898b6231 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 unsigned
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
index 7ee0f45f64296442204ee13d5f880f4b7716fb85..d3c0c86f529203e1f82b7d99647539f8fe76a8ed 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 unsigned
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
index 2de1434528b87f0c32c54150b16791f3f2a469b5..86a5c85753c0ae68bd577eb90fe5a9f6b0b9d278 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 unsigned
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
index dc48f95a32bf76c54a906ee81ddee99b16aea84a..25de0940a65354be2f0ffa856b9912291de462b1 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
index aec628789366673321aea88c60316a68fe16cbc5..4a1dec0677e5d81c1d1d13bc80e9d2fca1d92bdc 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
index 38f86fe458adcc7ebbbae22f5cc1e720928f2d48..90d21188b76060095de344bd75c8f8c632399f49 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
@@ -50,4 +50,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
index 2e86ebe3c6c6a0da9ac242868592f30028ed2155..81ecb158d29e97f04f16e5d497629e892b23b2a9 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
@@ -50,4 +50,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
index d00f24aae4c7ffbf213dc248faeeae96cd401411..cbcd4f120a505c14145ef822c99acd1bd6946e83 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
index 17adbca83a0c97e76db8e15c0ff376608fd5d1bd..e81ed1da5a4f5e389d0f95da30e4d99a89a2de89 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
index 6cc6a4f2e92ed21fe2e71c0cd842c80d44b6db9f..81ce5cdaffbd0ec8d67dd5f4bd7ca05e3f3b0580 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
index e13d3d5c4da7b14df48fa9a2c7ad457c5ccbc89c..b8c9d3ca53b07654680de192a52257a9687cec9c 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
index d1049c96bf1febfc8933622e292b44cc8dd129cc..e0b132f6b3598979e5ad14fb316809b6a7dac788 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
@@ -50,4 +50,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
index 78dfe8519aa0fb115613cb26f5ec086b5aba4efd..3a70c15b59305ec3ddf84a7e349289b5f86f0037 100644 (file)
@@ -4566,6 +4566,31 @@ have_whole_vector_shift (machine_mode mode)
   return true;
 }
 
+/* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
+   multiplication operands have differing signs and (b) we intend
+   to emulate the operation using a series of signed DOT_PROD_EXPRs.
+   See vect_emulate_mixed_dot_prod for the actual sequence used.  */
+
+static bool
+vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
+                                stmt_vec_info stmt_info)
+{
+  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
+  if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
+    return false;
+
+  tree rhs1 = gimple_assign_rhs1 (assign);
+  tree rhs2 = gimple_assign_rhs2 (assign);
+  if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
+    return false;
+
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
+  gcc_assert (reduc_info->is_reduc_info);
+  return !directly_supported_p (DOT_PROD_EXPR,
+                               STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
+                               optab_vector_mixed_sign);
+}
+
 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
    functions. Design better to avoid maintenance issues.  */
 
@@ -4601,6 +4626,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
     gcc_unreachable ();
 
+  bool emulated_mixed_dot_prod
+    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     /* No extra instructions are needed in the prologue.  The loop body
        operations are costed in vectorizable_condition.  */
@@ -4628,11 +4655,20 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
     }
   else
     {
-      /* Add in cost for initial definition.
-        For cond reduction we have four vectors: initial index, step,
-        initial result of the data reduction, initial value of the index
-        reduction.  */
-      int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
+      /* Add in the cost of the initial definitions.  */
+      int prologue_stmts;
+      if (reduction_type == COND_REDUCTION)
+       /* For cond reductions we have four vectors: initial index, step,
+          initial result of the data reduction, initial value of the index
+          reduction.  */
+       prologue_stmts = 4;
+      else if (emulated_mixed_dot_prod)
+       /* We need the initial reduction value and two invariants:
+          one that contains the minimum signed value and one that
+          contains half of its negative.  */
+       prologue_stmts = 3;
+      else
+       prologue_stmts = 1;
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
                                         scalar_to_vec, stmt_info, 0,
                                         vect_prologue);
@@ -6797,11 +6833,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
                            || op.code == WIDEN_SUM_EXPR
                            || op.code == SAD_EXPR);
-  enum optab_subtype optab_query_kind = optab_vector;
-  if (op.code == DOT_PROD_EXPR
-      && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
-         != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
-    optab_query_kind = optab_vector_mixed_sign;
 
   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
       && !SCALAR_FLOAT_TYPE_P (op.type))
@@ -7328,9 +7359,17 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       /* 4. Supportable by target?  */
       bool ok = true;
 
-      /* 4.1. check support for the operation in the loop  */
+      /* 4.1. check support for the operation in the loop
+
+        This isn't necessary for the lane reduction codes, since they
+        can only be produced by pattern matching, and it's up to the
+        pattern matcher to test for support.  The main reason for
+        specifically skipping this step is to avoid rechecking whether
+        mixed-sign dot-products can be implemented using signed
+        dot-products.  */
       machine_mode vec_mode = TYPE_MODE (vectype_in);
-      if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
+      if (!lane_reduc_code_p
+         && !directly_supported_p (op.code, vectype_in))
         {
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
@@ -7398,7 +7437,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
      vect_transform_reduction.  Otherwise this is costed by the
      separate vectorizable_* routines.  */
   if (single_defuse_cycle || lane_reduc_code_p)
-    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
+    {
+      int factor = 1;
+      if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
+       /* Three dot-products and a subtraction.  */
+       factor = 4;
+      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
+                       stmt_info, 0, vect_body);
+    }
 
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
@@ -7457,6 +7503,81 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   return true;
 }
 
+/* STMT_INFO is a dot-product reduction whose multiplication operands
+   have different signs.  Emit a sequence to emulate the operation
+   using a series of signed DOT_PROD_EXPRs and return the last
+   statement generated.  VEC_DEST is the result of the vector operation
+   and VOP lists its inputs.  */
+
+static gassign *
+vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+                            gimple_stmt_iterator *gsi, tree vec_dest,
+                            tree vop[3])
+{
+  tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
+  tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
+  tree narrow_elttype = TREE_TYPE (narrow_vectype);
+  gimple *new_stmt;
+
+  /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
+  if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
+    std::swap (vop[0], vop[1]);
+
+  /* Convert all inputs to signed types.  */
+  for (int i = 0; i < 3; ++i)
+    if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
+      {
+       tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
+       new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
+       vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+       vop[i] = tmp;
+      }
+
+  /* In the comments below we assume 8-bit inputs for simplicity,
+     but the approach works for any full integer type.  */
+
+  /* Create a vector of -128.  */
+  tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
+  tree min_narrow = build_vector_from_val (narrow_vectype,
+                                          min_narrow_elttype);
+
+  /* Create a vector of 64.  */
+  auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
+  tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
+  half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
+
+  /* Emit: SUB_RES = VOP[0] - 128.  */
+  tree sub_res = make_ssa_name (narrow_vectype);
+  new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  /* Emit:
+
+       STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
+       STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
+       STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
+
+     on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
+     Doing the two 64 * y steps first allows more time to compute x.  */
+  tree stage1 = make_ssa_name (wide_vectype);
+  new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
+                                 vop[1], half_narrow, vop[2]);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  tree stage2 = make_ssa_name (wide_vectype);
+  new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
+                                 vop[1], half_narrow, stage1);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  tree stage3 = make_ssa_name (wide_vectype);
+  new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
+                                 sub_res, vop[1], stage2);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  /* Convert STAGE3 to the reduction type.  */
+  return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
+}
+
 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
    value.  */
 
@@ -7563,12 +7684,17 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
                                        : &vec_oprnds2));
     }
 
+  bool emulated_mixed_dot_prod
+    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
     {
       gimple *new_stmt;
       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
       if (masked_loop_p && !mask_by_cond_expr)
        {
+         /* No conditional ifns have been defined for dot-product yet.  */
+         gcc_assert (code != DOT_PROD_EXPR);
+
          /* Make sure that the reduction accumulator is vop[0].  */
          if (reduc_index == 1)
            {
@@ -7597,8 +7723,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
              build_vect_cond_expr (code, vop, mask, gsi);
            }
 
-         new_stmt = gimple_build_assign (vec_dest, code,
-                                         vop[0], vop[1], vop[2]);
+         if (emulated_mixed_dot_prod)
+           new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
+                                                   vec_dest, vop);
+         else
+           new_stmt = gimple_build_assign (vec_dest, code,
+                                           vop[0], vop[1], vop[2]);
          new_temp = make_ssa_name (vec_dest, new_stmt);
          gimple_assign_set_lhs (new_stmt, new_temp);
          vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
index 8f624863971392c891fde7278949c8818f646576..dfbfb71b3c69a0205ccc1b287cb50fa02a70942e 100644 (file)
@@ -760,12 +760,16 @@ vect_convert_input (vec_info *vinfo, stmt_vec_info stmt_info, tree type,
                    vect_unpromoted_value *unprom, tree vectype,
                    enum optab_subtype subtype = optab_default)
 {
-
   /* Update the type if the signs differ.  */
-  if (subtype == optab_vector_mixed_sign
-      && TYPE_SIGN (type) != TYPE_SIGN (TREE_TYPE (unprom->op)))
-    type = build_nonstandard_integer_type (TYPE_PRECISION (type),
-                                          TYPE_SIGN (unprom->type));
+  if (subtype == optab_vector_mixed_sign)
+    {
+      gcc_assert (!TYPE_UNSIGNED (type));
+      if (TYPE_UNSIGNED (TREE_TYPE (unprom->op)))
+       {
+         type = unsigned_type_for (type);
+         vectype = unsigned_type_for (vectype);
+       }
+    }
 
   /* Check for a no-op conversion.  */
   if (types_compatible_p (type, TREE_TYPE (unprom->op)))
@@ -1139,16 +1143,34 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
      is signed; otherwise, the result has the same sign as the operands.  */
   if (TYPE_PRECISION (unprom_mult.type) != TYPE_PRECISION (type)
       && (subtype == optab_vector_mixed_sign
-       ? TYPE_UNSIGNED (unprom_mult.type)
-       : TYPE_SIGN (unprom_mult.type) != TYPE_SIGN (half_type)))
+         ? TYPE_UNSIGNED (unprom_mult.type)
+         : TYPE_SIGN (unprom_mult.type) != TYPE_SIGN (half_type)))
     return NULL;
 
   vect_pattern_detected ("vect_recog_dot_prod_pattern", last_stmt);
 
+  /* If the inputs have mixed signs, canonicalize on using the signed
+     input type for analysis.  This also helps when emulating mixed-sign
+     operations using signed operations.  */
+  if (subtype == optab_vector_mixed_sign)
+    half_type = signed_type_for (half_type);
+
   tree half_vectype;
   if (!vect_supportable_direct_optab_p (vinfo, type, DOT_PROD_EXPR, half_type,
                                        type_out, &half_vectype, subtype))
-    return NULL;
+    {
+      /* We can emulate a mixed-sign dot-product using a sequence of
+        signed dot-products; see vect_emulate_mixed_dot_prod for details.  */
+      if (subtype != optab_vector_mixed_sign
+         || !vect_supportable_direct_optab_p (vinfo, signed_type_for (type),
+                                              DOT_PROD_EXPR, half_type,
+                                              type_out, &half_vectype,
+                                              optab_vector))
+       return NULL;
+
+      *type_out = signed_or_unsigned_type_for (TYPE_UNSIGNED (type),
+                                              *type_out);
+    }
 
   /* Get the inputs in the appropriate types.  */
   tree mult_oprnd[2];