]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Add emulated scatter capability to the vectorizer
authorRichard Biener <rguenther@suse.de>
Wed, 18 Jan 2023 09:59:52 +0000 (10:59 +0100)
committerRichard Biener <rguenther@suse.de>
Fri, 28 Apr 2023 12:40:48 +0000 (14:40 +0200)
This adds a scatter vectorization capability to the vectorizer
without target support by decomposing the offset and data vectors
and then performing scalar stores in the order of vector lanes.
This is aimed at cases where vectorizing the rest of the loop
offsets the cost of vectorizing the scatter.

The offset load is still vectorized and costed as such, but like
with emulated gather those will be turned back to scalar loads
by forwrpop.

* tree-vect-data-refs.cc (vect_analyze_data_refs): Always
consider scatters.
* tree-vect-stmts.cc (vect_model_store_cost): Pass in the
gather-scatter info and cost emulated scatters accordingly.
(get_load_store_type): Support emulated scatters.
(vectorizable_store): Likewise.  Emulate them by extracting
scalar offsets and data, doing scalar stores.

* gcc.dg/vect/pr25413a.c: Un-XFAIL everywhere.
* gcc.dg/vect/vect-71.c: Likewise.
* gcc.dg/vect/tsvc/vect-tsvc-s4113.c: Likewise.
* gcc.dg/vect/tsvc/vect-tsvc-s491.c: Likewise.
* gcc.dg/vect/tsvc/vect-tsvc-vas.c: Likewise.

gcc/testsuite/gcc.dg/vect/pr25413a.c
gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s4113.c
gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s491.c
gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-vas.c
gcc/testsuite/gcc.dg/vect/vect-71.c
gcc/tree-vect-data-refs.cc
gcc/tree-vect-stmts.cc

index e444b2c3e8ee24c4374453c95fb4e892323a3897..ffb517c9ce063591e2bf3c0f3d39e12ac2ad2ae8 100644 (file)
@@ -123,7 +123,6 @@ int main (void)
   return 0;
 } 
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! vect_scatter_store } } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_scatter_store } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "vector alignment may not be reachable" 1 "vect" { target { ! vector_alignment_reachable  } } } } */
 /* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 1 "vect" { target { ! vector_alignment_reachable } } } } */
index b64682a65df9a206036e24c76d9fcb346b64e795..ddb7e9dc0e8ad7cfb8b1379b21eeac78e28093b4 100644 (file)
@@ -39,4 +39,4 @@ int main (int argc, char **argv)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { ! aarch64_sve }  } } } */
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
index 8465e137070c8f575964fdfd6e88a2a117f619ac..29e90ff0affc7cb65b9115f450ea45ae3b23129d 100644 (file)
@@ -39,4 +39,4 @@ int main (int argc, char **argv)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { ! aarch64_sve }  } } } */
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
index 5ff38851f434c467b54b6fc76299c237645026f5..b72ee21a9a3a2f4837ae2eba0e20b575e39565d2 100644 (file)
@@ -39,4 +39,4 @@ int main (int argc, char **argv)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { ! aarch64_sve }  } } } */
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
index f15521176df55bef45fde880221bdfcd7aa17dbc..581473fa4a1dcf1a7ee570336693ada765d429f3 100644 (file)
@@ -36,4 +36,4 @@ int main (void)
   return main1 ();
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { ! vect_scatter_store } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
index c03ffb3aaf1c3ae3bc9fb3af4287b7820d474634..6721ab6efc4f029be8e2315c31ba87d94230cda5 100644 (file)
@@ -4464,9 +4464,7 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
              && !TREE_THIS_VOLATILE (DR_REF (dr));
          bool maybe_scatter
            = DR_IS_WRITE (dr)
-             && !TREE_THIS_VOLATILE (DR_REF (dr))
-             && (targetm.vectorize.builtin_scatter != NULL
-                 || supports_vec_scatter_store_p ());
+             && !TREE_THIS_VOLATILE (DR_REF (dr));
 
          /* If target supports vector gather loads or scatter stores,
             see if they can't be used.  */
index dc2dc2cfa7e94cfc6b9ef7679d9ca3d9a0b58f4c..c71e28737ee3d9f3b874c01572e971dd1dfd6b12 100644 (file)
@@ -942,6 +942,7 @@ cfun_returns (tree decl)
 static void
 vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
                       vect_memory_access_type memory_access_type,
+                      gather_scatter_info *gs_info,
                       dr_alignment_support alignment_support_scheme,
                       int misalignment,
                       vec_load_store_type vls_type, slp_tree slp_node,
@@ -997,8 +998,16 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
   if (memory_access_type == VMAT_ELEMENTWISE
       || memory_access_type == VMAT_GATHER_SCATTER)
     {
-      /* N scalar stores plus extracting the elements.  */
       unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
+      if (memory_access_type == VMAT_GATHER_SCATTER
+         && gs_info->ifn == IFN_LAST && !gs_info->decl)
+       /* For emulated scatter N offset vector element extracts
+          (we assume the scalar scaling and ptr + offset add is consumed by
+          the load).  */
+       inside_cost += record_stmt_cost (cost_vec, ncopies * assumed_nunits,
+                                        vec_to_scalar, stmt_info, 0,
+                                        vect_body);
+      /* N scalar stores plus extracting the elements.  */
       inside_cost += record_stmt_cost (cost_vec,
                                       ncopies * assumed_nunits,
                                       scalar_store, stmt_info, 0, vect_body);
@@ -1008,7 +1017,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
                         misalignment, &inside_cost, cost_vec);
 
   if (memory_access_type == VMAT_ELEMENTWISE
-      || memory_access_type == VMAT_STRIDED_SLP)
+      || memory_access_type == VMAT_STRIDED_SLP
+      || (memory_access_type == VMAT_GATHER_SCATTER
+         && gs_info->ifn == IFN_LAST && !gs_info->decl))
     {
       /* N scalar stores plus extracting the elements.  */
       unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
@@ -2503,19 +2514,11 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
        }
       else if (gs_info->ifn == IFN_LAST && !gs_info->decl)
        {
-         if (vls_type != VLS_LOAD)
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "unsupported emulated scatter.\n");
-             return false;
-           }
-         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
-                  || !TYPE_VECTOR_SUBPARTS
-                        (gs_info->offset_vectype).is_constant ()
-                  || !constant_multiple_p (TYPE_VECTOR_SUBPARTS
-                                             (gs_info->offset_vectype),
-                                           TYPE_VECTOR_SUBPARTS (vectype)))
+         if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
+             || !TYPE_VECTOR_SUBPARTS (gs_info->offset_vectype).is_constant ()
+             || !constant_multiple_p (TYPE_VECTOR_SUBPARTS
+                                        (gs_info->offset_vectype),
+                                      TYPE_VECTOR_SUBPARTS (vectype)))
            {
              if (dump_enabled_p ())
                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7824,6 +7827,15 @@ vectorizable_store (vec_info *vinfo,
                             "unsupported access type for masked store.\n");
          return false;
        }
+      else if (memory_access_type == VMAT_GATHER_SCATTER
+              && gs_info.ifn == IFN_LAST
+              && !gs_info.decl)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "unsupported masked emulated scatter.\n");
+         return false;
+       }
     }
   else
     {
@@ -7887,7 +7899,8 @@ vectorizable_store (vec_info *vinfo,
 
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
       vect_model_store_cost (vinfo, stmt_info, ncopies,
-                            memory_access_type, alignment_support_scheme,
+                            memory_access_type, &gs_info,
+                            alignment_support_scheme,
                             misalignment, vls_type, slp_node, cost_vec);
       return true;
     }
@@ -8527,12 +8540,9 @@ vectorizable_store (vec_info *vinfo,
              dataref_offset = build_int_cst (ref_type, 0);
            }
          else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-           {
-             vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
-                                          slp_node, &gs_info, &dataref_ptr,
-                                          &vec_offsets);
-             vec_offset = vec_offsets[0];
-           }
+           vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
+                                        slp_node, &gs_info, &dataref_ptr,
+                                        &vec_offsets);
          else
            dataref_ptr
              = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -8558,9 +8568,7 @@ vectorizable_store (vec_info *vinfo,
          if (dataref_offset)
            dataref_offset
              = int_const_binop (PLUS_EXPR, dataref_offset, bump);
-         else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-           vec_offset = vec_offsets[j];
-         else
+         else if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
            dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
                                           stmt_info, bump);
        }
@@ -8648,8 +8656,11 @@ vectorizable_store (vec_info *vinfo,
                final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
                                               final_mask, vec_mask, gsi);
 
-             if (memory_access_type == VMAT_GATHER_SCATTER)
+             if (memory_access_type == VMAT_GATHER_SCATTER
+                 && gs_info.ifn != IFN_LAST)
                {
+                 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+                   vec_offset = vec_offsets[vec_num * j + i];
                  tree scale = size_int (gs_info.scale);
                  gcall *call;
                  if (final_mask)
@@ -8665,6 +8676,60 @@ vectorizable_store (vec_info *vinfo,
                  new_stmt = call;
                  break;
                }
+             else if (memory_access_type == VMAT_GATHER_SCATTER)
+               {
+                 /* Emulated scatter.  */
+                 gcc_assert (!final_mask);
+                 unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
+                 unsigned HOST_WIDE_INT const_offset_nunits
+                   = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
+                       .to_constant ();
+                 vec<constructor_elt, va_gc> *ctor_elts;
+                 vec_alloc (ctor_elts, const_nunits);
+                 gimple_seq stmts = NULL;
+                 tree elt_type = TREE_TYPE (vectype);
+                 unsigned HOST_WIDE_INT elt_size
+                   = tree_to_uhwi (TYPE_SIZE (elt_type));
+                 /* We support offset vectors with more elements
+                    than the data vector for now.  */
+                 unsigned HOST_WIDE_INT factor
+                   = const_offset_nunits / const_nunits;
+                 vec_offset = vec_offsets[j / factor];
+                 unsigned elt_offset = (j % factor) * const_nunits;
+                 tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
+                 tree scale = size_int (gs_info.scale);
+                 align = get_object_alignment (DR_REF (first_dr_info->dr));
+                 tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
+                 for (unsigned k = 0; k < const_nunits; ++k)
+                   {
+                     /* Compute the offsetted pointer.  */
+                     tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
+                                             bitsize_int (k + elt_offset));
+                     tree idx = gimple_build (&stmts, BIT_FIELD_REF,
+                                              idx_type, vec_offset,
+                                              TYPE_SIZE (idx_type), boff);
+                     idx = gimple_convert (&stmts, sizetype, idx);
+                     idx = gimple_build (&stmts, MULT_EXPR,
+                                         sizetype, idx, scale);
+                     tree ptr = gimple_build (&stmts, PLUS_EXPR,
+                                              TREE_TYPE (dataref_ptr),
+                                              dataref_ptr, idx);
+                     ptr = gimple_convert (&stmts, ptr_type_node, ptr);
+                     /* Extract the element to be stored.  */
+                     tree elt = gimple_build (&stmts, BIT_FIELD_REF,
+                                              TREE_TYPE (vectype), vec_oprnd,
+                                              TYPE_SIZE (elt_type),
+                                              bitsize_int (k * elt_size));
+                     gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+                     stmts = NULL;
+                     tree ref = build2 (MEM_REF, ltype, ptr,
+                                        build_int_cst (ref_type, 0));
+                     new_stmt = gimple_build_assign (ref, elt);
+                     vect_finish_stmt_generation (vinfo, stmt_info,
+                                                  new_stmt, gsi);
+                   }
+                 break;
+               }
 
              if (i > 0)
                /* Bump the vector pointer.  */