vect: Cost adjacent vector loads/stores together [PR111784]

author Kewen Lin <linkw@linux.ibm.com>

Mon, 23 Oct 2023 02:18:40 +0000 (21:18 -0500)

committer Kewen Lin <linkw@linux.ibm.com>

Mon, 23 Oct 2023 02:18:40 +0000 (21:18 -0500)
author Kewen Lin <linkw@linux.ibm.com>
Mon, 23 Oct 2023 02:18:40 +0000 (21:18 -0500)
committer Kewen Lin <linkw@linux.ibm.com>
Mon, 23 Oct 2023 02:18:40 +0000 (21:18 -0500)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index 337b24c51f953667ad4146002052ce40f34b7529..99ba75e98c0d185edd78c7b8b9947618d18576cc 100644 (file)
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8626,6 +8626,9 @@ vectorizable_store (vec_info *vinfo,
        alias_off = build_int_cst (ref_type, 0);
        stmt_vec_info next_stmt_info = first_stmt_info;
        auto_vec<tree> vec_oprnds (ncopies);
+      /* For costing some adjacent vector stores, we'd like to cost with
+        the total number of them once instead of cost each one by one. */
+      unsigned int n_adjacent_stores = 0;
        for (g = 0; g < group_size; g++)
         {
           running_off = offvar;
@@ -8683,10 +8686,7 @@ vectorizable_store (vec_info *vinfo,
                          store to avoid ICE like 110776.  */
                       if (VECTOR_TYPE_P (ltype)
                           && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
-                       vect_get_store_cost (vinfo, stmt_info, 1,
-                                            alignment_support_scheme,
-                                            misalignment, &inside_cost,
-                                            cost_vec);
+                       n_adjacent_stores++;
                       else
                         inside_cost
                           += record_stmt_cost (cost_vec, 1, scalar_store,
@@ -8743,11 +8743,18 @@ vectorizable_store (vec_info *vinfo,
             break;
         }
  
-      if (costing_p && dump_enabled_p ())
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "vect_model_store_cost: inside_cost = %d, "
-                        "prologue_cost = %d .\n",
-                        inside_cost, prologue_cost);
+      if (costing_p)
+       {
+         if (n_adjacent_stores > 0)
+           vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+                                alignment_support_scheme, misalignment,
+                                &inside_cost, cost_vec);
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "vect_model_store_cost: inside_cost = %d, "
+                            "prologue_cost = %d .\n",
+                            inside_cost, prologue_cost);
+       }
  
        return true;
      }
@@ -8854,6 +8861,9 @@ vectorizable_store (vec_info *vinfo,
      {
        gcc_assert (!slp && grouped_store);
        unsigned inside_cost = 0, prologue_cost = 0;
+      /* For costing some adjacent vector stores, we'd like to cost with
+        the total number of them once instead of cost each one by one. */
+      unsigned int n_adjacent_stores = 0;
        for (j = 0; j < ncopies; j++)
         {
           gimple *new_stmt;
@@ -8919,10 +8929,7 @@ vectorizable_store (vec_info *vinfo,
  
           if (costing_p)
             {
-             for (i = 0; i < vec_num; i++)
-               vect_get_store_cost (vinfo, stmt_info, 1,
-                                    alignment_support_scheme, misalignment,
-                                    &inside_cost, cost_vec);
+             n_adjacent_stores += vec_num;
               continue;
             }
  
@@ -9012,11 +9019,18 @@ vectorizable_store (vec_info *vinfo,
           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
         }
  
-      if (costing_p && dump_enabled_p ())
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "vect_model_store_cost: inside_cost = %d, "
-                        "prologue_cost = %d .\n",
-                        inside_cost, prologue_cost);
+      if (costing_p)
+       {
+         if (n_adjacent_stores > 0)
+           vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+                                alignment_support_scheme, misalignment,
+                                &inside_cost, cost_vec);
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "vect_model_store_cost: inside_cost = %d, "
+                            "prologue_cost = %d .\n",
+                            inside_cost, prologue_cost);
+       }
  
        return true;
      }
@@ -9235,6 +9249,9 @@ vectorizable_store (vec_info *vinfo,
               || memory_access_type == VMAT_CONTIGUOUS_REVERSE);
  
    unsigned inside_cost = 0, prologue_cost = 0;
+  /* For costing some adjacent vector stores, we'd like to cost with
+     the total number of them once instead of cost each one by one. */
+  unsigned int n_adjacent_stores = 0;
    auto_vec<tree> result_chain (group_size);
    auto_vec<tree, 1> vec_oprnds;
    for (j = 0; j < ncopies; j++)
@@ -9396,9 +9413,7 @@ vectorizable_store (vec_info *vinfo,
  
           if (costing_p)
             {
-             vect_get_store_cost (vinfo, stmt_info, 1,
-                                  alignment_support_scheme, misalignment,
-                                  &inside_cost, cost_vec);
+             n_adjacent_stores++;
  
               if (!slp)
                 {
@@ -9568,6 +9583,11 @@ vectorizable_store (vec_info *vinfo,
  
    if (costing_p)
      {
+      if (n_adjacent_stores > 0)
+       vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+                            alignment_support_scheme, misalignment,
+                            &inside_cost, cost_vec);
+
        /* When vectorizing a store into the function result assign
          a penalty if the function returns in a multi-register location.
          In this case we assume we'll end up with having to spill the
@@ -10275,6 +10295,9 @@ vectorizable_load (vec_info *vinfo,
        unsigned HOST_WIDE_INT
         elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
        unsigned int n_groups = 0;
+      /* For costing some adjacent vector loads, we'd like to cost with
+        the total number of them once instead of cost each one by one. */
+      unsigned int n_adjacent_loads = 0;
        for (j = 0; j < ncopies; j++)
         {
           if (nloads > 1 && !costing_p)
@@ -10288,10 +10311,7 @@ vectorizable_load (vec_info *vinfo,
                      avoid ICE, see PR110776.  */
                   if (VECTOR_TYPE_P (ltype)
                       && memory_access_type != VMAT_ELEMENTWISE)
-                   vect_get_load_cost (vinfo, stmt_info, 1,
-                                       alignment_support_scheme, misalignment,
-                                       false, &inside_cost, nullptr, cost_vec,
-                                       cost_vec, true);
+                   n_adjacent_loads++;
                   else
                     inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
                                                      stmt_info, 0, vect_body);
@@ -10385,11 +10405,19 @@ vectorizable_load (vec_info *vinfo,
                                           false, &n_perms);
         }
  
-      if (costing_p && dump_enabled_p ())
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "vect_model_load_cost: inside_cost = %u, "
-                        "prologue_cost = 0 .\n",
-                        inside_cost);
+      if (costing_p)
+       {
+         if (n_adjacent_loads > 0)
+           vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+                               alignment_support_scheme, misalignment, false,
+                               &inside_cost, nullptr, cost_vec, cost_vec,
+                               true);
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "vect_model_load_cost: inside_cost = %u, "
+                            "prologue_cost = 0 .\n",
+                            inside_cost);
+       }
  
        return true;
      }
@@ -10694,6 +10722,9 @@ vectorizable_load (vec_info *vinfo,
        gcc_assert (grouped_load && !slp);
  
        unsigned int inside_cost = 0, prologue_cost = 0;
+      /* For costing some adjacent vector loads, we'd like to cost with
+        the total number of them once instead of cost each one by one. */
+      unsigned int n_adjacent_loads = 0;
        for (j = 0; j < ncopies; j++)
         {
           if (costing_p)
@@ -10725,9 +10756,7 @@ vectorizable_load (vec_info *vinfo,
                                           true);
                     }
                 }
-             vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
-                                 misalignment, false, &inside_cost,
-                                 &prologue_cost, cost_vec, cost_vec, true);
+             n_adjacent_loads++;
               continue;
             }
  
@@ -10829,11 +10858,19 @@ vectorizable_load (vec_info *vinfo,
           *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
         }
  
-      if (costing_p && dump_enabled_p ())
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "vect_model_load_cost: inside_cost = %u, "
-                        "prologue_cost = %u .\n",
-                        inside_cost, prologue_cost);
+      if (costing_p)
+       {
+         if (n_adjacent_loads > 0)
+           vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+                               alignment_support_scheme, misalignment, false,
+                               &inside_cost, &prologue_cost, cost_vec,
+                               cost_vec, true);
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "vect_model_load_cost: inside_cost = %u, "
+                            "prologue_cost = %u .\n",
+                            inside_cost, prologue_cost);
+       }
  
        return true;
      }
@@ -11177,6 +11214,9 @@ vectorizable_load (vec_info *vinfo,
  
    poly_uint64 group_elt = 0;
    unsigned int inside_cost = 0, prologue_cost = 0;
+  /* For costing some adjacent vector loads, we'd like to cost with
+     the total number of them once instead of cost each one by one. */
+  unsigned int n_adjacent_loads = 0;
    for (j = 0; j < ncopies; j++)
      {
        /* 1. Create the vector or array pointer update chain.  */
@@ -11571,10 +11611,18 @@ vectorizable_load (vec_info *vinfo,
                   || memory_access_type == VMAT_CONTIGUOUS_REVERSE
                   || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
                       && (!grouped_load || first_stmt_info_p)))
-               vect_get_load_cost (vinfo, stmt_info, 1,
-                                   alignment_support_scheme, misalignment,
-                                   add_realign_cost, &inside_cost,
-                                   &prologue_cost, cost_vec, cost_vec, true);
+               {
+                 /* Leave realign cases alone to keep them simple.  */
+                 if (alignment_support_scheme == dr_explicit_realign_optimized
+                     || alignment_support_scheme == dr_explicit_realign)
+                   vect_get_load_cost (vinfo, stmt_info, 1,
+                                       alignment_support_scheme, misalignment,
+                                       add_realign_cost, &inside_cost,
+                                       &prologue_cost, cost_vec, cost_vec,
+                                       true);
+                 else
+                   n_adjacent_loads++;
+               }
             }
           else
             {
@@ -11745,6 +11793,11 @@ vectorizable_load (vec_info *vinfo,
        gcc_assert (memory_access_type == VMAT_CONTIGUOUS
                   || memory_access_type == VMAT_CONTIGUOUS_REVERSE
                   || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
+      if (n_adjacent_loads > 0)
+       vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+                           alignment_support_scheme, misalignment, false,
+                           &inside_cost, &prologue_cost, cost_vec, cost_vec,
+                           true);
        if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
                          "vect_model_load_cost: inside_cost = %u, "
author	Kewen Lin <linkw@linux.ibm.com>
	Mon, 23 Oct 2023 02:18:40 +0000 (21:18 -0500)
committer	Kewen Lin <linkw@linux.ibm.com>
	Mon, 23 Oct 2023 02:18:40 +0000 (21:18 -0500)