vect: Move VMAT_LOAD_STORE_LANES handlings from final loop nest

author Kewen Lin <linkw@linux.ibm.com>

Tue, 15 Aug 2023 06:36:33 +0000 (01:36 -0500)

committer Kewen Lin <linkw@linux.ibm.com>

Tue, 15 Aug 2023 06:36:33 +0000 (01:36 -0500)
author Kewen Lin <linkw@linux.ibm.com>
Tue, 15 Aug 2023 06:36:33 +0000 (01:36 -0500)
committer Kewen Lin <linkw@linux.ibm.com>
Tue, 15 Aug 2023 06:36:33 +0000 (01:36 -0500)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index d4e781531fdbdb1a94339f66afea4a4af50e2a37..86d033aa60cfa33062db56851487de4c59511a99 100644 (file)
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -10322,7 +10322,129 @@ vectorizable_load (vec_info *vinfo,
         vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
                                        &vec_masks, mask_vectype);
      }
+
    tree vec_mask = NULL_TREE;
+  if (memory_access_type == VMAT_LOAD_STORE_LANES)
+    {
+      gcc_assert (alignment_support_scheme == dr_aligned
+                 || alignment_support_scheme == dr_unaligned_supported);
+      gcc_assert (grouped_load && !slp);
+
+      unsigned int inside_cost = 0, prologue_cost = 0;
+      for (j = 0; j < ncopies; j++)
+       {
+         if (costing_p)
+           {
+             /* An IFN_LOAD_LANES will load all its vector results,
+                regardless of which ones we actually need.  Account
+                for the cost of unused results.  */
+             if (first_stmt_info == stmt_info)
+               {
+                 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
+                 stmt_vec_info next_stmt_info = first_stmt_info;
+                 do
+                   {
+                     gaps -= 1;
+                     next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
+                   }
+                 while (next_stmt_info);
+                 if (gaps)
+                   {
+                     if (dump_enabled_p ())
+                       dump_printf_loc (MSG_NOTE, vect_location,
+                                        "vect_model_load_cost: %d "
+                                        "unused vectors.\n",
+                                        gaps);
+                     vect_get_load_cost (vinfo, stmt_info, gaps,
+                                         alignment_support_scheme,
+                                         misalignment, false, &inside_cost,
+                                         &prologue_cost, cost_vec, cost_vec,
+                                         true);
+                   }
+               }
+             vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
+                                 misalignment, false, &inside_cost,
+                                 &prologue_cost, cost_vec, cost_vec, true);
+             continue;
+           }
+
+         /* 1. Create the vector or array pointer update chain.  */
+         if (j == 0)
+           dataref_ptr
+             = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
+                                         at_loop, offset, &dummy, gsi,
+                                         &ptr_incr, false, bump);
+         else
+           {
+             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
+             dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
+                                            stmt_info, bump);
+           }
+         if (mask)
+           vec_mask = vec_masks[j];
+
+         tree vec_array = create_vector_array (vectype, vec_num);
+
+         tree final_mask = NULL_TREE;
+         if (loop_masks)
+           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+                                            ncopies, vectype, j);
+         if (vec_mask)
+           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
+                                          vec_mask, gsi);
+
+         gcall *call;
+         if (final_mask)
+           {
+             /* Emit:
+                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
+                                               VEC_MASK).  */
+             unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
+             tree alias_ptr = build_int_cst (ref_type, align);
+             call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
+                                                dataref_ptr, alias_ptr,
+                                                final_mask);
+           }
+         else
+           {
+             /* Emit:
+                  VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
+             data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
+             call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
+           }
+         gimple_call_set_lhs (call, vec_array);
+         gimple_call_set_nothrow (call, true);
+         vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
+
+         dr_chain.create (vec_num);
+         /* Extract each vector into an SSA_NAME.  */
+         for (i = 0; i < vec_num; i++)
+           {
+             new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
+                                           vec_array, i);
+             dr_chain.quick_push (new_temp);
+           }
+
+         /* Record the mapping between SSA_NAMEs and statements.  */
+         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
+
+         /* Record that VEC_ARRAY is now dead.  */
+         vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
+
+         dr_chain.release ();
+
+         *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+       }
+
+      if (costing_p && dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "vect_model_load_cost: inside_cost = %u, "
+                        "prologue_cost = %u .\n",
+                        inside_cost, prologue_cost);
+
+      return true;
+    }
+
    poly_uint64 group_elt = 0;
    unsigned int inside_cost = 0, prologue_cost = 0;
    for (j = 0; j < ncopies; j++)
@@ -10404,685 +10526,558 @@ vectorizable_load (vec_info *vinfo,
         dr_chain.create (vec_num);
  
        gimple *new_stmt = NULL;
-      if (memory_access_type == VMAT_LOAD_STORE_LANES)
+      for (i = 0; i < vec_num; i++)
         {
-         if (costing_p)
-           {
-             /* An IFN_LOAD_LANES will load all its vector results,
-                regardless of which ones we actually need.  Account
-                for the cost of unused results.  */
-             if (grouped_load && first_stmt_info == stmt_info)
-               {
-                 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
-                 stmt_vec_info next_stmt_info = first_stmt_info;
-                 do
-                   {
-                     gaps -= 1;
-                     next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
-                   }
-                 while (next_stmt_info);
-                 if (gaps)
-                   {
-                     if (dump_enabled_p ())
-                       dump_printf_loc (MSG_NOTE, vect_location,
-                                        "vect_model_load_cost: %d "
-                                        "unused vectors.\n",
-                                        gaps);
-                     vect_get_load_cost (vinfo, stmt_info, gaps,
-                                         alignment_support_scheme,
-                                         misalignment, false, &inside_cost,
-                                         &prologue_cost, cost_vec, cost_vec,
-                                         true);
-                   }
-               }
-             vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
-                                 misalignment, false, &inside_cost,
-                                 &prologue_cost, cost_vec, cost_vec, true);
-             continue;
-           }
-         tree vec_array;
-
-         vec_array = create_vector_array (vectype, vec_num);
-
           tree final_mask = NULL_TREE;
-         if (loop_masks)
-           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
-                                            ncopies, vectype, j);
-         if (vec_mask)
-           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
-                                          final_mask, vec_mask, gsi);
-
-         gcall *call;
-         if (final_mask)
-           {
-             /* Emit:
-                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
-                                               VEC_MASK).  */
-             unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
-             tree alias_ptr = build_int_cst (ref_type, align);
-             call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
-                                                dataref_ptr, alias_ptr,
-                                                final_mask);
-           }
-         else
+         tree final_len = NULL_TREE;
+         tree bias = NULL_TREE;
+         if (!costing_p)
             {
-             /* Emit:
-                  VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
-             data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
-             call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
-           }
-         gimple_call_set_lhs (call, vec_array);
-         gimple_call_set_nothrow (call, true);
-         vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
-         new_stmt = call;
+             if (loop_masks)
+               final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+                                                vec_num * ncopies, vectype,
+                                                vec_num * j + i);
+             if (vec_mask)
+               final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+                                              final_mask, vec_mask, gsi);
  
-         /* Extract each vector into an SSA_NAME.  */
-         for (i = 0; i < vec_num; i++)
-           {
-             new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
-                                           vec_array, i);
-             dr_chain.quick_push (new_temp);
+             if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+                                              gsi, stmt_info, bump);
             }
  
-         /* Record the mapping between SSA_NAMEs and statements.  */
-         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
-
-         /* Record that VEC_ARRAY is now dead.  */
-         vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
-       }
-      else
-       {
-         for (i = 0; i < vec_num; i++)
+         /* 2. Create the vector-load in the loop.  */
+         switch (alignment_support_scheme)
             {
-             tree final_mask = NULL_TREE;
-             tree final_len = NULL_TREE;
-             tree bias = NULL_TREE;
-             if (!costing_p)
-               {
-                 if (loop_masks)
-                   final_mask
-                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
-                                           vec_num * ncopies, vectype,
-                                           vec_num * j + i);
-                 if (vec_mask)
-                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
-                                                  final_mask, vec_mask, gsi);
-
-                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
-                                                  gsi, stmt_info, bump);
-               }
+           case dr_aligned:
+           case dr_unaligned_supported:
+             {
+               unsigned int misalign;
+               unsigned HOST_WIDE_INT align;
  
-             /* 2. Create the vector-load in the loop.  */
-             switch (alignment_support_scheme)
-               {
-               case dr_aligned:
-               case dr_unaligned_supported:
+               if (memory_access_type == VMAT_GATHER_SCATTER
+                   && gs_info.ifn != IFN_LAST)
                   {
-                   unsigned int misalign;
-                   unsigned HOST_WIDE_INT align;
-
-                   if (memory_access_type == VMAT_GATHER_SCATTER
-                       && gs_info.ifn != IFN_LAST)
+                   if (costing_p)
                       {
-                       if (costing_p)
-                         {
-                           unsigned int cnunits
-                             = vect_nunits_for_cost (vectype);
-                           inside_cost
-                             = record_stmt_cost (cost_vec, cnunits,
-                                                 scalar_load, stmt_info, 0,
-                                                 vect_body);
-                           break;
-                         }
-                       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-                         vec_offset = vec_offsets[vec_num * j + i];
-                       tree zero = build_zero_cst (vectype);
-                       tree scale = size_int (gs_info.scale);
-
-                       if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
-                         {
-                           if (loop_lens)
-                             final_len
-                               = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-                                                    vec_num * ncopies, vectype,
-                                                    vec_num * j + i, 1);
-                           else
-                             final_len = build_int_cst (sizetype,
-                                                        TYPE_VECTOR_SUBPARTS (
-                                                          vectype));
-                           signed char biasval
-                             = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-                           bias = build_int_cst (intQI_type_node, biasval);
-                           if (!final_mask)
-                             {
-                               mask_vectype = truth_type_for (vectype);
-                               final_mask = build_minus_one_cst (mask_vectype);
-                             }
-                         }
-
-                       gcall *call;
-                       if (final_len && final_mask)
-                         call = gimple_build_call_internal (
-                           IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr,
-                           vec_offset, scale, zero, final_mask, final_len,
-                           bias);
-                       else if (final_mask)
-                         call = gimple_build_call_internal
-                           (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
-                            vec_offset, scale, zero, final_mask);
-                       else
-                         call = gimple_build_call_internal
-                           (IFN_GATHER_LOAD, 4, dataref_ptr,
-                            vec_offset, scale, zero);
-                       gimple_call_set_nothrow (call, true);
-                       new_stmt = call;
-                       data_ref = NULL_TREE;
+                       unsigned int cnunits = vect_nunits_for_cost (vectype);
+                       inside_cost
+                         = record_stmt_cost (cost_vec, cnunits, scalar_load,
+                                             stmt_info, 0, vect_body);
                         break;
                       }
-                   else if (memory_access_type == VMAT_GATHER_SCATTER)
+                   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+                     vec_offset = vec_offsets[vec_num * j + i];
+                   tree zero = build_zero_cst (vectype);
+                   tree scale = size_int (gs_info.scale);
+
+                   if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
                       {
-                       /* Emulated gather-scatter.  */
-                       gcc_assert (!final_mask);
-                       unsigned HOST_WIDE_INT const_nunits
-                         = nunits.to_constant ();
-                       if (costing_p)
-                         {
-                           /* For emulated gathers N offset vector element
-                              offset add is consumed by the load).  */
-                           inside_cost
-                             = record_stmt_cost (cost_vec, const_nunits,
-                                                 vec_to_scalar, stmt_info, 0,
-                                                 vect_body);
-                           /* N scalar loads plus gathering them into a
-                              vector.  */
-                           inside_cost
-                             = record_stmt_cost (cost_vec, const_nunits,
-                                                 scalar_load, stmt_info, 0,
-                                                 vect_body);
-                           inside_cost
-                             = record_stmt_cost (cost_vec, 1, vec_construct,
-                                                 stmt_info, 0, vect_body);
-                           break;
-                         }
-                       unsigned HOST_WIDE_INT const_offset_nunits
-                         = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
-                             .to_constant ();
-                       vec<constructor_elt, va_gc> *ctor_elts;
-                       vec_alloc (ctor_elts, const_nunits);
-                       gimple_seq stmts = NULL;
-                       /* We support offset vectors with more elements
-                          than the data vector for now.  */
-                       unsigned HOST_WIDE_INT factor
-                         = const_offset_nunits / const_nunits;
-                       vec_offset = vec_offsets[j / factor];
-                       unsigned elt_offset = (j % factor) * const_nunits;
-                       tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
-                       tree scale = size_int (gs_info.scale);
-                       align
-                         = get_object_alignment (DR_REF (first_dr_info->dr));
-                       tree ltype = build_aligned_type (TREE_TYPE (vectype),
-                                                        align);
-                       for (unsigned k = 0; k < const_nunits; ++k)
+                       if (loop_lens)
+                         final_len
+                           = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+                                                vec_num * ncopies, vectype,
+                                                vec_num * j + i, 1);
+                       else
+                         final_len
+                           = build_int_cst (sizetype,
+                                            TYPE_VECTOR_SUBPARTS (vectype));
+                       signed char biasval
+                         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+                       bias = build_int_cst (intQI_type_node, biasval);
+                       if (!final_mask)
                           {
-                           tree boff = size_binop (MULT_EXPR,
-                                                   TYPE_SIZE (idx_type),
-                                                   bitsize_int
-                                                     (k + elt_offset));
-                           tree idx = gimple_build (&stmts, BIT_FIELD_REF,
-                                                    idx_type, vec_offset,
-                                                    TYPE_SIZE (idx_type),
-                                                    boff);
-                           idx = gimple_convert (&stmts, sizetype, idx);
-                           idx = gimple_build (&stmts, MULT_EXPR,
-                                               sizetype, idx, scale);
-                           tree ptr = gimple_build (&stmts, PLUS_EXPR,
-                                                    TREE_TYPE (dataref_ptr),
-                                                    dataref_ptr, idx);
-                           ptr = gimple_convert (&stmts, ptr_type_node, ptr);
-                           tree elt = make_ssa_name (TREE_TYPE (vectype));
-                           tree ref = build2 (MEM_REF, ltype, ptr,
-                                              build_int_cst (ref_type, 0));
-                           new_stmt = gimple_build_assign (elt, ref);
-                           gimple_set_vuse (new_stmt,
-                                            gimple_vuse (gsi_stmt (*gsi)));
-                           gimple_seq_add_stmt (&stmts, new_stmt);
-                           CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
+                           mask_vectype = truth_type_for (vectype);
+                           final_mask = build_minus_one_cst (mask_vectype);
                           }
-                       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-                       new_stmt = gimple_build_assign (NULL_TREE,
-                                                       build_constructor
-                                                         (vectype, ctor_elts));
-                       data_ref = NULL_TREE;
-                       break;
                       }
  
-                   if (costing_p)
-                     break;
-
-                   align =
-                     known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
-                   if (alignment_support_scheme == dr_aligned)
-                     misalign = 0;
-                   else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
-                     {
-                       align = dr_alignment
-                         (vect_dr_behavior (vinfo, first_dr_info));
-                       misalign = 0;
-                     }
+                   gcall *call;
+                   if (final_len && final_mask)
+                     call = gimple_build_call_internal (
+                       IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
+                       scale, zero, final_mask, final_len, bias);
+                   else if (final_mask)
+                     call
+                       = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
+                                                     dataref_ptr, vec_offset,
+                                                     scale, zero, final_mask);
                     else
-                     misalign = misalignment;
-                   if (dataref_offset == NULL_TREE
-                       && TREE_CODE (dataref_ptr) == SSA_NAME)
-                     set_ptr_info_alignment (get_ptr_info (dataref_ptr),
-                                             align, misalign);
-                   align = least_bit_hwi (misalign | align);
-
-                   /* Compute IFN when LOOP_LENS or final_mask valid.  */
-                   machine_mode vmode = TYPE_MODE (vectype);
-                   machine_mode new_vmode = vmode;
-                   internal_fn partial_ifn = IFN_LAST;
-                   if (loop_lens)
+                     call
+                       = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
+                                                     dataref_ptr, vec_offset,
+                                                     scale, zero);
+                   gimple_call_set_nothrow (call, true);
+                   new_stmt = call;
+                   data_ref = NULL_TREE;
+                   break;
+                 }
+               else if (memory_access_type == VMAT_GATHER_SCATTER)
+                 {
+                   /* Emulated gather-scatter.  */
+                   gcc_assert (!final_mask);
+                   unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
+                   if (costing_p)
                       {
-                       opt_machine_mode new_ovmode
-                         = get_len_load_store_mode (vmode, true,
-                                                    &partial_ifn);
-                       new_vmode = new_ovmode.require ();
-                       unsigned factor = (new_ovmode == vmode)
-                                           ? 1
-                                           : GET_MODE_UNIT_SIZE (vmode);
-                       final_len
-                         = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-                                              vec_num * ncopies, vectype,
-                                              vec_num * j + i, factor);
+                       /* For emulated gathers N offset vector element
+                          offset add is consumed by the load).  */
+                       inside_cost
+                         = record_stmt_cost (cost_vec, const_nunits,
+                                             vec_to_scalar, stmt_info, 0,
+                                             vect_body);
+                       /* N scalar loads plus gathering them into a
+                          vector.  */
+                       inside_cost = record_stmt_cost (cost_vec, const_nunits,
+                                                       scalar_load, stmt_info,
+                                                       0, vect_body);
+                       inside_cost
+                         = record_stmt_cost (cost_vec, 1, vec_construct,
+                                             stmt_info, 0, vect_body);
+                       break;
                       }
-                   else if (final_mask)
+                   unsigned HOST_WIDE_INT const_offset_nunits
+                     = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
+                         .to_constant ();
+                   vec<constructor_elt, va_gc> *ctor_elts;
+                   vec_alloc (ctor_elts, const_nunits);
+                   gimple_seq stmts = NULL;
+                   /* We support offset vectors with more elements
+                      than the data vector for now.  */
+                   unsigned HOST_WIDE_INT factor
+                     = const_offset_nunits / const_nunits;
+                   vec_offset = vec_offsets[j / factor];
+                   unsigned elt_offset = (j % factor) * const_nunits;
+                   tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
+                   tree scale = size_int (gs_info.scale);
+                   align = get_object_alignment (DR_REF (first_dr_info->dr));
+                   tree ltype
+                     = build_aligned_type (TREE_TYPE (vectype), align);
+                   for (unsigned k = 0; k < const_nunits; ++k)
                       {
-                       if (!can_vec_mask_load_store_p (
-                             vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
-                             &partial_ifn))
-                         gcc_unreachable ();
+                       tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
+                                               bitsize_int (k + elt_offset));
+                       tree idx = gimple_build (&stmts, BIT_FIELD_REF,
+                                                idx_type, vec_offset,
+                                                TYPE_SIZE (idx_type), boff);
+                       idx = gimple_convert (&stmts, sizetype, idx);
+                       idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
+                                           scale);
+                       tree ptr = gimple_build (&stmts, PLUS_EXPR,
+                                                TREE_TYPE (dataref_ptr),
+                                                dataref_ptr, idx);
+                       ptr = gimple_convert (&stmts, ptr_type_node, ptr);
+                       tree elt = make_ssa_name (TREE_TYPE (vectype));
+                       tree ref = build2 (MEM_REF, ltype, ptr,
+                                          build_int_cst (ref_type, 0));
+                       new_stmt = gimple_build_assign (elt, ref);
+                       gimple_set_vuse (new_stmt,
+                                        gimple_vuse (gsi_stmt (*gsi)));
+                       gimple_seq_add_stmt (&stmts, new_stmt);
+                       CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
                       }
+                   gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+                   new_stmt = gimple_build_assign (
+                     NULL_TREE, build_constructor (vectype, ctor_elts));
+                   data_ref = NULL_TREE;
+                   break;
+                 }
  
-                   if (partial_ifn == IFN_MASK_LEN_LOAD)
+               if (costing_p)
+                 break;
+
+               align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
+               if (alignment_support_scheme == dr_aligned)
+                 misalign = 0;
+               else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
+                 {
+                   align
+                     = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
+                   misalign = 0;
+                 }
+               else
+                 misalign = misalignment;
+               if (dataref_offset == NULL_TREE
+                   && TREE_CODE (dataref_ptr) == SSA_NAME)
+                 set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
+                                         misalign);
+               align = least_bit_hwi (misalign | align);
+
+               /* Compute IFN when LOOP_LENS or final_mask valid.  */
+               machine_mode vmode = TYPE_MODE (vectype);
+               machine_mode new_vmode = vmode;
+               internal_fn partial_ifn = IFN_LAST;
+               if (loop_lens)
+                 {
+                   opt_machine_mode new_ovmode
+                     = get_len_load_store_mode (vmode, true, &partial_ifn);
+                   new_vmode = new_ovmode.require ();
+                   unsigned factor
+                     = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
+                   final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+                                                  vec_num * ncopies, vectype,
+                                                  vec_num * j + i, factor);
+                 }
+               else if (final_mask)
+                 {
+                   if (!can_vec_mask_load_store_p (
+                         vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
+                         &partial_ifn))
+                     gcc_unreachable ();
+                 }
+
+               if (partial_ifn == IFN_MASK_LEN_LOAD)
+                 {
+                   if (!final_len)
                       {
-                       if (!final_len)
-                         {
-                           /* Pass VF value to 'len' argument of
-                              MASK_LEN_LOAD if LOOP_LENS is invalid.  */
-                           final_len
-                             = size_int (TYPE_VECTOR_SUBPARTS (vectype));
-                         }
-                       if (!final_mask)
-                         {
-                           /* Pass all ones value to 'mask' argument of
-                              MASK_LEN_LOAD if final_mask is invalid.  */
-                           mask_vectype = truth_type_for (vectype);
-                           final_mask = build_minus_one_cst (mask_vectype);
-                         }
+                       /* Pass VF value to 'len' argument of
+                          MASK_LEN_LOAD if LOOP_LENS is invalid.  */
+                       final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
                       }
-                   if (final_len)
+                   if (!final_mask)
                       {
-                       signed char biasval
-                         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-
-                       bias = build_int_cst (intQI_type_node, biasval);
+                       /* Pass all ones value to 'mask' argument of
+                          MASK_LEN_LOAD if final_mask is invalid.  */
+                       mask_vectype = truth_type_for (vectype);
+                       final_mask = build_minus_one_cst (mask_vectype);
                       }
+                 }
+               if (final_len)
+                 {
+                   signed char biasval
+                     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
  
-                   if (final_len)
+                   bias = build_int_cst (intQI_type_node, biasval);
+                 }
+
+               if (final_len)
+                 {
+                   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
+                   gcall *call;
+                   if (partial_ifn == IFN_MASK_LEN_LOAD)
+                     call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, 5,
+                                                        dataref_ptr, ptr,
+                                                        final_mask, final_len,
+                                                        bias);
+                   else
+                     call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
+                                                        dataref_ptr, ptr,
+                                                        final_len, bias);
+                   gimple_call_set_nothrow (call, true);
+                   new_stmt = call;
+                   data_ref = NULL_TREE;
+
+                   /* Need conversion if it's wrapped with VnQI.  */
+                   if (vmode != new_vmode)
                       {
-                       tree ptr
-                         = build_int_cst (ref_type, align * BITS_PER_UNIT);
-                       gcall *call;
-                       if (partial_ifn == IFN_MASK_LEN_LOAD)
-                         call = gimple_build_call_internal (IFN_MASK_LEN_LOAD,
-                                                            5, dataref_ptr,
-                                                            ptr, final_mask,
-                                                            final_len, bias);
-                       else
-                         call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
-                                                            dataref_ptr, ptr,
-                                                            final_len, bias);
-                       gimple_call_set_nothrow (call, true);
-                       new_stmt = call;
-                       data_ref = NULL_TREE;
-
-                       /* Need conversion if it's wrapped with VnQI.  */
-                       if (vmode != new_vmode)
-                         {
-                           tree new_vtype = build_vector_type_for_mode (
-                             unsigned_intQI_type_node, new_vmode);
-                           tree var = vect_get_new_ssa_name (new_vtype,
-                                                             vect_simple_var);
-                           gimple_set_lhs (call, var);
-                           vect_finish_stmt_generation (vinfo, stmt_info, call,
-                                                        gsi);
-                           tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
-                           new_stmt
-                             = gimple_build_assign (vec_dest,
-                                                    VIEW_CONVERT_EXPR, op);
-                         }
+                       tree new_vtype = build_vector_type_for_mode (
+                         unsigned_intQI_type_node, new_vmode);
+                       tree var
+                         = vect_get_new_ssa_name (new_vtype, vect_simple_var);
+                       gimple_set_lhs (call, var);
+                       vect_finish_stmt_generation (vinfo, stmt_info, call,
+                                                    gsi);
+                       tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
+                       new_stmt = gimple_build_assign (vec_dest,
+                                                       VIEW_CONVERT_EXPR, op);
                       }
-                   else if (final_mask)
+                 }
+               else if (final_mask)
+                 {
+                   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
+                   gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
+                                                             dataref_ptr, ptr,
+                                                             final_mask);
+                   gimple_call_set_nothrow (call, true);
+                   new_stmt = call;
+                   data_ref = NULL_TREE;
+                 }
+               else
+                 {
+                   tree ltype = vectype;
+                   tree new_vtype = NULL_TREE;
+                   unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
+                   unsigned int vect_align
+                     = vect_known_alignment_in_bytes (first_dr_info, vectype);
+                   unsigned int scalar_dr_size
+                     = vect_get_scalar_dr_size (first_dr_info);
+                   /* If there's no peeling for gaps but we have a gap
+                      with slp loads then load the lower half of the
+                      vector only.  See get_group_load_store_type for
+                      when we apply this optimization.  */
+                   if (slp
+                       && loop_vinfo
+                       && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0
+                       && known_eq (nunits, (group_size - gap) * 2)
+                       && known_eq (nunits, group_size)
+                       && gap >= (vect_align / scalar_dr_size))
+                     {
+                       tree half_vtype;
+                       new_vtype
+                         = vector_vector_composition_type (vectype, 2,
+                                                           &half_vtype);
+                       if (new_vtype != NULL_TREE)
+                         ltype = half_vtype;
+                     }
+                   tree offset
+                     = (dataref_offset ? dataref_offset
+                                       : build_int_cst (ref_type, 0));
+                   if (ltype != vectype
+                       && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
                       {
-                       tree ptr = build_int_cst (ref_type,
-                                                 align * BITS_PER_UNIT);
-                       gcall *call
-                         = gimple_build_call_internal (IFN_MASK_LOAD, 3,
-                                                       dataref_ptr, ptr,
-                                                       final_mask);
-                       gimple_call_set_nothrow (call, true);
-                       new_stmt = call;
-                       data_ref = NULL_TREE;
+                       unsigned HOST_WIDE_INT gap_offset
+                         = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
+                       tree gapcst = build_int_cst (ref_type, gap_offset);
+                       offset = size_binop (PLUS_EXPR, offset, gapcst);
                       }
+                   data_ref
+                     = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
+                   if (alignment_support_scheme == dr_aligned)
+                     ;
                     else
+                     TREE_TYPE (data_ref)
+                       = build_aligned_type (TREE_TYPE (data_ref),
+                                             align * BITS_PER_UNIT);
+                   if (ltype != vectype)
                       {
-                       tree ltype = vectype;
-                       tree new_vtype = NULL_TREE;
-                       unsigned HOST_WIDE_INT gap
-                         = DR_GROUP_GAP (first_stmt_info);
-                       unsigned int vect_align
-                         = vect_known_alignment_in_bytes (first_dr_info,
-                                                          vectype);
-                       unsigned int scalar_dr_size
-                         = vect_get_scalar_dr_size (first_dr_info);
-                       /* If there's no peeling for gaps but we have a gap
-                          with slp loads then load the lower half of the
-                          vector only.  See get_group_load_store_type for
-                          when we apply this optimization.  */
-                       if (slp
-                           && loop_vinfo
-                           && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-                           && gap != 0
-                           && known_eq (nunits, (group_size - gap) * 2)
-                           && known_eq (nunits, group_size)
-                           && gap >= (vect_align / scalar_dr_size))
+                       vect_copy_ref_info (data_ref,
+                                           DR_REF (first_dr_info->dr));
+                       tree tem = make_ssa_name (ltype);
+                       new_stmt = gimple_build_assign (tem, data_ref);
+                       vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
+                                                    gsi);
+                       data_ref = NULL;
+                       vec<constructor_elt, va_gc> *v;
+                       vec_alloc (v, 2);
+                       if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
                           {
-                           tree half_vtype;
-                           new_vtype
-                             = vector_vector_composition_type (vectype, 2,
-                                                               &half_vtype);
-                           if (new_vtype != NULL_TREE)
-                             ltype = half_vtype;
+                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+                                                   build_zero_cst (ltype));
+                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
                           }
-                       tree offset
-                         = (dataref_offset ? dataref_offset
-                                           : build_int_cst (ref_type, 0));
-                       if (ltype != vectype
-                           && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+                       else
                           {
-                           unsigned HOST_WIDE_INT gap_offset
-                             = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
-                           tree gapcst = build_int_cst (ref_type, gap_offset);
-                           offset = size_binop (PLUS_EXPR, offset, gapcst);
+                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
+                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+                                                   build_zero_cst (ltype));
                           }
-                       data_ref
-                         = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
-                       if (alignment_support_scheme == dr_aligned)
-                         ;
+                       gcc_assert (new_vtype != NULL_TREE);
+                       if (new_vtype == vectype)
+                         new_stmt = gimple_build_assign (
+                           vec_dest, build_constructor (vectype, v));
                         else
-                         TREE_TYPE (data_ref)
-                           = build_aligned_type (TREE_TYPE (data_ref),
-                                                 align * BITS_PER_UNIT);
-                       if (ltype != vectype)
                           {
-                           vect_copy_ref_info (data_ref,
-                                               DR_REF (first_dr_info->dr));
-                           tree tem = make_ssa_name (ltype);
-                           new_stmt = gimple_build_assign (tem, data_ref);
+                           tree new_vname = make_ssa_name (new_vtype);
+                           new_stmt = gimple_build_assign (
+                             new_vname, build_constructor (new_vtype, v));
                             vect_finish_stmt_generation (vinfo, stmt_info,
                                                          new_stmt, gsi);
-                           data_ref = NULL;
-                           vec<constructor_elt, va_gc> *v;
-                           vec_alloc (v, 2);
-                           if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
-                             {
-                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
-                                                       build_zero_cst (ltype));
-                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
-                             }
-                           else
-                             {
-                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
-                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
-                                                       build_zero_cst (ltype));
-                             }
-                           gcc_assert (new_vtype != NULL_TREE);
-                           if (new_vtype == vectype)
-                             new_stmt = gimple_build_assign (
-                               vec_dest, build_constructor (vectype, v));
-                           else
-                             {
-                               tree new_vname = make_ssa_name (new_vtype);
-                               new_stmt = gimple_build_assign (
-                                 new_vname, build_constructor (new_vtype, v));
-                               vect_finish_stmt_generation (vinfo, stmt_info,
-                                                            new_stmt, gsi);
-                               new_stmt = gimple_build_assign (
-                                 vec_dest, build1 (VIEW_CONVERT_EXPR, vectype,
-                                                   new_vname));
-                             }
+                           new_stmt = gimple_build_assign (
+                             vec_dest,
+                             build1 (VIEW_CONVERT_EXPR, vectype, new_vname));
                           }
                       }
-                   break;
                   }
-               case dr_explicit_realign:
-                 {
-                   if (costing_p)
-                     break;
-                   tree ptr, bump;
-
-                   tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+               break;
+             }
+           case dr_explicit_realign:
+             {
+               if (costing_p)
+                 break;
+               tree ptr, bump;
  
-                   if (compute_in_loop)
-                     msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
-                                                   &realignment_token,
-                                                   dr_explicit_realign,
-                                                   dataref_ptr, NULL);
+               tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
  
-                   if (TREE_CODE (dataref_ptr) == SSA_NAME)
-                     ptr = copy_ssa_name (dataref_ptr);
-                   else
-                     ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
-                   // For explicit realign the target alignment should be
-                   // known at compile time.
-                   unsigned HOST_WIDE_INT align =
-                     DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
-                   new_stmt = gimple_build_assign
-                                (ptr, BIT_AND_EXPR, dataref_ptr,
-                                 build_int_cst
-                                 (TREE_TYPE (dataref_ptr),
-                                  -(HOST_WIDE_INT) align));
-                   vect_finish_stmt_generation (vinfo, stmt_info,
-                                                new_stmt, gsi);
-                   data_ref
-                     = build2 (MEM_REF, vectype, ptr,
-                               build_int_cst (ref_type, 0));
-                   vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
-                   vec_dest = vect_create_destination_var (scalar_dest,
-                                                           vectype);
-                   new_stmt = gimple_build_assign (vec_dest, data_ref);
-                   new_temp = make_ssa_name (vec_dest, new_stmt);
-                   gimple_assign_set_lhs (new_stmt, new_temp);
-                   gimple_move_vops (new_stmt, stmt_info->stmt);
-                   vect_finish_stmt_generation (vinfo, stmt_info,
-                                                new_stmt, gsi);
-                   msq = new_temp;
-
-                   bump = size_binop (MULT_EXPR, vs,
-                                      TYPE_SIZE_UNIT (elem_type));
-                   bump = size_binop (MINUS_EXPR, bump, size_one_node);
-                   ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi,
-                                          stmt_info, bump);
-                   new_stmt = gimple_build_assign
-                                (NULL_TREE, BIT_AND_EXPR, ptr,
-                                 build_int_cst
-                                 (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
-                   if (TREE_CODE (ptr) == SSA_NAME)
-                     ptr = copy_ssa_name (ptr, new_stmt);
-                   else
-                     ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
-                   gimple_assign_set_lhs (new_stmt, ptr);
-                   vect_finish_stmt_generation (vinfo, stmt_info,
-                                                new_stmt, gsi);
-                   data_ref
-                     = build2 (MEM_REF, vectype, ptr,
-                               build_int_cst (ref_type, 0));
-                   break;
-                 }
-               case dr_explicit_realign_optimized:
-                 {
-                   if (costing_p)
-                     break;
-                   if (TREE_CODE (dataref_ptr) == SSA_NAME)
-                     new_temp = copy_ssa_name (dataref_ptr);
-                   else
-                     new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
-                   // We should only be doing this if we know the target
-                   // alignment at compile time.
-                   unsigned HOST_WIDE_INT align =
-                     DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
-                   new_stmt = gimple_build_assign
-                     (new_temp, BIT_AND_EXPR, dataref_ptr,
-                      build_int_cst (TREE_TYPE (dataref_ptr),
-                                    -(HOST_WIDE_INT) align));
-                   vect_finish_stmt_generation (vinfo, stmt_info,
-                                                new_stmt, gsi);
-                   data_ref
-                     = build2 (MEM_REF, vectype, new_temp,
-                               build_int_cst (ref_type, 0));
-                   break;
-                 }
-               default:
-                 gcc_unreachable ();
-               }
+               if (compute_in_loop)
+                 msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
+                                               &realignment_token,
+                                               dr_explicit_realign,
+                                               dataref_ptr, NULL);
+
+               if (TREE_CODE (dataref_ptr) == SSA_NAME)
+                 ptr = copy_ssa_name (dataref_ptr);
+               else
+                 ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
+               // For explicit realign the target alignment should be
+               // known at compile time.
+               unsigned HOST_WIDE_INT align
+                 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
+               new_stmt = gimple_build_assign (
+                 ptr, BIT_AND_EXPR, dataref_ptr,
+                 build_int_cst (TREE_TYPE (dataref_ptr),
+                                -(HOST_WIDE_INT) align));
+               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+               data_ref
+                 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
+               vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+               vec_dest = vect_create_destination_var (scalar_dest, vectype);
+               new_stmt = gimple_build_assign (vec_dest, data_ref);
+               new_temp = make_ssa_name (vec_dest, new_stmt);
+               gimple_assign_set_lhs (new_stmt, new_temp);
+               gimple_move_vops (new_stmt, stmt_info->stmt);
+               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+               msq = new_temp;
+
+               bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type));
+               bump = size_binop (MINUS_EXPR, bump, size_one_node);
+               ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info,
+                                      bump);
+               new_stmt = gimple_build_assign (
+                 NULL_TREE, BIT_AND_EXPR, ptr,
+                 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
+               if (TREE_CODE (ptr) == SSA_NAME)
+                 ptr = copy_ssa_name (ptr, new_stmt);
+               else
+                 ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
+               gimple_assign_set_lhs (new_stmt, ptr);
+               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+               data_ref
+                 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
+               break;
+             }
+           case dr_explicit_realign_optimized:
+             {
+               if (costing_p)
+                 break;
+               if (TREE_CODE (dataref_ptr) == SSA_NAME)
+                 new_temp = copy_ssa_name (dataref_ptr);
+               else
+                 new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
+               // We should only be doing this if we know the target
+               // alignment at compile time.
+               unsigned HOST_WIDE_INT align
+                 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
+               new_stmt = gimple_build_assign (
+                 new_temp, BIT_AND_EXPR, dataref_ptr,
+                 build_int_cst (TREE_TYPE (dataref_ptr),
+                                -(HOST_WIDE_INT) align));
+               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+               data_ref = build2 (MEM_REF, vectype, new_temp,
+                                  build_int_cst (ref_type, 0));
+               break;
+             }
+           default:
+             gcc_unreachable ();
+           }
  
-             /* One common place to cost the above vect load for different
-                alignment support schemes.  */
-             if (costing_p)
-               {
-                 /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
-                    only need to take care of the first stmt, whose
-                    stmt_info is first_stmt_info, vec_num iterating on it
-                    will cover the cost for the remaining, it's consistent
-                    with transforming.  For the prologue cost for realign,
-                    we only need to count it once for the whole group.  */
-                 bool first_stmt_info_p = first_stmt_info == stmt_info;
-                 bool add_realign_cost = first_stmt_info_p && i == 0;
-                 if (memory_access_type == VMAT_CONTIGUOUS
-                     || memory_access_type == VMAT_CONTIGUOUS_REVERSE
-                     || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
-                         && (!grouped_load || first_stmt_info_p)))
-                   vect_get_load_cost (vinfo, stmt_info, 1,
-                                       alignment_support_scheme, misalignment,
-                                       add_realign_cost, &inside_cost,
-                                       &prologue_cost, cost_vec, cost_vec,
-                                       true);
-               }
-             else
+         /* One common place to cost the above vect load for different
+            alignment support schemes.  */
+         if (costing_p)
+           {
+             /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
+                only need to take care of the first stmt, whose
+                stmt_info is first_stmt_info, vec_num iterating on it
+                will cover the cost for the remaining, it's consistent
+                with transforming.  For the prologue cost for realign,
+                we only need to count it once for the whole group.  */
+             bool first_stmt_info_p = first_stmt_info == stmt_info;
+             bool add_realign_cost = first_stmt_info_p && i == 0;
+             if (memory_access_type == VMAT_CONTIGUOUS
+                 || memory_access_type == VMAT_CONTIGUOUS_REVERSE
+                 || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
+                     && (!grouped_load || first_stmt_info_p)))
+               vect_get_load_cost (vinfo, stmt_info, 1,
+                                   alignment_support_scheme, misalignment,
+                                   add_realign_cost, &inside_cost,
+                                   &prologue_cost, cost_vec, cost_vec, true);
+           }
+         else
+           {
+             vec_dest = vect_create_destination_var (scalar_dest, vectype);
+             /* DATA_REF is null if we've already built the statement.  */
+             if (data_ref)
                 {
-                 vec_dest = vect_create_destination_var (scalar_dest, vectype);
-                 /* DATA_REF is null if we've already built the statement.  */
-                 if (data_ref)
-                   {
-                     vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
-                     new_stmt = gimple_build_assign (vec_dest, data_ref);
-                   }
-                 new_temp = make_ssa_name (vec_dest, new_stmt);
-                 gimple_set_lhs (new_stmt, new_temp);
-                 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+                 new_stmt = gimple_build_assign (vec_dest, data_ref);
                 }
+             new_temp = make_ssa_name (vec_dest, new_stmt);
+             gimple_set_lhs (new_stmt, new_temp);
+             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+           }
  
-             /* 3. Handle explicit realignment if necessary/supported.
-                Create in loop:
-                  vec_dest = realign_load (msq, lsq, realignment_token)  */
-             if (!costing_p
-                 && (alignment_support_scheme == dr_explicit_realign_optimized
-                     || alignment_support_scheme == dr_explicit_realign))
-               {
-                 lsq = gimple_assign_lhs (new_stmt);
-                 if (!realignment_token)
-                   realignment_token = dataref_ptr;
-                 vec_dest = vect_create_destination_var (scalar_dest, vectype);
-                 new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR,
-                                                 msq, lsq, realignment_token);
-                 new_temp = make_ssa_name (vec_dest, new_stmt);
-                 gimple_assign_set_lhs (new_stmt, new_temp);
-                 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+         /* 3. Handle explicit realignment if necessary/supported.
+            Create in loop:
+              vec_dest = realign_load (msq, lsq, realignment_token)  */
+         if (!costing_p
+             && (alignment_support_scheme == dr_explicit_realign_optimized
+                 || alignment_support_scheme == dr_explicit_realign))
+           {
+             lsq = gimple_assign_lhs (new_stmt);
+             if (!realignment_token)
+               realignment_token = dataref_ptr;
+             vec_dest = vect_create_destination_var (scalar_dest, vectype);
+             new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq,
+                                             lsq, realignment_token);
+             new_temp = make_ssa_name (vec_dest, new_stmt);
+             gimple_assign_set_lhs (new_stmt, new_temp);
+             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
  
-                 if (alignment_support_scheme == dr_explicit_realign_optimized)
-                   {
-                     gcc_assert (phi);
-                     if (i == vec_num - 1 && j == ncopies - 1)
-                       add_phi_arg (phi, lsq,
-                                    loop_latch_edge (containing_loop),
-                                    UNKNOWN_LOCATION);
-                     msq = lsq;
-                   }
+             if (alignment_support_scheme == dr_explicit_realign_optimized)
+               {
+                 gcc_assert (phi);
+                 if (i == vec_num - 1 && j == ncopies - 1)
+                   add_phi_arg (phi, lsq, loop_latch_edge (containing_loop),
+                                UNKNOWN_LOCATION);
+                 msq = lsq;
                 }
+           }
  
-             if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+         if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+           {
+             if (costing_p)
+               inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
+                                               stmt_info, 0, vect_body);
+             else
                 {
-                 if (costing_p)
-                   inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
-                                                   stmt_info, 0, vect_body);
-                 else
-                   {
-                     tree perm_mask = perm_mask_for_reverse (vectype);
-                     new_temp
-                       = permute_vec_elements (vinfo, new_temp, new_temp,
-                                               perm_mask, stmt_info, gsi);
-                     new_stmt = SSA_NAME_DEF_STMT (new_temp);
-                   }
+                 tree perm_mask = perm_mask_for_reverse (vectype);
+                 new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
+                                                  perm_mask, stmt_info, gsi);
+                 new_stmt = SSA_NAME_DEF_STMT (new_temp);
                 }
+           }
  
-             /* Collect vector loads and later create their permutation in
-                vect_transform_grouped_load ().  */
-             if (!costing_p && (grouped_load || slp_perm))
-               dr_chain.quick_push (new_temp);
+         /* Collect vector loads and later create their permutation in
+            vect_transform_grouped_load ().  */
+         if (!costing_p && (grouped_load || slp_perm))
+           dr_chain.quick_push (new_temp);
  
-             /* Store vector loads in the corresponding SLP_NODE.  */
-             if (!costing_p && slp && !slp_perm)
-               slp_node->push_vec_def (new_stmt);
+         /* Store vector loads in the corresponding SLP_NODE.  */
+         if (!costing_p && slp && !slp_perm)
+           slp_node->push_vec_def (new_stmt);
  
-             /* With SLP permutation we load the gaps as well, without
-                we need to skip the gaps after we manage to fully load
-                all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
-             group_elt += nunits;
-             if (!costing_p
-                 && maybe_ne (group_gap_adj, 0U)
-                 && !slp_perm
-                 && known_eq (group_elt, group_size - group_gap_adj))
-               {
-                 poly_wide_int bump_val
-                   = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
-                      * group_gap_adj);
-                 if (tree_int_cst_sgn
-                       (vect_dr_behavior (vinfo, dr_info)->step) == -1)
-                   bump_val = -bump_val;
-                 tree bump = wide_int_to_tree (sizetype, bump_val);
-                 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
-                                                gsi, stmt_info, bump);
-                 group_elt = 0;
-               }
-           }
-         /* Bump the vector pointer to account for a gap or for excess
-            elements loaded for a permuted SLP load.  */
+         /* With SLP permutation we load the gaps as well, without
+            we need to skip the gaps after we manage to fully load
+            all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
+         group_elt += nunits;
           if (!costing_p
               && maybe_ne (group_gap_adj, 0U)
-             && slp_perm)
+             && !slp_perm
+             && known_eq (group_elt, group_size - group_gap_adj))
             {
               poly_wide_int bump_val
-               = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
-                  * group_gap_adj);
-             if (tree_int_cst_sgn
-                   (vect_dr_behavior (vinfo, dr_info)->step) == -1)
+               = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
+             if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step)
+                 == -1)
                 bump_val = -bump_val;
               tree bump = wide_int_to_tree (sizetype, bump_val);
               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
                                              stmt_info, bump);
+             group_elt = 0;
             }
         }
+      /* Bump the vector pointer to account for a gap or for excess
+        elements loaded for a permuted SLP load.  */
+      if (!costing_p
+         && maybe_ne (group_gap_adj, 0U)
+         && slp_perm)
+       {
+         poly_wide_int bump_val
+           = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
+         if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1)
+           bump_val = -bump_val;
+         tree bump = wide_int_to_tree (sizetype, bump_val);
+         dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
+                                        stmt_info, bump);
+       }
  
        if (slp && !slp_perm)
         continue;
@@ -11110,39 +11105,36 @@ vectorizable_load (vec_info *vinfo,
             }
         }
        else
-        {
-          if (grouped_load)
-           {
-             if (memory_access_type != VMAT_LOAD_STORE_LANES)
+       {
+         if (grouped_load)
+           {
+             gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
+             /* We assume that the cost of a single load-lanes instruction
+                is equivalent to the cost of DR_GROUP_SIZE separate loads.
+                If a grouped access is instead being provided by a
+                load-and-permute operation, include the cost of the
+                permutes.  */
+             if (costing_p && first_stmt_info == stmt_info)
                 {
-                 gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
-                 /* We assume that the cost of a single load-lanes instruction
-                    is equivalent to the cost of DR_GROUP_SIZE separate loads.
-                    If a grouped access is instead being provided by a
-                    load-and-permute operation, include the cost of the
-                    permutes.  */
-                 if (costing_p && first_stmt_info == stmt_info)
-                   {
-                     /* Uses an even and odd extract operations or shuffle
-                        operations for each needed permute.  */
-                     int group_size = DR_GROUP_SIZE (first_stmt_info);
-                     int nstmts = ceil_log2 (group_size) * group_size;
-                     inside_cost
-                       += record_stmt_cost (cost_vec, nstmts, vec_perm,
-                                            stmt_info, 0, vect_body);
+                 /* Uses an even and odd extract operations or shuffle
+                    operations for each needed permute.  */
+                 int group_size = DR_GROUP_SIZE (first_stmt_info);
+                 int nstmts = ceil_log2 (group_size) * group_size;
+                 inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
+                                                  stmt_info, 0, vect_body);
  
-                     if (dump_enabled_p ())
-                       dump_printf_loc (
-                         MSG_NOTE, vect_location,
-                         "vect_model_load_cost: strided group_size = %d .\n",
-                         group_size);
-                   }
-                 else if (!costing_p)
-                   vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
-                                                group_size, gsi);
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_NOTE, vect_location,
+                                    "vect_model_load_cost:"
+                                    "strided group_size = %d .\n",
+                                    group_size);
+               }
+             else if (!costing_p)
+               {
+                 vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
+                                              group_size, gsi);
+                 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
                 }
-             if (!costing_p)
-               *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
             }
           else if (!costing_p)
             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
@@ -11156,7 +11148,8 @@ vectorizable_load (vec_info *vinfo,
      {
        gcc_assert (memory_access_type != VMAT_INVARIANT
                   && memory_access_type != VMAT_ELEMENTWISE
-                 && memory_access_type != VMAT_STRIDED_SLP);
+                 && memory_access_type != VMAT_STRIDED_SLP
+                 && memory_access_type != VMAT_LOAD_STORE_LANES);
        if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
                          "vect_model_load_cost: inside_cost = %u, "
author	Kewen Lin <linkw@linux.ibm.com>
	Tue, 15 Aug 2023 06:36:33 +0000 (01:36 -0500)
committer	Kewen Lin <linkw@linux.ibm.com>
	Tue, 15 Aug 2023 06:36:33 +0000 (01:36 -0500)